diff options
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/__init__.py | 51 | ||||
| -rw-r--r-- | searx/engines/bing_images.py | 16 | ||||
| -rw-r--r-- | searx/engines/blekko_images.py | 2 | ||||
| -rw-r--r-- | searx/engines/btdigg.py | 2 | ||||
| -rw-r--r-- | searx/engines/deviantart.py | 2 | ||||
| -rw-r--r-- | searx/engines/digg.py | 2 | ||||
| -rw-r--r-- | searx/engines/faroo.py | 2 | ||||
| -rw-r--r-- | searx/engines/frinkiac.py | 44 | ||||
| -rw-r--r-- | searx/engines/gigablast.py | 57 | ||||
| -rw-r--r-- | searx/engines/google.py | 50 | ||||
| -rw-r--r-- | searx/engines/mediawiki.py | 21 | ||||
| -rw-r--r-- | searx/engines/searchcode_code.py | 4 | ||||
| -rw-r--r-- | searx/engines/searchcode_doc.py | 4 | ||||
| -rw-r--r-- | searx/engines/soundcloud.py | 32 | ||||
| -rw-r--r-- | searx/engines/stackoverflow.py | 2 | ||||
| -rw-r--r-- | searx/engines/startpage.py | 8 | ||||
| -rw-r--r-- | searx/engines/swisscows.py | 9 | ||||
| -rw-r--r-- | searx/engines/wikidata.py | 4 | ||||
| -rw-r--r-- | searx/engines/wolframalpha_api.py | 122 | ||||
| -rw-r--r-- | searx/engines/wolframalpha_noapi.py | 116 | ||||
| -rw-r--r-- | searx/engines/www1x.py | 2 | ||||
| -rw-r--r-- | searx/engines/xpath.py | 4 | ||||
| -rw-r--r-- | searx/engines/yandex.py | 7 |
23 files changed, 442 insertions, 121 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 447138d3b..6d5066733 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -34,6 +34,15 @@ engines = {} categories = {'general': []} engine_shortcuts = {} +engine_default_args = {'paging': False, + 'categories': ['general'], + 'language_support': True, + 'safesearch': False, + 'timeout': settings['outgoing']['request_timeout'], + 'shortcut': '-', + 'disabled': False, + 'suspend_end_time': 0, + 'continuous_errors': 0} def load_module(filename): @@ -62,26 +71,9 @@ def load_engine(engine_data): continue setattr(engine, param_name, engine_data[param_name]) - if not hasattr(engine, 'paging'): - engine.paging = False - - if not hasattr(engine, 'categories'): - engine.categories = ['general'] - - if not hasattr(engine, 'language_support'): - engine.language_support = True - - if not hasattr(engine, 'safesearch'): - engine.safesearch = False - - if not hasattr(engine, 'timeout'): - engine.timeout = settings['outgoing']['request_timeout'] - - if not hasattr(engine, 'shortcut'): - engine.shortcut = '' - - if not hasattr(engine, 'disabled'): - engine.disabled = False + for arg_name, arg_value in engine_default_args.iteritems(): + if not hasattr(engine, arg_name): + setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): @@ -100,18 +92,15 @@ def load_engine(engine_data): 'errors': 0 } - if hasattr(engine, 'categories'): - for category_name in engine.categories: - categories.setdefault(category_name, []).append(engine) - else: - categories['general'].append(engine) + for category_name in engine.categories: + categories.setdefault(category_name, []).append(engine) + + if engine.shortcut in engine_shortcuts: + logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut)) + sys.exit(1) + + engine_shortcuts[engine.shortcut] = engine.name - if engine.shortcut: - if engine.shortcut in engine_shortcuts: - logger.error('Engine config error: ambigious shortcut: {0}' - .format(engine.shortcut)) - sys.exit(1) - engine_shortcuts[engine.shortcut] = engine.name return engine diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 06850dfe1..2664b795f 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -17,7 +17,7 @@ from urllib import urlencode from lxml import html -from yaml import load +from json import loads import re # engine dependent config @@ -36,6 +36,9 @@ safesearch_types = {2: 'STRICT', 0: 'OFF'} +_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) + + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 @@ -65,22 +68,19 @@ def response(resp): dom = html.fromstring(resp.text) - # init regex for yaml-parsing - p = re.compile('({|,)([a-z]+):(")') - # parse results for result in dom.xpath('//div[@class="dg_u"]'): link = result.xpath('./a')[0] - # parse yaml-data (it is required to add a space, to make it parsable) - yaml_data = load(p.sub(r'\1\2: \3', link.attrib.get('m'))) + # parse json-data (it is required to add a space, to make it parsable) + json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m'))) title = link.attrib.get('t1') ihk = link.attrib.get('ihk') # url = 'http://' + link.attrib.get('t3') - url = yaml_data.get('surl') - img_src = yaml_data.get('imgurl') + url = json_data.get('surl') + img_src = json_data.get('imgurl') # append result results.append({'template': 'images.html', diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py index 93ac6616b..c0664f390 100644 --- a/searx/engines/blekko_images.py +++ b/searx/engines/blekko_images.py @@ -37,7 +37,7 @@ def request(query, params): c=c) if params['pageno'] != 1: - params['url'] += '&page={pageno}'.format(pageno=(params['pageno']-1)) + params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1)) # let Blekko know we wan't have profiling params['cookies']['tag_lesslogging'] = '1' diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index 192ed6ee9..c2b22f003 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -29,7 +29,7 @@ search_url = url + '/search?q={search_term}&p={pageno}' # do search-request def request(query, params): params['url'] = search_url.format(search_term=quote(query), - pageno=params['pageno']-1) + pageno=params['pageno'] - 1) return params diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index 60c8d7ea7..135aeb324 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -24,7 +24,7 @@ paging = True # search-url base_url = 'https://www.deviantart.com/' -search_url = base_url+'browse/all/?offset={offset}&{query}' +search_url = base_url + 'browse/all/?offset={offset}&{query}' # do search-request diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 000f66ba2..a10b38bb6 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -22,7 +22,7 @@ paging = True # search-url base_url = 'https://digg.com/' -search_url = base_url+'api/search/{query}.json?position={position}&format=html' +search_url = base_url + 'api/search/{query}.json?position={position}&format=html' # specific xpath variables results_xpath = '//article' diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py index 43df14eef..9fa244e77 100644 --- a/searx/engines/faroo.py +++ b/searx/engines/faroo.py @@ -88,7 +88,7 @@ def response(resp): for result in search_res['results']: if result['news']: # timestamp (milliseconds since 1970) - publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0) # noqa + publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0) # noqa # append news result results.append({'url': result['url'], diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py new file mode 100644 index 000000000..a9383f862 --- /dev/null +++ b/searx/engines/frinkiac.py @@ -0,0 +1,44 @@ +""" +Frinkiac (Images) + +@website https://www.frinkiac.com +@provide-api no +@using-api no +@results JSON +@stable no +@parse url, title, img_src +""" + +from json import loads +from urllib import urlencode + +categories = ['images'] + +BASE = 'https://frinkiac.com/' +SEARCH_URL = '{base}api/search?{query}' +RESULT_URL = '{base}?{query}' +THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg' +IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg' + + +def request(query, params): + params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query})) + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + for result in response_data: + episode = result['Episode'] + timestamp = result['Timestamp'] + + results.append({'template': 'images.html', + 'url': RESULT_URL.format(base=BASE, + query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})), + 'title': episode, + 'content': '', + 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp), + 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp)}) + + return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 3fef102f4..1cc243104 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -10,20 +10,30 @@ @parse url, title, content """ -from urllib import urlencode from cgi import escape -from lxml import etree +from json import loads from random import randint from time import time +from urllib import urlencode # engine dependent config categories = ['general'] paging = True -number_of_results = 5 +number_of_results = 10 +language_support = True +safesearch = True -# search-url, invalid HTTPS certificate +# search-url base_url = 'https://gigablast.com/' -search_string = 'search?{query}&n={number_of_results}&s={offset}&format=xml&qh=0&rxiyd={rxiyd}&rand={rand}' +search_string = 'search?{query}'\ + '&n={number_of_results}'\ + '&c=main'\ + '&s={offset}'\ + '&format=json'\ + '&qh=0'\ + '&rxiwd={rxiwd}'\ + '&qlang={lang}'\ + '&ff={safesearch}' # specific xpath variables results_xpath = '//response//result' @@ -36,12 +46,23 @@ content_xpath = './/sum' def request(query, params): offset = (params['pageno'] - 1) * number_of_results - search_path = search_string.format( - query=urlencode({'q': query}), - offset=offset, - number_of_results=number_of_results, - rxiyd=randint(10000, 10000000), - rand=int(time())) + if params['language'] == 'all': + language = 'xx' + else: + language = params['language'][0:2] + + if params['safesearch'] >= 1: + safesearch = 1 + else: + safesearch = 0 + + search_path = search_string.format(query=urlencode({'q': query}), + offset=offset, + number_of_results=number_of_results, + rxiwd=1, + # rand=int(time()), + lang=language, + safesearch=safesearch) params['url'] = base_url + search_path @@ -52,18 +73,14 @@ def request(query, params): def response(resp): results = [] - dom = etree.fromstring(resp.content) - # parse results - for result in dom.xpath(results_xpath): - url = result.xpath(url_xpath)[0].text - title = result.xpath(title_xpath)[0].text - content = escape(result.xpath(content_xpath)[0].text) + response_json = loads(resp.text) + for result in response_json['results']: # append result - results.append({'url': url, - 'title': title, - 'content': content}) + results.append({'url': result['url'], + 'title': escape(result['title']), + 'content': escape(result['sum'])}) # return results return results diff --git a/searx/engines/google.py b/searx/engines/google.py index e82260356..dbca205a1 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -90,7 +90,7 @@ url_map = 'https://www.openstreetmap.org/'\ search_path = '/search' search_url = ('https://{hostname}' + search_path + - '?{query}&start={offset}&gbv=1&gws_rd=cr') + '?{query}&start={offset}&gbv=1&gws_rd=ssl') # other URLs map_hostname_start = 'maps.google.' @@ -99,7 +99,7 @@ redirect_path = '/url' images_path = '/images' # specific xpath variables -results_xpath = '//li[@class="g"]' +results_xpath = '//div[@class="g"]' url_xpath = './/h3/a/@href' title_xpath = './/h3' content_xpath = './/span[@class="st"]' @@ -209,29 +209,29 @@ def response(resp): parsed_url = urlparse(url, google_hostname) # map result - if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path)) - or (parsed_url.netloc.startswith(map_hostname_start))): - x = result.xpath(map_near) - if len(x) > 0: - # map : near the location - results = results + parse_map_near(parsed_url, x, google_hostname) - else: - # map : detail about a location - results = results + parse_map_detail(parsed_url, result, google_hostname) - - # google news - elif (parsed_url.netloc == google_hostname - and parsed_url.path == search_path): - # skipping news results - pass - - # images result - elif (parsed_url.netloc == google_hostname - and parsed_url.path == images_path): - # only thumbnail image provided, - # so skipping image results - # results = results + parse_images(result, google_hostname) - pass + if parsed_url.netloc == google_hostname: + # TODO fix inside links + continue + # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): + # print "yooooo"*30 + # x = result.xpath(map_near) + # if len(x) > 0: + # # map : near the location + # results = results + parse_map_near(parsed_url, x, google_hostname) + # else: + # # map : detail about a location + # results = results + parse_map_detail(parsed_url, result, google_hostname) + # # google news + # elif parsed_url.path == search_path: + # # skipping news results + # pass + + # # images result + # elif parsed_url.path == images_path: + # # only thumbnail image provided, + # # so skipping image results + # # results = results + parse_images(result, google_hostname) + # pass else: # normal result diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 9fb72e830..26d3720d9 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -24,13 +24,13 @@ number_of_results = 1 # search-url base_url = 'https://{language}.wikipedia.org/' -search_url = base_url + 'w/api.php?action=query'\ - '&list=search'\ - '&{query}'\ - '&srprop=timestamp'\ - '&format=json'\ - '&sroffset={offset}'\ - '&srlimit={limit}' # noqa +search_postfix = 'w/api.php?action=query'\ + '&list=search'\ + '&{query}'\ + '&format=json'\ + '&sroffset={offset}'\ + '&srlimit={limit}'\ + '&srwhat=nearmatch' # search for a near match in the title # do search-request @@ -48,12 +48,15 @@ def request(query, params): else: language = params['language'].split('_')[0] - if len(format_strings) > 1: + # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] + if any(x[1] == 'language' for x in format_strings): string_args['language'] = language # write search-language back to params, required in response params['language'] = language + search_url = base_url + search_postfix + params['url'] = search_url.format(**string_args) return params @@ -71,6 +74,8 @@ def response(resp): # parse results for result in search_results['query']['search']: + if result.get('snippet', '').startswith('#REDIRECT'): + continue url = base_url.format(language=resp.search_params['language']) +\ 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index bd5eb71d2..de8cd43be 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -20,7 +20,7 @@ paging = True # search-url url = 'https://searchcode.com/' -search_url = url+'api/codesearch_I/?{query}&p={pageno}' +search_url = url + 'api/codesearch_I/?{query}&p={pageno}' # special code-endings which are not recognised by the file ending code_endings = {'cs': 'c#', @@ -32,7 +32,7 @@ code_endings = {'cs': 'c#', # do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), - pageno=params['pageno']-1) + pageno=params['pageno'] - 1) # Disable SSL verification # error: (60) SSL certificate problem: unable to get local issuer diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index 9453f31a4..f24fe6f90 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -19,13 +19,13 @@ paging = True # search-url url = 'https://searchcode.com/' -search_url = url+'api/search_IV/?{query}&p={pageno}' +search_url = url + 'api/search_IV/?{query}&p={pageno}' # do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), - pageno=params['pageno']-1) + pageno=params['pageno'] - 1) # Disable SSL verification # error: (60) SSL certificate problem: unable to get local issuer diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 46e17fc81..ac23c1e83 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -10,17 +10,19 @@ @parse url, title, content, publishedDate, embedded """ +import re +from StringIO import StringIO from json import loads +from lxml import etree from urllib import urlencode, quote_plus from dateutil import parser +from searx import logger +from searx.poolrequests import get as http_get # engine dependent config categories = ['music'] paging = True -# api-key -guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28' - # search-url url = 'https://api.soundcloud.com/' search_url = url + 'search?{query}'\ @@ -35,6 +37,30 @@ embedded_url = '<iframe width="100%" height="166" ' +\ 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>' +def get_client_id(): + response = http_get("https://soundcloud.com") + rx_namespace = {"re": "http://exslt.org/regular-expressions"} + + if response.ok: + tree = etree.parse(StringIO(response.content), etree.HTMLParser()) + script_tags = tree.xpath("//script[re:match(@src, '(.*app.*js)')]", namespaces=rx_namespace) + app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] + + # extracts valid app_js urls from soundcloud.com content + for app_js_url in app_js_urls: + # gets app_js and searches for the clientid + response = http_get(app_js_url) + if response.ok: + cids = re.search(r'client_id:"([^"]*)"', response.content, re.M | re.I) + if cids is not None and len(cids.groups()): + return cids.groups()[0] + logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") + return "" + +# api-key +guest_client_id = get_client_id() + + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 20 diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index 34ecabae7..fdd3711a9 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -22,7 +22,7 @@ paging = True # search-url url = 'https://stackoverflow.com/' -search_url = url+'search?{query}&page={pageno}' +search_url = url + 'search?{query}&page={pageno}' # specific xpath variables results_xpath = '//div[contains(@class,"question-summary")]' diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index a91cafa00..52dd0b92f 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -90,8 +90,8 @@ def response(resp): # check if search result starts with something like: "2 Sep 2014 ... " if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): - date_pos = content.find('...')+4 - date_string = content[0:date_pos-5] + date_pos = content.find('...') + 4 + date_string = content[0:date_pos - 5] published_date = parser.parse(date_string, dayfirst=True) # fix content string @@ -99,8 +99,8 @@ def response(resp): # check if search result starts with something like: "5 days ago ... " elif re.match("^[0-9]+ days? ago \.\.\. ", content): - date_pos = content.find('...')+4 - date_string = content[0:date_pos-5] + date_pos = content.find('...') + 4 + date_string = content[0:date_pos - 5] # calculate datetime published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 2d31264ca..864436a52 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -10,6 +10,7 @@ @parse url, title, content """ +from cgi import escape from json import loads from urllib import urlencode, unquote import re @@ -77,7 +78,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': result['Title'], + 'title': escape(result['Title']), 'content': '', 'img_src': img_url, 'template': 'images.html'}) @@ -89,8 +90,8 @@ def response(resp): # append result results.append({'url': result_url, - 'title': result_title, - 'content': result_content}) + 'title': escape(result_title), + 'content': escape(result_content)}) # parse images for result in json.get('Images', []): @@ -99,7 +100,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': result['Title'], + 'title': escape(result['Title']), 'content': '', 'img_src': img_url, 'template': 'images.html'}) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index fc840d47c..9f3496b72 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -295,7 +295,7 @@ def get_geolink(claims, propertyName, defaultValue=''): if precision < 0.0003: zoom = 19 else: - zoom = int(15 - precision*8.8322 + precision*precision*0.625447) + zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) url = url_map\ .replace('{latitude}', str(value.get('latitude', 0)))\ @@ -318,6 +318,6 @@ def get_wikilink(result, wikiid): def get_wiki_firstlanguage(result, wikipatternid): for k in result.get('sitelinks', {}).keys(): - if k.endswith(wikipatternid) and len(k) == (2+len(wikipatternid)): + if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)): return k[0:2] return None diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py new file mode 100644 index 000000000..4526c825f --- /dev/null +++ b/searx/engines/wolframalpha_api.py @@ -0,0 +1,122 @@ +# Wolfram Alpha (Science) +# +# @website https://www.wolframalpha.com +# @provide-api yes (https://api.wolframalpha.com/v2/) +# +# @using-api yes +# @results XML +# @stable yes +# @parse url, infobox + +from urllib import urlencode +from lxml import etree + +# search-url +search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' +site_url = 'https://www.wolframalpha.com/input/?{query}' +api_key = '' # defined in settings.yml + +# xpath variables +failure_xpath = '/queryresult[attribute::success="false"]' +answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' +input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' +pods_xpath = '//pod' +subpods_xpath = './subpod' +pod_id_xpath = './@id' +pod_title_xpath = './@title' +plaintext_xpath = './plaintext' +image_xpath = './img' +img_src_xpath = './@src' +img_alt_xpath = './@alt' + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', + 'Illustration'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'input': query}), + api_key=api_key) + params['headers']['Referer'] = site_url.format(query=urlencode({'i': query})) + + return params + + +# replace private user area characters to make text legible +def replace_pua_chars(text): + pua_chars = {u'\uf522': u'\u2192', # rigth arrow + u'\uf7b1': u'\u2115', # set of natural numbers + u'\uf7b4': u'\u211a', # set of rational numbers + u'\uf7b5': u'\u211d', # set of real numbers + u'\uf7bd': u'\u2124', # set of integer numbers + u'\uf74c': 'd', # differential + u'\uf74d': u'\u212f', # euler's number + u'\uf74e': 'i', # imaginary number + u'\uf7d9': '='} # equals sign + + for k, v in pua_chars.iteritems(): + text = text.replace(k, v) + + return text + + +# get response from search-request +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + # return empty array if there are no results + if search_results.xpath(failure_xpath): + return [] + + try: + infobox_title = search_results.xpath(input_xpath)[0].text + except: + infobox_title = None + + pods = search_results.xpath(pods_xpath) + result_chunks = [] + for pod in pods: + pod_id = pod.xpath(pod_id_xpath)[0] + pod_title = pod.xpath(pod_title_xpath)[0] + + subpods = pod.xpath(subpods_xpath) + if not subpods: + continue + + # Appends either a text or an image, depending on which one is more suitable + for subpod in subpods: + content = subpod.xpath(plaintext_xpath)[0].text + image = subpod.xpath(image_xpath) + + if content and pod_id not in image_pods: + + # if no input pod was found, title is first plaintext pod + if not infobox_title: + infobox_title = content + + content = replace_pua_chars(content) + result_chunks.append({'label': pod_title, 'value': content}) + + elif image: + result_chunks.append({'label': pod_title, + 'image': {'src': image[0].xpath(img_src_xpath)[0], + 'alt': image[0].xpath(img_alt_xpath)[0]}}) + + if not result_chunks: + return [] + + # append infobox + results.append({'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) + + # append link to site + results.append({'url': resp.request.headers['Referer'].decode('utf8'), + 'title': 'Wolfram|Alpha', + 'content': infobox_title}) + + return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py new file mode 100644 index 000000000..59629b833 --- /dev/null +++ b/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,116 @@ +# Wolfram|Alpha (Science) +# +# @website https://www.wolframalpha.com/ +# @provide-api yes (https://api.wolframalpha.com/v2/) +# +# @using-api no +# @results JSON +# @stable no +# @parse url, infobox + +from cgi import escape +from json import loads +from time import time +from urllib import urlencode +from lxml.etree import XML + +from searx.poolrequests import get as http_get + +# search-url +url = 'https://www.wolframalpha.com/' + +search_url = url + 'input/json.jsp'\ + '?async=false'\ + '&banners=raw'\ + '&debuggingdata=false'\ + '&format=image,plaintext,imagemap,minput,moutput'\ + '&formattimeout=2'\ + '&{query}'\ + '&output=JSON'\ + '&parsetimeout=2'\ + '&proxycode={token}'\ + '&scantimeout=0.5'\ + '&sponsorcategories=true'\ + '&statemethod=deploybutton' + +referer_url = url + 'input/?{query}' + +token = {'value': '', + 'last_updated': None} + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', + 'Illustration', + 'Symbol'} + + +# seems, wolframalpha resets its token in every hour +def obtain_token(): + update_time = time() - (time() % 3600) + try: + token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token['value'] = loads(token_response.text)['code'] + token['last_updated'] = update_time + except: + pass + return token + + +obtain_token() + + +# do search-request +def request(query, params): + # obtain token if last update was more than an hour + if time() - token['last_updated'] > 3600: + obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + resp_json = loads(resp.text) + + if not resp_json['queryresult']['success']: + return [] + + # TODO handle resp_json['queryresult']['assumptions'] + result_chunks = [] + infobox_title = None + for pod in resp_json['queryresult']['pods']: + pod_id = pod.get('id', '') + pod_title = pod.get('title', '') + + if 'subpods' not in pod: + continue + + if pod_id == 'Input' or not infobox_title: + infobox_title = pod['subpods'][0]['plaintext'] + + for subpod in pod['subpods']: + if subpod['plaintext'] != '' and pod_id not in image_pods: + # append unless it's not an actual answer + if subpod['plaintext'] != '(requires interactivity)': + result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) + + elif 'img' in subpod: + result_chunks.append({'label': pod_title, 'image': subpod['img']}) + + if not result_chunks: + return [] + + results.append({'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) + + results.append({'url': resp.request.headers['Referer'].decode('utf8'), + 'title': 'Wolfram|Alpha', + 'content': infobox_title}) + + return results diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index ddb79bfea..1269a5422 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -22,7 +22,7 @@ paging = False # search-url base_url = 'https://1x.com' -search_url = base_url+'/backend/search.php?{query}' +search_url = base_url + '/backend/search.php?{query}' # do search-request diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 1a599dc0a..f51634be0 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -43,7 +43,7 @@ def extract_url(xpath_results, search_url): if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(search_url) - url = parsed_search_url.scheme+url + url = parsed_search_url.scheme + url elif url.startswith('/'): # fix relative url to the search engine url = urljoin(search_url, url) @@ -69,7 +69,7 @@ def normalize_url(url): p = parsed_url.path mark = p.find('/**') if mark != -1: - return unquote(p[mark+3:]).decode('utf-8') + return unquote(p[mark + 3:]).decode('utf-8') return url diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index edc6ad5f2..be3ec36ce 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -9,6 +9,7 @@ @parse url, title, content """ +from cgi import escape from urllib import urlencode from lxml import html from searx.search import logger @@ -38,7 +39,7 @@ content_xpath = './/div[@class="serp-item__text"]//text()' def request(query, params): lang = params['language'].split('_')[0] host = base_url.format(tld=language_map.get(lang) or default_tld) - params['url'] = host + search_url.format(page=params['pageno']-1, + params['url'] = host + search_url.format(page=params['pageno'] - 1, query=urlencode({'text': query})) return params @@ -51,8 +52,8 @@ def response(resp): for result in dom.xpath(results_xpath): try: res = {'url': result.xpath(url_xpath)[0], - 'title': ''.join(result.xpath(title_xpath)), - 'content': ''.join(result.xpath(content_xpath))} + 'title': escape(''.join(result.xpath(title_xpath))), + 'content': escape(''.join(result.xpath(content_xpath)))} except: logger.exception('yandex parse crash') continue |