From b51ba32f619e6b7a927444475b0ee986d4d13a60 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Tue, 29 Dec 2015 20:59:51 -0600 Subject: Wolfram Alpha (no API needed now) --- searx/engines/wolframalpha_noapi.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 searx/engines/wolframalpha_noapi.py (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py new file mode 100644 index 000000000..1ce2aa1ff --- /dev/null +++ b/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,66 @@ +# WolframAlpha (Maths) +# +# @website http://www.wolframalpha.com/ +# +# @using-api no +# @results HTML, JS +# @stable no +# @parse answer + +import re +import json +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text + +# search-url +url = 'http://www.wolframalpha.com/' +search_url = url+'input/?{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'i': query})) + + return params + + +# tries to find answer under the pattern given +def extract_answer(script_list, pattern): + answer = None + + # get line that matches the pattern + for script in script_list: + try: + line = re.search(pattern, script.text_content()).group(1) + except AttributeError: + continue + + # extract answer from json + answer = line[line.find('{') : line.rfind('}')+1] + answer = json.loads(answer.encode('unicode-escape')) + answer = answer['stringified'].decode('unicode-escape') + + return answer + + +# get response from search-request +def response(resp): + + dom = html.fromstring(resp.text) + + # the answer is inside a js script + scripts = dom.xpath('//script') + + results = [] + + # answer can be located in different 'pods', although by default it should be in pod_0200 + answer = extract_answer(scripts, 'pod_0200\.push(.*)\n') + if not answer: + answer = extract_answer(scripts, 'pod_0100\.push(.*)\n') + if answer: + results.append({'answer': answer}) + else: + results.append({'answer': answer}) + + return results -- cgit v1.2.3 From d827fc49a11b6f84bba3d006b54a70a6a05757fd Mon Sep 17 00:00:00 2001 From: a01200356 Date: Tue, 29 Dec 2015 21:11:49 -0600 Subject: Remove unnecessary code in wolframalpha_noapi engine The answer is scraped from a js function, so parsing the html tree doesn't achieve anything here. --- searx/engines/wolframalpha_noapi.py | 49 ++++++++++++++----------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 1ce2aa1ff..29600ca1f 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -10,8 +10,6 @@ import re import json from urllib import urlencode -from lxml import html -from searx.engines.xpath import extract_text # search-url url = 'http://www.wolframalpha.com/' @@ -25,42 +23,31 @@ def request(query, params): return params -# tries to find answer under the pattern given -def extract_answer(script_list, pattern): - answer = None +# get response from search-request +def response(resp): + results = [] + + # the answer is inside a js function + # answer can be located in different 'pods', although by default it should be in pod_0200 + possible_locations = ['pod_0200\.push(.*)\n', + 'pod_0100\.push(.*)\n'] # get line that matches the pattern - for script in script_list: + for pattern in possible_locations: try: - line = re.search(pattern, script.text_content()).group(1) + line = re.search(pattern, resp.text).group(1) + break except AttributeError: continue - # extract answer from json - answer = line[line.find('{') : line.rfind('}')+1] - answer = json.loads(answer.encode('unicode-escape')) - answer = answer['stringified'].decode('unicode-escape') - - return answer - + if not line: + return results -# get response from search-request -def response(resp): - - dom = html.fromstring(resp.text) - - # the answer is inside a js script - scripts = dom.xpath('//script') + # extract answer from json + answer = line[line.find('{') : line.rfind('}')+1] + answer = json.loads(answer.encode('unicode-escape')) + answer = answer['stringified'].decode('unicode-escape') - results = [] - - # answer can be located in different 'pods', although by default it should be in pod_0200 - answer = extract_answer(scripts, 'pod_0200\.push(.*)\n') - if not answer: - answer = extract_answer(scripts, 'pod_0100\.push(.*)\n') - if answer: - results.append({'answer': answer}) - else: - results.append({'answer': answer}) + results.append({'answer': answer}) return results -- cgit v1.2.3 From 5ed8f4da80ecd119173d7db871256be8484a9ecb Mon Sep 17 00:00:00 2001 From: a01200356 Date: Tue, 29 Dec 2015 21:37:48 -0600 Subject: Make wolframalpha_noapi.py flake8 compliant --- searx/engines/wolframalpha_noapi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 29600ca1f..23e912a1e 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -3,7 +3,7 @@ # @website http://www.wolframalpha.com/ # # @using-api no -# @results HTML, JS +# @results HTML # @stable no # @parse answer @@ -26,7 +26,7 @@ def request(query, params): # get response from search-request def response(resp): results = [] - + # the answer is inside a js function # answer can be located in different 'pods', although by default it should be in pod_0200 possible_locations = ['pod_0200\.push(.*)\n', @@ -44,10 +44,10 @@ def response(resp): return results # extract answer from json - answer = line[line.find('{') : line.rfind('}')+1] + answer = line[line.find('{'):line.rfind('}')+1] answer = json.loads(answer.encode('unicode-escape')) answer = answer['stringified'].decode('unicode-escape') results.append({'answer': answer}) - + return results -- cgit v1.2.3 From be54e5269a982e272e2fe8a5064ed898373c9063 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Wed, 30 Dec 2015 00:53:15 -0600 Subject: Add tests for the Wolfram Alpha engines (both API and NO API versions) --- searx/engines/wolframalpha_noapi.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 23e912a1e..9d3afe658 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -7,8 +7,8 @@ # @stable no # @parse answer -import re -import json +from re import search +from json import loads from urllib import urlencode # search-url @@ -26,6 +26,8 @@ def request(query, params): # get response from search-request def response(resp): results = [] + webpage = resp.text + line = None # the answer is inside a js function # answer can be located in different 'pods', although by default it should be in pod_0200 @@ -35,7 +37,7 @@ def response(resp): # get line that matches the pattern for pattern in possible_locations: try: - line = re.search(pattern, resp.text).group(1) + line = search(pattern, webpage).group(1) break except AttributeError: continue @@ -45,7 +47,7 @@ def response(resp): # extract answer from json answer = line[line.find('{'):line.rfind('}')+1] - answer = json.loads(answer.encode('unicode-escape')) + answer = loads(answer.encode('unicode-escape')) answer = answer['stringified'].decode('unicode-escape') results.append({'answer': answer}) -- cgit v1.2.3 From 0871c7ca85cd19a2fa0971c7db28516a74255d5d Mon Sep 17 00:00:00 2001 From: a01200356 Date: Fri, 1 Jan 2016 22:02:10 -0600 Subject: [enh] wolframalpha appends result --- searx/engines/wolframalpha_noapi.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 9d3afe658..89a3c45b5 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -1,6 +1,7 @@ # WolframAlpha (Maths) # # @website http://www.wolframalpha.com/ +# @provide-api yes (http://api.wolframalpha.com/v2/) # # @using-api no # @results HTML @@ -14,12 +15,17 @@ from urllib import urlencode # search-url url = 'http://www.wolframalpha.com/' search_url = url+'input/?{query}' +search_query = '' # do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'i': query})) + # used in response + global search_query + search_query = query + return params @@ -42,14 +48,20 @@ def response(resp): except AttributeError: continue - if not line: - return results + if line: + # extract answer from json + answer = line[line.find('{'):line.rfind('}')+1] + answer = loads(answer.encode('unicode-escape')) + answer = answer['stringified'].decode('unicode-escape') + + results.append({'answer': answer}) - # extract answer from json - answer = line[line.find('{'):line.rfind('}')+1] - answer = loads(answer.encode('unicode-escape')) - answer = answer['stringified'].decode('unicode-escape') + # failed result + elif search('pfail', webpage): + return results - results.append({'answer': answer}) + # append result + results.append({'url': request(search_query, {})['url'], + 'title': search_query + ' - Wolfram|Alpha'}) return results -- cgit v1.2.3 From e9d35c1309f05a0b214fb323049909ee7ec62ab8 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sat, 2 Jan 2016 00:41:14 -0600 Subject: update tests for wolframalpha --- searx/engines/wolframalpha_noapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 89a3c45b5..d7442db5d 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -53,7 +53,7 @@ def response(resp): answer = line[line.find('{'):line.rfind('}')+1] answer = loads(answer.encode('unicode-escape')) answer = answer['stringified'].decode('unicode-escape') - + results.append({'answer': answer}) # failed result -- cgit v1.2.3 From 19d025f0e7ef9a5f41b81fc6c1a9a7114bdae78c Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sat, 2 Jan 2016 01:49:32 -0600 Subject: [fix] pass wolframalpha_noapi tests --- searx/engines/wolframalpha_noapi.py | 43 +++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index d7442db5d..a730ed60b 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -8,60 +8,71 @@ # @stable no # @parse answer -from re import search +from re import search, sub from json import loads from urllib import urlencode +from lxml import html # search-url url = 'http://www.wolframalpha.com/' search_url = url+'input/?{query}' -search_query = '' + +# xpath variables +scripts_xpath = '//script' +title_xpath = '//title' +failure_xpath = '//p[attribute::class="pfail"]' # do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'i': query})) - # used in response - global search_query - search_query = query - return params # get response from search-request def response(resp): results = [] - webpage = resp.text line = None + dom = html.fromstring(resp.text) + scripts = dom.xpath(scripts_xpath) + # the answer is inside a js function # answer can be located in different 'pods', although by default it should be in pod_0200 possible_locations = ['pod_0200\.push(.*)\n', 'pod_0100\.push(.*)\n'] + # failed result + if dom.xpath(failure_xpath): + return results + # get line that matches the pattern for pattern in possible_locations: - try: - line = search(pattern, webpage).group(1) + for script in scripts: + try: + line = search(pattern, script.text_content()).group(1) + break + except AttributeError: + continue + if line: break - except AttributeError: - continue if line: # extract answer from json answer = line[line.find('{'):line.rfind('}')+1] answer = loads(answer.encode('unicode-escape')) answer = answer['stringified'].decode('unicode-escape') + answer = sub(r'\\', '', answer) results.append({'answer': answer}) - # failed result - elif search('pfail', webpage): - return results + # user input is in first part of title + title = dom.xpath(title_xpath)[0].text + result_url = request(title[:-16], {})['url'] # append result - results.append({'url': request(search_query, {})['url'], - 'title': search_query + ' - Wolfram|Alpha'}) + results.append({'url': result_url, + 'title': title}) return results -- cgit v1.2.3 From 576d37f256649b570a9c8591a795acd85ac499bc Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sun, 3 Jan 2016 15:58:01 -0600 Subject: [fix] unescape htmlentities in wolframalpha_noapi's answer --- searx/engines/wolframalpha_noapi.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index a730ed60b..0f0315630 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -12,6 +12,7 @@ from re import search, sub from json import loads from urllib import urlencode from lxml import html +import HTMLParser # search-url url = 'http://www.wolframalpha.com/' @@ -62,7 +63,11 @@ def response(resp): # extract answer from json answer = line[line.find('{'):line.rfind('}')+1] answer = loads(answer.encode('unicode-escape')) - answer = answer['stringified'].decode('unicode-escape') + answer = answer['stringified'] + + # clean plaintext answer + h = HTMLParser.HTMLParser() + answer = h.unescape(answer.decode('unicode-escape')) answer = sub(r'\\', '', answer) results.append({'answer': answer}) -- cgit v1.2.3 From d997265e5599333b4316561ca18a8f4131e3e2d9 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sun, 3 Jan 2016 19:57:37 -0600 Subject: add tests for unicode strings in wolframalpha --- searx/engines/wolframalpha_noapi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 0f0315630..71ad3b281 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -73,11 +73,11 @@ def response(resp): results.append({'answer': answer}) # user input is in first part of title - title = dom.xpath(title_xpath)[0].text + title = dom.xpath(title_xpath)[0].text.encode('utf-8') result_url = request(title[:-16], {})['url'] # append result results.append({'url': result_url, - 'title': title}) + 'title': title.decode('utf-8')}) return results -- cgit v1.2.3 From 2a15944b58089d84a930f36b42c6ef60d4e629b3 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sun, 3 Jan 2016 22:03:33 -0600 Subject: [fix] test in wolframalpha_noapi --- searx/engines/wolframalpha_noapi.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 71ad3b281..442e894b5 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -41,8 +41,8 @@ def response(resp): # the answer is inside a js function # answer can be located in different 'pods', although by default it should be in pod_0200 - possible_locations = ['pod_0200\.push(.*)\n', - 'pod_0100\.push(.*)\n'] + possible_locations = ['pod_0200\.push\((.*)', + 'pod_0100\.push\((.*)'] # failed result if dom.xpath(failure_xpath): @@ -62,7 +62,10 @@ def response(resp): if line: # extract answer from json answer = line[line.find('{'):line.rfind('}')+1] - answer = loads(answer.encode('unicode-escape')) + try: + answer = loads(answer) + except Exception: + answer = loads(answer.encode('unicode-escape')) answer = answer['stringified'] # clean plaintext answer -- cgit v1.2.3 From 30bfbf2e07def8911d0b293e8032699812f43599 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Mon, 18 Jan 2016 11:34:38 -0600 Subject: [fix] pep8 --- searx/engines/wolframalpha_noapi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines/wolframalpha_noapi.py') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 442e894b5..291fee04d 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -16,7 +16,7 @@ import HTMLParser # search-url url = 'http://www.wolframalpha.com/' -search_url = url+'input/?{query}' +search_url = url + 'input/?{query}' # xpath variables scripts_xpath = '//script' @@ -61,7 +61,7 @@ def response(resp): if line: # extract answer from json - answer = line[line.find('{'):line.rfind('}')+1] + answer = line[line.find('{'):line.rfind('}') + 1] try: answer = loads(answer) except Exception: -- cgit v1.2.3