diff options
| author | Adam Tauber <asciimoo@gmail.com> | 2016-01-19 17:02:14 +0100 |
|---|---|---|
| committer | Adam Tauber <asciimoo@gmail.com> | 2016-01-19 17:02:14 +0100 |
| commit | b5a3dfca60f23bac10ade068c40729f030bbad63 (patch) | |
| tree | da3142cd5f1c1d05b922be2d1e6aaaf029fa8d8d /searx | |
| parent | 09b7673fbd271349b6878959bd2e1ae846981e13 (diff) | |
| parent | 30bfbf2e07def8911d0b293e8032699812f43599 (diff) | |
Merge pull request #486 from a01200356/master
[enh] WolframAlpha no API engine (and tests for both)
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/engines/wolframalpha_api.py | 35 | ||||
| -rw-r--r-- | searx/engines/wolframalpha_noapi.py | 86 | ||||
| -rw-r--r-- | searx/settings.yml | 16 |
3 files changed, 121 insertions, 16 deletions
diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index d61d25747..303c6c165 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -10,11 +10,18 @@ from urllib import urlencode from lxml import etree +from re import search # search-url base_url = 'http://api.wolframalpha.com/v2/query' search_url = base_url + '?appid={api_key}&{query}&format=plaintext' -api_key = '' +site_url = 'http://www.wolframalpha.com/input/?{query}' +api_key = '' # defined in settings.yml + +# xpath variables +failure_xpath = '/queryresult[attribute::success="false"]' +answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' +input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext' # do search-request @@ -45,16 +52,26 @@ def response(resp): search_results = etree.XML(resp.content) # return empty array if there are no results - if search_results.xpath('/queryresult[attribute::success="false"]'): + if search_results.xpath(failure_xpath): return [] - # parse result - result = search_results.xpath('//pod[attribute::primary="true"]/subpod/plaintext')[0].text - result = replace_pua_chars(result) + # parse answers + answers = search_results.xpath(answer_xpath) + if answers: + for answer in answers: + answer = replace_pua_chars(answer.text) + + results.append({'answer': answer}) + + # if there's no input section in search_results, check if answer has the input embedded (before their "=" sign) + try: + query_input = search_results.xpath(input_xpath)[0].text + except IndexError: + query_input = search(u'([^\uf7d9]+)', answers[0].text).group(1) - # append result - # TODO: shouldn't it bind the source too? - results.append({'answer': result}) + # append link to site + result_url = site_url.format(query=urlencode({'i': query_input.encode('utf-8')})) + results.append({'url': result_url, + 'title': query_input + " - Wolfram|Alpha"}) - # return results return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py new file mode 100644 index 000000000..291fee04d --- /dev/null +++ b/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,86 @@ +# WolframAlpha (Maths) +# +# @website http://www.wolframalpha.com/ +# @provide-api yes (http://api.wolframalpha.com/v2/) +# +# @using-api no +# @results HTML +# @stable no +# @parse answer + +from re import search, sub +from json import loads +from urllib import urlencode +from lxml import html +import HTMLParser + +# search-url +url = 'http://www.wolframalpha.com/' +search_url = url + 'input/?{query}' + +# xpath variables +scripts_xpath = '//script' +title_xpath = '//title' +failure_xpath = '//p[attribute::class="pfail"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'i': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + line = None + + dom = html.fromstring(resp.text) + scripts = dom.xpath(scripts_xpath) + + # the answer is inside a js function + # answer can be located in different 'pods', although by default it should be in pod_0200 + possible_locations = ['pod_0200\.push\((.*)', + 'pod_0100\.push\((.*)'] + + # failed result + if dom.xpath(failure_xpath): + return results + + # get line that matches the pattern + for pattern in possible_locations: + for script in scripts: + try: + line = search(pattern, script.text_content()).group(1) + break + except AttributeError: + continue + if line: + break + + if line: + # extract answer from json + answer = line[line.find('{'):line.rfind('}') + 1] + try: + answer = loads(answer) + except Exception: + answer = loads(answer.encode('unicode-escape')) + answer = answer['stringified'] + + # clean plaintext answer + h = HTMLParser.HTMLParser() + answer = h.unescape(answer.decode('unicode-escape')) + answer = sub(r'\\', '', answer) + + results.append({'answer': answer}) + + # user input is in first part of title + title = dom.xpath(title_xpath)[0].text.encode('utf-8') + result_url = request(title[:-16], {})['url'] + + # append result + results.append({'url': result_url, + 'title': title.decode('utf-8')}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index e23e4c390..7a6fc6d8a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -300,13 +300,15 @@ engines: engine : vimeo shortcut : vm -# You can use the engine using the official stable API, but you need an API key -# See : http://products.wolframalpha.com/api/ -# - name : wolframalpha -# shortcut : wa -# engine : wolframalpha_api -# api_key: 'apikey' # required! -# timeout: 6.0 + - name : wolframalpha + shortcut : wa + # You can use the engine using the official stable API, but you need an API key + # See : http://products.wolframalpha.com/api/ + # engine : wolframalpha_api + # api_key: 'apikey' # required! + engine : wolframalpha_noapi + timeout: 6.0 + disabled : True #The blekko technology and team have joined IBM Watson! -> https://blekko.com/ # - name : blekko images |