From f4525880ed5f965ac4c241638933842a27a2acf7 Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 11:47:27 +0200 Subject: [enh] dictionary engine added --- searx/engines/dictionary.py | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 searx/engines/dictionary.py (limited to 'searx/engines') diff --git a/searx/engines/dictionary.py b/searx/engines/dictionary.py new file mode 100644 index 000000000..1849322f5 --- /dev/null +++ b/searx/engines/dictionary.py @@ -0,0 +1,70 @@ +import re +from lxml import html +from searx.engines.xpath import extract_text +from searx.languages import language_codes + +categories = [] +url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.+)', re.I) +results_xpath = './/table[@id="r"]/tr' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + if len(from_lang) == 2: + lan = filter(lambda x: x[0][:2] == from_lang, language_codes) + if lan: + from_lang = lan[0][1].lower() + else: + return params + + if len(to_lang) == 2: + lan = filter(lambda x: x[0][:2] == to_lang, language_codes) + if lan: + to_lang = lan[0][1].lower() + else: + return params + + params['url'] = url.format(from_lang=from_lang, to_lang=to_lang,query=query) + params['from_lang'] = from_lang + params['to_lang'] = to_lang + params['query'] = query + + return params + +def response(resp): + results = [] + answers = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(results_xpath)[1:]: + try: + from_result, to_results_raw = result.xpath('./td') + except: + continue + + to_results = [] + for to_result in to_results_raw.xpath('./p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append({ + 'answer': u'{0} - {1}'.format( + from_result.text_content(), + '; '.join(to_results) + ), + 'url': url + }) + + return results + + -- cgit v1.2.3 From 3f4cc2146c81e12a890b8ea4c4ac5ad600f34618 Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 12:34:20 +0200 Subject: [enh] return results instead of answers --- searx/engines/dictionary.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dictionary.py b/searx/engines/dictionary.py index 1849322f5..e3abaa1a4 100644 --- a/searx/engines/dictionary.py +++ b/searx/engines/dictionary.py @@ -3,7 +3,7 @@ from lxml import html from searx.engines.xpath import extract_text from searx.languages import language_codes -categories = [] +categories = ['general'] url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' weight = 100 @@ -41,7 +41,6 @@ def request(query, params): def response(resp): results = [] - answers = [] dom = html.fromstring(resp.text) @@ -58,11 +57,9 @@ def response(resp): to_results.append(to_result.text_content()) results.append({ - 'answer': u'{0} - {1}'.format( - from_result.text_content(), - '; '.join(to_results) - ), - 'url': url + 'url': resp.url, + 'title': from_result.text_content(), + 'content': '; '.join(to_results) }) return results -- cgit v1.2.3 From b808a2e26670e06d6f912f7d169a9c59ee7ac8ee Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 12:37:26 +0200 Subject: [fix] don't merge with suggestions --- searx/engines/dictionary.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dictionary.py b/searx/engines/dictionary.py index e3abaa1a4..080f7b4a4 100644 --- a/searx/engines/dictionary.py +++ b/searx/engines/dictionary.py @@ -1,4 +1,5 @@ import re +from urlparse import urljoin from lxml import html from searx.engines.xpath import extract_text from searx.languages import language_codes @@ -44,7 +45,7 @@ def response(resp): dom = html.fromstring(resp.text) - for result in dom.xpath(results_xpath)[1:]: + for k, result in enumerate(dom.xpath(results_xpath)[1:]): try: from_result, to_results_raw = result.xpath('./td') except: @@ -57,7 +58,7 @@ def response(resp): to_results.append(to_result.text_content()) results.append({ - 'url': resp.url, + 'url': urljoin(resp.url, '?%d' % k), 'title': from_result.text_content(), 'content': '; '.join(to_results) }) -- cgit v1.2.3 From 84ff6e289ea608207755b01bc648575a87ea55ba Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 12:46:18 +0200 Subject: [enh] filter non-existing language code/name containing requests --- searx/engines/dictionary.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/dictionary.py b/searx/engines/dictionary.py index 080f7b4a4..b255463aa 100644 --- a/searx/engines/dictionary.py +++ b/searx/engines/dictionary.py @@ -25,6 +25,9 @@ def request(query, params): from_lang = lan[0][1].lower() else: return params + elif from_lang.lower() not in [x[1].lower() for x in language_codes]: + return params + if len(to_lang) == 2: lan = filter(lambda x: x[0][:2] == to_lang, language_codes) @@ -32,6 +35,8 @@ def request(query, params): to_lang = lan[0][1].lower() else: return params + elif to_lang.lower() not in [x[1].lower() for x in language_codes]: + return params params['url'] = url.format(from_lang=from_lang, to_lang=to_lang,query=query) params['from_lang'] = from_lang -- cgit v1.2.3 From 5416f0f248e1c8072c69b4a272af07bd4c0d8e5e Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 12:50:56 +0200 Subject: [enh] dictionary engine renamed to dictzone --- searx/engines/dictionary.py | 73 --------------------------------------------- searx/engines/dictzone.py | 73 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 73 deletions(-) delete mode 100644 searx/engines/dictionary.py create mode 100644 searx/engines/dictzone.py (limited to 'searx/engines') diff --git a/searx/engines/dictionary.py b/searx/engines/dictionary.py deleted file mode 100644 index b255463aa..000000000 --- a/searx/engines/dictionary.py +++ /dev/null @@ -1,73 +0,0 @@ -import re -from urlparse import urljoin -from lxml import html -from searx.engines.xpath import extract_text -from searx.languages import language_codes - -categories = ['general'] -url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' -weight = 100 - -parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.+)', re.I) -results_xpath = './/table[@id="r"]/tr' - - -def request(query, params): - m = parser_re.match(unicode(query, 'utf8')) - if not m: - return params - - from_lang, to_lang, query = m.groups() - - if len(from_lang) == 2: - lan = filter(lambda x: x[0][:2] == from_lang, language_codes) - if lan: - from_lang = lan[0][1].lower() - else: - return params - elif from_lang.lower() not in [x[1].lower() for x in language_codes]: - return params - - - if len(to_lang) == 2: - lan = filter(lambda x: x[0][:2] == to_lang, language_codes) - if lan: - to_lang = lan[0][1].lower() - else: - return params - elif to_lang.lower() not in [x[1].lower() for x in language_codes]: - return params - - params['url'] = url.format(from_lang=from_lang, to_lang=to_lang,query=query) - params['from_lang'] = from_lang - params['to_lang'] = to_lang - params['query'] = query - - return params - -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - for k, result in enumerate(dom.xpath(results_xpath)[1:]): - try: - from_result, to_results_raw = result.xpath('./td') - except: - continue - - to_results = [] - for to_result in to_results_raw.xpath('./p/a'): - t = to_result.text_content() - if t.strip(): - to_results.append(to_result.text_content()) - - results.append({ - 'url': urljoin(resp.url, '?%d' % k), - 'title': from_result.text_content(), - 'content': '; '.join(to_results) - }) - - return results - - diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py new file mode 100644 index 000000000..b255463aa --- /dev/null +++ b/searx/engines/dictzone.py @@ -0,0 +1,73 @@ +import re +from urlparse import urljoin +from lxml import html +from searx.engines.xpath import extract_text +from searx.languages import language_codes + +categories = ['general'] +url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.+)', re.I) +results_xpath = './/table[@id="r"]/tr' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + if len(from_lang) == 2: + lan = filter(lambda x: x[0][:2] == from_lang, language_codes) + if lan: + from_lang = lan[0][1].lower() + else: + return params + elif from_lang.lower() not in [x[1].lower() for x in language_codes]: + return params + + + if len(to_lang) == 2: + lan = filter(lambda x: x[0][:2] == to_lang, language_codes) + if lan: + to_lang = lan[0][1].lower() + else: + return params + elif to_lang.lower() not in [x[1].lower() for x in language_codes]: + return params + + params['url'] = url.format(from_lang=from_lang, to_lang=to_lang,query=query) + params['from_lang'] = from_lang + params['to_lang'] = to_lang + params['query'] = query + + return params + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for k, result in enumerate(dom.xpath(results_xpath)[1:]): + try: + from_result, to_results_raw = result.xpath('./td') + except: + continue + + to_results = [] + for to_result in to_results_raw.xpath('./p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append({ + 'url': urljoin(resp.url, '?%d' % k), + 'title': from_result.text_content(), + 'content': '; '.join(to_results) + }) + + return results + + -- cgit v1.2.3 From bc806bfab1cc75279dc912bf443dc39178a872dd Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 14:12:46 +0200 Subject: [fix] no lambda anymore, cgi.escape --- searx/engines/dictzone.py | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index b255463aa..f68f44887 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -1,6 +1,7 @@ import re from urlparse import urljoin from lxml import html +from cgi import escape from searx.engines.xpath import extract_text from searx.languages import language_codes @@ -12,6 +13,19 @@ parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.+)', re.I) results_xpath = './/table[@id="r"]/tr' +def is_valid_lang(lang): + is_abbr = (len(lang) == 2) + if is_abbr: + for l in language_codes: + if l[0][:2] == lang.lower(): + return (True, l[1].lower()) + return False + else: + for l in language_codes: + if l[1].lower() == lang.lower(): + return (True, l[1].lower()) + return False + def request(query, params): m = parser_re.match(unicode(query, 'utf8')) if not m: @@ -19,28 +33,15 @@ def request(query, params): from_lang, to_lang, query = m.groups() - if len(from_lang) == 2: - lan = filter(lambda x: x[0][:2] == from_lang, language_codes) - if lan: - from_lang = lan[0][1].lower() - else: - return params - elif from_lang.lower() not in [x[1].lower() for x in language_codes]: - return params - + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) - if len(to_lang) == 2: - lan = filter(lambda x: x[0][:2] == to_lang, language_codes) - if lan: - to_lang = lan[0][1].lower() - else: - return params - elif to_lang.lower() not in [x[1].lower() for x in language_codes]: + if not from_lang or not to_lang: return params - params['url'] = url.format(from_lang=from_lang, to_lang=to_lang,query=query) - params['from_lang'] = from_lang - params['to_lang'] = to_lang + params['url'] = url.format(from_lang=from_lang[1], to_lang=to_lang[1],query=query) + params['from_lang'] = from_lang[1] + params['to_lang'] = to_lang[1] params['query'] = query return params @@ -64,8 +65,8 @@ def response(resp): results.append({ 'url': urljoin(resp.url, '?%d' % k), - 'title': from_result.text_content(), - 'content': '; '.join(to_results) + 'title': escape(from_result.text_content()), + 'content': escape('; '.join(to_results)) }) return results -- cgit v1.2.3 From 7bf1013c1591c1af177063477fb4ac9ed178ff2a Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 14:24:08 +0200 Subject: [enh] removed missing params; [fix] pep8 --- searx/engines/dictzone.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index f68f44887..212218343 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -26,6 +26,7 @@ def is_valid_lang(lang): return (True, l[1].lower()) return False + def request(query, params): m = parser_re.match(unicode(query, 'utf8')) if not m: @@ -39,13 +40,13 @@ def request(query, params): if not from_lang or not to_lang: return params - params['url'] = url.format(from_lang=from_lang[1], to_lang=to_lang[1],query=query) - params['from_lang'] = from_lang[1] - params['to_lang'] = to_lang[1] - params['query'] = query + params['url'] = url.format(from_lang=from_lang[1], + to_lang=to_lang[1], + query=query) return params + def response(resp): results = [] @@ -70,5 +71,3 @@ def response(resp): }) return results - - -- cgit v1.2.3 From 22bd39fd42e469339ff1ccac9f8c16cc00f52211 Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 15:07:47 +0200 Subject: [fix] only 1-word search triggers the engine --- searx/engines/dictzone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 212218343..b58d7ec62 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -9,7 +9,7 @@ categories = ['general'] url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' weight = 100 -parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.+)', re.I) +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) results_xpath = './/table[@id="r"]/tr' -- cgit v1.2.3 From ab471fd13b3891a5a924e8c2cd18a1079e7ac8e0 Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 15:40:07 +0200 Subject: [enh] mymemory translated engine added for multi-word translations --- searx/engines/translated.py | 63 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 searx/engines/translated.py (limited to 'searx/engines') diff --git a/searx/engines/translated.py b/searx/engines/translated.py new file mode 100644 index 000000000..9f194b76b --- /dev/null +++ b/searx/engines/translated.py @@ -0,0 +1,63 @@ +import re +from urlparse import urljoin +from lxml import html +from cgi import escape +from searx.engines.xpath import extract_text +from searx.languages import language_codes + +categories = ['general'] +url = 'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}' +web_url = 'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) + +def is_valid_lang(lang): + is_abbr = (len(lang) == 2) + if is_abbr: + for l in language_codes: + if l[0][:2] == lang.lower(): + return (True, l[0][:2], l[1].lower()) + return False + else: + for l in language_codes: + if l[1].lower() == lang.lower(): + return (True, l[0][:2], l[1].lower()) + return False + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + params['url'] = url.format(from_lang=from_lang[1], + to_lang=to_lang[1], + query=query) + params['query'] = query + params['from_lang'] = from_lang + params['to_lang'] = to_lang + + return params + + +def response(resp): + results = [] + results.append({ + 'url': escape(web_url.format(from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query'])), + 'title': escape('[{0}-{1}] {2}'.format(resp.search_params['from_lang'][1], + resp.search_params['to_lang'][1], + resp.search_params['query'])), + 'content': escape(resp.json()['responseData']['translatedText']) + }) + return results -- cgit v1.2.3 From c051e6a2c3e97419983d552594a6a8340339c1d5 Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 15:44:05 +0200 Subject: [fix] pep8 --- searx/engines/translated.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 9f194b76b..2f535140c 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -6,12 +6,14 @@ from searx.engines.xpath import extract_text from searx.languages import language_codes categories = ['general'] -url = 'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}' +url = 'http://api.mymemory.translated.net/get?q={query}' \ + '&langpair={from_lang}|{to_lang}' web_url = 'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' weight = 100 parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) + def is_valid_lang(lang): is_abbr = (len(lang) == 2) if is_abbr: @@ -52,12 +54,14 @@ def request(query, params): def response(resp): results = [] results.append({ - 'url': escape(web_url.format(from_lang=resp.search_params['from_lang'][2], - to_lang=resp.search_params['to_lang'][2], - query=resp.search_params['query'])), - 'title': escape('[{0}-{1}] {2}'.format(resp.search_params['from_lang'][1], - resp.search_params['to_lang'][1], - resp.search_params['query'])), + 'url': escape(web_url.format( + from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query'])), + 'title': escape('[{0}-{1}] {2}'.format( + resp.search_params['from_lang'][1], + resp.search_params['to_lang'][1], + resp.search_params['query'])), 'content': escape(resp.json()['responseData']['translatedText']) }) return results -- cgit v1.2.3 From 8c72a22757290754fc15fecb82dd157f6ea56a7f Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 16:12:34 +0200 Subject: [enh] api_key usage, disable the engine by default --- searx/engines/translated.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 2f535140c..3be9d4adf 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -7,11 +7,12 @@ from searx.languages import language_codes categories = ['general'] url = 'http://api.mymemory.translated.net/get?q={query}' \ - '&langpair={from_lang}|{to_lang}' + '&langpair={from_lang}|{to_lang}{key}' web_url = 'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' weight = 100 parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) +api_key = '' def is_valid_lang(lang): @@ -41,9 +42,14 @@ def request(query, params): if not from_lang or not to_lang: return params + if api_key: + key_form = '&key=' + api_key + else: + key_form = '' params['url'] = url.format(from_lang=from_lang[1], to_lang=to_lang[1], - query=query) + query=query, + key=key_form) params['query'] = query params['from_lang'] = from_lang params['to_lang'] = to_lang -- cgit v1.2.3 From b7d578ae8041658fe6f088eb337f42238c25e2f5 Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 16:36:04 +0200 Subject: [enh] engine header comments --- searx/engines/dictzone.py | 11 +++++++++++ searx/engines/translated.py | 10 ++++++++++ 2 files changed, 21 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index b58d7ec62..2c2ec3abc 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -1,3 +1,14 @@ +""" + Dictzone + + @website https://dictzone.com/ + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + import re from urlparse import urljoin from lxml import html diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 3be9d4adf..1b75e4f4e 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -1,3 +1,13 @@ +""" + MyMemory Translated + + @website https://mymemory.translated.net/ + @provide-api yes (https://mymemory.translated.net/doc/spec.php) + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" import re from urlparse import urljoin from lxml import html -- cgit v1.2.3 From 983415bc38937a637e9b2aae191f2e087765800b Mon Sep 17 00:00:00 2001 From: potato Date: Tue, 6 Sep 2016 16:43:48 +0200 Subject: [enh] is_valid_lang moved to utils --- searx/engines/dictzone.py | 20 +++----------------- searx/engines/translated.py | 16 +--------------- 2 files changed, 4 insertions(+), 32 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 2c2ec3abc..5de6c5b98 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -14,7 +14,7 @@ from urlparse import urljoin from lxml import html from cgi import escape from searx.engines.xpath import extract_text -from searx.languages import language_codes +from searx.utils import is_valid_lang categories = ['general'] url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' @@ -24,20 +24,6 @@ parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) results_xpath = './/table[@id="r"]/tr' -def is_valid_lang(lang): - is_abbr = (len(lang) == 2) - if is_abbr: - for l in language_codes: - if l[0][:2] == lang.lower(): - return (True, l[1].lower()) - return False - else: - for l in language_codes: - if l[1].lower() == lang.lower(): - return (True, l[1].lower()) - return False - - def request(query, params): m = parser_re.match(unicode(query, 'utf8')) if not m: @@ -51,8 +37,8 @@ def request(query, params): if not from_lang or not to_lang: return params - params['url'] = url.format(from_lang=from_lang[1], - to_lang=to_lang[1], + params['url'] = url.format(from_lang=from_lang[2], + to_lang=to_lang[2], query=query) return params diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 1b75e4f4e..3a077ae8e 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -13,7 +13,7 @@ from urlparse import urljoin from lxml import html from cgi import escape from searx.engines.xpath import extract_text -from searx.languages import language_codes +from searx.utils import is_valid_lang categories = ['general'] url = 'http://api.mymemory.translated.net/get?q={query}' \ @@ -25,20 +25,6 @@ parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) api_key = '' -def is_valid_lang(lang): - is_abbr = (len(lang) == 2) - if is_abbr: - for l in language_codes: - if l[0][:2] == lang.lower(): - return (True, l[0][:2], l[1].lower()) - return False - else: - for l in language_codes: - if l[1].lower() == lang.lower(): - return (True, l[0][:2], l[1].lower()) - return False - - def request(query, params): m = parser_re.match(unicode(query, 'utf8')) if not m: -- cgit v1.2.3 From 3144ec1d5926a2f445da63fc7d6ea7efe00c6d26 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 6 Sep 2016 17:17:42 +0200 Subject: [fix] unicode urls --- searx/engines/dictzone.py | 3 +-- searx/engines/translated.py | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 5de6c5b98..9765d5f60 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -13,11 +13,10 @@ import re from urlparse import urljoin from lxml import html from cgi import escape -from searx.engines.xpath import extract_text from searx.utils import is_valid_lang categories = ['general'] -url = 'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' weight = 100 parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 3a077ae8e..02047bc93 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -9,16 +9,12 @@ @parse url, title, content """ import re -from urlparse import urljoin -from lxml import html from cgi import escape -from searx.engines.xpath import extract_text from searx.utils import is_valid_lang categories = ['general'] -url = 'http://api.mymemory.translated.net/get?q={query}' \ - '&langpair={from_lang}|{to_lang}{key}' -web_url = 'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' +web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' weight = 100 parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) -- cgit v1.2.3 From 09ee2aa69dbd4815e0e1e1de53f3571972e04903 Mon Sep 17 00:00:00 2001 From: marc Date: Wed, 6 Jul 2016 17:29:40 -0500 Subject: [fix] Result text in Wolfram|Alpha (#607) --- searx/engines/wolframalpha_api.py | 10 ++++++++-- searx/engines/wolframalpha_noapi.py | 9 +++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 4526c825f..0e38051d1 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -22,6 +22,7 @@ answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' pods_xpath = '//pod' subpods_xpath = './subpod' +pod_primary_xpath = './@primary' pod_id_xpath = './@id' pod_title_xpath = './@title' plaintext_xpath = './plaintext' @@ -78,10 +79,12 @@ def response(resp): infobox_title = None pods = search_results.xpath(pods_xpath) + result = "" result_chunks = [] for pod in pods: pod_id = pod.xpath(pod_id_xpath)[0] pod_title = pod.xpath(pod_title_xpath)[0] + pod_is_result = pod.xpath(pod_primary_xpath) subpods = pod.xpath(subpods_xpath) if not subpods: @@ -94,6 +97,9 @@ def response(resp): if content and pod_id not in image_pods: + if pod_is_result: + result = content + # if no input pod was found, title is first plaintext pod if not infobox_title: infobox_title = content @@ -116,7 +122,7 @@ def response(resp): # append link to site results.append({'url': resp.request.headers['Referer'].decode('utf8'), - 'title': 'Wolfram|Alpha', - 'content': infobox_title}) + 'title': infobox_title + ' - Wolfram|Alpha', + 'content': result}) return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 3a8180f04..80a510e3a 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -81,9 +81,11 @@ def response(resp): # TODO handle resp_json['queryresult']['assumptions'] result_chunks = [] infobox_title = None + result = "" for pod in resp_json['queryresult']['pods']: pod_id = pod.get('id', '') pod_title = pod.get('title', '') + pod_is_result = pod.get('primary', None) if 'subpods' not in pod: continue @@ -97,6 +99,9 @@ def response(resp): if subpod['plaintext'] != '(requires interactivity)': result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) + if pod_is_result: + result = subpod['plaintext'] + elif 'img' in subpod: result_chunks.append({'label': pod_title, 'image': subpod['img']}) @@ -108,7 +113,7 @@ def response(resp): 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) results.append({'url': resp.request.headers['Referer'].decode('utf8'), - 'title': 'Wolfram|Alpha', - 'content': infobox_title}) + 'title': infobox_title + ' - Wolfram|Alpha', + 'content': result}) return results -- cgit v1.2.3 From a8907224a1c433b1227fd707e9bb2524dd405109 Mon Sep 17 00:00:00 2001 From: "Lorenzo J. Lucchini" Date: Thu, 7 Jul 2016 00:33:03 +0200 Subject: Improving Wolfram Alpha search hit content Making WA search hits contain - the (parsed) input inside the "title" instead of just "Wolfram|Alpha", to better match other hit titles and to confirm correct parsing of input to the user - the first output field that contains any text (skipping ones that are only pictures; this is usually the most meaningful "result" field) instead of the raw input as the "content", making it additionally possible to obtain WA computations from JSON API calls --- searx/engines/wolframalpha_api.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 0e38051d1..e743c8f56 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -18,7 +18,6 @@ api_key = '' # defined in settings.yml # xpath variables failure_xpath = '/queryresult[attribute::success="false"]' -answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' pods_xpath = '//pod' subpods_xpath = './subpod' @@ -76,11 +75,11 @@ def response(resp): try: infobox_title = search_results.xpath(input_xpath)[0].text except: - infobox_title = None + infobox_title = "" pods = search_results.xpath(pods_xpath) - result = "" result_chunks = [] + result_content = "" for pod in pods: pod_id = pod.xpath(pod_id_xpath)[0] pod_title = pod.xpath(pod_title_xpath)[0] @@ -97,8 +96,9 @@ def response(resp): if content and pod_id not in image_pods: - if pod_is_result: - result = content + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = "%s: %s" % (pod_title, content) # if no input pod was found, title is first plaintext pod if not infobox_title: @@ -115,6 +115,8 @@ def response(resp): if not result_chunks: return [] + title = "Wolfram|Alpha (%s)" % infobox_title + # append infobox results.append({'infobox': infobox_title, 'attributes': result_chunks, @@ -122,7 +124,7 @@ def response(resp): # append link to site results.append({'url': resp.request.headers['Referer'].decode('utf8'), - 'title': infobox_title + ' - Wolfram|Alpha', - 'content': result}) + 'title': title, + 'content': result_content}) return results -- cgit v1.2.3 From e145fdb86d0cd9dd8421ed63b3635f4bebcafa74 Mon Sep 17 00:00:00 2001 From: firebovine Date: Thu, 7 Jul 2016 19:41:33 -0400 Subject: #607 - noapi fix --- searx/engines/wolframalpha_noapi.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 80a510e3a..e318d93e6 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -8,9 +8,11 @@ # @stable no # @parse url, infobox +from cgi import escape from json import loads from time import time from urllib import urlencode +from lxml.etree import XML from searx.poolrequests import get as http_get @@ -34,7 +36,7 @@ search_url = url + 'input/json.jsp'\ referer_url = url + 'input/?{query}' token = {'value': '', - 'last_updated': 0} + 'last_updated': None} # pods to display as image in infobox # this pods do return a plaintext, but they look better and are more useful as images @@ -80,8 +82,8 @@ def response(resp): # TODO handle resp_json['queryresult']['assumptions'] result_chunks = [] - infobox_title = None - result = "" + infobox_title = "" + result_content = "" for pod in resp_json['queryresult']['pods']: pod_id = pod.get('id', '') pod_title = pod.get('title', '') @@ -99,8 +101,9 @@ def response(resp): if subpod['plaintext'] != '(requires interactivity)': result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) - if pod_is_result: - result = subpod['plaintext'] + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = pod_title + ': ' + subpod['plaintext'] elif 'img' in subpod: result_chunks.append({'label': pod_title, 'image': subpod['img']}) @@ -113,7 +116,7 @@ def response(resp): 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) results.append({'url': resp.request.headers['Referer'].decode('utf8'), - 'title': infobox_title + ' - Wolfram|Alpha', - 'content': result}) + 'title': 'Wolfram|Alpha (' + infobox_title + ')', + 'content': result_content}) return results -- cgit v1.2.3 From d1d4ed4376e41fa380b5b3a72e1b08e1f36a35e8 Mon Sep 17 00:00:00 2001 From: marc Date: Tue, 20 Sep 2016 15:35:54 -0500 Subject: [fix] results with digbit don't truncate anymore --- searx/engines/digbt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py index c35327e8c..b55d7747a 100644 --- a/searx/engines/digbt.py +++ b/searx/engines/digbt.py @@ -40,7 +40,7 @@ def response(resp): results = list() for result in search_res: url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) - title = result.xpath('.//a[@title]/text()')[0] + title = extract_text(result.xpath('.//a[@title]')) content = extract_text(result.xpath('.//div[@class="files"]')) files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) -- cgit v1.2.3 From 86daef2063a83a6aee90e9f269644e0803ae9cb9 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Wed, 28 Sep 2016 22:30:05 +0200 Subject: [fix] do not allow underscore in engine names - closes #708 --- searx/engines/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 782b622b0..14376c31f 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -57,11 +57,17 @@ def load_module(filename): def load_engine(engine_data): - engine_name = engine_data['engine'] + + if '_' in engine_data['name']: + logger.error('Engine name conains underscore: "{}"'.format(engine_data['name'])) + sys.exit(1) + + engine_module = engine_data['engine'] + try: - engine = load_module(engine_name + '.py') + engine = load_module(engine_module + '.py') except: - logger.exception('Cannot load engine "{}"'.format(engine_name)) + logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name in engine_data: -- cgit v1.2.3