From c329ea135ed8c7b56a16e08bf0ee8f6f82609406 Mon Sep 17 00:00:00 2001 From: 0xhtml <34682885+0xhtml@users.noreply.github.com> Date: Wed, 31 Jul 2019 20:44:41 +0200 Subject: Fix spotify engine --- searx/engines/spotify.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index aed756be3..da32b334c 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -12,10 +12,14 @@ from json import loads from searx.url_utils import urlencode +import requests +import base64 # engine dependent config categories = ['music'] paging = True +api_client_id = None +api_client_secret = None # search-url url = 'https://api.spotify.com/' @@ -31,6 +35,16 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + r = requests.post( + 'https://accounts.spotify.com/api/token', + data={'grant_type': 'client_credentials'}, + headers={'Authorization': 'Basic ' + str(base64.b64encode( + (api_client_id + ":" + api_client_secret).encode('utf-8') + ), 'utf-8')} + ) + j = loads(r.text) + params['headers'] = {'Authorization': 'Bearer ' + j['access_token']} + return params -- cgit v1.2.3 From 275b37cc7c87b562d08576be5268a4f8797b84ea Mon Sep 17 00:00:00 2001 From: 0xhtml <34682885+0xhtml@users.noreply.github.com> Date: Wed, 31 Jul 2019 21:01:24 +0200 Subject: Fix error if the user hasn't set api credentials --- searx/engines/spotify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index da32b334c..57b08a1e4 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -39,7 +39,7 @@ def request(query, params): 'https://accounts.spotify.com/api/token', data={'grant_type': 'client_credentials'}, headers={'Authorization': 'Basic ' + str(base64.b64encode( - (api_client_id + ":" + api_client_secret).encode('utf-8') + "{}:{}".format(api_client_id, api_client_secret).encode('utf-8') ), 'utf-8')} ) j = loads(r.text) -- cgit v1.2.3 From b2e1ee8d35050033b41765a2de49c0eea5f8b4b4 Mon Sep 17 00:00:00 2001 From: 0xhtml <34682885+0xhtml@users.noreply.github.com> Date: Wed, 31 Jul 2019 21:09:02 +0200 Subject: Fix some more errors with none/wrong credentials --- searx/engines/spotify.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index 57b08a1e4..00c395706 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -38,12 +38,12 @@ def request(query, params): r = requests.post( 'https://accounts.spotify.com/api/token', data={'grant_type': 'client_credentials'}, - headers={'Authorization': 'Basic ' + str(base64.b64encode( + headers={'Authorization': 'Basic ' + base64.b64encode( "{}:{}".format(api_client_id, api_client_secret).encode('utf-8') - ), 'utf-8')} + ).decode('utf-8')} ) j = loads(r.text) - params['headers'] = {'Authorization': 'Bearer ' + j['access_token']} + params['headers'] = {'Authorization': 'Bearer {}'.format(j.get('access_token'))} return params -- cgit v1.2.3 From 8850036ded3af2ba7455cef53a8134022e1b544d Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 21 Dec 2019 20:25:39 +0100 Subject: [fix] add explicit useragent header to requests - closes #1459 --- searx/engines/qwant.py | 1 + 1 file changed, 1 insertion(+) (limited to 'searx/engines') diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index de12955c6..54e9dafad 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -50,6 +50,7 @@ def request(query, params): language = match_language(params['language'], supported_languages, language_aliases) params['url'] += '&locale=' + language.replace('-', '_').lower() + params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' return params -- cgit v1.2.3 From e5305f886c0d7d5fb3f34d1fbd7f9a545c14c284 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 21 Dec 2019 20:51:30 +0100 Subject: [fix] fetch extra search param of gigablast - fixes #1293 --- searx/engines/gigablast.py | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index a84f3f69d..2a5067bc3 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -14,6 +14,7 @@ import random from json import loads from time import time from lxml.html import fromstring +from searx.poolrequests import get from searx.url_utils import urlencode from searx.utils import eval_xpath @@ -31,13 +32,9 @@ search_string = 'search?{query}'\ '&c=main'\ '&s={offset}'\ '&format=json'\ - '&qh=0'\ - '&qlang={lang}'\ + '&langcountry={lang}'\ '&ff={safesearch}'\ - '&rxiec={rxieu}'\ - '&ulse={ulse}'\ - '&rand={rxikd}'\ - '&dbez={dbez}' + '&rand={rxikd}' # specific xpath variables results_xpath = '//response//result' url_xpath = './/url' @@ -46,9 +43,26 @@ content_xpath = './/sum' supported_languages_url = 'https://gigablast.com/search?&rxikd=1' +extra_param = '' # gigablast requires a random extra parameter +# which can be extracted from the source code of the search page + + +def parse_extra_param(text): + global extra_param + param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')] + extra_param = '' + for l in param_lines: + extra_param += l.split("'")[1] + extra_param = extra_param.split('&')[-1] + + +def init(engine_settings=None): + parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text) + # do search-request def request(query, params): + print("EXTRAPARAM:", extra_param) offset = (params['pageno'] - 1) * number_of_results if params['language'] == 'all': @@ -67,14 +81,11 @@ def request(query, params): search_path = search_string.format(query=urlencode({'q': query}), offset=offset, number_of_results=number_of_results, - rxikd=int(time() * 1000), - rxieu=random.randint(1000000000, 9999999999), - ulse=random.randint(100000000, 999999999), lang=language, - safesearch=safesearch, - dbez=random.randint(100000000, 999999999)) + rxikd=int(time() * 1000), + safesearch=safesearch) - params['url'] = base_url + search_path + params['url'] = base_url + search_path + '&' + extra_param return params @@ -84,7 +95,11 @@ def response(resp): results = [] # parse results - response_json = loads(resp.text) + try: + response_json = loads(resp.text) + except: + parse_extra_param(resp.text) + return results for result in response_json['results']: # append result -- cgit v1.2.3 From f8713512bedf19d4495e0b9a0fd86679daaf7f79 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 21 Dec 2019 20:56:38 +0100 Subject: [fix] convert byte query to string in osm engine - fixes #1220 --- searx/engines/openstreetmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 733ba6203..cec10a3c7 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -24,7 +24,7 @@ result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' # do search-request def request(query, params): - params['url'] = base_url + search_string.format(query=query) + params['url'] = base_url + search_string.format(query=query.decode('utf-8')) return params -- cgit v1.2.3 From 00512e36c133312eb74a82f6a2dec6d06214c42b Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 21 Dec 2019 21:01:08 +0100 Subject: [fix] handle empty response from wikipedia engine - closes #1114 --- searx/engines/wikipedia.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 4dae735d1..690da72fe 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -79,6 +79,9 @@ def response(resp): # wikipedia article's unique id # first valid id is assumed to be the requested article + if 'pages' not in search_result['query']: + return results + for article_id in search_result['query']['pages']: page = search_result['query']['pages'][article_id] if int(article_id) > 0: -- cgit v1.2.3 From fc457569f757dd10ff55393f472ea9ed49a42374 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 21 Dec 2019 21:13:43 +0100 Subject: [fix] pep8 --- searx/engines/gigablast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 2a5067bc3..5af593e36 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -43,7 +43,7 @@ content_xpath = './/sum' supported_languages_url = 'https://gigablast.com/search?&rxikd=1' -extra_param = '' # gigablast requires a random extra parameter +extra_param = '' # gigablast requires a random extra parameter # which can be extracted from the source code of the search page -- cgit v1.2.3 From 34ad3d6b34017523a9502f86b92c17fe389918eb Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 21 Dec 2019 21:25:50 +0100 Subject: [enh] display error message if gigablast extra param expired --- searx/engines/gigablast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 5af593e36..2bb29a9fe 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -99,7 +99,7 @@ def response(resp): response_json = loads(resp.text) except: parse_extra_param(resp.text) - return results + raise Exception('extra param expired, please reload') for result in response_json['results']: # append result -- cgit v1.2.3 From c18048e0454f4e3dc75c778940903091fbeae06a Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 25 Aug 2019 22:23:37 -0700 Subject: exclude disambiguation pages from wikipedia infobox --- searx/engines/wikipedia.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 690da72fe..44dea56fa 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -21,7 +21,8 @@ search_url = base_url + u'w/api.php?'\ 'action=query'\ '&format=json'\ '&{query}'\ - '&prop=extracts|pageimages'\ + '&prop=extracts|pageimages|pageprops'\ + '&ppprop=disambiguation'\ '&exintro'\ '&explaintext'\ '&pithumbsize=300'\ @@ -87,7 +88,7 @@ def response(resp): if int(article_id) > 0: break - if int(article_id) < 0: + if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}): return [] title = page.get('title') -- cgit v1.2.3 From 5706c12fba98e169c7c76a4d3c29aabf48242d63 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 25 Aug 2019 22:47:23 -0700 Subject: remove empty parenthesis in wikipedia's summary They're usually IPA pronunciations which are removed by the API. --- searx/engines/wikipedia.py | 1 + 1 file changed, 1 insertion(+) (limited to 'searx/engines') diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 44dea56fa..a216ba886 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -100,6 +100,7 @@ def response(resp): extract = page.get('extract') summary = extract_first_paragraph(extract, title, image) + summary = summary.replace('() ', '') # link to wikipedia article wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ -- cgit v1.2.3 From 2292e6e130dca104cb324197b63611a012e4ef3c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:28:47 +0100 Subject: [fix] handle missing result size --- searx/engines/bing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index ed0b87dbd..24776c400 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -63,6 +63,8 @@ def response(resp): results = [] result_len = 0 + + dom = html.fromstring(resp.text) # parse results for result in eval_xpath(dom, '//div[@class="sa_cc"]'): @@ -89,8 +91,7 @@ def response(resp): 'content': content}) try: - result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()')) - result_len_container = utils.to_string(result_len_container) + result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: # Remove the part "from-to" for paginated request ... result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:] @@ -102,7 +103,7 @@ def response(resp): logger.debug('result error :\n%s', e) pass - if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: + if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: return [] results.append({'number_of_results': result_len}) -- cgit v1.2.3 From 2dc2e1e8f9c8ae0d28df56f42b2f4949d8611624 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:29:10 +0100 Subject: [fix] skip invalid encoded attributes --- searx/engines/flickr_noapi.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 198ac2cff..e1abb378f 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -109,14 +109,22 @@ def response(resp): else: url = build_flickr_url(photo['ownerNsid'], photo['id']) - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'content': content, - 'author': author, - 'source': source, - 'img_format': img_format, - 'template': 'images.html'}) + result = { + 'url': url, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'source': source, + 'img_format': img_format, + 'template': 'images.html' + } + try: + result['author'] = author.encode('utf-8') + result['title'] = title.encode('utf-8') + result['content'] = content.encode('utf-8') + except: + result['author'] = '' + result['title'] = '' + result['content'] = '' + results.append(result) return results -- cgit v1.2.3 From 86a378bd0109684bd45c917f94068e3c98441904 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:29:28 +0100 Subject: [fix] handle missing thumbnail --- searx/engines/ina.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/ina.py b/searx/engines/ina.py index 37a05f099..ea509649f 100644 --- a/searx/engines/ina.py +++ b/searx/engines/ina.py @@ -32,7 +32,7 @@ base_url = 'https://www.ina.fr' search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' # specific xpath variables -results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' +results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' url_xpath = './/a/@href' title_xpath = './/h3[@class="h3--title media-heading"]' thumbnail_xpath = './/img/@src' @@ -65,8 +65,11 @@ def response(resp): videoid = result.xpath(url_xpath)[0] url = base_url + videoid title = p.unescape(extract_text(result.xpath(title_xpath))) - thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) - if thumbnail[0] == '/': + try: + thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) + except: + thumbnail = '' + if thumbnail and thumbnail[0] == '/': thumbnail = base_url + thumbnail d = extract_text(result.xpath(publishedDate_xpath)[0]) d = d.split('/') -- cgit v1.2.3 From 1e6253ce16346fc6f439a07211b56770d06ba225 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:29:55 +0100 Subject: [fix] handle empty response --- searx/engines/microsoft_academic.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py index 9387b08d0..9bac0069c 100644 --- a/searx/engines/microsoft_academic.py +++ b/searx/engines/microsoft_academic.py @@ -45,6 +45,8 @@ def request(query, params): def response(resp): results = [] response_data = loads(resp.text) + if not response_data: + return results for result in response_data['results']: url = _get_url(result) -- cgit v1.2.3 From ad5bb994b1cff56c4f021f88bfa62f38055f1416 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:30:18 +0100 Subject: [fix] add py3 compatibility --- searx/engines/scanr_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py index 72fd2b3c9..7208dcb70 100644 --- a/searx/engines/scanr_structures.py +++ b/searx/engines/scanr_structures.py @@ -29,7 +29,7 @@ def request(query, params): params['url'] = search_url params['method'] = 'POST' params['headers']['Content-type'] = "application/json" - params['data'] = dumps({"query": query, + params['data'] = dumps({"query": query.decode('utf-8'), "searchField": "ALL", "sortDirection": "ASC", "sortOrder": "RELEVANCY", -- cgit v1.2.3 From 17b6faa4c3c1cf14a327f4a3538fc70dce08b756 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:37:06 +0100 Subject: [fix] pep8 --- searx/engines/bing.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 24776c400..b193f7c60 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -63,8 +63,6 @@ def response(resp): results = [] result_len = 0 - - dom = html.fromstring(resp.text) # parse results for result in eval_xpath(dom, '//div[@class="sa_cc"]'): -- cgit v1.2.3 From db9d7d47bdce4e4e68e681748af01f84f993434c Mon Sep 17 00:00:00 2001 From: frankdelange Date: Tue, 7 Jan 2020 21:41:43 +0100 Subject: Fix double-encode error (fixes #1799) --- searx/engines/flickr_noapi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index e1abb378f..c8ee34f7a 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -118,9 +118,9 @@ def response(resp): 'template': 'images.html' } try: - result['author'] = author.encode('utf-8') - result['title'] = title.encode('utf-8') - result['content'] = content.encode('utf-8') + result['author'] = author + result['title'] = title + result['content'] = content except: result['author'] = '' result['title'] = '' -- cgit v1.2.3 From 99435381a84072b110c32004b2fb778af9b96f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Sat, 1 Feb 2020 11:01:17 +0100 Subject: [enh] introduce private engines This PR adds a new setting to engines named `tokens`. It expects a list of tokens which lets searx validate if the request should be accepted or not. --- searx/engines/__init__.py | 9 +++++++-- searx/engines/dummy-offline.py | 12 ++++++++++++ searx/engines/genius.py | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 searx/engines/dummy-offline.py (limited to 'searx/engines') diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 2393f52b6..9ccef8b54 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -54,7 +54,8 @@ engine_default_args = {'paging': False, 'suspend_end_time': 0, 'continuous_errors': 0, 'time_range_support': False, - 'offline': False} + 'offline': False, + 'tokens': []} def load_engine(engine_data): @@ -160,7 +161,7 @@ def to_percentage(stats, maxvalue): return stats -def get_engines_stats(): +def get_engines_stats(preferences): # TODO refactor pageloads = [] engine_times = [] @@ -171,8 +172,12 @@ def get_engines_stats(): max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa for engine in engines.values(): + if not preferences.validate_token(engine): + continue + if engine.stats['search_count'] == 0: continue + results_num = \ engine.stats['result_count'] / float(engine.stats['search_count']) diff --git a/searx/engines/dummy-offline.py b/searx/engines/dummy-offline.py new file mode 100644 index 000000000..13a9ecc01 --- /dev/null +++ b/searx/engines/dummy-offline.py @@ -0,0 +1,12 @@ +""" + Dummy Offline + + @results one result + @stable yes +""" + + +def search(query, request_params): + return [{ + 'result': 'this is what you get', + }] diff --git a/searx/engines/genius.py b/searx/engines/genius.py index b265e9d76..aa5afad9b 100644 --- a/searx/engines/genius.py +++ b/searx/engines/genius.py @@ -72,6 +72,7 @@ def parse_album(hit): result.update({'content': 'Released: {}'.format(year)}) return result + parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album} -- cgit v1.2.3 From f0684a5bb5860c2b9caffefb47dc55781092819e Mon Sep 17 00:00:00 2001 From: piplongrun Date: Wed, 12 Feb 2020 23:58:50 +0100 Subject: Add eTools engine --- searx/engines/etools.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 searx/engines/etools.py (limited to 'searx/engines') diff --git a/searx/engines/etools.py b/searx/engines/etools.py new file mode 100644 index 000000000..a9eb0980d --- /dev/null +++ b/searx/engines/etools.py @@ -0,0 +1,54 @@ +""" + eTools (Web) + + @website https://www.etools.ch + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import quote +from searx.utils import eval_xpath + +categories = ['general'] +paging = False +language_support = False +safesearch = True + +base_url = 'https://www.etools.ch' +search_path = '/searchAdvancedSubmit.do'\ + '?query={search_term}'\ + '&pageResults=20'\ + '&safeSearch={safesearch}' + + +def request(query, params): + if params['safesearch']: + safesearch = 'true' + else: + safesearch = 'false' + + params['url'] = base_url + search_path.format(search_term=quote(query), safesearch=safesearch) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in eval_xpath(dom, '//table[@class="result"]//td[@class="record"]'): + url = eval_xpath(result, './a/@href')[0] + title = extract_text(eval_xpath(result, './a//text()')) + content = extract_text(eval_xpath(result, './/div[@class="text"]//text()')) + + results.append({'url': url, + 'title': title, + 'content': content}) + + return results -- cgit v1.2.3 From c89c05bcebb44699c94d6da2fcb8ad376b60fe67 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 25 Feb 2020 18:44:28 +0100 Subject: bugfix: google-news and bing-news has changed the language parameter closes: https://github.com/asciimoo/searx/issues/1838 Signed-off-by: Markus Heiser --- searx/engines/bing_news.py | 3 ++- searx/engines/google_news.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 669130c42..827555bd7 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -58,8 +58,9 @@ def _get_url(query, language, offset, time_range): offset=offset, interval=time_range_dict[time_range]) else: + # e.g. setmkt=de-de&setlang=de search_path = search_string.format( - query=urlencode({'q': query, 'setmkt': language}), + query=urlencode({'q': query, 'setlang': language}), offset=offset) return base_url + search_path diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 9c837b45b..c9cc75435 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -54,7 +54,7 @@ def request(query, params): if params['language'] != 'all': language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] if language: - params['url'] += '&lr=lang_' + language + params['url'] += '&hl=' + language return params -- cgit v1.2.3 From e0c99d9dcbe4c2eee0a7c6f4a7326a8376467640 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 1 Mar 2020 08:01:36 +0100 Subject: bugfix: fetch_supported_languages bing, -news, -videos, -images Signed-off-by: Markus Heiser --- searx/engines/bing.py | 23 ++++++++++++++--------- searx/engines/bing_images.py | 21 ++------------------- searx/engines/bing_news.py | 3 ++- searx/engines/bing_videos.py | 2 +- 4 files changed, 19 insertions(+), 30 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index b193f7c60..c8fc4fa2e 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -110,13 +110,18 @@ def response(resp): # get supported languages from their site def _fetch_supported_languages(resp): - supported_languages = [] + lang_tags = set() + + setmkt = re.compile('setmkt=([^&]*)') dom = html.fromstring(resp.text) - options = eval_xpath(dom, '//div[@id="limit-languages"]//input') - for option in options: - code = eval_xpath(option, './@id')[0].replace('_', '-') - if code == 'nb': - code = 'no' - supported_languages.append(code) - - return supported_languages + lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]") + + for a in lang_links: + href = eval_xpath(a, './@href')[0] + match = setmkt.search(href) + l_tag = match.groups()[0] + _lang, _nation = l_tag.split('-',1) + l_tag = _lang.lower() + '-' + _nation.upper() + lang_tags.add(l_tag) + + return list(lang_tags) diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 44e2c3bbc..138ed11c6 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -18,6 +18,8 @@ import re from searx.url_utils import urlencode from searx.utils import match_language +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases + # engine dependent config categories = ['images'] paging = True @@ -103,22 +105,3 @@ def response(resp): continue return results - - -# get supported languages from their site -def _fetch_supported_languages(resp): - supported_languages = [] - dom = html.fromstring(resp.text) - - regions_xpath = '//div[@id="region-section-content"]' \ - + '//ul[@class="b_vList"]/li/a/@href' - - regions = dom.xpath(regions_xpath) - for region in regions: - code = re.search('setmkt=[^\&]+', region).group()[7:] - if code == 'nb-NO': - code = 'no-NO' - - supported_languages.append(code) - - return supported_languages diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 827555bd7..14fd7b99a 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -15,9 +15,10 @@ from datetime import datetime from dateutil import parser from lxml import etree from searx.utils import list_get, match_language -from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases from searx.url_utils import urlencode, urlparse, parse_qsl +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases + # engine dependent config categories = ['news'] paging = True diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index f1e636819..6e7b6d3aa 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -12,10 +12,10 @@ from json import loads from lxml import html -from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode from searx.utils import match_language +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases categories = ['videos'] paging = True -- cgit v1.2.3 From 1c853f9573a85d9885d3c99b7e3210c834e5e5f3 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 1 Mar 2020 11:07:59 +0100 Subject: bing_news: parital rollback of c89c05bc The bing_news bug (discussed in #1838) was caused by wrong language tags, which was fixed e0c99d9d / no need to change the bing_news search string. closes: https://github.com/asciimoo/searx/issues/1838 Signed-off-by: Markus Heiser --- searx/engines/bing.py | 2 +- searx/engines/bing_news.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index c8fc4fa2e..afb776acd 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -120,7 +120,7 @@ def _fetch_supported_languages(resp): href = eval_xpath(a, './@href')[0] match = setmkt.search(href) l_tag = match.groups()[0] - _lang, _nation = l_tag.split('-',1) + _lang, _nation = l_tag.split('-', 1) l_tag = _lang.lower() + '-' + _nation.upper() lang_tags.add(l_tag) diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 14fd7b99a..d13be777c 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -61,7 +61,7 @@ def _get_url(query, language, offset, time_range): else: # e.g. setmkt=de-de&setlang=de search_path = search_string.format( - query=urlencode({'q': query, 'setlang': language}), + query=urlencode({'q': query, 'setmkt': language}), offset=offset) return base_url + search_path -- cgit v1.2.3 From ad7a6e6e1022923850343b2c19e47bbd9fbad050 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 1 Mar 2020 17:28:59 +0100 Subject: bugfix(!biv) : bing-video do not like "older" User-Agents When selecting other languages than 'en', bing-video did not handle the language correct and gave very bad results. Since User-Agent is normaly rotated in searx, the behavior of a !biv search was unpredictable and paging was broken. Signed-off-by: Markus Heiser --- searx/engines/bing_videos.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index 6e7b6d3aa..f048f0d8e 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -67,6 +67,10 @@ def request(query, params): if params['time_range'] in time_range_dict: params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + # bing videos did not like "older" versions < 70.0.1 when selectin other + # languages then 'en' .. very strange ?!?! + params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1' + return params -- cgit v1.2.3 From 4f90fb6a9236a9699e80c249153ba372bf308cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sp=C3=BChler=20Stefan?= Date: Mon, 2 Mar 2020 18:55:48 +0100 Subject: [Fix] Startpage ValueError on Spanish date format datetime.parser.parse() does not know the Spanish date format which leads to a ValueError. Fixes #1870 Traceback (most recent call last): File "/usr/local/searx/searx/search.py", line 160, in search_one_http_request_safe search_results = search_one_http_request(engine, query, request_params) File "/usr/local/searx/searx/search.py", line 97, in search_one_http_request return engine.response(response) File "/usr/local/searx/searx/engines/startpage.py", line 102, in response published_date = parser.parse(date_string, dayfirst=True) File "/usr/local/searx/searx-ve/lib/python3.6/site-packages/dateutil/parser/_parser.py", line 1358, in parse return DEFAULTPARSER.parse(timestr, **kwargs) File "/usr/local/searx/searx-ve/lib/python3.6/site-packages/dateutil/parser/_parser.py", line 649, in parse raise ValueError("Unknown string format:", timestr) ValueError: ('Unknown string format:', '24 Ene 2013') --- searx/engines/startpage.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 76567396f..953734934 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -99,11 +99,14 @@ def response(resp): if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] - published_date = parser.parse(date_string, dayfirst=True) - # fix content string content = content[date_pos:] + try: + published_date = parser.parse(date_string, dayfirst=True) + except ValueError: + pass + # check if search result starts with something like: "5 days ago ... " elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): date_pos = content.find('...') + 4 -- cgit v1.2.3 From 3a26093c46469a320c2dfa4d732634a462c8f0f4 Mon Sep 17 00:00:00 2001 From: HLFH Date: Wed, 15 Apr 2020 16:46:15 +0100 Subject: Remove discontinued faroo engine --- searx/engines/faroo.py | 96 -------------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 searx/engines/faroo.py (limited to 'searx/engines') diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py deleted file mode 100644 index a36ecf778..000000000 --- a/searx/engines/faroo.py +++ /dev/null @@ -1,96 +0,0 @@ -""" - Faroo (Web, News) - - @website http://www.faroo.com - @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key - - @using-api no - @results JSON - @stable yes - @parse url, title, content, publishedDate, img_src -""" - -from json import loads -import datetime -from searx.utils import searx_useragent -from searx.url_utils import urlencode - -# engine dependent config -categories = ['general', 'news'] -paging = True -language_support = True -number_of_results = 10 - -# search-url -url = 'http://www.faroo.com/' -search_url = url + 'instant.json?{query}'\ - '&start={offset}'\ - '&length={number_of_results}'\ - '&l={language}'\ - '&src={categorie}'\ - '&i=false'\ - '&c=false' - -search_category = {'general': 'web', - 'news': 'news'} - - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * number_of_results + 1 - categorie = search_category.get(params['category'], 'web') - - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('-')[0] - - # if language is not supported, put it in english - if language != 'en' and\ - language != 'de' and\ - language != 'zh': - language = 'en' - - params['url'] = search_url.format(offset=offset, - number_of_results=number_of_results, - query=urlencode({'q': query}), - language=language, - categorie=categorie) - - params['headers']['Referer'] = url - - return params - - -# get response from search-request -def response(resp): - # HTTP-Code 429: rate limit exceeded - if resp.status_code == 429: - raise Exception("rate limit has been exceeded!") - - results = [] - - search_res = loads(resp.text) - - # return empty array if there are no results - if not search_res.get('results', {}): - return [] - - # parse results - for result in search_res['results']: - publishedDate = None - result_json = {'url': result['url'], 'title': result['title'], - 'content': result['kwic']} - if result['news']: - result_json['publishedDate'] = \ - datetime.datetime.fromtimestamp(result['date'] / 1000.0) - - # append image result if image url is set - if result['iurl']: - result_json['template'] = 'videos.html' - result_json['thumbnail'] = result['iurl'] - - results.append(result_json) - - # return results - return results -- cgit v1.2.3