From c89c05bcebb44699c94d6da2fcb8ad376b60fe67 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 25 Feb 2020 18:44:28 +0100 Subject: bugfix: google-news and bing-news has changed the language parameter closes: https://github.com/asciimoo/searx/issues/1838 Signed-off-by: Markus Heiser --- searx/engines/bing_news.py | 3 ++- searx/engines/google_news.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 669130c42..827555bd7 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -58,8 +58,9 @@ def _get_url(query, language, offset, time_range): offset=offset, interval=time_range_dict[time_range]) else: + # e.g. setmkt=de-de&setlang=de search_path = search_string.format( - query=urlencode({'q': query, 'setmkt': language}), + query=urlencode({'q': query, 'setlang': language}), offset=offset) return base_url + search_path diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 9c837b45b..c9cc75435 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -54,7 +54,7 @@ def request(query, params): if params['language'] != 'all': language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] if language: - params['url'] += '&lr=lang_' + language + params['url'] += '&hl=' + language return params -- cgit v1.2.3 From e0c99d9dcbe4c2eee0a7c6f4a7326a8376467640 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 1 Mar 2020 08:01:36 +0100 Subject: bugfix: fetch_supported_languages bing, -news, -videos, -images Signed-off-by: Markus Heiser --- searx/engines/bing.py | 23 ++++++++++++++--------- searx/engines/bing_images.py | 21 ++------------------- searx/engines/bing_news.py | 3 ++- searx/engines/bing_videos.py | 2 +- 4 files changed, 19 insertions(+), 30 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index b193f7c60..c8fc4fa2e 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -110,13 +110,18 @@ def response(resp): # get supported languages from their site def _fetch_supported_languages(resp): - supported_languages = [] + lang_tags = set() + + setmkt = re.compile('setmkt=([^&]*)') dom = html.fromstring(resp.text) - options = eval_xpath(dom, '//div[@id="limit-languages"]//input') - for option in options: - code = eval_xpath(option, './@id')[0].replace('_', '-') - if code == 'nb': - code = 'no' - supported_languages.append(code) - - return supported_languages + lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]") + + for a in lang_links: + href = eval_xpath(a, './@href')[0] + match = setmkt.search(href) + l_tag = match.groups()[0] + _lang, _nation = l_tag.split('-',1) + l_tag = _lang.lower() + '-' + _nation.upper() + lang_tags.add(l_tag) + + return list(lang_tags) diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 44e2c3bbc..138ed11c6 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -18,6 +18,8 @@ import re from searx.url_utils import urlencode from searx.utils import match_language +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases + # engine dependent config categories = ['images'] paging = True @@ -103,22 +105,3 @@ def response(resp): continue return results - - -# get supported languages from their site -def _fetch_supported_languages(resp): - supported_languages = [] - dom = html.fromstring(resp.text) - - regions_xpath = '//div[@id="region-section-content"]' \ - + '//ul[@class="b_vList"]/li/a/@href' - - regions = dom.xpath(regions_xpath) - for region in regions: - code = re.search('setmkt=[^\&]+', region).group()[7:] - if code == 'nb-NO': - code = 'no-NO' - - supported_languages.append(code) - - return supported_languages diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 827555bd7..14fd7b99a 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -15,9 +15,10 @@ from datetime import datetime from dateutil import parser from lxml import etree from searx.utils import list_get, match_language -from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases from searx.url_utils import urlencode, urlparse, parse_qsl +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases + # engine dependent config categories = ['news'] paging = True diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index f1e636819..6e7b6d3aa 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -12,10 +12,10 @@ from json import loads from lxml import html -from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode from searx.utils import match_language +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases categories = ['videos'] paging = True -- cgit v1.2.3 From 1c853f9573a85d9885d3c99b7e3210c834e5e5f3 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 1 Mar 2020 11:07:59 +0100 Subject: bing_news: parital rollback of c89c05bc The bing_news bug (discussed in #1838) was caused by wrong language tags, which was fixed e0c99d9d / no need to change the bing_news search string. closes: https://github.com/asciimoo/searx/issues/1838 Signed-off-by: Markus Heiser --- searx/engines/bing.py | 2 +- searx/engines/bing_news.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index c8fc4fa2e..afb776acd 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -120,7 +120,7 @@ def _fetch_supported_languages(resp): href = eval_xpath(a, './@href')[0] match = setmkt.search(href) l_tag = match.groups()[0] - _lang, _nation = l_tag.split('-',1) + _lang, _nation = l_tag.split('-', 1) l_tag = _lang.lower() + '-' + _nation.upper() lang_tags.add(l_tag) diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 14fd7b99a..d13be777c 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -61,7 +61,7 @@ def _get_url(query, language, offset, time_range): else: # e.g. setmkt=de-de&setlang=de search_path = search_string.format( - query=urlencode({'q': query, 'setlang': language}), + query=urlencode({'q': query, 'setmkt': language}), offset=offset) return base_url + search_path -- cgit v1.2.3 From ad7a6e6e1022923850343b2c19e47bbd9fbad050 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 1 Mar 2020 17:28:59 +0100 Subject: bugfix(!biv) : bing-video do not like "older" User-Agents When selecting other languages than 'en', bing-video did not handle the language correct and gave very bad results. Since User-Agent is normaly rotated in searx, the behavior of a !biv search was unpredictable and paging was broken. Signed-off-by: Markus Heiser --- searx/engines/bing_videos.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index 6e7b6d3aa..f048f0d8e 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -67,6 +67,10 @@ def request(query, params): if params['time_range'] in time_range_dict: params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + # bing videos did not like "older" versions < 70.0.1 when selectin other + # languages then 'en' .. very strange ?!?! + params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1' + return params -- cgit v1.2.3 From 4f90fb6a9236a9699e80c249153ba372bf308cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sp=C3=BChler=20Stefan?= Date: Mon, 2 Mar 2020 18:55:48 +0100 Subject: [Fix] Startpage ValueError on Spanish date format datetime.parser.parse() does not know the Spanish date format which leads to a ValueError. Fixes #1870 Traceback (most recent call last): File "/usr/local/searx/searx/search.py", line 160, in search_one_http_request_safe search_results = search_one_http_request(engine, query, request_params) File "/usr/local/searx/searx/search.py", line 97, in search_one_http_request return engine.response(response) File "/usr/local/searx/searx/engines/startpage.py", line 102, in response published_date = parser.parse(date_string, dayfirst=True) File "/usr/local/searx/searx-ve/lib/python3.6/site-packages/dateutil/parser/_parser.py", line 1358, in parse return DEFAULTPARSER.parse(timestr, **kwargs) File "/usr/local/searx/searx-ve/lib/python3.6/site-packages/dateutil/parser/_parser.py", line 649, in parse raise ValueError("Unknown string format:", timestr) ValueError: ('Unknown string format:', '24 Ene 2013') --- searx/engines/startpage.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 76567396f..953734934 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -99,11 +99,14 @@ def response(resp): if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] - published_date = parser.parse(date_string, dayfirst=True) - # fix content string content = content[date_pos:] + try: + published_date = parser.parse(date_string, dayfirst=True) + except ValueError: + pass + # check if search result starts with something like: "5 days ago ... " elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): date_pos = content.find('...') + 4 -- cgit v1.2.3 From 3a26093c46469a320c2dfa4d732634a462c8f0f4 Mon Sep 17 00:00:00 2001 From: HLFH Date: Wed, 15 Apr 2020 16:46:15 +0100 Subject: Remove discontinued faroo engine --- searx/engines/faroo.py | 96 -------------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 searx/engines/faroo.py (limited to 'searx/engines') diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py deleted file mode 100644 index a36ecf778..000000000 --- a/searx/engines/faroo.py +++ /dev/null @@ -1,96 +0,0 @@ -""" - Faroo (Web, News) - - @website http://www.faroo.com - @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key - - @using-api no - @results JSON - @stable yes - @parse url, title, content, publishedDate, img_src -""" - -from json import loads -import datetime -from searx.utils import searx_useragent -from searx.url_utils import urlencode - -# engine dependent config -categories = ['general', 'news'] -paging = True -language_support = True -number_of_results = 10 - -# search-url -url = 'http://www.faroo.com/' -search_url = url + 'instant.json?{query}'\ - '&start={offset}'\ - '&length={number_of_results}'\ - '&l={language}'\ - '&src={categorie}'\ - '&i=false'\ - '&c=false' - -search_category = {'general': 'web', - 'news': 'news'} - - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * number_of_results + 1 - categorie = search_category.get(params['category'], 'web') - - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('-')[0] - - # if language is not supported, put it in english - if language != 'en' and\ - language != 'de' and\ - language != 'zh': - language = 'en' - - params['url'] = search_url.format(offset=offset, - number_of_results=number_of_results, - query=urlencode({'q': query}), - language=language, - categorie=categorie) - - params['headers']['Referer'] = url - - return params - - -# get response from search-request -def response(resp): - # HTTP-Code 429: rate limit exceeded - if resp.status_code == 429: - raise Exception("rate limit has been exceeded!") - - results = [] - - search_res = loads(resp.text) - - # return empty array if there are no results - if not search_res.get('results', {}): - return [] - - # parse results - for result in search_res['results']: - publishedDate = None - result_json = {'url': result['url'], 'title': result['title'], - 'content': result['kwic']} - if result['news']: - result_json['publishedDate'] = \ - datetime.datetime.fromtimestamp(result['date'] / 1000.0) - - # append image result if image url is set - if result['iurl']: - result_json['template'] = 'videos.html' - result_json['thumbnail'] = result['iurl'] - - results.append(result_json) - - # return results - return results -- cgit v1.2.3