diff options
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/qwant.py | 7 | ||||
| -rw-r--r-- | searx/engines/yahoo.py | 208 |
2 files changed, 115 insertions, 100 deletions
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 18e6a65d8..249a1f4e4 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -85,14 +85,14 @@ def request(query, params): # add language tag if params['language'] == 'all': - params['url'] += '&locale=en_us' + params['url'] += '&locale=en_US' else: language = match_language( params['language'], supported_languages, language_aliases, ) - params['url'] += '&locale=' + language.replace('-', '_').lower() + params['url'] += '&locale=' + language.replace('-', '_') params['raise_for_httperror'] = False return params @@ -138,6 +138,9 @@ def response(resp): for row in mainline: mainline_type = row.get('type', 'web') + if mainline_type != keyword: + continue + if mainline_type == 'ads': # ignore adds continue diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 26a4469b9..bd6e6721c 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -1,11 +1,24 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" - Yahoo (Web) +# lint: pylint +"""Yahoo Search (Web) + +Languages are supported by mapping the language to a domain. If domain is not +found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used. + """ -from urllib.parse import unquote, urlencode +from urllib.parse import ( + unquote, + urlencode, +) from lxml import html -from searx.utils import extract_text, extract_url, match_language, eval_xpath + +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, + match_language, +) # about about = { @@ -21,30 +34,78 @@ about = { categories = ['general'] paging = True time_range_support = True +supported_languages_url = 'https://search.yahoo.com/preferences/languages' +"""Supported languages are read from Yahoo preference page.""" -# search-url -base_url = 'https://search.yahoo.com/' -search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' -search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' +time_range_dict = { + 'day': ('1d', 'd'), + 'week': ('1w', 'w'), + 'month': ('1m', 'm'), +} -supported_languages_url = 'https://search.yahoo.com/web/advanced' +language_aliases = { + 'zh-HK': 'zh_chs', + 'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com + 'zh-TW': 'zh_cht', +} -# specific xpath variables -results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" -url_xpath = './/h3/a/@href' -title_xpath = './/h3/a' -content_xpath = './/div[contains(@class, "compText")]' -suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" +lang2domain = { + 'zh_chs' : 'hk.search.yahoo.com', + 'zh_cht' : 'tw.search.yahoo.com', + 'en' : 'search.yahoo.com', + + 'bg' : 'search.yahoo.com', + 'cs' : 'search.yahoo.com', + 'da' : 'search.yahoo.com', + 'el' : 'search.yahoo.com', + 'et' : 'search.yahoo.com', + 'he' : 'search.yahoo.com', + 'hr' : 'search.yahoo.com', + 'ja' : 'search.yahoo.com', + 'ko' : 'search.yahoo.com', + 'sk' : 'search.yahoo.com', + 'sl' : 'search.yahoo.com', -time_range_dict = {'day': ['1d', 'd'], - 'week': ['1w', 'w'], - 'month': ['1m', 'm']} +} +"""Map language to domain""" -language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} +def _get_language(params): + lang = language_aliases.get(params['language']) + if lang is None: + lang = match_language( + params['language'], supported_languages, language_aliases + ) + lang = lang.split('-')[0] + logger.debug("params['language']: %s --> %s" , params['language'], lang) + return lang + +def request(query, params): + """build request""" + offset = (params['pageno'] - 1) * 7 + 1 + lang = _get_language(params) + age, btf = time_range_dict.get( + params['time_range'], ('', '')) + + args = urlencode({ + 'p' : query, + 'ei' : 'UTF-8', + 'fl' : 1, + 'vl' : 'lang_' + lang, + 'btf' : btf, + 'fr2' : 'time', + 'age' : age, + 'b' : offset, + 'xargs' :0 + }) + + domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) + params['url'] = 'https://%s/search?%s' % (domain, args) + return params -# remove yahoo-specific tracking-url def parse_url(url_string): + """remove yahoo-specific tracking-url""" + endings = ['/RS', '/RK'] endpositions = [] start = url_string.find('http', url_string.find('/RU=') + 1) @@ -56,91 +117,46 @@ def parse_url(url_string): if start == 0 or len(endpositions) == 0: return url_string - else: - end = min(endpositions) - return unquote(url_string[start:end]) - - -def _get_url(query, offset, language, time_range): - if time_range in time_range_dict: - return base_url + search_url_with_time.format(offset=offset, - query=urlencode({'p': query}), - lang=language, - age=time_range_dict[time_range][0], - btf=time_range_dict[time_range][1]) - return base_url + search_url.format(offset=offset, - query=urlencode({'p': query}), - lang=language) - - -def _get_language(params): - if params['language'] == 'all': - return 'en' - language = match_language(params['language'], supported_languages, language_aliases) - if language not in language_aliases.values(): - language = language.split('-')[0] - language = language.replace('-', '_').lower() + end = min(endpositions) + return unquote(url_string[start:end]) - return language - - -# do search-request -def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: - return params - - offset = (params['pageno'] - 1) * 10 + 1 - language = _get_language(params) - - params['url'] = _get_url(query, offset, language, params['time_range']) - - # TODO required? - params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ - .format(lang=language) - - return params - - -# get response from search-request def response(resp): - results = [] + """parse response""" + results = [] dom = html.fromstring(resp.text) - try: - results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0] - .split()[0].replace(',', '')) - results.append({'number_of_results': results_num}) - except: - pass - # parse results - for result in eval_xpath(dom, results_xpath): - try: - url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url)) - title = extract_text(eval_xpath(result, title_xpath)[0]) - except: + for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): + url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) + if url is None: continue + url = parse_url(url) - content = extract_text(eval_xpath(result, content_xpath)[0]) + title = eval_xpath_getindex(result, './/h3/a', 0, default=None) + if title is None: + continue + offset = len(extract_text(title.xpath('span'))) + title = extract_text(title)[offset:] - # append result - results.append({'url': url, - 'title': title, - 'content': content}) + content = eval_xpath_getindex( + result, './/div[contains(@class, "compText")]', 0, default='' + ) + if content: + content = extract_text(content) - # if no suggestion found, return results - suggestions = eval_xpath(dom, suggestion_xpath) - if not suggestions: - return results + # append result + results.append({ + 'url': url, + 'title': title, + 'content': content + }) - # parse suggestion - for suggestion in suggestions: + for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - # return results return results @@ -148,13 +164,9 @@ def response(resp): def _fetch_supported_languages(resp): supported_languages = [] dom = html.fromstring(resp.text) - options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input') - for option in options: - code_parts = eval_xpath(option, './@value')[0][5:].split('_') - if len(code_parts) == 2: - code = code_parts[0] + '-' + code_parts[1].upper() - else: - code = code_parts[0] - supported_languages.append(code) + offset = len('lang_') + + for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): + supported_languages.append( val[offset:] ) return supported_languages |