diff options
| author | Markus Heiser <markus.heiser@darmarIT.de> | 2023-03-29 09:47:21 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-03-29 09:47:21 +0200 |
| commit | f950119ca87363aec81591dc4985f11371aa2b3e (patch) | |
| tree | ab893ff1f60d8c969ff0f5c2fad0cff49148aa3c /searx/engines | |
| parent | 64fea2f9cb079bd0055c6a23360097d285204515 (diff) | |
| parent | 6f9e678346e5978a09ee453a62fa133cdc0ee0bd (diff) | |
Merge pull request #2269 from return42/locale-revision
Revision of the locale- and language- handling in SearXNG
Diffstat (limited to 'searx/engines')
25 files changed, 2582 insertions, 1394 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 52bb5f20d..c8e8e7241 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -11,24 +11,22 @@ usage:: """ +from __future__ import annotations + import sys import copy -from typing import Dict, List, Optional - from os.path import realpath, dirname -from babel.localedata import locale_identifiers + +from typing import TYPE_CHECKING, Dict, Optional + from searx import logger, settings -from searx.data import ENGINES_LANGUAGES -from searx.network import get -from searx.utils import load_module, match_language, gen_useragent +from searx.utils import load_module +if TYPE_CHECKING: + from searx.enginelib import Engine logger = logger.getChild('engines') ENGINE_DIR = dirname(realpath(__file__)) -BABEL_LANGS = [ - lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0] - for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers()) -] ENGINE_DEFAULT_ARGS = { "engine_type": "online", "inactive": False, @@ -36,8 +34,6 @@ ENGINE_DEFAULT_ARGS = { "timeout": settings["outgoing"]["request_timeout"], "shortcut": "-", "categories": ["general"], - "supported_languages": [], - "language_aliases": {}, "paging": False, "safesearch": False, "time_range_support": False, @@ -52,24 +48,6 @@ ENGINE_DEFAULT_ARGS = { OTHER_CATEGORY = 'other' -class Engine: # pylint: disable=too-few-public-methods - """This class is currently never initialized and only used for type hinting.""" - - name: str - engine: str - shortcut: str - categories: List[str] - supported_languages: List[str] - about: dict - inactive: bool - disabled: bool - language_support: bool - paging: bool - safesearch: bool - time_range_support: bool - timeout: float - - # Defaults for the namespace of an engine module, see :py:func:`load_engine` categories = {'general': []} @@ -136,9 +114,15 @@ def load_engine(engine_data: dict) -> Optional[Engine]: return None update_engine_attributes(engine, engine_data) - set_language_attributes(engine) update_attributes_for_tor(engine) + # avoid cyclic imports + # pylint: disable=import-outside-toplevel + from searx.enginelib.traits import EngineTraitsMap + + trait_map = EngineTraitsMap.from_data() + trait_map.set_traits(engine) + if not is_engine_active(engine): return None @@ -190,60 +174,6 @@ def update_engine_attributes(engine: Engine, engine_data): setattr(engine, arg_name, copy.deepcopy(arg_value)) -def set_language_attributes(engine: Engine): - # assign supported languages from json file - if engine.name in ENGINES_LANGUAGES: - engine.supported_languages = ENGINES_LANGUAGES[engine.name] - - elif engine.engine in ENGINES_LANGUAGES: - # The key of the dictionary ENGINES_LANGUAGES is the *engine name* - # configured in settings.xml. When multiple engines are configured in - # settings.yml to use the same origin engine (python module) these - # additional engines can use the languages from the origin engine. - # For this use the configured ``engine: ...`` from settings.yml - engine.supported_languages = ENGINES_LANGUAGES[engine.engine] - - if hasattr(engine, 'language'): - # For an engine, when there is `language: ...` in the YAML settings, the - # engine supports only one language, in this case - # engine.supported_languages should contains this value defined in - # settings.yml - if engine.language not in engine.supported_languages: - raise ValueError( - "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language) - ) - - if isinstance(engine.supported_languages, dict): - engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} - else: - engine.supported_languages = [engine.language] - - # find custom aliases for non standard language codes - for engine_lang in engine.supported_languages: - iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) - if ( - iso_lang - and iso_lang != engine_lang - and not engine_lang.startswith(iso_lang) - and iso_lang not in engine.supported_languages - ): - engine.language_aliases[iso_lang] = engine_lang - - # language_support - engine.language_support = len(engine.supported_languages) > 0 - - # assign language fetching method if auxiliary method exists - if hasattr(engine, '_fetch_supported_languages'): - headers = { - 'User-Agent': gen_useragent(), - 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language - } - engine.fetch_supported_languages = ( - # pylint: disable=protected-access - lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers)) - ) - - def update_attributes_for_tor(engine: Engine) -> bool: if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index b5e426107..56c3b447f 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -1,15 +1,32 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint """ - Arch Linux Wiki +Arch Linux Wiki +~~~~~~~~~~~~~~~ + +This implementation does not use a official API: Mediawiki provides API, but +Arch Wiki blocks access to it. - API: Mediawiki provides API, but Arch Wiki blocks access to it """ -from urllib.parse import urlencode, urljoin -from lxml import html +from typing import TYPE_CHECKING +from urllib.parse import urlencode, urljoin, urlparse +import lxml +import babel + +from searx import network from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +from searx.enginelib.traits import EngineTraits +from searx.locales import language_tag + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + -# about about = { "website": 'https://wiki.archlinux.org/', "wikidata_id": 'Q101445877', @@ -22,125 +39,113 @@ about = { # engine dependent config categories = ['it', 'software wikis'] paging = True -base_url = 'https://wiki.archlinux.org' - -# xpath queries -xpath_results = '//ul[@class="mw-search-results"]/li' -xpath_link = './/div[@class="mw-search-result-heading"]/a' - - -# cut 'en' from 'en-US', 'de' from 'de-CH', and so on -def locale_to_lang_code(locale): - if locale.find('-') >= 0: - locale = locale.split('-')[0] - return locale - - -# wikis for some languages were moved off from the main site, we need to make -# requests to correct URLs to be able to get results in those languages -lang_urls = { - # fmt: off - 'all': { - 'base': 'https://wiki.archlinux.org', - 'search': '/index.php?title=Special:Search&offset={offset}&{query}' - }, - 'de': { - 'base': 'https://wiki.archlinux.de', - 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}' - }, - 'fr': { - 'base': 'https://wiki.archlinux.fr', - 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}' - }, - 'ja': { - 'base': 'https://wiki.archlinuxjp.org', - 'search': '/index.php?title=特別:検索&offset={offset}&{query}' - }, - 'ro': { - 'base': 'http://wiki.archlinux.ro', - 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}' - }, - 'tr': { - 'base': 'http://archtr.org/wiki', - 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}' - } - # fmt: on -} - - -# get base & search URLs for selected language -def get_lang_urls(language): - if language in lang_urls: - return lang_urls[language] - return lang_urls['all'] - - -# Language names to build search requests for -# those languages which are hosted on the main site. -main_langs = { - 'ar': 'العربية', - 'bg': 'Български', - 'cs': 'Česky', - 'da': 'Dansk', - 'el': 'Ελληνικά', - 'es': 'Español', - 'he': 'עברית', - 'hr': 'Hrvatski', - 'hu': 'Magyar', - 'it': 'Italiano', - 'ko': '한국어', - 'lt': 'Lietuviškai', - 'nl': 'Nederlands', - 'pl': 'Polski', - 'pt': 'Português', - 'ru': 'Русский', - 'sl': 'Slovenský', - 'th': 'ไทย', - 'uk': 'Українська', - 'zh': '简体中文', -} -supported_languages = dict(lang_urls, **main_langs) +main_wiki = 'wiki.archlinux.org' -# do search-request def request(query, params): - # translate the locale (e.g. 'en-US') to language code ('en') - language = locale_to_lang_code(params['language']) - - # if our language is hosted on the main site, we need to add its name - # to the query in order to narrow the results to that language - if language in main_langs: - query += ' (' + main_langs[language] + ')' - # prepare the request parameters - query = urlencode({'search': query}) + sxng_lang = params['searxng_locale'].split('-')[0] + netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) + title = traits.custom['title'].get(sxng_lang, 'Special:Search') + base_url = 'https://' + netloc + '/index.php?' offset = (params['pageno'] - 1) * 20 - # get request URLs for our language of choice - urls = get_lang_urls(language) - search_url = urls['base'] + urls['search'] - - params['url'] = search_url.format(query=query, offset=offset) + if netloc == main_wiki: + eng_lang: str = traits.get_language(sxng_lang, 'English') + query += ' (' + eng_lang + ')' + elif netloc == 'wiki.archlinuxcn.org': + base_url = 'https://' + netloc + '/wzh/index.php?' + + args = { + 'search': query, + 'title': title, + 'limit': 20, + 'offset': offset, + 'profile': 'default', + } + params['url'] = base_url + urlencode(args) return params -# get response from search-request def response(resp): - # get the base URL for the language in which request was made - language = locale_to_lang_code(resp.search_params['language']) - base_url = get_lang_urls(language)['base'] results = [] + dom = lxml.html.fromstring(resp.text) - dom = html.fromstring(resp.text) + # get the base URL for the language in which request was made + sxng_lang = resp.search_params['searxng_locale'].split('-')[0] + netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) + base_url = 'https://' + netloc + '/index.php?' + + for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'): + link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0) + content = extract_text(result.xpath('.//div[@class="searchresult"]')) + results.append( + { + 'url': urljoin(base_url, link.get('href')), + 'title': extract_text(link), + 'content': content, + } + ) - # parse results - for result in eval_xpath_list(dom, xpath_results): - link = eval_xpath_getindex(result, xpath_link, 0) - href = urljoin(base_url, link.attrib.get('href')) - title = extract_text(link) + return results - results.append({'url': href, 'title': title}) - return results +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from Archlinix-Wiki. The location of the Wiki address of a + language is mapped in a :py:obj:`custom field + <searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending + on the location, the ``title`` argument in the request is translated. + + .. code:: python + + "custom": { + "wiki_netloc": { + "de": "wiki.archlinux.de", + # ... + "zh": "wiki.archlinuxcn.org" + } + "title": { + "de": "Spezial:Suche", + # ... + "zh": "Special:\u641c\u7d22" + }, + }, + + """ + + engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['title'] = {} + + title_map = { + 'de': 'Spezial:Suche', + 'fa': 'ویژه:جستجو', + 'ja': '特別:検索', + 'zh': 'Special:搜索', + } + + resp = network.get('https://wiki.archlinux.org/') + if not resp.ok: + print("ERROR: response from wiki.archlinix.org is not OK.") + + dom = lxml.html.fromstring(resp.text) + for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"): + + sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-')) + # zh_Hans --> zh + sxng_tag = sxng_tag.split('_')[0] + + netloc = urlparse(a.get('href')).netloc + if netloc != 'wiki.archlinux.org': + title = title_map.get(sxng_tag) + if not title: + print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag)) + continue + engine_traits.custom['wiki_netloc'][sxng_tag] = netloc + engine_traits.custom['title'][sxng_tag] = title + + eng_tag = extract_text(eval_xpath_list(a, ".//span")) + engine_traits.languages[sxng_tag] = eng_tag + + engine_traits.languages['en'] = 'English' diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 783c0056a..0f85c7036 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -1,16 +1,53 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Bing (Web) +"""This is the implementation of the Bing-WEB engine. Some of this +implementations are shared by other engines: + +- :ref:`bing images engine` +- :ref:`bing news engine` +- :ref:`bing videos engine` + +On the `preference page`_ Bing offers a lot of languages an regions (see section +'Search results languages' and 'Country/region'). However, the abundant choice +does not correspond to reality, where Bing has a full-text indexer only for a +limited number of languages. By example: you can select a language like Māori +but you never get a result in this language. + +What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem +to be completely correct either (if you take a closer look you will find some +inaccuracies there too): + +- :py:obj:`searx.engines.bing.bing_traits_url` +- :py:obj:`searx.engines.bing_videos.bing_traits_url` +- :py:obj:`searx.engines.bing_images.bing_traits_url` +- :py:obj:`searx.engines.bing_news.bing_traits_url` + +.. _preference page: https://www.bing.com/account/general +.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/ -- https://github.com/searx/searx/issues/2019#issuecomment-648227442 """ -# pylint: disable=too-many-branches +# pylint: disable=too-many-branches, invalid-name +from typing import TYPE_CHECKING +import datetime import re -from urllib.parse import urlencode, urlparse, parse_qs +import uuid +from urllib.parse import urlencode from lxml import html -from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex -from searx.network import multi_requests, Request +import babel +import babel.languages + +from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex +from searx import network +from searx.locales import language_tag, region_tag +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits about = { "website": 'https://www.bing.com', @@ -21,56 +58,124 @@ about = { "results": 'HTML', } +send_accept_language_header = True +"""Bing tries to guess user's language and territory from the HTTP +Accept-Language. Optional the user can select a search-language (can be +different to the UI language) and a region (market code).""" + # engine dependent config categories = ['general', 'web'] paging = True -time_range_support = False -safesearch = False -send_accept_language_header = True -supported_languages_url = 'https://www.bing.com/account/general' -language_aliases = {} - -# search-url -base_url = 'https://www.bing.com/' +time_range_support = True +safesearch = True +safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT -# initial query: https://www.bing.com/search?q=foo&search=&form=QBLH -inital_query = 'search?{query}&search=&form=QBLH' +base_url = 'https://www.bing.com/search' +"""Bing (Web) search URL""" -# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE -page_query = 'search?{query}&search=&first={offset}&FORM=PERE' +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes' +"""Bing (Web) search API description""" def _get_offset_from_pageno(pageno): return (pageno - 1) * 10 + 1 -def request(query, params): +def set_bing_cookies(params, engine_language, engine_region, SID): + + # set cookies + # ----------- + + params['cookies']['_EDGE_V'] = '1' + + # _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw + _EDGE_S = [ + 'F=1', + 'SID=%s' % SID, + 'mkt=%s' % engine_region.lower(), + 'ui=%s' % engine_language.lower(), + ] + params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S) + logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S']) + + # "_EDGE_CD": "m=zh-tw", + + _EDGE_CD = [ # pylint: disable=invalid-name + 'm=%s' % engine_region.lower(), # search region: zh-cn + 'u=%s' % engine_language.lower(), # UI: en-us + ] + + params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';' + logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD']) - offset = _get_offset_from_pageno(params.get('pageno', 1)) + SRCHHPGUSR = [ # pylint: disable=invalid-name + 'SRCHLANG=%s' % engine_language, + # Trying to set ADLT cookie here seems not to have any effect, I assume + # there is some age verification by a cookie (and/or session ID) needed, + # to disable the SafeSearch. + 'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'), + ] + params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR) + logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR']) + + +def request(query, params): + """Assemble a Bing-Web request.""" - # logger.debug("params['pageno'] --> %s", params.get('pageno')) - # logger.debug(" offset --> %s", offset) + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') - search_string = page_query - if offset == 1: - search_string = inital_query + SID = uuid.uuid1().hex.upper() + CVID = uuid.uuid1().hex.upper() - if params['language'] == 'all': - lang = 'EN' - else: - lang = match_language(params['language'], supported_languages, language_aliases) + set_bing_cookies(params, engine_language, engine_region, SID) - query = 'language:{} {}'.format(lang.split('-')[0].upper(), query) + # build URL query + # --------------- - search_path = search_string.format(query=urlencode({'q': query}), offset=offset) + # query term + page = int(params.get('pageno', 1)) + query_params = { + # fmt: off + 'q': query, + 'pq': query, + 'cvid': CVID, + 'qs': 'n', + 'sp': '-1' + # fmt: on + } - if offset > 1: - referer = base_url + inital_query.format(query=urlencode({'q': query})) + # page + if page > 1: + referer = base_url + '?' + urlencode(query_params) params['headers']['Referer'] = referer logger.debug("headers.Referer --> %s", referer) - params['url'] = base_url + search_path - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + query_params['first'] = _get_offset_from_pageno(page) + + if page == 2: + query_params['FORM'] = 'PERE' + elif page > 2: + query_params['FORM'] = 'PERE%s' % (page - 2) + + filters = '' + if params['time_range']: + query_params['filt'] = 'custom' + + if params['time_range'] == 'day': + filters = 'ex1:"ez1"' + elif params['time_range'] == 'week': + filters = 'ex1:"ez2"' + elif params['time_range'] == 'month': + filters = 'ex1:"ez3"' + elif params['time_range'] == 'year': + epoch_1970 = datetime.date(1970, 1, 1) + today_no = (datetime.date.today() - epoch_1970).days + filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no) + + params['url'] = base_url + '?' + urlencode(query_params) + if filters: + params['url'] = params['url'] + '&filters=' + filters return params @@ -107,7 +212,8 @@ def response(resp): url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) # Bing can shorten the URL either at the end or in the middle of the string if ( - url_cite.startswith('https://') + url_cite + and url_cite.startswith('https://') and '…' not in url_cite and '...' not in url_cite and '›' not in url_cite @@ -127,9 +233,9 @@ def response(resp): # resolve all Bing redirections in parallel request_list = [ - Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve + network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve ] - response_list = multi_requests(request_list) + response_list = network.multi_requests(request_list) for i, redirect_response in enumerate(response_list): if not isinstance(redirect_response, Exception): results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] @@ -157,27 +263,71 @@ def response(resp): return results -# get supported languages from their site -def _fetch_supported_languages(resp): +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-Web.""" + + xpath_market_codes = '//table[1]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) + + +def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str): + + # insert alias to map from a language (zh) to a language + script (zh_Hans) + engine_traits.languages['zh'] = 'zh-hans' - lang_tags = set() + resp = network.get(url) + + if not resp.ok: + print("ERROR: response from peertube is not OK.") dom = html.fromstring(resp.text) - lang_links = eval_xpath(dom, '//div[@id="language-section"]//li') - for _li in lang_links: + map_lang = {'jp': 'ja'} + for td in eval_xpath(dom, xpath_language_codes): + eng_lang = td.text - href = eval_xpath(_li, './/@href')[0] - (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href) - query = parse_qs(query, keep_blank_values=True) + if eng_lang in ('en-gb', 'pt-br'): + # language 'en' is already in the list and a language 'en-gb' can't + # be handled in SearXNG, same with pt-br which is covered by pt-pt. + continue - # fmt: off - setlang = query.get('setlang', [None, ])[0] - # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN'] - lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip - # fmt: on + babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_') + try: + sxng_tag = language_tag(babel.Locale.parse(babel_lang)) + except babel.UnknownLocaleError: + print("ERROR: language (%s) is unknown by babel" % (eng_lang)) + continue + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) + continue + engine_traits.languages[sxng_tag] = eng_lang - tag = lang + '-' + nation if nation else lang - lang_tags.add(tag) + map_region = { + 'en-ID': 'id_ID', + 'no-NO': 'nb_NO', + } - return list(lang_tags) + for td in eval_xpath(dom, xpath_market_codes): + eng_region = td.text + babel_region = map_region.get(eng_region, eng_region).replace('-', '_') + + if eng_region == 'en-WW': + engine_traits.all_locale = eng_region + continue + + try: + sxng_tag = region_tag(babel.Locale.parse(babel_region)) + except babel.UnknownLocaleError: + print("ERROR: region (%s) is unknown by babel" % (eng_region)) + continue + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_region: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region)) + continue + engine_traits.regions[sxng_tag] = eng_region diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 107ce3cff..bd3a34aa5 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -1,20 +1,30 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Bing (Images) - +"""Bing-Images: description see :py:obj:`searx.engines.bing`. """ +# pylint: disable=invalid-name + -from json import loads +from typing import TYPE_CHECKING +import uuid +import json from urllib.parse import urlencode from lxml import html -from searx.utils import match_language -from searx.engines.bing import language_aliases -from searx.engines.bing import ( # pylint: disable=unused-import - _fetch_supported_languages, - supported_languages_url, +from searx.enginelib.traits import EngineTraits +from searx.engines.bing import ( + set_bing_cookies, + _fetch_traits, ) +from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -31,77 +41,92 @@ categories = ['images', 'web'] paging = True safesearch = True time_range_support = True -send_accept_language_header = True -supported_languages_url = 'https://www.bing.com/account/general' -number_of_results = 28 -# search-url -base_url = 'https://www.bing.com/' -search_string = ( +base_url = 'https://www.bing.com/images/async' +"""Bing (Images) search URL""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes' +"""Bing (Images) search API description""" + +time_map = { # fmt: off - 'images/search' - '?{query}' - '&count={count}' - '&first={first}' - '&tsc=ImageHoverTitle' + 'day': 60 * 24, + 'week': 60 * 24 * 7, + 'month': 60 * 24 * 31, + 'year': 60 * 24 * 365, # fmt: on -) -time_range_string = '&qft=+filterui:age-lt{interval}' -time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'} - -# safesearch definitions -safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} +} -# do search-request def request(query, params): - offset = ((params['pageno'] - 1) * number_of_results) + 1 + """Assemble a Bing-Image request.""" - search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset) + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') - language = match_language(params['language'], supported_languages, language_aliases).lower() + SID = uuid.uuid1().hex.upper() + set_bing_cookies(params, engine_language, engine_region, SID) - params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + # build URL query + # - example: https://www.bing.com/images/async?q=foo&first=155&count=35 - params['cookies']['_EDGE_S'] = 'mkt=' + language + '&ui=' + language + '&F=1' + query_params = { + # fmt: off + 'q': query, + 'async' : 'content', + # to simplify the page count lets use the default of 35 images per page + 'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, + 'count' : 35, + # fmt: on + } - params['url'] = base_url + search_path - if params['time_range'] in time_range_dict: - params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + # time range + # - example: one year (525600 minutes) 'qft=+filterui:age-lt525600' + + if params['time_range']: + query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']] + + params['url'] = base_url + '?' + urlencode(query_params) return params -# get response from search-request def response(resp): - results = [] + """Get response from Bing-Images""" + results = [] dom = html.fromstring(resp.text) - # parse results - for result in dom.xpath('//div[@class="imgpt"]'): - img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] - # Microsoft seems to experiment with this code so don't make the path too specific, - # just catch the text section for the first anchor in img_info assuming this to be - # the originating site. - source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] + for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'): - m = loads(result.xpath('./a/@m')[0]) + metadata = result.xpath('.//a[@class="iusc"]/@m') + if not metadata: + continue - # strip 'Unicode private use area' highlighting, they render to Tux - # the Linux penguin and a standing diamond on my machine... - title = m.get('t', '').replace('\ue000', '').replace('\ue001', '') + metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0]) + title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip() + img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip() + source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip() results.append( { 'template': 'images.html', - 'url': m['purl'], - 'thumbnail_src': m['turl'], - 'img_src': m['murl'], - 'content': '', + 'url': metadata['purl'], + 'thumbnail_src': metadata['turl'], + 'img_src': metadata['murl'], + 'content': metadata['desc'], 'title': title, 'source': source, 'img_format': img_format, } ) - return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-News.""" + + xpath_market_codes = '//table[1]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 7eea17bb4..d8c63857a 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -1,24 +1,30 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Bing (News) +"""Bing-News: description see :py:obj:`searx.engines.bing`. """ -from urllib.parse import ( - urlencode, - urlparse, - parse_qsl, - quote, -) -from datetime import datetime -from dateutil import parser -from lxml import etree -from lxml.etree import XPath -from searx.utils import match_language, eval_xpath_getindex -from searx.engines.bing import ( # pylint: disable=unused-import - language_aliases, - _fetch_supported_languages, - supported_languages_url, +# pylint: disable=invalid-name + +from typing import TYPE_CHECKING +import uuid +from urllib.parse import urlencode + +from lxml import html + +from searx.enginelib.traits import EngineTraits +from searx.engines.bing import ( + set_bing_cookies, + _fetch_traits, ) +from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + # about about = { @@ -34,108 +40,111 @@ about = { categories = ['news'] paging = True time_range_support = True -send_accept_language_header = True - -# search-url -base_url = 'https://www.bing.com/' -search_string = 'news/search?{query}&first={offset}&format=RSS' -search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' -time_range_dict = {'day': '7', 'week': '8', 'month': '9'} - - -def url_cleanup(url_string): - """remove click""" - - parsed_url = urlparse(url_string) - if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': - query = dict(parse_qsl(parsed_url.query)) - url_string = query.get('url', None) - return url_string - - -def image_url_cleanup(url_string): - """replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=...""" - - parsed_url = urlparse(url_string) - if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th': - query = dict(parse_qsl(parsed_url.query)) - url_string = "https://www.bing.com/th?id=" + quote(query.get('id')) - return url_string - - -def _get_url(query, language, offset, time_range): - if time_range in time_range_dict: - search_path = search_string_with_time.format( - # fmt: off - query = urlencode({ - 'q': query, - 'setmkt': language - }), - offset = offset, - interval = time_range_dict[time_range] - # fmt: on - ) - else: - # e.g. setmkt=de-de&setlang=de - search_path = search_string.format( - # fmt: off - query = urlencode({ - 'q': query, - 'setmkt': language - }), - offset = offset - # fmt: on - ) - return base_url + search_path +time_map = { + 'day': '4', + 'week': '8', + 'month': '9', +} +"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the +difference of *last day* and *last week* in the result list is just marginally. +""" + +base_url = 'https://www.bing.com/news/infinitescrollajax' +"""Bing (News) search URL""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes' +"""Bing (News) search API description""" + +mkt_alias = { + 'zh': 'en-WW', + 'zh-CN': 'en-WW', +} +"""Bing News has an official market code 'zh-CN' but we won't get a result with +this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate* +market code (en-WW). +""" def request(query, params): + """Assemble a Bing-News request.""" + + sxng_locale = params['searxng_locale'] + engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale) + engine_language = traits.get_language(sxng_locale, 'en') + + SID = uuid.uuid1().hex.upper() + set_bing_cookies(params, engine_language, engine_region, SID) - if params['time_range'] and params['time_range'] not in time_range_dict: - return params + # build URL query + # + # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 - offset = (params['pageno'] - 1) * 10 + 1 - if params['language'] == 'all': - language = 'en-US' - else: - language = match_language(params['language'], supported_languages, language_aliases) - params['url'] = _get_url(query, language, offset, params['time_range']) + query_params = { + # fmt: off + 'q': query, + 'InfiniteScroll': 1, + # to simplify the page count lets use the default of 10 images per page + 'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1, + # fmt: on + } + + if params['time_range']: + # qft=interval:"7" + query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9') + + params['url'] = base_url + '?' + urlencode(query_params) return params def response(resp): - + """Get response from Bing-Video""" results = [] - rss = etree.fromstring(resp.content) - namespaces = rss.nsmap - - for item in rss.xpath('./channel/item'): - # url / title / content - url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None)) - title = eval_xpath_getindex(item, './title/text()', 0, default=url) - content = eval_xpath_getindex(item, './description/text()', 0, default='') - - # publishedDate - publishedDate = eval_xpath_getindex(item, './pubDate/text()', 0, default=None) - try: - publishedDate = parser.parse(publishedDate, dayfirst=False) - except TypeError: - publishedDate = datetime.now() - except ValueError: - publishedDate = datetime.now() - - # thumbnail - thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None) - if thumbnail is not None: - thumbnail = image_url_cleanup(thumbnail) - - # append result - if thumbnail is not None: - results.append( - {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content, 'img_src': thumbnail} - ) - else: - results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}) + + if not resp.ok or not resp.text: + return results + + dom = html.fromstring(resp.text) + + for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'): + + url = newsitem.xpath('./@url')[0] + title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip() + content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip() + thumbnail = None + author = newsitem.xpath('./@data-author')[0] + metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip() + + img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src') + if img_src: + thumbnail = 'https://www.bing.com/' + img_src[0] + + results.append( + { + 'url': url, + 'title': title, + 'content': content, + 'img_src': thumbnail, + 'author': author, + 'metadata': metadata, + } + ) return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-News. + + The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the + first table says *"query parameter when calling the Video Search API."* + .. thats why I use the 4. table "News Category API markets" for the + ``xpath_market_codes``. + + """ + + xpath_market_codes = '//table[4]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index 85071de21..8ee0bb66e 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -1,21 +1,30 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Bing (Videos) - +"""Bing-Videos: description see :py:obj:`searx.engines.bing`. """ +# pylint: disable=invalid-name -from json import loads +from typing import TYPE_CHECKING +import uuid +import json from urllib.parse import urlencode from lxml import html -from searx.utils import match_language -from searx.engines.bing import language_aliases - -from searx.engines.bing import ( # pylint: disable=unused-import - _fetch_supported_languages, - supported_languages_url, +from searx.enginelib.traits import EngineTraits +from searx.engines.bing import ( + set_bing_cookies, + _fetch_traits, ) +from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + about = { "website": 'https://www.bing.com/videos', @@ -26,65 +35,76 @@ about = { "results": 'HTML', } +# engine dependent config categories = ['videos', 'web'] paging = True safesearch = True time_range_support = True -send_accept_language_header = True -number_of_results = 28 -base_url = 'https://www.bing.com/' -search_string = ( +base_url = 'https://www.bing.com/videos/asyncv2' +"""Bing (Videos) async search URL.""" + +bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes' +"""Bing (Video) search API description""" + +time_map = { # fmt: off - 'videos/search' - '?{query}' - '&count={count}' - '&first={first}' - '&scope=video' - '&FORM=QBLH' + 'day': 60 * 24, + 'week': 60 * 24 * 7, + 'month': 60 * 24 * 31, + 'year': 60 * 24 * 365, # fmt: on -) -time_range_string = '&qft=+filterui:videoage-lt{interval}' -time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'} - -# safesearch definitions -safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} +} -# do search-request def request(query, params): - offset = ((params['pageno'] - 1) * number_of_results) + 1 + """Assemble a Bing-Video request.""" - search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset) + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') - # safesearch cookie - params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + SID = uuid.uuid1().hex.upper() + set_bing_cookies(params, engine_language, engine_region, SID) - # language cookie - language = match_language(params['language'], supported_languages, language_aliases).lower() - params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' + # build URL query + # + # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 - # query and paging - params['url'] = base_url + search_path + query_params = { + # fmt: off + 'q': query, + 'async' : 'content', + # to simplify the page count lets use the default of 35 images per page + 'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1, + 'count' : 35, + # fmt: on + } # time range - if params['time_range'] in time_range_dict: - params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + # + # example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR' + + if params['time_range']: + query_params['form'] = 'VRFLTR' + query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']] + + params['url'] = base_url + '?' + urlencode(query_params) return params -# get response from search-request def response(resp): + """Get response from Bing-Video""" results = [] dom = html.fromstring(resp.text) - for result in dom.xpath('//div[@class="dg_u"]/div[contains(@class, "mc_vtvc")]'): - metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) + for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'): + metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() content = '{0} - {1}'.format(metadata['du'], info) - thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid']) + thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0] + results.append( { 'url': metadata['murl'], @@ -96,3 +116,13 @@ def response(resp): ) return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and regions from Bing-Videos.""" + + xpath_market_codes = '//table[1]/tbody/tr/td[3]' + # xpath_country_codes = '//table[2]/tbody/tr/td[2]' + xpath_language_codes = '//table[3]/tbody/tr/td[2]' + + _fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes) diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 7dd84dd27..d734ec3c8 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -1,17 +1,35 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Dailymotion (Videos) +# lint: pylint +""" +Dailymotion (Videos) +~~~~~~~~~~~~~~~~~~~~ + +.. _REST GET: https://developers.dailymotion.com/tools/ +.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters +.. _Video filters API: https://developers.dailymotion.com/api/#video-filters +.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection """ -from typing import Set +from typing import TYPE_CHECKING + from datetime import datetime, timedelta from urllib.parse import urlencode import time import babel from searx.exceptions import SearxEngineAPIException -from searx.network import raise_for_httperror +from searx import network from searx.utils import html_to_text +from searx.locales import region_tag, language_tag +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -37,11 +55,24 @@ time_delta_dict = { } safesearch = True -safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''} +safesearch_params = { + 2: {'is_created_for_kids': 'true'}, + 1: {'is_created_for_kids': 'true'}, + 0: {}, +} +"""True if this video is "Created for Kids" / intends to target an audience +under the age of 16 (``is_created_for_kids`` in `Video filters API`_ ) +""" -# search-url -# - https://developers.dailymotion.com/tools/ -# - https://www.dailymotion.com/doc/api/obj-video.html +family_filter_map = { + 2: 'true', + 1: 'true', + 0: 'false', +} +"""By default, the family filter is turned on. Setting this parameter to +``false`` will stop filtering-out explicit content from searches and global +contexts (``family_filter`` in `Global API Parameters`_ ). +""" result_fields = [ 'allow_embed', @@ -53,27 +84,21 @@ result_fields = [ 'thumbnail_360_url', 'id', ] -search_url = ( - 'https://api.dailymotion.com/videos?' - 'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}' -).format( - fields=','.join(result_fields), - password_protected='false', - private='false', - sort='relevance', - limit=number_of_results, -) -iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" +"""`Fields selection`_, by default, a few fields are returned. To request more +specific fields, the ``fields`` parameter is used with the list of fields +SearXNG needs in the response to build a video result list. +""" -# The request query filters by 'languages' & 'country', therefore instead of -# fetching only languages we need to fetch locales. -supported_languages_url = 'https://api.dailymotion.com/locales' -supported_languages_iso639: Set[str] = set() +search_url = 'https://api.dailymotion.com/videos?' +"""URL to retrieve a list of videos. +- `REST GET`_ +- `Global API Parameters`_ +- `Video filters API`_ +""" -def init(_engine_settings): - global supported_languages_iso639 - supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages]) +iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" +"""URL template to embed video in SearXNG's result list.""" def request(query, params): @@ -81,34 +106,42 @@ def request(query, params): if not query: return False - language = params['language'] - if language == 'all': - language = 'en-US' - locale = babel.Locale.parse(language, sep='-') + eng_region = traits.get_region(params['searxng_locale'], 'en_US') + eng_lang = traits.get_language(params['searxng_locale'], 'en') - language_iso639 = locale.language - if locale.language not in supported_languages_iso639: - language_iso639 = 'en' - - query_args = { + args = { 'search': query, - 'languages': language_iso639, + 'family_filter': family_filter_map.get(params['safesearch'], 'false'), + 'thumbnail_ratio': 'original', # original|widescreen|square + # https://developers.dailymotion.com/api/#video-filters + 'languages': eng_lang, 'page': params['pageno'], + 'password_protected': 'false', + 'private': 'false', + 'sort': 'relevance', + 'limit': number_of_results, + 'fields': ','.join(result_fields), } - if locale.territory: - localization = locale.language + '_' + locale.territory - if localization in supported_languages: - query_args['country'] = locale.territory + args.update(safesearch_params.get(params['safesearch'], {})) + + # Don't add localization and country arguments if the user does select a + # language (:de, :en, ..) + + if len(params['searxng_locale'].split('-')) > 1: + # https://developers.dailymotion.com/api/#global-parameters + args['localization'] = eng_region + args['country'] = eng_region.split('_')[1] + # Insufficient rights for the `ams_country' parameter of route `GET /videos' + # 'ams_country': eng_region.split('_')[1], time_delta = time_delta_dict.get(params["time_range"]) if time_delta: created_after = datetime.now() - time_delta - query_args['created_after'] = datetime.timestamp(created_after) + args['created_after'] = datetime.timestamp(created_after) - query_str = urlencode(query_args) - params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '') - params['raise_for_httperror'] = False + query_str = urlencode(args) + params['url'] = search_url + query_str return params @@ -123,7 +156,7 @@ def response(resp): if 'error' in search_res: raise SearxEngineAPIException(search_res['error'].get('message')) - raise_for_httperror(resp) + network.raise_for_httperror(resp) # parse results for res in search_res.get('list', []): @@ -167,7 +200,53 @@ def response(resp): return results -# get supported languages from their site -def _fetch_supported_languages(resp): - response_json = resp.json() - return [item['locale'] for item in response_json['list']] +def fetch_traits(engine_traits: EngineTraits): + """Fetch locales & languages from dailymotion. + + Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_. + There are duplications in the locale codes returned from Dailymotion which + can be ignored:: + + en_EN --> en_GB, en_US + ar_AA --> ar_EG, ar_AE, ar_SA + + The language list `api/languages <https://api.dailymotion.com/languages>`_ + contains over 7000 *languages* codes (see PR1071_). We use only those + language codes that are used in the locales. + + .. _PR1071: https://github.com/searxng/searxng/pull/1071 + + """ + + resp = network.get('https://api.dailymotion.com/locales') + if not resp.ok: + print("ERROR: response from dailymotion/locales is not OK.") + + for item in resp.json()['list']: + eng_tag = item['locale'] + if eng_tag in ('en_EN', 'ar_AA'): + continue + try: + sxng_tag = region_tag(babel.Locale.parse(eng_tag)) + except babel.UnknownLocaleError: + print("ERROR: item unknown --> %s" % item) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag + + locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()] + + resp = network.get('https://api.dailymotion.com/languages') + if not resp.ok: + print("ERROR: response from dailymotion/languages is not OK.") + + for item in resp.json()['list']: + eng_tag = item['code'] + if eng_tag in locale_lang_list: + sxng_tag = language_tag(babel.Locale.parse(eng_tag)) + engine_traits.languages[sxng_tag] = eng_tag diff --git a/searx/engines/demo_offline.py b/searx/engines/demo_offline.py index aeb74f443..9d6e3b52d 100644 --- a/searx/engines/demo_offline.py +++ b/searx/engines/demo_offline.py @@ -63,7 +63,7 @@ def search(query, request_params): for row in result_list: entry = { 'query': query, - 'language': request_params['language'], + 'language': request_params['searxng_locale'], 'value': row.get("value"), # choose a result template or comment out to use the *default* 'template': 'key-value.html', diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 2a7956ca8..85e977bdb 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,71 +1,207 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""DuckDuckGo Lite +""" +DuckDuckGo Lite +~~~~~~~~~~~~~~~ """ -from json import loads - -from lxml.html import fromstring +from typing import TYPE_CHECKING +from urllib.parse import urlencode +import json +import babel +import lxml.html +from searx import ( + network, + locales, + redislib, +) +from searx import redisdb from searx.utils import ( - dict_subset, eval_xpath, eval_xpath_getindex, extract_text, - match_language, ) -from searx.network import get +from searx.enginelib.traits import EngineTraits +from searx.exceptions import SearxEngineAPIException + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits -# about about = { "website": 'https://lite.duckduckgo.com/lite/', "wikidata_id": 'Q12805', - "official_api_documentation": 'https://duckduckgo.com/api', "use_official_api": False, "require_api_key": False, "results": 'HTML', } +send_accept_language_header = True +"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP +``Accept-Language``. Optional the user can select a region filter (but not a +language). +""" + # engine dependent config categories = ['general', 'web'] paging = True -supported_languages_url = 'https://duckduckgo.com/util/u588.js' time_range_support = True -send_accept_language_header = True +safesearch = True # user can't select but the results are filtered -language_aliases = { - 'ar-SA': 'ar-XA', - 'es-419': 'es-XL', - 'ja': 'jp-JP', - 'ko': 'kr-KR', - 'sl-SI': 'sl-SL', - 'zh-TW': 'tzh-TW', - 'zh-HK': 'tzh-HK', -} +url = 'https://lite.duckduckgo.com/lite/' +# url_ping = 'https://duckduckgo.com/t/sl_l' time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} +form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} -# search-url -url = 'https://lite.duckduckgo.com/lite/' -url_ping = 'https://duckduckgo.com/t/sl_l' -# match query's language to a region code that duckduckgo will accept -def get_region_code(lang, lang_list=None): - if lang == 'all': - return None +def cache_vqd(query, value): + """Caches a ``vqd`` value from a query. + + The vqd value depends on the query string and is needed for the follow up + pages or the images loaded by a XMLHttpRequest: + + - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...` + - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...` + + """ + c = redisdb.client() + if c: + logger.debug("cache vqd value: %s", value) + key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) + c.set(key, value, ex=600) + + +def get_vqd(query, headers): + """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached + (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the + response. + + """ + value = None + c = redisdb.client() + if c: + key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) + value = c.get(key) + if value: + value = value.decode('utf-8') + logger.debug("re-use cached vqd value: %s", value) + return value - lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT') - lang_parts = lang_code.split('-') + query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query})) + res = network.get(query_url, headers=headers) + content = res.text + if content.find('vqd=\'') == -1: + raise SearxEngineAPIException('Request failed') + value = content[content.find('vqd=\'') + 5 :] + value = value[: value.find('\'')] + logger.debug("new vqd value: %s", value) + cache_vqd(query, value) + return value - # country code goes first - return lang_parts[1].lower() + '-' + lang_parts[0].lower() + +def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): + """Get DuckDuckGo's language identifier from SearXNG's locale. + + DuckDuckGo defines its lanaguages by region codes (see + :py:obj:`fetch_traits`). + + To get region and language of a DDG service use: + + .. code: python + + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + It might confuse, but the ``l`` value of the cookie is what SearXNG calls + the *region*: + + .. code:: python + + # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} + params['cookies']['ad'] = eng_lang + params['cookies']['ah'] = eng_region + params['cookies']['l'] = eng_region + + .. hint:: + + `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language + selection to the user, only a region can be selected by the user + (``eng_region`` from the example above). DDG-lite stores the selected + region in a cookie:: + + params['cookies']['kl'] = eng_region # 'ar-es' + + """ + return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default)) + + +ddg_reg_map = { + 'tw-tzh': 'zh_TW', + 'hk-tzh': 'zh_HK', + 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES + 'es-ca': 'ca_ES', + 'id-en': 'id_ID', + 'no-no': 'nb_NO', + 'jp-jp': 'ja_JP', + 'kr-kr': 'ko_KR', + 'xa-ar': 'ar_SA', + 'sl-sl': 'sl_SI', + 'th-en': 'th_TH', + 'vn-en': 'vi_VN', +} + +ddg_lang_map = { + # use ar --> ar_EG (Egypt's arabic) + "ar_DZ": 'lang_region', + "ar_JO": 'lang_region', + "ar_SA": 'lang_region', + # use bn --> bn_BD + 'bn_IN': 'lang_region', + # use de --> de_DE + 'de_CH': 'lang_region', + # use en --> en_US, + 'en_AU': 'lang_region', + 'en_CA': 'lang_region', + 'en_GB': 'lang_region', + # Esperanto + 'eo_XX': 'eo', + # use es --> es_ES, + 'es_AR': 'lang_region', + 'es_CL': 'lang_region', + 'es_CO': 'lang_region', + 'es_CR': 'lang_region', + 'es_EC': 'lang_region', + 'es_MX': 'lang_region', + 'es_PE': 'lang_region', + 'es_UY': 'lang_region', + 'es_VE': 'lang_region', + # use fr --> rf_FR + 'fr_CA': 'lang_region', + 'fr_CH': 'lang_region', + 'fr_BE': 'lang_region', + # use nl --> nl_NL + 'nl_BE': 'lang_region', + # use pt --> pt_PT + 'pt_BR': 'lang_region', + # skip these languages + 'od_IN': 'skip', + 'io_XX': 'skip', + 'tokipona_XX': 'skip', +} def request(query, params): + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + # eng_lang = get_ddg_lang(traits, params['searxng_locale']) + params['url'] = url params['method'] = 'POST' - params['data']['q'] = query # The API is not documented, so we do some reverse engineering and emulate @@ -88,23 +224,19 @@ def request(query, params): params['data']['s'] = offset params['data']['dc'] = offset + 1 + # request needs a vqd argument + params['data']['vqd'] = get_vqd(query, params["headers"]) + # initial page does not have additional data in the input form if params['pageno'] > 1: - # request the second page (and more pages) needs 'o' and 'api' arguments - params['data']['o'] = 'json' - params['data']['api'] = 'd.js' - # initial page does not have additional data in the input form - if params['pageno'] > 2: - # request the third page (and more pages) some more arguments - params['data']['nextParams'] = '' - params['data']['v'] = '' - params['data']['vqd'] = '' + params['data']['o'] = form_data.get('o', 'json') + params['data']['api'] = form_data.get('api', 'd.js') + params['data']['nextParams'] = form_data.get('nextParams', '') + params['data']['v'] = form_data.get('v', 'l') - region_code = get_region_code(params['language'], supported_languages) - if region_code: - params['data']['kl'] = region_code - params['cookies']['kl'] = region_code + params['data']['kl'] = eng_region + params['cookies']['kl'] = eng_region params['data']['df'] = '' if params['time_range'] in time_range_dict: @@ -116,26 +248,40 @@ def request(query, params): return params -# get response from search-request def response(resp): - headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) - get(url_ping, headers=headers_ping) - if resp.status_code == 303: return [] results = [] - doc = fromstring(resp.text) + doc = lxml.html.fromstring(resp.text) result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') - if not len(result_table) >= 3: + + if len(result_table) == 2: + # some locales (at least China) does not have a "next page" button and + # the layout of the HTML tables is different. + result_table = result_table[1] + elif not len(result_table) >= 3: # no more results return [] - result_table = result_table[2] + else: + result_table = result_table[2] + # update form data from response + form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') + if len(form): + + form = form[0] + form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0] + form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0] + form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0] + logger.debug('form_data: %s', form_data) + + value = eval_xpath(form, '//input[@name="vqd"]/@value')[0] + query = resp.search_params['data']['q'] + cache_vqd(query, value) tr_rows = eval_xpath(result_table, './/tr') - # In the last <tr> is the form of the 'previous/next page' links tr_rows = tr_rows[:-1] @@ -172,15 +318,105 @@ def response(resp): return results -# get supported languages from their site -def _fetch_supported_languages(resp): +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages & regions from DuckDuckGo. + + SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``). + DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no + sense in a SearXNG request since SearXNG's ``all`` will not add a + ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale`` + is ``wt-wt`` (the region). + + Beside regions DuckDuckGo also defines its lanaguages by region codes. By + example these are the english languages in DuckDuckGo: + + - en_US + - en_AU + - en_CA + - en_GB + + The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from + SearXNG's locale. - # response is a js file with regions as an embedded object - response_page = resp.text - response_page = response_page[response_page.find('regions:{') + 8 :] - response_page = response_page[: response_page.find('}') + 1] + """ + # pylint: disable=too-many-branches, too-many-statements + # fetch regions + + engine_traits.all_locale = 'wt-wt' + + # updated from u588 to u661 / should be updated automatically? + resp = network.get('https://duckduckgo.com/util/u661.js') + + if not resp.ok: + print("ERROR: response from DuckDuckGo is not OK.") + + pos = resp.text.find('regions:{') + 8 + js_code = resp.text[pos:] + pos = js_code.find('}') + 1 + regions = json.loads(js_code[:pos]) + + for eng_tag, name in regions.items(): + + if eng_tag == 'wt-wt': + engine_traits.all_locale = 'wt-wt' + continue + + region = ddg_reg_map.get(eng_tag) + if region == 'skip': + continue - regions_json = loads(response_page) - supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) + if not region: + eng_territory, eng_lang = eng_tag.split('-') + region = eng_lang + '_' + eng_territory.upper() - return list(supported_languages) + try: + sxng_tag = locales.region_tag(babel.Locale.parse(region)) + except babel.UnknownLocaleError: + print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region)) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag + + # fetch languages + + engine_traits.custom['lang_region'] = {} + + pos = resp.text.find('languages:{') + 10 + js_code = resp.text[pos:] + pos = js_code.find('}') + 1 + js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') + languages = json.loads(js_code) + + for eng_lang, name in languages.items(): + + if eng_lang == 'wt_WT': + continue + + babel_tag = ddg_lang_map.get(eng_lang, eng_lang) + if babel_tag == 'skip': + continue + + try: + + if babel_tag == 'lang_region': + sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang)) + engine_traits.custom['lang_region'][sxng_tag] = eng_lang + continue + + sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag)) + + except babel.UnknownLocaleError: + print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang)) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang)) + continue + engine_traits.languages[sxng_tag] = eng_lang diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 7ed0de35c..39fed87e7 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,22 +1,33 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""DuckDuckGo (Instant Answer API) +""" +DuckDuckGo Instant Answer API +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from +reverse engineering we can see that some services (e.g. instant answers) still +in use from the DDG search engine. + +As far we can say the *instant answers* API does not support languages, or at +least we could not find out how language support should work. It seems that +most of the features are based on English terms. """ -import json +from typing import TYPE_CHECKING + from urllib.parse import urlencode, urlparse, urljoin from lxml import html from searx.data import WIKIDATA_UNITS -from searx.engines.duckduckgo import language_aliases -from searx.engines.duckduckgo import ( # pylint: disable=unused-import - _fetch_supported_languages, - supported_languages_url, -) -from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function +from searx.utils import extract_text, html_to_text, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom +if TYPE_CHECKING: + import logging + + logger: logging.Logger + # about about = { "website": 'https://duckduckgo.com/', @@ -37,7 +48,7 @@ replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) def is_broken_text(text): - """duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>" + """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>`` The href URL is broken, the "Related website" may contains some HTML. @@ -62,8 +73,6 @@ def result_to_text(text, htmlResult): def request(query, params): params['url'] = URL.format(query=urlencode({'q': query})) - language = match_language(params['language'], supported_languages, language_aliases) - language = language.split('-')[0] return params @@ -71,7 +80,7 @@ def response(resp): # pylint: disable=too-many-locals, too-many-branches, too-many-statements results = [] - search_res = json.loads(resp.text) + search_res = resp.json() # search_res.get('Entity') possible values (not exhaustive) : # * continent / country / department / location / waterfall @@ -235,7 +244,7 @@ def unit_to_str(unit): def area_to_str(area): - """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}""" + """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``""" unit = unit_to_str(area.get('unit')) if unit is not None: try: diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 19f649ef4..d8a6f1340 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -1,26 +1,30 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """ - DuckDuckGo (Images) +DuckDuckGo Images +~~~~~~~~~~~~~~~~~ """ -from json import loads +from typing import TYPE_CHECKING from urllib.parse import urlencode -from searx.exceptions import SearxEngineAPIException -from searx.engines.duckduckgo import get_region_code -from searx.engines.duckduckgo import ( # pylint: disable=unused-import - _fetch_supported_languages, - supported_languages_url, + +from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import +from searx.engines.duckduckgo import ( + get_ddg_lang, + get_vqd, ) -from searx.network import get +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { "website": 'https://duckduckgo.com/', "wikidata_id": 'Q12805', - "official_api_documentation": { - 'url': 'https://duckduckgo.com/api', - 'comment': 'but images are not supported', - }, "use_official_api": False, "require_api_key": False, "results": 'JSON (site requires js to get images)', @@ -32,70 +36,64 @@ paging = True safesearch = True send_accept_language_header = True -# search-url -images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' -site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' +safesearch_cookies = {0: '-2', 1: None, 2: '1'} +safesearch_args = {0: '1', 1: None, 2: '1'} -# run query in site to get vqd number needed for requesting images -# TODO: find a way to get this number without an extra request (is it a hash of the query?) -def get_vqd(query, headers): - query_url = site_url.format(query=urlencode({'q': query})) - res = get(query_url, headers=headers) - content = res.text - if content.find('vqd=\'') == -1: - raise SearxEngineAPIException('Request failed') - vqd = content[content.find('vqd=\'') + 5 :] - vqd = vqd[: vqd.find('\'')] - return vqd +def request(query, params): + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) -# do search-request -def request(query, params): - # to avoid running actual external requests when testing - if 'is_test' not in params: - vqd = get_vqd(query, params['headers']) - else: - vqd = '12345' + args = { + 'q': query, + 'o': 'json', + # 'u': 'bing', + 'l': eng_region, + 'vqd': get_vqd(query, params["headers"]), + } - offset = (params['pageno'] - 1) * 50 + if params['pageno'] > 1: + args['s'] = (params['pageno'] - 1) * 100 - safesearch = params['safesearch'] - 1 + params['cookies']['ad'] = eng_lang # zh_CN + params['cookies']['ah'] = eng_region # "us-en,de-de" + params['cookies']['l'] = eng_region # "hk-tzh" + logger.debug("cookies: %s", params['cookies']) - region_code = get_region_code(params['language'], lang_list=supported_languages) - if region_code: - params['url'] = images_url.format( - query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd - ) - else: - params['url'] = images_url.format(query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) + safe_search = safesearch_cookies.get(params['safesearch']) + if safe_search is not None: + params['cookies']['p'] = safe_search # "-2", "1" + safe_search = safesearch_args.get(params['safesearch']) + if safe_search is not None: + args['p'] = safe_search # "-1", "1" + + args = urlencode(args) + params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,') + + params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01' + params['headers']['Referer'] = 'https://duckduckgo.com/' + params['headers']['X-Requested-With'] = 'XMLHttpRequest' + logger.debug("headers: %s", params['headers']) return params -# get response from search-request def response(resp): results = [] + res_json = resp.json() - content = resp.text - res_json = loads(content) - - # parse results for result in res_json['results']: - title = result['title'] - url = result['url'] - thumbnail = result['thumbnail'] - image = result['image'] - - # append result results.append( { 'template': 'images.html', - 'title': title, + 'title': result['title'], 'content': '', - 'thumbnail_src': thumbnail, - 'img_src': image, - 'url': url, + 'thumbnail_src': result['thumbnail'], + 'img_src': result['image'], + 'url': result['url'], + 'img_format': '%s x %s' % (result['width'], result['height']), + 'source': result['source'], } ) diff --git a/searx/engines/duckduckgo_weather.py b/searx/engines/duckduckgo_weather.py index 0540cbcb5..4f0ce1b49 100644 --- a/searx/engines/duckduckgo_weather.py +++ b/searx/engines/duckduckgo_weather.py @@ -1,13 +1,29 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""DuckDuckGo Weather""" +""" +DuckDuckGo Weather +~~~~~~~~~~~~~~~~~~ +""" +from typing import TYPE_CHECKING from json import loads from urllib.parse import quote from datetime import datetime from flask_babel import gettext +from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import +from searx.engines.duckduckgo import get_ddg_lang +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + + about = { "website": 'https://duckduckgo.com/', "wikidata_id": 'Q12805', @@ -17,9 +33,11 @@ about = { "results": "JSON", } -categories = ["others"] +send_accept_language_header = True -url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" +# engine dependent config +categories = ["others"] +URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" def generate_condition_table(condition): @@ -72,8 +90,17 @@ def generate_day_table(day): def request(query, params): - params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0]) + eng_region = traits.get_region(params['searxng_locale'], traits.all_locale) + eng_lang = get_ddg_lang(traits, params['searxng_locale']) + + # !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'} + params['cookies']['ad'] = eng_lang + params['cookies']['ah'] = eng_region + params['cookies']['l'] = eng_region + logger.debug("cookies: %s", params['cookies']) + + params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0]) return params diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py index 856c93710..f0cb6a794 100644 --- a/searx/engines/gentoo.py +++ b/searx/engines/gentoo.py @@ -25,6 +25,7 @@ base_url = 'https://wiki.gentoo.org' # xpath queries xpath_results = '//ul[@class="mw-search-results"]/li' xpath_link = './/div[@class="mw-search-result-heading"]/a' +xpath_content = './/div[@class="searchresult"]' # cut 'en' from 'en-US', 'de' from 'de-CH', and so on @@ -77,8 +78,6 @@ main_langs = { 'uk': 'Українська', 'zh': '简体中文', } -supported_languages = dict(lang_urls, **main_langs) - # do search-request def request(query, params): @@ -118,7 +117,8 @@ def response(resp): link = result.xpath(xpath_link)[0] href = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) + content = extract_text(result.xpath(xpath_content)) - results.append({'url': href, 'title': title}) + results.append({'url': href, 'title': title, 'content': content}) return results diff --git a/searx/engines/google.py b/searx/engines/google.py index bdb351432..708068f3a 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,34 +1,39 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google WEB engine. Some of this -implementations are shared by other engines: +"""This is the implementation of the Google WEB engine. Some of this +implementations (manly the :py:obj:`get_google_info`) are shared by other +engines: - :ref:`google images engine` - :ref:`google news engine` - :ref:`google videos engine` - -The google WEB engine itself has a special setup option: - -.. code:: yaml - - - name: google - ... - use_mobile_ui: false - -``use_mobile_ui``: (default: ``false``) - Enables to use *mobile endpoint* to bypass the google blocking (see - :issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More - results` is not affected by Google rate limiting and we can still do requests - while actively blocked by the original Google search. By activate - ``use_mobile_ui`` this behavior is simulated by adding the parameter - ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. +- :ref:`google scholar engine` +- :ref:`google autocomplete` """ +from typing import TYPE_CHECKING + +import re from urllib.parse import urlencode from lxml import html -from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +import babel +import babel.core +import babel.languages + +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +from searx.locales import language_tag, region_tag, get_offical_locales +from searx import network from searx.exceptions import SearxEngineCaptchaException +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + # about about = { @@ -45,64 +50,6 @@ categories = ['general', 'web'] paging = True time_range_support = True safesearch = True -send_accept_language_header = True -use_mobile_ui = False -supported_languages_url = 'https://www.google.com/preferences?#languages' - -# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests -google_domains = { - 'BG': 'google.bg', # Bulgaria - 'CZ': 'google.cz', # Czech Republic - 'DE': 'google.de', # Germany - 'DK': 'google.dk', # Denmark - 'AT': 'google.at', # Austria - 'CH': 'google.ch', # Switzerland - 'GR': 'google.gr', # Greece - 'AU': 'google.com.au', # Australia - 'CA': 'google.ca', # Canada - 'GB': 'google.co.uk', # United Kingdom - 'ID': 'google.co.id', # Indonesia - 'IE': 'google.ie', # Ireland - 'IN': 'google.co.in', # India - 'MY': 'google.com.my', # Malaysia - 'NZ': 'google.co.nz', # New Zealand - 'PH': 'google.com.ph', # Philippines - 'SG': 'google.com.sg', # Singapore - 'US': 'google.com', # United States (google.us) redirects to .com - 'ZA': 'google.co.za', # South Africa - 'AR': 'google.com.ar', # Argentina - 'CL': 'google.cl', # Chile - 'ES': 'google.es', # Spain - 'MX': 'google.com.mx', # Mexico - 'EE': 'google.ee', # Estonia - 'FI': 'google.fi', # Finland - 'BE': 'google.be', # Belgium - 'FR': 'google.fr', # France - 'IL': 'google.co.il', # Israel - 'HR': 'google.hr', # Croatia - 'HU': 'google.hu', # Hungary - 'IT': 'google.it', # Italy - 'JP': 'google.co.jp', # Japan - 'KR': 'google.co.kr', # South Korea - 'LT': 'google.lt', # Lithuania - 'LV': 'google.lv', # Latvia - 'NO': 'google.no', # Norway - 'NL': 'google.nl', # Netherlands - 'PL': 'google.pl', # Poland - 'BR': 'google.com.br', # Brazil - 'PT': 'google.pt', # Portugal - 'RO': 'google.ro', # Romania - 'RU': 'google.ru', # Russia - 'SK': 'google.sk', # Slovakia - 'SI': 'google.si', # Slovenia - 'SE': 'google.se', # Sweden - 'TH': 'google.co.th', # Thailand - 'TR': 'google.com.tr', # Turkey - 'UA': 'google.com.ua', # Ukraine - 'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN - 'HK': 'google.com.hk', # Hong Kong - 'TW': 'google.com.tw', # Taiwan -} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} @@ -112,50 +59,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} # specific xpath variables # ------------------------ -results_xpath = './/div[@data-sokoban-container]' +results_xpath = './/div[contains(@jscontroller, "SC7lYd")]' title_xpath = './/a/h3[1]' href_xpath = './/a[h3]/@href' -content_xpath = './/div[@data-content-feature=1]' - -# google *sections* are no usual *results*, we ignore them -g_section_with_header = './g-section-with-header' - +content_xpath = './/div[@data-sncf]' # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' +# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for +# # celebrities like '!google natasha allegri' +# # or '!google chris evans' +UI_ASYNC = 'use_ac:true,_fmt:prog' +"""Format of the response from UI's async request.""" + -def get_lang_info(params, lang_list, custom_aliases, supported_any_language): - """Composing various language properties for the google engines. +def get_google_info(params, eng_traits): + """Composing various (language) properties for the google engines (:ref:`google + API`). This function is called by the various google engines (:ref:`google web engine`, :ref:`google images engine`, :ref:`google news engine` and :ref:`google videos engine`). - :param dict param: request parameters of the engine - - :param list lang_list: list of supported languages of the engine - :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` - - :param dict lang_list: custom aliases for non standard language codes - (used when calling :py:func:`searx.utils.match_language`) + :param dict param: Request parameters of the engine. At least + a ``searxng_locale`` key should be in the dictionary. - :param bool supported_any_language: When a language is not specified, the - language interpretation is left up to Google to decide how the search - results should be delivered. This argument is ``True`` for the google - engine and ``False`` for the other engines (google-images, -news, - -scholar, -videos). + :param eng_traits: Engine's traits fetched from google preferences + (:py:obj:`searx.enginelib.traits.EngineTraits`) :rtype: dict :returns: Py-Dictionary with the key/value pairs: language: - Return value from :py:func:`searx.utils.match_language` + The language code that is used by google (e.g. ``lang_en`` or + ``lang_zh-TW``) country: - The country code (e.g. US, AT, CA, FR, DE ..) + The country code that is used by google (e.g. ``US`` or ``TW``) + + locale: + A instance of :py:obj:`babel.core.Locale` build from the + ``searxng_locale`` value. subdomain: Google subdomain :py:obj:`google_domains` that fits to the country @@ -165,52 +112,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): Py-Dictionary with additional request arguments (can be passed to :py:func:`urllib.parse.urlencode`). + - ``hl`` parameter: specifies the interface language of user interface. + - ``lr`` parameter: restricts search results to documents written in + a particular language. + - ``cr`` parameter: restricts search results to documents + originating in a particular country. + - ``ie`` parameter: sets the character encoding scheme that should + be used to interpret the query string ('utf8'). + - ``oe`` parameter: sets the character encoding scheme that should + be used to decode the XML result ('utf8'). + headers: Py-Dictionary with additional HTTP headers (can be passed to request's headers) + + - ``Accept: '*/*`` + """ + ret_val = { 'language': None, 'country': None, 'subdomain': None, 'params': {}, 'headers': {}, + 'cookies': {}, + 'locale': None, } - # language ... + sxng_locale = params.get('searxng_locale', 'all') + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.core.UnknownLocaleError: + locale = None - _lang = params['language'] - _any_language = _lang.lower() == 'all' - if _any_language: - _lang = 'en-US' - language = match_language(_lang, lang_list, custom_aliases) - ret_val['language'] = language + eng_lang = eng_traits.get_language(sxng_locale, 'lang_en') + lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en + country = eng_traits.get_region(sxng_locale, eng_traits.all_locale) - # country ... + # Test zh_hans & zh_hant --> in the topmost links in the result list of list + # TW and HK you should a find wiktionary.org zh_hant link. In the result + # list of zh-CN should not be no hant link instead you should find + # zh.m.wikipedia.org/zh somewhere in the top. - _l = _lang.split('-') - if len(_l) == 2: - country = _l[1] - else: - country = _l[0].upper() - if country == 'EN': - country = 'US' - ret_val['country'] = country - - # subdomain ... - - ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') - - # params & headers + # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5 + # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5 - lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..) + ret_val['language'] = eng_lang + ret_val['country'] = country + ret_val['locale'] = locale + ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com') # hl parameter: - # https://developers.google.com/custom-search/docs/xml_results#hlsp The - # Interface Language: + # The hl parameter specifies the interface language (host language) of + # your user interface. To improve the performance and the quality of your + # search results, you are strongly encouraged to set this parameter + # explicitly. + # https://developers.google.com/custom-search/docs/xml_results#hlsp + # The Interface Language: # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages - ret_val['params']['hl'] = lang_list.get(lang_country, language) + ret_val['params']['hl'] = lang_code # lr parameter: # The lr (language restrict) parameter restricts search results to @@ -218,22 +180,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): # https://developers.google.com/custom-search/docs/xml_results#lrsp # Language Collection Values: # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections + # + # To select 'all' languages an empty 'lr' value is used. + # + # Different to other google services, Google Schloar supports to select more + # than one language. The languages are seperated by a pipe '|' (logical OR). + # By example: &lr=lang_zh-TW%7Clang_de selects articles written in + # traditional chinese OR german language. - if _any_language and supported_any_language: + ret_val['params']['lr'] = eng_lang + if sxng_locale == 'all': + ret_val['params']['lr'] = '' - # interpretation is left up to Google (based on whoogle) - # - # - add parameter ``source=lnt`` - # - don't use parameter ``lr`` - # - don't add a ``Accept-Language`` HTTP header. + # cr parameter: + # The cr parameter restricts search results to documents originating in a + # particular country. + # https://developers.google.com/custom-search/docs/xml_results#crsp - ret_val['params']['source'] = 'lnt' + ret_val['params']['cr'] = 'country' + country + if sxng_locale == 'all': + ret_val['params']['cr'] = '' - else: + # gl parameter: (mandatory by Geeogle News) + # The gl parameter value is a two-letter country code. For WebSearch + # results, the gl parameter boosts search results whose country of origin + # matches the parameter value. See the Country Codes section for a list of + # valid values. + # Specifying a gl parameter value in WebSearch requests should improve the + # relevance of results. This is particularly true for international + # customers and, even more specifically, for customers in English-speaking + # countries other than the United States. + # https://developers.google.com/custom-search/docs/xml_results#glsp + + ret_val['params']['gl'] = country + + # ie parameter: + # The ie parameter sets the character encoding scheme that should be used + # to interpret the query string. The default ie value is latin1. + # https://developers.google.com/custom-search/docs/xml_results#iesp + + ret_val['params']['ie'] = 'utf8' - # restricts search results to documents written in a particular - # language. - ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) + # oe parameter: + # The oe parameter sets the character encoding scheme that should be used + # to decode the XML result. The default oe value is latin1. + # https://developers.google.com/custom-search/docs/xml_results#oesp + + ret_val['params']['oe'] = 'utf8' + + # num parameter: + # The num parameter identifies the number of search results to return. + # The default num value is 10, and the maximum value is 20. If you request + # more than 20 results, only 20 results will be returned. + # https://developers.google.com/custom-search/docs/xml_results#numsp + + # HINT: seems to have no effect (tested in google WEB & Images) + # ret_val['params']['num'] = 20 + + # HTTP headers + + ret_val['headers']['Accept'] = '*/*' + + # Cookies + + # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746 + # - https://github.com/searxng/searxng/issues/1555 + ret_val['cookies']['CONSENT'] = "YES+" return ret_val @@ -245,33 +257,34 @@ def detect_google_sorry(resp): def request(query, params): """Google search request""" - + # pylint: disable=line-too-long offset = (params['pageno'] - 1) * 10 - - lang_info = get_lang_info(params, supported_languages, language_aliases, True) - - additional_parameters = {} - if use_mobile_ui: - additional_parameters = { - 'asearch': 'arc', - 'async': 'use_ac:true,_fmt:prog', - } + google_info = get_google_info(params, traits) # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, - **lang_info['params'], - 'ie': "utf8", - 'oe': "utf8", - 'start': offset, + **google_info['params'], 'filter': '0', - **additional_parameters, + 'start': offset, + # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i', + # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG', + # 'cs' : 1, + # 'sa': 'N', + # 'yv': 3, + # 'prmd': 'vin', + # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE', + # 'sa': 'N', + # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg' + # formally known as use_mobile_ui + 'asearch': 'arc', + 'async': UI_ASYNC, } ) ) @@ -282,25 +295,38 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - if use_mobile_ui: - params['headers']['Accept'] = '*/*' - else: - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params +# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA +# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; +RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') + + +def _parse_data_images(dom): + data_image_map = {} + for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()): + end_pos = data_image.rfind('=') + if end_pos > 0: + data_image = data_image[: end_pos + 1] + data_image_map[img_id] = data_image + logger.debug('data:image objects --> %s', list(data_image_map.keys())) + return data_image_map + + def response(resp): """Get response from google's search request""" - + # pylint: disable=too-many-branches, too-many-statements detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) + data_image_map = _parse_data_images(dom) + # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: @@ -309,25 +335,9 @@ def response(resp): else: logger.debug("did not find 'answer'") - # results --> number_of_results - if not use_mobile_ui: - try: - _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) - _digit = ''.join([n for n in _txt if n.isdigit()]) - number_of_results = int(_digit) - results.append({'number_of_results': number_of_results}) - except Exception as e: # pylint: disable=broad-except - logger.debug("did not 'number_of_results'") - logger.error(e, exc_info=True) - # parse results - for result in eval_xpath_list(dom, results_xpath): - - # google *sections* - if extract_text(eval_xpath(result, g_section_with_header)): - logger.debug("ignoring <g-section-with-header>") - continue + for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) @@ -336,16 +346,30 @@ def response(resp): logger.debug('ignoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) + url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: + logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) continue - content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) - if content is None: + + content_nodes = eval_xpath(result, content_xpath) + content = extract_text(content_nodes) + + if not content: logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) continue - logger.debug('add link to results: %s', title) - results.append({'url': url, 'title': title, 'content': content}) + img_src = content_nodes[0].xpath('.//img/@src') + if img_src: + img_src = img_src[0] + if img_src.startswith('data:image'): + img_id = content_nodes[0].xpath('.//img/@id') + if img_id: + img_src = data_image_map.get(img_id[0]) + else: + img_src = None + + results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) @@ -361,15 +385,107 @@ def response(resp): # get supported languages from their site -def _fetch_supported_languages(resp): - ret_val = {} + + +skip_countries = [ + # official language of google-country not in google-languages + 'AL', # Albanien (sq) + 'AZ', # Aserbaidschan (az) + 'BD', # Bangladesch (bn) + 'BN', # Brunei Darussalam (ms) + 'BT', # Bhutan (dz) + 'ET', # Äthiopien (am) + 'GE', # Georgien (ka, os) + 'GL', # Grönland (kl) + 'KH', # Kambodscha (km) + 'LA', # Laos (lo) + 'LK', # Sri Lanka (si, ta) + 'ME', # Montenegro (sr) + 'MK', # Nordmazedonien (mk, sq) + 'MM', # Myanmar (my) + 'MN', # Mongolei (mn) + 'MV', # Malediven (dv) // dv_MV is unknown by babel + 'MY', # Malaysia (ms) + 'NP', # Nepal (ne) + 'TJ', # Tadschikistan (tg) + 'TM', # Turkmenistan (tk) + 'UZ', # Usbekistan (uz) +] + + +def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): + """Fetch languages from Google.""" + # pylint: disable=import-outside-toplevel, too-many-branches + + engine_traits.custom['supported_domains'] = {} + + resp = network.get('https://www.google.com/preferences') + if not resp.ok: + raise RuntimeError("Response from Google's preferences is not OK.") + dom = html.fromstring(resp.text) - radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') + # supported language codes - for x in radio_buttons: - name = x.get("data-name") - code = x.get("value").split('_')[-1] - ret_val[code] = {"name": name} + lang_map = {'no': 'nb'} + for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'): - return ret_val + eng_lang = x.get("value").split('_')[-1] + try: + locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) + continue + sxng_lang = language_tag(locale) + + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = 'lang_' + eng_lang + + # alias languages + engine_traits.languages['zh'] = 'lang_zh-CN' + + # supported region codes + + for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'): + eng_country = x.get("value") + + if eng_country in skip_countries: + continue + if eng_country == 'ZZ': + engine_traits.all_locale = 'ZZ' + continue + + sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True) + + if not sxng_locales: + print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country)) + continue + + for sxng_locale in sxng_locales: + engine_traits.regions[region_tag(sxng_locale)] = eng_country + + # alias regions + engine_traits.regions['zh-CN'] = 'HK' + + # supported domains + + if add_domains: + resp = network.get('https://www.google.com/supported_domains') + if not resp.ok: + raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.") + + for domain in resp.text.split(): + domain = domain.strip() + if not domain or domain in [ + '.google.com', + ]: + continue + region = domain.split('.')[-1].upper() + engine_traits.custom['supported_domains'][region] = 'www' + domain + if region == 'HK': + # There is no google.cn, we use .com.hk for zh-CN + engine_traits.custom['supported_domains']['CN'] = 'www' + domain diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 528f8d21d..e6445b1c4 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,31 +1,38 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google images engine using the google -internal API used the Google Go Android app. +"""This is the implementation of the Google Images engine using the internal +Google API used by the Google Go Android app. This internal API offer results in -- JSON (_fmt:json) -- Protobuf (_fmt:pb) -- Protobuf compressed? (_fmt:pc) -- HTML (_fmt:html) -- Protobuf encoded in JSON (_fmt:jspb). +- JSON (``_fmt:json``) +- Protobuf_ (``_fmt:pb``) +- Protobuf_ compressed? (``_fmt:pc``) +- HTML (``_fmt:html``) +- Protobuf_ encoded in JSON (``_fmt:jspb``). +.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers """ +from typing import TYPE_CHECKING + from urllib.parse import urlencode from json import loads +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, detect_google_sorry, ) -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages +if TYPE_CHECKING: + import logging + from searx.enginelib.traits import EngineTraits + + logger: logging.Logger + traits: EngineTraits -# pylint: enable=unused-import # about about = { @@ -40,7 +47,6 @@ about = { # engine dependent config categories = ['images', 'web'] paging = True -use_locale_domain = True time_range_support = True safesearch = True send_accept_language_header = True @@ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'} def request(query, params): """Google-Image search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + google_info = get_google_info(params, traits) query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, 'tbm': "isch", - **lang_info['params'], - 'ie': "utf8", - 'oe': "utf8", + **google_info['params'], 'asearch': 'isch', 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), } @@ -77,9 +81,8 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['headers'].update(lang_info['headers']) - params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip' - params['headers']['Accept'] = '*/*' + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -111,7 +114,11 @@ def response(resp): copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') if copyright_notice: - result_item['source'] += ' / ' + copyright_notice + result_item['source'] += ' | ' + copyright_notice + + freshness_date = item["result"].get("freshness_date") + if freshness_date: + result_item['source'] += ' | ' + freshness_date file_size = item.get('gsa', {}).get('file_size') if file_size: diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 1ada2d64d..ae55ca9cb 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,24 +1,40 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google news engine. The google news API -ignores some parameters from the common :ref:`google API`: +"""This is the implementation of the Google News engine. -- num_ : the number of search results is ignored +Google News has a different region handling compared to Google WEB. + +- the ``ceid`` argument has to be set (:py:obj:`ceid_list`) +- the hl_ argument has to be set correctly (and different to Google WEB) +- the gl_ argument is mandatory + +If one of this argument is not set correctly, the request is redirected to +CONSENT dialog:: + + https://consent.google.com/m?continue= + +The google news API ignores some parameters from the common :ref:`google API`: + +- num_ : the number of search results is ignored / there is no paging all + results for a query term are in the first response. - save_ : is ignored / Google-News results are always *SafeSearch* +.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp +.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp - """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING import binascii import re from urllib.parse import urlencode from base64 import b64decode from lxml import html +import babel +from searx import locales from searx.utils import ( eval_xpath, eval_xpath_list, @@ -26,18 +42,19 @@ from searx.utils import ( extract_text, ) -# pylint: disable=unused-import +from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - supported_languages_url, - _fetch_supported_languages, + get_google_info, + detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: enable=unused-import +if TYPE_CHECKING: + import logging -from searx.engines.google import ( - get_lang_info, - detect_google_sorry, -) + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -49,70 +66,77 @@ about = { "results": 'HTML', } -# compared to other google engines google-news has a different time range -# support. The time range is included in the search term. -time_range_dict = { - 'day': 'when:1d', - 'week': 'when:7d', - 'month': 'when:1m', - 'year': 'when:1y', -} - # engine dependent config - categories = ['news'] paging = False -use_locale_domain = True -time_range_support = True +time_range_support = False # Google-News results are always *SafeSearch*. Option 'safesearch' is set to # False here, otherwise checker will report safesearch-errors:: # # safesearch : results are identitical for safesearch=0 and safesearch=2 -safesearch = False -send_accept_language_header = True +safesearch = True +# send_accept_language_header = True def request(query, params): """Google-News search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + sxng_locale = params.get('searxng_locale', 'en-US') + ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en') + google_info = get_google_info(params, traits) + google_info['subdomain'] = 'news.google.com' # google news has only one domain - # google news has only one domain - lang_info['subdomain'] = 'news.google.com' + ceid_region, ceid_lang = ceid.split(':') + ceid_lang, ceid_suffix = ( + ceid_lang.split('-') + + [ + None, + ] + )[:2] - ceid = "%s:%s" % (lang_info['country'], lang_info['language']) + google_info['params']['hl'] = ceid_lang - # google news redirects en to en-US - if lang_info['params']['hl'] == 'en': - lang_info['params']['hl'] = 'en-US' + if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']: - # Very special to google-news compared to other google engines, the time - # range is included in the search term. - if params['time_range']: - query += ' ' + time_range_dict[params['time_range']] + if ceid_region.lower() == ceid_lang: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix + + elif ceid_region.lower() != ceid_lang: + + if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']: + google_info['params']['hl'] = ceid_lang + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + + google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0] + google_info['params']['gl'] = ceid_region query_url = ( 'https://' - + lang_info['subdomain'] - + '/search' - + "?" - + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']}) + + google_info['subdomain'] + + "/search?" + + urlencode( + { + 'q': query, + **google_info['params'], + } + ) + # ceid includes a ':' character which must not be urlencoded + ('&ceid=%s' % ceid) - ) # ceid includes a ':' character which must not be urlencoded - params['url'] = query_url - - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + params['url'] = query_url + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params def response(resp): """Get response from google's search request""" results = [] - detect_google_sorry(resp) # convert the text to dom @@ -152,8 +176,8 @@ def response(resp): # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. - pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) - pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) + pub_date = extract_text(eval_xpath(result, './article//time')) + pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]')) content = ' / '.join([x for x in [pub_origin, pub_date] if x]) @@ -174,3 +198,127 @@ def response(resp): # return results return results + + +ceid_list = [ + 'AE:ar', + 'AR:es-419', + 'AT:de', + 'AU:en', + 'BD:bn', + 'BE:fr', + 'BE:nl', + 'BG:bg', + 'BR:pt-419', + 'BW:en', + 'CA:en', + 'CA:fr', + 'CH:de', + 'CH:fr', + 'CL:es-419', + 'CN:zh-Hans', + 'CO:es-419', + 'CU:es-419', + 'CZ:cs', + 'DE:de', + 'EG:ar', + 'ES:es', + 'ET:en', + 'FR:fr', + 'GB:en', + 'GH:en', + 'GR:el', + 'HK:zh-Hant', + 'HU:hu', + 'ID:en', + 'ID:id', + 'IE:en', + 'IL:en', + 'IL:he', + 'IN:bn', + 'IN:en', + 'IN:hi', + 'IN:ml', + 'IN:mr', + 'IN:ta', + 'IN:te', + 'IT:it', + 'JP:ja', + 'KE:en', + 'KR:ko', + 'LB:ar', + 'LT:lt', + 'LV:en', + 'LV:lv', + 'MA:fr', + 'MX:es-419', + 'MY:en', + 'NA:en', + 'NG:en', + 'NL:nl', + 'NO:no', + 'NZ:en', + 'PE:es-419', + 'PH:en', + 'PK:en', + 'PL:pl', + 'PT:pt-150', + 'RO:ro', + 'RS:sr', + 'RU:ru', + 'SA:ar', + 'SE:sv', + 'SG:en', + 'SI:sl', + 'SK:sk', + 'SN:fr', + 'TH:th', + 'TR:tr', + 'TW:zh-Hant', + 'TZ:en', + 'UA:ru', + 'UA:uk', + 'UG:en', + 'US:en', + 'US:es-419', + 'VE:es-419', + 'VN:vi', + 'ZA:en', + 'ZW:en', +] +"""List of region/language combinations supported by Google News. Values of the +``ceid`` argument of the Google News REST API.""" + + +_skip_values = [ + 'ET:en', # english (ethiopia) + 'ID:en', # english (indonesia) + 'LV:en', # english (latvia) +] + +_ceid_locale_map = {'NO:no': 'nb-NO'} + + +def fetch_traits(engine_traits: EngineTraits): + _fetch_traits(engine_traits, add_domains=False) + + engine_traits.custom['ceid'] = {} + + for ceid in ceid_list: + if ceid in _skip_values: + continue + + region, lang = ceid.split(':') + x = lang.split('-') + if len(x) > 1: + if x[1] not in ['Hant', 'Hans']: + lang = x[0] + + sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region) + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) + continue + + engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index c07cd4cea..6f33d1e1a 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -1,19 +1,18 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Scholar) +"""This is the implementation of the Google Scholar engine. -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. - -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions +Compared to other Google services the Scholar engine has a simple GET REST-API +and there does not exists `async` API. Even though the API slightly vintage we +can make use of the :ref:`google API` to assemble the arguments of the GET +request. """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING +from typing import Optional from urllib.parse import urlencode from datetime import datetime -from typing import Optional from lxml import html from searx.utils import ( @@ -23,19 +22,21 @@ from searx.utils import ( extract_text, ) +from searx.exceptions import SearxEngineCaptchaException + +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, - detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import ( - supported_languages_url, - _fetch_supported_languages, -) +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -51,53 +52,62 @@ about = { categories = ['science', 'scientific publications'] paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = False send_accept_language_header = True -def time_range_url(params): - """Returns a URL query component for a google-Scholar time range based on - ``params['time_range']``. Google-Scholar does only support ranges in years. - To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*) - are mapped to *year*. If no range is set, an empty string is returned. - Example:: +def time_range_args(params): + """Returns a dictionary with a time range arguments based on + ``params['time_range']``. - &as_ylo=2019 - """ - # as_ylo=2016&as_yhi=2019 - ret_val = '' - if params['time_range'] in time_range_dict: - ret_val = urlencode({'as_ylo': datetime.now().year - 1}) - return '&' + ret_val + Google Scholar supports a detailed search by year. Searching by *last + month* or *last week* (as offered by SearXNG) is uncommon for scientific + publications and is not supported by Google Scholar. + To limit the result list when the users selects a range, all the SearXNG + ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range + is set an empty dictionary of arguments is returned. Example; when + user selects a time range (current year minus one in 2022): -def request(query, params): - """Google-Scholar search request""" + .. code:: python - offset = (params['pageno'] - 1) * 10 - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + { 'as_ylo' : 2021 } - # subdomain is: scholar.google.xy - lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") + """ + ret_val = {} + if params['time_range'] in time_range_dict: + ret_val['as_ylo'] = datetime.now().year - 1 + return ret_val - query_url = ( - 'https://' - + lang_info['subdomain'] - + '/scholar' - + "?" - + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset}) - ) - query_url += time_range_url(params) - params['url'] = query_url +def detect_google_captcha(dom): + """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is + not redirected to ``sorry.google.com``. + """ + if eval_xpath(dom, "//form[@id='gs_captcha_f']"): + raise SearxEngineCaptchaException() + - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' +def request(query, params): + """Google-Scholar search request""" - # params['google_subdomain'] = subdomain + google_info = get_google_info(params, traits) + # subdomain is: scholar.google.xy + google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") + + args = { + 'q': query, + **google_info['params'], + 'start': (params['pageno'] - 1) * 10, + 'as_sdt': '2007', # include patents / to disable set '0,5' + 'as_vis': '0', # include citations / to disable set '1' + } + args.update(time_range_args(params)) + + params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -138,19 +148,15 @@ def parse_gs_a(text: Optional[str]): def response(resp): # pylint: disable=too-many-locals - """Get response from google's search request""" + """Parse response from Google Scholar""" results = [] - detect_google_sorry(resp) - - # which subdomain ? - # subdomain = resp.search_params.get('google_subdomain') - # convert the text to dom dom = html.fromstring(resp.text) + detect_google_captcha(dom) # parse results - for result in eval_xpath_list(dom, '//div[@data-cid]'): + for result in eval_xpath_list(dom, '//div[@data-rp]'): title = extract_text(eval_xpath(result, './/h3[1]//a')) @@ -158,7 +164,7 @@ def response(resp): # pylint: disable=too-many-locals # this is a [ZITATION] block continue - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) + pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) if pub_type: pub_type = pub_type[1:-1].lower() diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index fc574bd48..985189df5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google videos engine. +"""This is the implementation of the Google Videos engine. .. admonition:: Content-Security-Policy (CSP) @@ -14,9 +14,8 @@ """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING -import re from urllib.parse import urlencode from lxml import html @@ -27,20 +26,22 @@ from searx.utils import ( extract_text, ) +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, filter_mapping, - g_section_with_header, - title_xpath, suggestion_xpath, detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -55,70 +56,32 @@ about = { # engine dependent config categories = ['videos', 'web'] -paging = False +paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = True -send_accept_language_header = True - -RE_CACHE = {} - - -def _re(regexpr): - """returns compiled regular expression""" - RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) - return RE_CACHE[regexpr] - - -def scrap_out_thumbs_src(dom): - ret_val = {} - thumb_name = 'dimg_' - for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): - _script = script.text - # "dimg_35":"https://i.ytimg.c....", - _dimurl = _re("s='([^']*)").findall(_script) - for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): - v = v.replace(r'\u003d', '=') - v = v.replace(r'\u0026', '&') - ret_val[k] = v - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val - - -def scrap_out_thumbs(dom): - """Scrap out thumbnail data from <script> tags.""" - ret_val = {} - thumb_name = 'dimg_' - - for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): - _script = script.text - - # var s='data:image/jpeg;base64, ...' - _imgdata = _re("s='([^']*)").findall(_script) - if not _imgdata: - continue - - # var ii=['dimg_17'] - for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): - # At least the equal sign in the URL needs to be decoded - ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") - - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val def request(query, params): """Google-Video search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + google_info = get_google_info(params, traits) query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" - + urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"}) + + urlencode( + { + 'q': query, + 'tbm': "vid", + 'start': 10 * params['pageno'], + **google_info['params'], + 'asearch': 'arc', + 'async': 'use_ac:true,_fmt:html', + } + ) ) if params['time_range'] in time_range_dict: @@ -127,9 +90,8 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -141,43 +103,30 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) - vidthumb_imgdata = scrap_out_thumbs(dom) - thumbs_src = scrap_out_thumbs_src(dom) - logger.debug(str(thumbs_src)) # parse results for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): - # ignore google *sections* - if extract_text(eval_xpath(result, g_section_with_header)): - logger.debug("ignoring <g-section-with-header>") - continue - - # ingnore articles without an image id / e.g. news articles - img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) - if img_id is None: - logger.error("no img_id found in item %s (news article?)", len(results) + 1) + img_src = eval_xpath_getindex(result, './/img/@src', 0, None) + if img_src is None: continue - img_src = vidthumb_imgdata.get(img_id, None) - if not img_src: - img_src = thumbs_src.get(img_id, "") + title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0)) + url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0) - title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) - url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) - length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) content = extract_text(c_node) - pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) + pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]')) + length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]')) results.append( { 'url': url, 'title': title, 'content': content, - 'length': length, 'author': pub_info, 'thumbnail': img_src, + 'length': length, 'template': 'videos.html', } ) diff --git a/searx/engines/peertube.py b/searx/engines/peertube.py index 345c2f991..87b386d7a 100644 --- a/searx/engines/peertube.py +++ b/searx/engines/peertube.py @@ -1,18 +1,30 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" - peertube (Videos) +# lint: pylint +"""Peertube and :py:obj:`SepiaSearch <searx.engines.sepiasearch>` do share +(more or less) the same REST API and the schema of the JSON result is identical. + """ -from json import loads -from datetime import datetime +import re from urllib.parse import urlencode +from datetime import datetime +from dateutil.parser import parse +from dateutil.relativedelta import relativedelta + +import babel + +from searx import network +from searx.locales import language_tag from searx.utils import html_to_text +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits -# about about = { + # pylint: disable=line-too-long "website": 'https://joinpeertube.org', "wikidata_id": 'Q50938515', - "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html', + "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', "use_official_api": True, "require_api_key": False, "results": 'JSON', @@ -22,66 +34,155 @@ about = { categories = ["videos"] paging = True base_url = "https://peer.tube" -supported_languages_url = 'https://peer.tube/api/v1/videos/languages' +"""Base URL of the Peertube instance. A list of instances is available at: + +- https://instances.joinpeertube.org/instances +""" + +time_range_support = True +time_range_table = { + 'day': relativedelta(), + 'week': relativedelta(weeks=-1), + 'month': relativedelta(months=-1), + 'year': relativedelta(years=-1), +} + +safesearch = True +safesearch_table = {0: 'both', 1: 'false', 2: 'false'} + + +def minute_to_hm(minute): + if isinstance(minute, int): + return "%d:%02d" % (divmod(minute, 60)) + return None -# do search-request def request(query, params): - sanitized_url = base_url.rstrip("/") - pageno = (params["pageno"] - 1) * 15 - search_url = sanitized_url + "/api/v1/search/videos/?pageno={pageno}&{query}" - query_dict = {"search": query} - language = params["language"].split("-")[0] - if "all" != language and language in supported_languages: - query_dict["languageOneOf"] = language - params["url"] = search_url.format(query=urlencode(query_dict), pageno=pageno) - return params + """Assemble request for the Peertube API""" + + if not query: + return False + + # eng_region = traits.get_region(params['searxng_locale'], 'en_US') + eng_lang = traits.get_language(params['searxng_locale'], None) + + params['url'] = ( + base_url.rstrip("/") + + "/api/v1/search/videos?" + + urlencode( + { + 'search': query, + 'searchTarget': 'search-index', # Vidiversum + 'resultType': 'videos', + 'start': (params['pageno'] - 1) * 10, + 'count': 10, + # -createdAt: sort by date ascending / createdAt: date descending + 'sort': '-match', # sort by *match descending* + 'nsfw': safesearch_table[params['safesearch']], + } + ) + ) + + if eng_lang is not None: + params['url'] += '&languageOneOf[]=' + eng_lang + params['url'] += '&boostLanguages[]=' + eng_lang + if params['time_range'] in time_range_table: + time = datetime.now().date() + time_range_table[params['time_range']] + params['url'] += '&startDate=' + time.isoformat() -def _get_offset_from_pageno(pageno): - return (pageno - 1) * 15 + 1 + return params -# get response from search-request def response(resp): - sanitized_url = base_url.rstrip("/") + return video_response(resp) + + +def video_response(resp): + """Parse video response from SepiaSearch and Peertube instances.""" results = [] - search_res = loads(resp.text) + json_data = resp.json() - # return empty array if there are no results - if "data" not in search_res: + if 'data' not in json_data: return [] - # parse results - for res in search_res["data"]: - title = res["name"] - url = sanitized_url + "/videos/watch/" + res["uuid"] - description = res["description"] - if description: - content = html_to_text(res["description"]) - else: - content = "" - thumbnail = sanitized_url + res["thumbnailPath"] - publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + for result in json_data['data']: + metadata = [ + x + for x in [ + result.get('channel', {}).get('displayName'), + result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'), + ', '.join(result.get('tags', [])), + ] + if x + ] results.append( { - "template": "videos.html", - "url": url, - "title": title, - "content": content, - "publishedDate": publishedDate, - "iframe_src": sanitized_url + res["embedPath"], - "thumbnail": thumbnail, + 'url': result['url'], + 'title': result['name'], + 'content': html_to_text(result.get('description') or ''), + 'author': result.get('account', {}).get('displayName'), + 'length': minute_to_hm(result.get('duration')), + 'template': 'videos.html', + 'publishedDate': parse(result['publishedAt']), + 'iframe_src': result.get('embedUrl'), + 'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'), + 'metadata': ' | '.join(metadata), } ) - # return results return results -def _fetch_supported_languages(resp): - videolanguages = resp.json() - peertube_languages = list(videolanguages.keys()) - return peertube_languages +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from peertube's search-index source code. + + See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_ + + .. _8ed5c729 - Refactor and redesign client: + https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729 + .. _videoLanguages: + https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291 + """ + + resp = network.get( + 'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue', + # the response from search-index repository is very slow + timeout=60, + ) + + if not resp.ok: + print("ERROR: response from peertube is not OK.") + return + + js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) + if not js_lang: + print("ERROR: can't determine languages from peertube") + return + + for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)): + try: + eng_tag = lang.group(1) + if eng_tag == 'oc': + # Occitanis not known by babel, its closest relative is Catalan + # but 'ca' is already in the list of engine_traits.languages --> + # 'oc' will be ignored. + continue + + sxng_tag = language_tag(babel.Locale.parse(eng_tag)) + + except babel.UnknownLocaleError: + print("ERROR: %s is unknown by babel" % eng_tag) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.languages[sxng_tag] = eng_tag + + engine_traits.languages['zh_Hans'] = 'zh' + engine_traits.languages['zh_Hant'] = 'zh' diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 6de2176d0..4a41676c5 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -34,7 +34,9 @@ import babel from searx.exceptions import SearxEngineAPIException from searx.network import raise_for_httperror -from searx.locales import get_engine_locale +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits # about about = { @@ -49,7 +51,6 @@ about = { # engine dependent config categories = [] paging = True -supported_languages_url = about['website'] qwant_categ = None # web|news|inages|videos safesearch = True @@ -95,7 +96,7 @@ def request(query, params): ) # add quant's locale - q_locale = get_engine_locale(params['language'], supported_languages, default='en_US') + q_locale = traits.get_region(params["searxng_locale"], default='en_US') params['url'] += '&locale=' + q_locale # add safesearch option @@ -243,15 +244,20 @@ def response(resp): return results -def _fetch_supported_languages(resp): +def fetch_traits(engine_traits: EngineTraits): + + # pylint: disable=import-outside-toplevel + from searx import network + from searx.locales import region_tag + resp = network.get(about['website']) text = resp.text text = text[text.find('INITIAL_PROPS') :] text = text[text.find('{') : text.find('</script>')] q_initial_props = loads(text) q_locales = q_initial_props.get('locales') - q_valid_locales = [] + eng_tag_list = set() for country, v in q_locales.items(): for lang in v['langs']: @@ -261,25 +267,18 @@ def _fetch_supported_languages(resp): # qwant-news does not support all locales from qwant-web: continue - q_valid_locales.append(_locale) - - supported_languages = {} + eng_tag_list.add(_locale) - for q_locale in q_valid_locales: + for eng_tag in eng_tag_list: try: - locale = babel.Locale.parse(q_locale, sep='_') - except babel.core.UnknownLocaleError: - print("ERROR: can't determine babel locale of quant's locale %s" % q_locale) + sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_')) + except babel.UnknownLocaleError: + print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag) continue - # note: supported_languages (dict) - # - # dict's key is a string build up from a babel.Locale object / the - # notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and - # language) notation and dict's values are the locale strings used by - # the engine. - - searxng_locale = locale.language + '-' + locale.territory # --> params['language'] - supported_languages[searxng_locale] = q_locale - - return supported_languages + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag diff --git a/searx/engines/sepiasearch.py b/searx/engines/sepiasearch.py index 9c45d6c43..72157b253 100644 --- a/searx/engines/sepiasearch.py +++ b/searx/engines/sepiasearch.py @@ -1,70 +1,80 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""SepiaSearch uses the same languages as :py:obj:`Peertube +<searx.engines.peertube>` and the response is identical to the response from the +peertube engines. + """ - SepiaSearch (Videos) -""" -from json import loads -from dateutil import parser, relativedelta +from typing import TYPE_CHECKING + from urllib.parse import urlencode from datetime import datetime -# about +from searx.engines.peertube import fetch_traits # pylint: disable=unused-import +from searx.engines.peertube import ( + # pylint: disable=unused-import + video_response, + safesearch_table, + time_range_table, +) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + about = { + # pylint: disable=line-too-long "website": 'https://sepiasearch.org', "wikidata_id": None, - "official_api_documentation": "https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api", # NOQA + "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos', "use_official_api": True, "require_api_key": False, "results": 'JSON', } +# engine dependent config categories = ['videos'] paging = True + +base_url = 'https://sepiasearch.org' + time_range_support = True safesearch = True -supported_languages = [ - # fmt: off - 'en', 'fr', 'ja', 'eu', 'ca', 'cs', 'eo', 'el', - 'de', 'it', 'nl', 'es', 'oc', 'gd', 'zh', 'pt', - 'sv', 'pl', 'fi', 'ru' - # fmt: on -] -base_url = 'https://sepiasearch.org/api/v1/search/videos' - -safesearch_table = {0: 'both', 1: 'false', 2: 'false'} - -time_range_table = { - 'day': relativedelta.relativedelta(), - 'week': relativedelta.relativedelta(weeks=-1), - 'month': relativedelta.relativedelta(months=-1), - 'year': relativedelta.relativedelta(years=-1), -} -def minute_to_hm(minute): - if isinstance(minute, int): - return "%d:%02d" % (divmod(minute, 60)) - return None +def request(query, params): + """Assemble request for the SepiaSearch API""" + + if not query: + return False + # eng_region = traits.get_region(params['searxng_locale'], 'en_US') + eng_lang = traits.get_language(params['searxng_locale'], None) -def request(query, params): params['url'] = ( - base_url - + '?' + base_url.rstrip("/") + + "/api/v1/search/videos?" + urlencode( { 'search': query, 'start': (params['pageno'] - 1) * 10, 'count': 10, - 'sort': '-match', + # -createdAt: sort by date ascending / createdAt: date descending + 'sort': '-match', # sort by *match descending* 'nsfw': safesearch_table[params['safesearch']], } ) ) - language = params['language'].split('-')[0] - if language in supported_languages: - params['url'] += '&languageOneOf[]=' + language + if eng_lang is not None: + params['url'] += '&languageOneOf[]=' + eng_lang + params['url'] += '&boostLanguages[]=' + eng_lang + if params['time_range'] in time_range_table: time = datetime.now().date() + time_range_table[params['time_range']] params['url'] += '&startDate=' + time.isoformat() @@ -73,34 +83,4 @@ def request(query, params): def response(resp): - results = [] - - search_results = loads(resp.text) - - if 'data' not in search_results: - return [] - - for result in search_results['data']: - title = result['name'] - content = result['description'] - thumbnail = result['thumbnailUrl'] - publishedDate = parser.parse(result['publishedAt']) - author = result.get('account', {}).get('displayName') - length = minute_to_hm(result.get('duration')) - url = result['url'] - - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'author': author, - 'length': length, - 'template': 'videos.html', - 'publishedDate': publishedDate, - 'iframe_src': result.get('embedUrl'), - 'thumbnail': thumbnail, - } - ) - - return results + return video_response(resp) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index f857f7b6d..2813d0bf3 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,28 +1,108 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Startpage (Web) +"""Startpage's language & region selectors are a mess .. + +.. _startpage regions: + +Startpage regions +================= + +In the list of regions there are tags we need to map to common region tags:: + + pt-BR_BR --> pt_BR + zh-CN_CN --> zh_Hans_CN + zh-TW_TW --> zh_Hant_TW + zh-TW_HK --> zh_Hant_HK + en-GB_GB --> en_GB + +and there is at least one tag with a three letter language tag (ISO 639-2):: + + fil_PH --> fil_PH + +The locale code ``no_NO`` from Startpage does not exists and is mapped to +``nb-NO``:: + + babel.core.UnknownLocaleError: unknown locale 'no_NO' + +For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and +W3C recommends subtag over macrolanguage [2]_. + +.. [1] `iana: language-subtag-registry + <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ :: + + type: language + Subtag: nb + Description: Norwegian Bokmål + Added: 2005-10-16 + Suppress-Script: Latn + Macrolanguage: no + +.. [2] + Use macrolanguages with care. Some language subtags have a Scope field set to + macrolanguage, i.e. this primary language subtag encompasses a number of more + specific primary language subtags in the registry. ... As we recommended for + the collection subtags mentioned above, in most cases you should try to use + the more specific subtags ... `W3: The primary language subtag + <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_ + +.. _startpage languages: + +Startpage languages +=================== + +:py:obj:`send_accept_language_header`: + The displayed name in Startpage's settings page depend on the location of the + IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits` + we use:: + + 'Accept-Language': "en-US,en;q=0.5", + .. + + to get uniform names independent from the IP). + +.. _startpage categories: + +Startpage categories +==================== + +Startpage's category (for Web-search, News, Videos, ..) is set by +:py:obj:`startpage_categ` in settings.yml:: + + - name: startpage + engine: startpage + startpage_categ: web + ... + +.. hint:: + + The default category is ``web`` .. and other categories than ``web`` are not + yet implemented. """ +from typing import TYPE_CHECKING +from collections import OrderedDict import re -from time import time - -from urllib.parse import urlencode from unicodedata import normalize, combining +from time import time from datetime import datetime, timedelta -from dateutil import parser -from lxml import html -from babel import Locale -from babel.localedata import locale_identifiers +import dateutil.parser +import lxml.html +import babel + +from searx import network +from searx.utils import extract_text, eval_xpath, gen_useragent +from searx.exceptions import SearxEngineCaptchaException +from searx.locales import region_tag +from searx.enginelib.traits import EngineTraits -from searx.network import get -from searx.utils import extract_text, eval_xpath, match_language -from searx.exceptions import ( - SearxEngineResponseException, - SearxEngineCaptchaException, -) +if TYPE_CHECKING: + import logging + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -34,18 +114,28 @@ about = { "results": 'HTML', } +startpage_categ = 'web' +"""Startpage's category, visit :ref:`startpage categories`. +""" + +send_accept_language_header = True +"""Startpage tries to guess user's language and territory from the HTTP +``Accept-Language``. Optional the user can select a search-language (can be +different to the UI language) and a region filter. +""" + # engine dependent config categories = ['general', 'web'] -# there is a mechanism to block "bot" search -# (probably the parameter qid), require -# storing of qid's between mulitble search-calls - paging = True -supported_languages_url = 'https://www.startpage.com/do/settings' +time_range_support = True +safesearch = True + +time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} +safesearch_dict = {0: '0', 1: '1', 2: '1'} # search-url -base_url = 'https://startpage.com/' -search_url = base_url + 'sp/search?' +base_url = 'https://www.startpage.com' +search_url = base_url + '/sp/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] @@ -53,92 +143,193 @@ search_url = base_url + 'sp/search?' results_xpath = '//div[@class="w-gl__result__main"]' link_xpath = './/a[@class="w-gl__result-title result-link"]' content_xpath = './/p[@class="w-gl__description"]' +search_form_xpath = '//form[@id="search"]' +"""XPath of Startpage's origin search form + +.. code: html + + <form action="/sp/search" method="post"> + <input type="text" name="query" value="" ..> + <input type="hidden" name="t" value="device"> + <input type="hidden" name="lui" value="english"> + <input type="hidden" name="sc" value="Q7Mt5TRqowKB00"> + <input type="hidden" name="cat" value="web"> + <input type="hidden" class="abp" id="abp-input" name="abp" value="1"> + </form> +""" # timestamp of the last fetch of 'sc' code sc_code_ts = 0 sc_code = '' +sc_code_cache_sec = 30 +"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" -def raise_captcha(resp): +def get_sc_code(searxng_locale, params): + """Get an actual ``sc`` argument from Startpage's search form (HTML page). - if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): - raise SearxEngineCaptchaException() + Startpage puts a ``sc`` argument on every HTML :py:obj:`search form + <search_form_xpath>`. Without this argument Startpage considers the request + is from a bot. We do not know what is encoded in the value of the ``sc`` + argument, but it seems to be a kind of a *time-stamp*. + Startpage's search form generates a new sc-code on each request. This + function scrap a new sc-code from Startpage's home page every + :py:obj:`sc_code_cache_sec` seconds. -def get_sc_code(headers): - """Get an actual `sc` argument from startpage's home page. + """ - Startpage puts a `sc` argument on every link. Without this argument - startpage considers the request is from a bot. We do not know what is - encoded in the value of the `sc` argument, but it seems to be a kind of a - *time-stamp*. This *time-stamp* is valid for a few hours. + global sc_code_ts, sc_code # pylint: disable=global-statement - This function scrap a new *time-stamp* from startpage's home page every hour - (3000 sec). + if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)): + logger.debug("get_sc_code: reuse '%s'", sc_code) + return sc_code + + headers = {**params['headers']} + headers['Origin'] = base_url + headers['Referer'] = base_url + '/' + # headers['Connection'] = 'keep-alive' + # headers['Accept-Encoding'] = 'gzip, deflate, br' + # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' + # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0' + + # add Accept-Language header + if searxng_locale == 'all': + searxng_locale = 'en-US' + locale = babel.Locale.parse(searxng_locale, sep='-') + + if send_accept_language_header: + ac_lang = locale.language + if locale.territory: + ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( + locale.language, + locale.territory, + locale.language, + ) + headers['Accept-Language'] = ac_lang + + get_sc_url = base_url + '/?sc=%s' % (sc_code) + logger.debug("query new sc time-stamp ... %s", get_sc_url) + logger.debug("headers: %s", headers) + resp = network.get(get_sc_url, headers=headers) + + # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers) + # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg + # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 - """ + if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): + raise SearxEngineCaptchaException( + message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", + ) + + dom = lxml.html.fromstring(resp.text) + + try: + sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] + except IndexError as exc: + logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") + raise SearxEngineCaptchaException( + message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, + ) from exc + + sc_code_ts = time() + logger.debug("get_sc_code: new value is: %s", sc_code) + return sc_code - global sc_code_ts, sc_code # pylint: disable=global-statement - if time() > (sc_code_ts + 3000): - logger.debug("query new sc time-stamp ...") +def request(query, params): + """Assemble a Startpage request. - resp = get(base_url, headers=headers) - raise_captcha(resp) - dom = html.fromstring(resp.text) + To avoid CAPTCHA we need to send a well formed HTTP POST request with a + cookie. We need to form a request that is identical to the request build by + Startpage's search form: - try: - # <input type="hidden" name="sc" value="..."> - sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0] - except IndexError as exc: - # suspend startpage API --> https://github.com/searxng/searxng/pull/695 - raise SearxEngineResponseException( - suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!" - ) from exc + - in the cookie the **region** is selected + - in the HTTP POST data the **language** is selected - sc_code_ts = time() - logger.debug("new value is: %s", sc_code) + Additionally the arguments form Startpage's search form needs to be set in + HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`. + """ + if startpage_categ == 'web': + return _request_cat_web(query, params) - return sc_code + logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) + return params -# do search-request -def request(query, params): +def _request_cat_web(query, params): - # pylint: disable=line-too-long - # The format string from Startpage's FFox add-on [1]:: - # - # https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0 - # - # [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/ + engine_region = traits.get_region(params['searxng_locale'], 'en-US') + engine_language = traits.get_language(params['searxng_locale'], 'en') + # build arguments args = { 'query': query, - 'page': params['pageno'], 'cat': 'web', - # 'pl': 'ext-ff', - # 'extVersion': '1.3.0', - # 'abp': "-1", - 'sc': get_sc_code(params['headers']), + 't': 'device', + 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers, + 'with_date': time_range_dict.get(params['time_range'], ''), } - # set language if specified - if params['language'] != 'all': - lang_code = match_language(params['language'], supported_languages, fallback=None) - if lang_code: - language_name = supported_languages[lang_code]['alias'] - args['language'] = language_name - args['lui'] = language_name + if engine_language: + args['language'] = engine_language + args['lui'] = engine_language + + args['abp'] = '1' + if params['pageno'] > 1: + args['page'] = params['pageno'] + + # build cookie + lang_homepage = 'en' + cookie = OrderedDict() + cookie['date_time'] = 'world' + cookie['disable_family_filter'] = safesearch_dict[params['safesearch']] + cookie['disable_open_in_new_window'] = '0' + cookie['enable_post_method'] = '1' # hint: POST + cookie['enable_proxy_safety_suggest'] = '1' + cookie['enable_stay_control'] = '1' + cookie['instant_answers'] = '1' + cookie['lang_homepage'] = 's/device/%s/' % lang_homepage + cookie['num_of_results'] = '10' + cookie['suggestions'] = '1' + cookie['wt_unit'] = 'celsius' + + if engine_language: + cookie['language'] = engine_language + cookie['language_ui'] = engine_language + + if engine_region: + cookie['search_results_region'] = engine_region + + params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()]) + logger.debug('cookie preferences: %s', params['cookies']['preferences']) + + # POST request + logger.debug("data: %s", args) + params['data'] = args + params['method'] = 'POST' + params['url'] = search_url + params['headers']['Origin'] = base_url + params['headers']['Referer'] = base_url + '/' + # is the Accept header needed? + # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - params['url'] = search_url + urlencode(args) return params # get response from search-request def response(resp): - results = [] + dom = lxml.html.fromstring(resp.text) - dom = html.fromstring(resp.text) + if startpage_categ == 'web': + return _response_cat_web(dom) + + logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) + return [] + + +def _response_cat_web(dom): + results = [] # parse results for result in eval_xpath(dom, results_xpath): @@ -173,7 +364,7 @@ def response(resp): content = content[date_pos:] try: - published_date = parser.parse(date_string, dayfirst=True) + published_date = dateutil.parser.parse(date_string, dayfirst=True) except ValueError: pass @@ -199,62 +390,103 @@ def response(resp): return results -# get supported languages from their site -def _fetch_supported_languages(resp): - # startpage's language selector is a mess each option has a displayed name - # and a value, either of which may represent the language name in the native - # script, the language name in English, an English transliteration of the - # native name, the English name of the writing script used by the language, - # or occasionally something else entirely. - - # this cases are so special they need to be hardcoded, a couple of them are misspellings - language_names = { - 'english_uk': 'en-GB', - 'fantizhengwen': ['zh-TW', 'zh-HK'], - 'hangul': 'ko', - 'malayam': 'ml', - 'norsk': 'nb', - 'sinhalese': 'si', - 'sudanese': 'su', +def fetch_traits(engine_traits: EngineTraits): + """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage + regions>` from Startpage.""" + # pylint: disable=too-many-branches + + headers = { + 'User-Agent': gen_useragent(), + 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language } + resp = network.get('https://www.startpage.com/do/settings', headers=headers) - # get the English name of every language known by babel - language_names.update( - { - # fmt: off - name.lower(): lang_code - # pylint: disable=protected-access - for lang_code, name in Locale('en')._data['languages'].items() - # fmt: on - } - ) + if not resp.ok: + print("ERROR: response from Startpage is not OK.") + + dom = lxml.html.fromstring(resp.text) + + # regions + + sp_region_names = [] + for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'): + sp_region_names.append(option.get('value')) + + for eng_tag in sp_region_names: + if eng_tag == 'all': + continue + babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway + + if '-' in babel_region_tag: + l, r = babel_region_tag.split('-') + r = r.split('_')[-1] + sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_')) + + else: + try: + sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_')) + + except babel.UnknownLocaleError: + print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag) + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.regions[sxng_tag] = eng_tag + + # languages + + catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()} # get the native name of every language known by babel - for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()): - native_name = Locale(lang_code).get_language_name().lower() + + for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()): + native_name = babel.Locale(lang_code).get_language_name().lower() # add native name exactly as it is - language_names[native_name] = lang_code + catalog_engine2code[native_name] = lang_code # add "normalized" language name (i.e. français becomes francais and español becomes espanol) unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name))) if len(unaccented_name) == len(unaccented_name.encode()): # add only if result is ascii (otherwise "normalization" didn't work) - language_names[unaccented_name] = lang_code + catalog_engine2code[unaccented_name] = lang_code + + # values that can't be determined by babel's languages names + + catalog_engine2code.update( + { + # traditional chinese used in .. + 'fantizhengwen': 'zh_Hant', + # Korean alphabet + 'hangul': 'ko', + # Malayalam is one of 22 scheduled languages of India. + 'malayam': 'ml', + 'norsk': 'nb', + 'sinhalese': 'si', + } + ) + + skip_eng_tags = { + 'english_uk', # SearXNG lang 'en' already maps to 'english' + } - dom = html.fromstring(resp.text) - sp_lang_names = [] for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): - sp_lang_names.append((option.get('value'), extract_text(option).lower())) - - supported_languages = {} - for sp_option_value, sp_option_text in sp_lang_names: - lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text) - if isinstance(lang_code, str): - supported_languages[lang_code] = {'alias': sp_option_value} - elif isinstance(lang_code, list): - for _lc in lang_code: - supported_languages[_lc] = {'alias': sp_option_value} - else: - print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text)) - return supported_languages + eng_tag = option.get('value') + if eng_tag in skip_eng_tags: + continue + name = extract_text(option).lower() + + sxng_tag = catalog_engine2code.get(eng_tag) + if sxng_tag is None: + sxng_tag = catalog_engine2code[name] + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.languages[sxng_tag] = eng_tag diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 8d3b0839a..6ea77f092 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Wikidata +"""This module implements the Wikidata engine. Some implementations are shared +from :ref:`wikipedia engine`. + """ # pylint: disable=missing-class-docstring +from typing import TYPE_CHECKING from hashlib import md5 from urllib.parse import urlencode, unquote from json import loads @@ -13,12 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_ from searx.data import WIKIDATA_UNITS from searx.network import post, get -from searx.utils import match_language, searx_useragent, get_string_replaces_function +from searx.utils import searx_useragent, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom -from searx.engines.wikipedia import ( # pylint: disable=unused-import - _fetch_supported_languages, - supported_languages_url, -) +from searx.engines.wikipedia import fetch_traits as _fetch_traits +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -154,33 +162,35 @@ def send_wikidata_query(query, method='GET'): def request(query, params): - language = params['language'].split('-')[0] - if language == 'all': - language = 'en' - else: - language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + + # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN + # mapped to zh + sxng_lang = params['searxng_locale'].split('-')[0] + language = traits.get_language(sxng_lang, 'en') query, attributes = get_query(query, language) + logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) params['method'] = 'POST' params['url'] = SPARQL_ENDPOINT_URL params['data'] = {'query': query} params['headers'] = get_headers() - params['language'] = language params['attributes'] = attributes + return params def response(resp): + results = [] jsonresponse = loads(resp.content.decode()) - language = resp.search_params['language'].lower() + language = resp.search_params['language'] attributes = resp.search_params['attributes'] + logger.debug("request --> language %s // len(attributes): %s", language, len(attributes)) seen_entities = set() - for result in jsonresponse.get('results', {}).get('bindings', []): attribute_result = {key: value['value'] for key, value in result.items()} entity_url = attribute_result['item'] @@ -756,3 +766,15 @@ def init(engine_settings=None): # pylint: disable=unused-argument lang = result['name']['xml:lang'] entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() + + +def fetch_traits(engine_traits: EngineTraits): + """Use languages evaluated from :py:obj:`wikipedia.fetch_traits + <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what + is not supported by wikidata.""" + + _fetch_traits(engine_traits) + # wikidata does not support zh-classical (zh_Hans) + engine_traits.languages.pop('zh_Hans') + # wikidata does not have net-locations for the languages + engine_traits.custom['wiki_netloc'] = {} diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index ca841e8b3..9d2d30afa 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -1,13 +1,26 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""This module implements the Wikipedia engine. Some of this implementations +are shared by other engines: + +- :ref:`wikidata engine` + +The list of supported languages is fetched from the article linked by +:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia +does not support one Wikipedia for all the languages, but there is one Wikipedia +for every language (:py:obj:`fetch_traits`). """ - Wikipedia (Web) -""" -from urllib.parse import quote -from json import loads -from lxml.html import fromstring -from searx.utils import match_language, searx_useragent -from searx.network import raise_for_httperror +import urllib.parse +import babel + +from lxml import html + +from searx import network +from searx.locales import language_tag +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits # about about = { @@ -19,32 +32,40 @@ about = { "results": 'JSON', } - send_accept_language_header = True -# search-url -search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' -supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' -language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")} +wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth' +"""The *editing depth* of Wikipedia is one of several possible rough indicators +of the encyclopedia's collaborative quality, showing how frequently its articles +are updated. The measurement of depth was introduced after some limitations of +the classic measurement of article count were realized. +""" + +# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日 +rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' +"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of +the first paragraph of the page in plain text and HTML as well as the type of +page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web +and link previews in the apps. +.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ -# set language in base_url -def url_lang(lang): - lang_pre = lang.split('-')[0] - if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases: - return 'en' - return match_language(lang, supported_languages, language_aliases).split('-')[0] +""" -# do search-request def request(query, params): + """Assemble a request (`wikipedia rest_v1 summary API`_).""" if query.islower(): query = query.title() - language = url_lang(params['language']) - params['url'] = search_url.format(title=quote(query), language=language) + engine_language = traits.get_language(params['searxng_locale'], 'en') + wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/') + title = urllib.parse.quote(query) + + # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/ + # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/ + params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title) - params['headers']['User-Agent'] = searx_useragent() params['raise_for_httperror'] = False params['soft_max_redirects'] = 2 @@ -53,13 +74,14 @@ def request(query, params): # get response from search-request def response(resp): + + results = [] if resp.status_code == 404: return [] - if resp.status_code == 400: try: - api_result = loads(resp.text) - except: + api_result = resp.json() + except Exception: # pylint: disable=broad-except pass else: if ( @@ -68,49 +90,135 @@ def response(resp): ): return [] - raise_for_httperror(resp) - - results = [] - api_result = loads(resp.text) - - # skip disambiguation pages - if api_result.get('type') != 'standard': - return [] + network.raise_for_httperror(resp) + api_result = resp.json() title = api_result['title'] wikipedia_link = api_result['content_urls']['desktop']['page'] - - results.append({'url': wikipedia_link, 'title': title}) - - results.append( - { - 'infobox': title, - 'id': wikipedia_link, - 'content': api_result.get('extract', ''), - 'img_src': api_result.get('thumbnail', {}).get('source'), - 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}], - } - ) + results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')}) + + if api_result.get('type') == 'standard': + results.append( + { + 'infobox': title, + 'id': wikipedia_link, + 'content': api_result.get('extract', ''), + 'img_src': api_result.get('thumbnail', {}).get('source'), + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}], + } + ) return results -# get supported languages from their site -def _fetch_supported_languages(resp): - supported_languages = {} - dom = fromstring(resp.text) - tables = dom.xpath('//table[contains(@class,"sortable")]') - for table in tables: - # exclude header row - trs = table.xpath('.//tr')[1:] - for tr in trs: - td = tr.xpath('./td') - code = td[3].xpath('./a')[0].text - name = td[1].xpath('./a')[0].text - english_name = td[1].xpath('./a')[0].text - articles = int(td[4].xpath('./a')[0].text.replace(',', '')) +# Nonstandard language codes +# +# These Wikipedias use language codes that do not conform to the ISO 639 +# standard (which is how wiki subdomains are chosen nowadays). + +lang_map = { + 'be-tarask': 'bel', + 'ak': 'aka', + 'als': 'gsw', + 'bat-smg': 'sgs', + 'cbk-zam': 'cbk', + 'fiu-vro': 'vro', + 'map-bms': 'map', + 'nrm': 'nrf', + 'roa-rup': 'rup', + 'nds-nl': 'nds', + #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple) + 'zh-min-nan': 'nan', + 'zh-yue': 'yue', + 'an': 'arg', + 'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason) +} + +unknown_langs = [ + 'an', # Aragonese + 'ba', # Bashkir + 'bar', # Bavarian + 'bcl', # Central Bicolano + 'be-tarask', # Belarusian variant / Belarusian is already covered by 'be' + 'bpy', # Bishnupriya Manipuri is unknown by babel + 'hif', # Fiji Hindi + 'ilo', # Ilokano + 'li', # Limburgish + 'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel + 'sh', # Serbo-Croatian + 'simple', # simple english is not know as a natural language different to english (babel) + 'vo', # Volapük + 'wa', # Walloon +] + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from Wikipedia. + + The location of the Wikipedia address of a language is mapped in a + :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>` + (``wiki_netloc``). Here is a reduced example: + + .. code:: python + + traits.custom['wiki_netloc'] = { + "en": "en.wikipedia.org", + .. + "gsw": "als.wikipedia.org", + .. + "zh": "zh.wikipedia.org", + "zh-classical": "zh-classical.wikipedia.org" + } + + """ + + engine_traits.custom['wiki_netloc'] = {} + + # insert alias to map from a region like zh-CN to a language zh_Hans + engine_traits.languages['zh_Hans'] = 'zh' + + resp = network.get(wikipedia_article_depth) + if not resp.ok: + print("ERROR: response from Wikipedia is not OK.") + + dom = html.fromstring(resp.text) + for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'): + + cols = row.xpath('./td') + if not cols: + continue + cols = [c.text_content().strip() for c in cols] + + depth = float(cols[3].replace('-', '0').replace(',', '')) + articles = int(cols[4].replace(',', '').replace(',', '')) + + if articles < 10000: # exclude languages with too few articles - if articles >= 100: - supported_languages[code] = {"name": name, "english_name": english_name} + continue + + if int(depth) < 20: + # Rough indicator of a Wikipedia’s quality, showing how frequently + # its articles are updated. + continue - return supported_languages + eng_tag = cols[2] + wiki_url = row.xpath('./td[3]/a/@href')[0] + wiki_url = urllib.parse.urlparse(wiki_url) + + if eng_tag in unknown_langs: + continue + + try: + sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-')) + except babel.UnknownLocaleError: + print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag)) + continue + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + + engine_traits.languages[sxng_tag] = eng_tag + engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index c13ce6d78..0fdeacec2 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -17,8 +17,10 @@ from searx.utils import ( eval_xpath_getindex, eval_xpath_list, extract_text, - match_language, ) +from searx.enginelib.traits import EngineTraits + +traits: EngineTraits # about about = { @@ -34,8 +36,7 @@ about = { categories = ['general', 'web'] paging = True time_range_support = True -supported_languages_url = 'https://search.yahoo.com/preferences/languages' -"""Supported languages are read from Yahoo preference page.""" +# send_accept_language_header = True time_range_dict = { 'day': ('1d', 'd'), @@ -43,15 +44,10 @@ time_range_dict = { 'month': ('1m', 'm'), } -language_aliases = { - 'zh-HK': 'zh_chs', - 'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com - 'zh-TW': 'zh_cht', -} - lang2domain = { 'zh_chs': 'hk.search.yahoo.com', 'zh_cht': 'tw.search.yahoo.com', + 'any': 'search.yahoo.com', 'en': 'search.yahoo.com', 'bg': 'search.yahoo.com', 'cs': 'search.yahoo.com', @@ -67,21 +63,23 @@ lang2domain = { } """Map language to domain""" - -def _get_language(params): - - lang = language_aliases.get(params['language']) - if lang is None: - lang = match_language(params['language'], supported_languages, language_aliases) - lang = lang.split('-')[0] - logger.debug("params['language']: %s --> %s", params['language'], lang) - return lang +locale_aliases = { + 'zh': 'zh_Hans', + 'zh-HK': 'zh_Hans', + 'zh-CN': 'zh_Hans', # dead since 2015 / routed to hk.search.yahoo.com + 'zh-TW': 'zh_Hant', +} def request(query, params): """build request""" + + lang = locale_aliases.get(params['language'], None) + if not lang: + lang = params['language'].split('-')[0] + lang = traits.get_language(lang, traits.all_locale) + offset = (params['pageno'] - 1) * 7 + 1 - lang = _get_language(params) age, btf = time_range_dict.get(params['time_range'], ('', '')) args = urlencode( @@ -154,13 +152,37 @@ def response(resp): return results -# get supported languages from their site -def _fetch_supported_languages(resp): - supported_languages = [] +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from yahoo""" + + # pylint: disable=import-outside-toplevel + import babel + from searx import network + from searx.locales import language_tag + + engine_traits.all_locale = 'any' + + resp = network.get('https://search.yahoo.com/preferences/languages') + if not resp.ok: + print("ERROR: response from peertube is not OK.") + dom = html.fromstring(resp.text) offset = len('lang_') + eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'} + for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): - supported_languages.append(val[offset:]) + eng_tag = val[offset:] + + try: + sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag))) + except babel.UnknownLocaleError: + print('ERROR: unknown language --> %s' % eng_tag) + continue - return supported_languages + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != eng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag)) + continue + engine_traits.languages[sxng_tag] = eng_tag |