From 249989955497cd048fa3312d115971282983b269 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 4 Dec 2022 22:57:22 +0100 Subject: [mod] Google: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the Google engines including a improved language and region handling based on the engine.traits_v1 data. When ever possible the implementations of the Google engines try to make use of the async REST APIs. The get_lang_info() has been generalized to a get_google_info() function / especially the region handling has been improved by adding the cr parameter. searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - Google (WEB), - Google images, - Google news, - Google scholar and - Google videos and remove data from obsolete data type "supported_languages". A traits.custom type that maps region codes to *supported_domains* is fetched from https://www.google.com/supported_domains searx/autocomplete.py: Reversed engineered autocomplete from Google WEB. Supports Google's languages and subdomains. The old API suggestqueries.google.com/complete has been replaced by the async REST API: https://{subdomain}/complete/search?{args} searx/engines/google.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - always use the async REST API (formally known as 'use_mobile_ui') - use *supported_domains* from traits - improved the result list by fetching './/div[@data-content-feature]' and parsing the type of the various *content features* --> thumbnails are added searx/engines/google_images.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - if exists, freshness_date is added to the result - issue 1864: result list has been improved a lot (due to the new cr parameter) searx/engines/google_news.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. *supported_domains* is not needed but a ceid list has been added. - different region handling compared to Google WEB - fixed for various languages & regions (due to the new ceid parameter) / avoid CONSENT page - Google News do no longer support time range - result list has been fixed: XPath of pub_date and pub_origin searx/engines/google_videos.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - add paging support - implement a async request ('asearch': 'arc' & 'async': 'use_ac:true,_fmt:html') - simplified code (thanks to '_fmt:html' request) - issue 1359: fixed xpath of video length data searx/engines/google_scholar.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - request(): include patents & citations - response(): fixed CAPTCHA detection (Scholar has its own CATCHA manager) - hardening XPath to iterate over results - fixed XPath of pub_type (has been change from gs_ct1 to gs_cgt2 class) - issue 1769 fixed: new request implementation is no longer incompatible Signed-off-by: Markus Heiser --- searx/engines/google.py | 440 ++++++++++++++++++++++------------------ searx/engines/google_images.py | 49 +++-- searx/engines/google_news.py | 251 ++++++++++++++++++----- searx/engines/google_scholar.py | 121 +++++------ searx/engines/google_videos.py | 115 +++-------- 5 files changed, 566 insertions(+), 410 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/google.py b/searx/engines/google.py index bee7085ec..bfdd4f1e5 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,38 +1,40 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google WEB engine. Some of this -implementations are shared by other engines: +"""This is the implementation of the Google WEB engine. Some of this +implementations (manly the :py:obj:`get_google_info`) are shared by other +engines: - :ref:`google images engine` - :ref:`google news engine` - :ref:`google videos engine` - -The google WEB engine itself has a special setup option: - -.. code:: yaml - - - name: google - ... - use_mobile_ui: false - -``use_mobile_ui``: (default: ``false``) - Enables to use *mobile endpoint* to bypass the google blocking (see - :issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More - results` is not affected by Google rate limiting and we can still do requests - while actively blocked by the original Google search. By activate - ``use_mobile_ui`` this behavior is simulated by adding the parameter - ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. +- :ref:`google scholar engine` +- :ref:`google autocomplete` """ +from typing import TYPE_CHECKING + +import re from urllib.parse import urlencode from lxml import html -from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +import babel +import babel.core +import babel.languages + +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +from searx.locales import language_tag, region_tag, get_offical_locales +from searx import network from searx.exceptions import SearxEngineCaptchaException from searx.enginelib.traits import EngineTraits +if TYPE_CHECKING: + import logging + + logger: logging.Logger + traits: EngineTraits + # about about = { "website": 'https://www.google.com', @@ -48,64 +50,6 @@ categories = ['general', 'web'] paging = True time_range_support = True safesearch = True -send_accept_language_header = True -use_mobile_ui = False -supported_languages_url = 'https://www.google.com/preferences?#languages' - -# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests -google_domains = { - 'BG': 'google.bg', # Bulgaria - 'CZ': 'google.cz', # Czech Republic - 'DE': 'google.de', # Germany - 'DK': 'google.dk', # Denmark - 'AT': 'google.at', # Austria - 'CH': 'google.ch', # Switzerland - 'GR': 'google.gr', # Greece - 'AU': 'google.com.au', # Australia - 'CA': 'google.ca', # Canada - 'GB': 'google.co.uk', # United Kingdom - 'ID': 'google.co.id', # Indonesia - 'IE': 'google.ie', # Ireland - 'IN': 'google.co.in', # India - 'MY': 'google.com.my', # Malaysia - 'NZ': 'google.co.nz', # New Zealand - 'PH': 'google.com.ph', # Philippines - 'SG': 'google.com.sg', # Singapore - 'US': 'google.com', # United States (google.us) redirects to .com - 'ZA': 'google.co.za', # South Africa - 'AR': 'google.com.ar', # Argentina - 'CL': 'google.cl', # Chile - 'ES': 'google.es', # Spain - 'MX': 'google.com.mx', # Mexico - 'EE': 'google.ee', # Estonia - 'FI': 'google.fi', # Finland - 'BE': 'google.be', # Belgium - 'FR': 'google.fr', # France - 'IL': 'google.co.il', # Israel - 'HR': 'google.hr', # Croatia - 'HU': 'google.hu', # Hungary - 'IT': 'google.it', # Italy - 'JP': 'google.co.jp', # Japan - 'KR': 'google.co.kr', # South Korea - 'LT': 'google.lt', # Lithuania - 'LV': 'google.lv', # Latvia - 'NO': 'google.no', # Norway - 'NL': 'google.nl', # Netherlands - 'PL': 'google.pl', # Poland - 'BR': 'google.com.br', # Brazil - 'PT': 'google.pt', # Portugal - 'RO': 'google.ro', # Romania - 'RU': 'google.ru', # Russia - 'SK': 'google.sk', # Slovakia - 'SI': 'google.si', # Slovenia - 'SE': 'google.se', # Sweden - 'TH': 'google.co.th', # Thailand - 'TR': 'google.com.tr', # Turkey - 'UA': 'google.com.ua', # Ukraine - 'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN - 'HK': 'google.com.hk', # Hong Kong - 'TW': 'google.com.tw', # Taiwan -} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} @@ -118,47 +62,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'} results_xpath = './/div[@data-sokoban-container]' title_xpath = './/a/h3[1]' href_xpath = './/a[h3]/@href' -content_xpath = './/div[@data-content-feature=1]' +content_xpath = './/div[@data-content-feature]' # google *sections* are no usual *results*, we ignore them g_section_with_header = './g-section-with-header' - # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' +# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for +# # celebrities like '!google natasha allegri' +# # or '!google chris evans' +UI_ASYNC = 'use_ac:true,_fmt:prog' +"""Format of the response from UI's async request.""" + -def get_lang_info(params, lang_list, custom_aliases, supported_any_language): - """Composing various language properties for the google engines. +def get_google_info(params, eng_traits): + """Composing various (language) properties for the google engines (:ref:`google + API`). This function is called by the various google engines (:ref:`google web engine`, :ref:`google images engine`, :ref:`google news engine` and :ref:`google videos engine`). - :param dict param: request parameters of the engine - - :param list lang_list: list of supported languages of the engine - :py:obj:`ENGINES_LANGUAGES[engine-name] ` - - :param dict lang_list: custom aliases for non standard language codes - (used when calling :py:func:`searx.utils.match_language`) + :param dict param: Request parameters of the engine. At least + a ``searxng_locale`` key should be in the dictionary. - :param bool supported_any_language: When a language is not specified, the - language interpretation is left up to Google to decide how the search - results should be delivered. This argument is ``True`` for the google - engine and ``False`` for the other engines (google-images, -news, - -scholar, -videos). + :param eng_traits: Engine's traits fetched from google preferences + (:py:obj:`searx.enginelib.traits.EngineTraits`) :rtype: dict :returns: Py-Dictionary with the key/value pairs: language: - Return value from :py:func:`searx.utils.match_language` + The language code that is used by google (e.g. ``lang_en`` or + ``lang_zh-TW``) country: - The country code (e.g. US, AT, CA, FR, DE ..) + The country code that is used by google (e.g. ``US`` or ``TW``) + + locale: + A instance of :py:obj:`babel.core.Locale` build from the + ``searxng_locale`` value. subdomain: Google subdomain :py:obj:`google_domains` that fits to the country @@ -168,52 +115,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): Py-Dictionary with additional request arguments (can be passed to :py:func:`urllib.parse.urlencode`). + - ``hl`` parameter: specifies the interface language of user interface. + - ``lr`` parameter: restricts search results to documents written in + a particular language. + - ``cr`` parameter: restricts search results to documents + originating in a particular country. + - ``ie`` parameter: sets the character encoding scheme that should + be used to interpret the query string ('utf8'). + - ``oe`` parameter: sets the character encoding scheme that should + be used to decode the XML result ('utf8'). + headers: Py-Dictionary with additional HTTP headers (can be passed to request's headers) + + - ``Accept: '*/*`` + """ + ret_val = { 'language': None, 'country': None, 'subdomain': None, 'params': {}, 'headers': {}, + 'cookies': {}, + 'locale': None, } - # language ... - - _lang = params['language'] - _any_language = _lang.lower() == 'all' - if _any_language: - _lang = 'en-US' - language = match_language(_lang, lang_list, custom_aliases) - ret_val['language'] = language - - # country ... + sxng_locale = params.get('searxng_locale', 'all') + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.core.UnknownLocaleError: + locale = None - _l = _lang.split('-') - if len(_l) == 2: - country = _l[1] - else: - country = _l[0].upper() - if country == 'EN': - country = 'US' - ret_val['country'] = country + eng_lang = eng_traits.get_language(sxng_locale, 'lang_en') + lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en + country = eng_traits.get_region(sxng_locale, eng_traits.all_locale) - # subdomain ... + # Test zh_hans & zh_hant --> in the topmost links in the result list of list + # TW and HK you should a find wiktionary.org zh_hant link. In the result + # list of zh-CN should not be no hant link instead you should find + # zh.m.wikipedia.org/zh somewhere in the top. - ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') + # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5 + # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5 - # params & headers - - lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..) + ret_val['language'] = eng_lang + ret_val['country'] = country + ret_val['locale'] = locale + ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com') # hl parameter: - # https://developers.google.com/custom-search/docs/xml_results#hlsp The - # Interface Language: + # The hl parameter specifies the interface language (host language) of + # your user interface. To improve the performance and the quality of your + # search results, you are strongly encouraged to set this parameter + # explicitly. + # https://developers.google.com/custom-search/docs/xml_results#hlsp + # The Interface Language: # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages - ret_val['params']['hl'] = lang_list.get(lang_country, language) + ret_val['params']['hl'] = lang_code # lr parameter: # The lr (language restrict) parameter restricts search results to @@ -221,22 +183,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): # https://developers.google.com/custom-search/docs/xml_results#lrsp # Language Collection Values: # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections + # + # To select 'all' languages an empty 'lr' value is used. + # + # Different to other google services, Google Schloar supports to select more + # than one language. The languages are seperated by a pipe '|' (logical OR). + # By example: &lr=lang_zh-TW%7Clang_de selects articles written in + # traditional chinese OR german language. - if _any_language and supported_any_language: + ret_val['params']['lr'] = eng_lang + if sxng_locale == 'all': + ret_val['params']['lr'] = '' - # interpretation is left up to Google (based on whoogle) - # - # - add parameter ``source=lnt`` - # - don't use parameter ``lr`` - # - don't add a ``Accept-Language`` HTTP header. + # cr parameter: + # The cr parameter restricts search results to documents originating in a + # particular country. + # https://developers.google.com/custom-search/docs/xml_results#crsp - ret_val['params']['source'] = 'lnt' + ret_val['params']['cr'] = 'country' + country + if sxng_locale == 'all': + ret_val['params']['cr'] = '' - else: + # gl parameter: (mandatory by Geeogle News) + # The gl parameter value is a two-letter country code. For WebSearch + # results, the gl parameter boosts search results whose country of origin + # matches the parameter value. See the Country Codes section for a list of + # valid values. + # Specifying a gl parameter value in WebSearch requests should improve the + # relevance of results. This is particularly true for international + # customers and, even more specifically, for customers in English-speaking + # countries other than the United States. + # https://developers.google.com/custom-search/docs/xml_results#glsp + + ret_val['params']['gl'] = country + + # ie parameter: + # The ie parameter sets the character encoding scheme that should be used + # to interpret the query string. The default ie value is latin1. + # https://developers.google.com/custom-search/docs/xml_results#iesp + + ret_val['params']['ie'] = 'utf8' + + # oe parameter: + # The oe parameter sets the character encoding scheme that should be used + # to decode the XML result. The default oe value is latin1. + # https://developers.google.com/custom-search/docs/xml_results#oesp + + ret_val['params']['oe'] = 'utf8' + + # num parameter: + # The num parameter identifies the number of search results to return. + # The default num value is 10, and the maximum value is 20. If you request + # more than 20 results, only 20 results will be returned. + # https://developers.google.com/custom-search/docs/xml_results#numsp + + # HINT: seems to have no effect (tested in google WEB & Images) + # ret_val['params']['num'] = 20 + + # HTTP headers + + ret_val['headers']['Accept'] = '*/*' - # restricts search results to documents written in a particular - # language. - ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) + # Cookies + + # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746 + # - https://github.com/searxng/searxng/issues/1555 + ret_val['cookies']['CONSENT'] = "YES+" return ret_val @@ -248,33 +260,34 @@ def detect_google_sorry(resp): def request(query, params): """Google search request""" - + # pylint: disable=line-too-long offset = (params['pageno'] - 1) * 10 - - lang_info = get_lang_info(params, supported_languages, language_aliases, True) - - additional_parameters = {} - if use_mobile_ui: - additional_parameters = { - 'asearch': 'arc', - 'async': 'use_ac:true,_fmt:prog', - } + google_info = get_google_info(params, traits) # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, - **lang_info['params'], - 'ie': "utf8", - 'oe': "utf8", - 'start': offset, + **google_info['params'], 'filter': '0', - **additional_parameters, + 'start': offset, + # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i', + # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG', + # 'cs' : 1, + # 'sa': 'N', + # 'yv': 3, + # 'prmd': 'vin', + # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE', + # 'sa': 'N', + # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg' + # formally known as use_mobile_ui + 'asearch': 'arc', + 'async': UI_ASYNC, } ) ) @@ -285,25 +298,45 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - if use_mobile_ui: - params['headers']['Accept'] = '*/*' - else: - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params +# (function(){var s='data:image/jpeg;base64,/9j/4AAQ ... +# ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})(); +RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc") + + +def _parse_data_images(dom): + data_image_map = {} + for _script in eval_xpath_list(dom, "//script[@nonce]"): + script = _script.text + if not script: + continue + script = RE_DATA_IMAGE.search(script) + if not script: + continue + data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=') + logger.debug('data:image objects --> %s', list(data_image_map.keys())) + return data_image_map + + def response(resp): """Get response from google's search request""" - + # pylint: disable=too-many-branches, too-many-statements detect_google_sorry(resp) results = [] # convert the text to dom dom = html.fromstring(resp.text) + + data_image_map = {} + if '_fmt:html' in UI_ASYNC: + # in this format images are embedded by a bse64 encoded 'data:image' + data_image_map = _parse_data_images(dom) + # results --> answer answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') if answer_list: @@ -312,20 +345,9 @@ def response(resp): else: logger.debug("did not find 'answer'") - # results --> number_of_results - if not use_mobile_ui: - try: - _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) - _digit = ''.join([n for n in _txt if n.isdigit()]) - number_of_results = int(_digit) - results.append({'number_of_results': number_of_results}) - except Exception as e: # pylint: disable=broad-except - logger.debug("did not 'number_of_results'") - logger.error(e, exc_info=True) - # parse results - for result in eval_xpath_list(dom, results_xpath): + for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): @@ -342,13 +364,31 @@ def response(resp): url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue - content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) - if content is None: + + content = [] + img_list = [] + for content_feature in eval_xpath(result, content_xpath): + val = content_feature.attrib['data-content-feature'] + if val in ['1', '2']: + txt = extract_text(content_feature, allow_none=True) + if txt: + content.append(txt) + elif '0' in val: + img = content_feature.xpath('.//img/@src') + if img: + img = img[0] + if img.startswith('data:image'): + img_id = content_feature.xpath('.//img/@id') + if img_id: + img = data_image_map.get(img_id[0]) + img_list.append(img) + + if not content: logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) continue - - logger.debug('add link to results: %s', title) - results.append({'url': url, 'title': title, 'content': content}) + content = ' / '.join(content) + img_src = img_list[0] if img_list else None + results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) @@ -364,18 +404,6 @@ def response(resp): # get supported languages from their site -def _fetch_supported_languages(resp): - ret_val = {} - dom = html.fromstring(resp.text) - - radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') - - for x in radio_buttons: - name = x.get("data-name") - code = x.get("value").split('_')[-1] - ret_val[code] = {"name": name} - - return ret_val skip_countries = [ @@ -404,25 +432,21 @@ skip_countries = [ ] -def fetch_traits(engine_traits: EngineTraits): +def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): """Fetch languages from Google.""" - # pylint: disable=import-outside-toplevel - - engine_traits.data_type = 'supported_languages' # deprecated + # pylint: disable=import-outside-toplevel, too-many-branches - import babel - import babel.languages - from searx import network - from searx.locales import language_tag, region_tag, get_offical_locales + engine_traits.custom['supported_domains'] = {} resp = network.get('https://www.google.com/preferences') if not resp.ok: - print("ERROR: response from Google is not OK.") + raise RuntimeError("Response from Google's preferences is not OK.") dom = html.fromstring(resp.text) - lang_map = {'no': 'nb'} + # supported language codes + lang_map = {'no': 'nb'} for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'): eng_lang = x.get("value").split('_')[-1] @@ -443,6 +467,8 @@ def fetch_traits(engine_traits: EngineTraits): # alias languages engine_traits.languages['zh'] = 'lang_zh-CN' + # supported region codes + for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'): eng_country = x.get("value") @@ -459,4 +485,26 @@ def fetch_traits(engine_traits: EngineTraits): continue for sxng_locale in sxng_locales: - engine_traits.regions[region_tag(sxng_locale)] = 'country' + eng_country + engine_traits.regions[region_tag(sxng_locale)] = eng_country + + # alias regions + engine_traits.regions['zh-CN'] = 'HK' + + # supported domains + + if add_domains: + resp = network.get('https://www.google.com/supported_domains') + if not resp.ok: + raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.") + + for domain in resp.text.split(): + domain = domain.strip() + if not domain or domain in [ + '.google.com', + ]: + continue + region = domain.split('.')[-1].upper() + engine_traits.custom['supported_domains'][region] = 'www' + domain + if region == 'HK': + # There is no google.cn, we use .com.hk for zh-CN + engine_traits.custom['supported_domains']['CN'] = 'www' + domain diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 219f2adee..e6445b1c4 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,31 +1,38 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google images engine using the google -internal API used the Google Go Android app. +"""This is the implementation of the Google Images engine using the internal +Google API used by the Google Go Android app. This internal API offer results in -- JSON (_fmt:json) -- Protobuf (_fmt:pb) -- Protobuf compressed? (_fmt:pc) -- HTML (_fmt:html) -- Protobuf encoded in JSON (_fmt:jspb). +- JSON (``_fmt:json``) +- Protobuf_ (``_fmt:pb``) +- Protobuf_ compressed? (``_fmt:pc``) +- HTML (``_fmt:html``) +- Protobuf_ encoded in JSON (``_fmt:jspb``). +.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers """ +from typing import TYPE_CHECKING + from urllib.parse import urlencode from json import loads +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, detect_google_sorry, ) -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits +if TYPE_CHECKING: + import logging + from searx.enginelib.traits import EngineTraits + + logger: logging.Logger + traits: EngineTraits -# pylint: enable=unused-import # about about = { @@ -40,7 +47,6 @@ about = { # engine dependent config categories = ['images', 'web'] paging = True -use_locale_domain = True time_range_support = True safesearch = True send_accept_language_header = True @@ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'} def request(query, params): """Google-Image search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + google_info = get_google_info(params, traits) query_url = ( 'https://' - + lang_info['subdomain'] + + google_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, 'tbm': "isch", - **lang_info['params'], - 'ie': "utf8", - 'oe': "utf8", + **google_info['params'], 'asearch': 'isch', 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), } @@ -77,9 +81,8 @@ def request(query, params): query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url - params['headers'].update(lang_info['headers']) - params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip' - params['headers']['Accept'] = '*/*' + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -111,7 +114,11 @@ def response(resp): copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') if copyright_notice: - result_item['source'] += ' / ' + copyright_notice + result_item['source'] += ' | ' + copyright_notice + + freshness_date = item["result"].get("freshness_date") + if freshness_date: + result_item['source'] += ' | ' + freshness_date file_size = item.get('gsa', {}).get('file_size') if file_size: diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 8962af36a..ae55ca9cb 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,24 +1,40 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google news engine. The google news API -ignores some parameters from the common :ref:`google API`: +"""This is the implementation of the Google News engine. -- num_ : the number of search results is ignored +Google News has a different region handling compared to Google WEB. + +- the ``ceid`` argument has to be set (:py:obj:`ceid_list`) +- the hl_ argument has to be set correctly (and different to Google WEB) +- the gl_ argument is mandatory + +If one of this argument is not set correctly, the request is redirected to +CONSENT dialog:: + + https://consent.google.com/m?continue= + +The google news API ignores some parameters from the common :ref:`google API`: + +- num_ : the number of search results is ignored / there is no paging all + results for a query term are in the first response. - save_ : is ignored / Google-News results are always *SafeSearch* +.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp +.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp - """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING import binascii import re from urllib.parse import urlencode from base64 import b64decode from lxml import html +import babel +from searx import locales from searx.utils import ( eval_xpath, eval_xpath_list, @@ -26,19 +42,19 @@ from searx.utils import ( extract_text, ) -# pylint: disable=unused-import +from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - fetch_traits, - supported_languages_url, - _fetch_supported_languages, + get_google_info, + detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: enable=unused-import +if TYPE_CHECKING: + import logging -from searx.engines.google import ( - get_lang_info, - detect_google_sorry, -) + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -50,70 +66,77 @@ about = { "results": 'HTML', } -# compared to other google engines google-news has a different time range -# support. The time range is included in the search term. -time_range_dict = { - 'day': 'when:1d', - 'week': 'when:7d', - 'month': 'when:1m', - 'year': 'when:1y', -} - # engine dependent config - categories = ['news'] paging = False -use_locale_domain = True -time_range_support = True +time_range_support = False # Google-News results are always *SafeSearch*. Option 'safesearch' is set to # False here, otherwise checker will report safesearch-errors:: # # safesearch : results are identitical for safesearch=0 and safesearch=2 -safesearch = False -send_accept_language_header = True +safesearch = True +# send_accept_language_header = True def request(query, params): """Google-News search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + sxng_locale = params.get('searxng_locale', 'en-US') + ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en') + google_info = get_google_info(params, traits) + google_info['subdomain'] = 'news.google.com' # google news has only one domain - # google news has only one domain - lang_info['subdomain'] = 'news.google.com' + ceid_region, ceid_lang = ceid.split(':') + ceid_lang, ceid_suffix = ( + ceid_lang.split('-') + + [ + None, + ] + )[:2] - ceid = "%s:%s" % (lang_info['country'], lang_info['language']) + google_info['params']['hl'] = ceid_lang - # google news redirects en to en-US - if lang_info['params']['hl'] == 'en': - lang_info['params']['hl'] = 'en-US' + if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']: - # Very special to google-news compared to other google engines, the time - # range is included in the search term. - if params['time_range']: - query += ' ' + time_range_dict[params['time_range']] + if ceid_region.lower() == ceid_lang: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix + + elif ceid_region.lower() != ceid_lang: + + if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']: + google_info['params']['hl'] = ceid_lang + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + + google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0] + google_info['params']['gl'] = ceid_region query_url = ( 'https://' - + lang_info['subdomain'] - + '/search' - + "?" - + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']}) + + google_info['subdomain'] + + "/search?" + + urlencode( + { + 'q': query, + **google_info['params'], + } + ) + # ceid includes a ':' character which must not be urlencoded + ('&ceid=%s' % ceid) - ) # ceid includes a ':' character which must not be urlencoded - params['url'] = query_url - - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + params['url'] = query_url + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params def response(resp): """Get response from google's search request""" results = [] - detect_google_sorry(resp) # convert the text to dom @@ -153,8 +176,8 @@ def response(resp): # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. - pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) - pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) + pub_date = extract_text(eval_xpath(result, './article//time')) + pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]')) content = ' / '.join([x for x in [pub_origin, pub_date] if x]) @@ -175,3 +198,127 @@ def response(resp): # return results return results + + +ceid_list = [ + 'AE:ar', + 'AR:es-419', + 'AT:de', + 'AU:en', + 'BD:bn', + 'BE:fr', + 'BE:nl', + 'BG:bg', + 'BR:pt-419', + 'BW:en', + 'CA:en', + 'CA:fr', + 'CH:de', + 'CH:fr', + 'CL:es-419', + 'CN:zh-Hans', + 'CO:es-419', + 'CU:es-419', + 'CZ:cs', + 'DE:de', + 'EG:ar', + 'ES:es', + 'ET:en', + 'FR:fr', + 'GB:en', + 'GH:en', + 'GR:el', + 'HK:zh-Hant', + 'HU:hu', + 'ID:en', + 'ID:id', + 'IE:en', + 'IL:en', + 'IL:he', + 'IN:bn', + 'IN:en', + 'IN:hi', + 'IN:ml', + 'IN:mr', + 'IN:ta', + 'IN:te', + 'IT:it', + 'JP:ja', + 'KE:en', + 'KR:ko', + 'LB:ar', + 'LT:lt', + 'LV:en', + 'LV:lv', + 'MA:fr', + 'MX:es-419', + 'MY:en', + 'NA:en', + 'NG:en', + 'NL:nl', + 'NO:no', + 'NZ:en', + 'PE:es-419', + 'PH:en', + 'PK:en', + 'PL:pl', + 'PT:pt-150', + 'RO:ro', + 'RS:sr', + 'RU:ru', + 'SA:ar', + 'SE:sv', + 'SG:en', + 'SI:sl', + 'SK:sk', + 'SN:fr', + 'TH:th', + 'TR:tr', + 'TW:zh-Hant', + 'TZ:en', + 'UA:ru', + 'UA:uk', + 'UG:en', + 'US:en', + 'US:es-419', + 'VE:es-419', + 'VN:vi', + 'ZA:en', + 'ZW:en', +] +"""List of region/language combinations supported by Google News. Values of the +``ceid`` argument of the Google News REST API.""" + + +_skip_values = [ + 'ET:en', # english (ethiopia) + 'ID:en', # english (indonesia) + 'LV:en', # english (latvia) +] + +_ceid_locale_map = {'NO:no': 'nb-NO'} + + +def fetch_traits(engine_traits: EngineTraits): + _fetch_traits(engine_traits, add_domains=False) + + engine_traits.custom['ceid'] = {} + + for ceid in ceid_list: + if ceid in _skip_values: + continue + + region, lang = ceid.split(':') + x = lang.split('-') + if len(x) > 1: + if x[1] not in ['Hant', 'Hans']: + lang = x[0] + + sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region) + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) + continue + + engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 38aaf904b..6f33d1e1a 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -1,19 +1,18 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Scholar) +"""This is the implementation of the Google Scholar engine. -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. - -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions +Compared to other Google services the Scholar engine has a simple GET REST-API +and there does not exists `async` API. Even though the API slightly vintage we +can make use of the :ref:`google API` to assemble the arguments of the GET +request. """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING +from typing import Optional from urllib.parse import urlencode from datetime import datetime -from typing import Optional from lxml import html from searx.utils import ( @@ -23,20 +22,21 @@ from searx.utils import ( extract_text, ) +from searx.exceptions import SearxEngineCaptchaException + +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, - detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import ( - fetch_traits, - supported_languages_url, - _fetch_supported_languages, -) +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -52,53 +52,62 @@ about = { categories = ['science', 'scientific publications'] paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = False send_accept_language_header = True -def time_range_url(params): - """Returns a URL query component for a google-Scholar time range based on - ``params['time_range']``. Google-Scholar does only support ranges in years. - To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*) - are mapped to *year*. If no range is set, an empty string is returned. - Example:: +def time_range_args(params): + """Returns a dictionary with a time range arguments based on + ``params['time_range']``. - &as_ylo=2019 - """ - # as_ylo=2016&as_yhi=2019 - ret_val = '' - if params['time_range'] in time_range_dict: - ret_val = urlencode({'as_ylo': datetime.now().year - 1}) - return '&' + ret_val + Google Scholar supports a detailed search by year. Searching by *last + month* or *last week* (as offered by SearXNG) is uncommon for scientific + publications and is not supported by Google Scholar. + To limit the result list when the users selects a range, all the SearXNG + ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range + is set an empty dictionary of arguments is returned. Example; when + user selects a time range (current year minus one in 2022): -def request(query, params): - """Google-Scholar search request""" + .. code:: python - offset = (params['pageno'] - 1) * 10 - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + { 'as_ylo' : 2021 } - # subdomain is: scholar.google.xy - lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") + """ + ret_val = {} + if params['time_range'] in time_range_dict: + ret_val['as_ylo'] = datetime.now().year - 1 + return ret_val - query_url = ( - 'https://' - + lang_info['subdomain'] - + '/scholar' - + "?" - + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset}) - ) - query_url += time_range_url(params) - params['url'] = query_url +def detect_google_captcha(dom): + """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is + not redirected to ``sorry.google.com``. + """ + if eval_xpath(dom, "//form[@id='gs_captcha_f']"): + raise SearxEngineCaptchaException() + - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' +def request(query, params): + """Google-Scholar search request""" - # params['google_subdomain'] = subdomain + google_info = get_google_info(params, traits) + # subdomain is: scholar.google.xy + google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") + + args = { + 'q': query, + **google_info['params'], + 'start': (params['pageno'] - 1) * 10, + 'as_sdt': '2007', # include patents / to disable set '0,5' + 'as_vis': '0', # include citations / to disable set '1' + } + args.update(time_range_args(params)) + + params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params @@ -139,19 +148,15 @@ def parse_gs_a(text: Optional[str]): def response(resp): # pylint: disable=too-many-locals - """Get response from google's search request""" + """Parse response from Google Scholar""" results = [] - detect_google_sorry(resp) - - # which subdomain ? - # subdomain = resp.search_params.get('google_subdomain') - # convert the text to dom dom = html.fromstring(resp.text) + detect_google_captcha(dom) # parse results - for result in eval_xpath_list(dom, '//div[@data-cid]'): + for result in eval_xpath_list(dom, '//div[@data-rp]'): title = extract_text(eval_xpath(result, './/h3[1]//a')) @@ -159,7 +164,7 @@ def response(resp): # pylint: disable=too-many-locals # this is a [ZITATION] block continue - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) + pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) if pub_type: pub_type = pub_type[1:-1].lower() diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 5ab29f9ff..985189df5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google videos engine. +"""This is the implementation of the Google Videos engine. .. admonition:: Content-Security-Policy (CSP) @@ -14,9 +14,8 @@ """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING -import re from urllib.parse import urlencode from lxml import html @@ -27,20 +26,22 @@ from searx.utils import ( extract_text, ) +from searx.engines.google import fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - get_lang_info, + get_google_info, time_range_dict, filter_mapping, - g_section_with_header, - title_xpath, suggestion_xpath, detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: disable=unused-import -from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits +if TYPE_CHECKING: + import logging -# pylint: enable=unused-import + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -55,70 +56,32 @@ about = { # engine dependent config categories = ['videos', 'web'] -paging = False +paging = True language_support = True -use_locale_domain = True time_range_support = True safesearch = True -send_accept_language_header = True - -RE_CACHE = {} - - -def _re(regexpr): - """returns compiled regular expression""" - RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) - return RE_CACHE[regexpr] - - -def scrap_out_thumbs_src(dom): - ret_val = {} - thumb_name = 'dimg_' - for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): - _script = script.text - # "dimg_35":"https://i.ytimg.c....", - _dimurl = _re("s='([^']*)").findall(_script) - for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): - v = v.replace(r'\u003d', '=') - v = v.replace(r'\u0026', '&') - ret_val[k] = v - logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) - return ret_val - - -def scrap_out_thumbs(dom): - """Scrap out thumbnail data from