From f78f9083836be851c224b4334b53b9686835e300 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 8 Oct 2022 11:32:08 +0200 Subject: [mod] Google: fetch engine traits (data_type: supported_languages) Implements a fetch_traits function for the Google engines. .. note:: Does not include migration of the request methode from 'supported_languages' to 'traits' (EngineTraits) object! Signed-off-by: Markus Heiser --- searx/engines/google_news.py | 1 + 1 file changed, 1 insertion(+) (limited to 'searx/engines/google_news.py') diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 1ada2d64d..8962af36a 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -28,6 +28,7 @@ from searx.utils import ( # pylint: disable=unused-import from searx.engines.google import ( + fetch_traits, supported_languages_url, _fetch_supported_languages, ) -- cgit v1.2.3 From 249989955497cd048fa3312d115971282983b269 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 4 Dec 2022 22:57:22 +0100 Subject: [mod] Google: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the Google engines including a improved language and region handling based on the engine.traits_v1 data. When ever possible the implementations of the Google engines try to make use of the async REST APIs. The get_lang_info() has been generalized to a get_google_info() function / especially the region handling has been improved by adding the cr parameter. searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - Google (WEB), - Google images, - Google news, - Google scholar and - Google videos and remove data from obsolete data type "supported_languages". A traits.custom type that maps region codes to *supported_domains* is fetched from https://www.google.com/supported_domains searx/autocomplete.py: Reversed engineered autocomplete from Google WEB. Supports Google's languages and subdomains. The old API suggestqueries.google.com/complete has been replaced by the async REST API: https://{subdomain}/complete/search?{args} searx/engines/google.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - always use the async REST API (formally known as 'use_mobile_ui') - use *supported_domains* from traits - improved the result list by fetching './/div[@data-content-feature]' and parsing the type of the various *content features* --> thumbnails are added searx/engines/google_images.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - if exists, freshness_date is added to the result - issue 1864: result list has been improved a lot (due to the new cr parameter) searx/engines/google_news.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. *supported_domains* is not needed but a ceid list has been added. - different region handling compared to Google WEB - fixed for various languages & regions (due to the new ceid parameter) / avoid CONSENT page - Google News do no longer support time range - result list has been fixed: XPath of pub_date and pub_origin searx/engines/google_videos.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - add paging support - implement a async request ('asearch': 'arc' & 'async': 'use_ac:true,_fmt:html') - simplified code (thanks to '_fmt:html' request) - issue 1359: fixed xpath of video length data searx/engines/google_scholar.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - request(): include patents & citations - response(): fixed CAPTCHA detection (Scholar has its own CATCHA manager) - hardening XPath to iterate over results - fixed XPath of pub_type (has been change from gs_ct1 to gs_cgt2 class) - issue 1769 fixed: new request implementation is no longer incompatible Signed-off-by: Markus Heiser --- searx/engines/google_news.py | 251 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 199 insertions(+), 52 deletions(-) (limited to 'searx/engines/google_news.py') diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 8962af36a..ae55ca9cb 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,24 +1,40 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""This is the implementation of the google news engine. The google news API -ignores some parameters from the common :ref:`google API`: +"""This is the implementation of the Google News engine. -- num_ : the number of search results is ignored +Google News has a different region handling compared to Google WEB. + +- the ``ceid`` argument has to be set (:py:obj:`ceid_list`) +- the hl_ argument has to be set correctly (and different to Google WEB) +- the gl_ argument is mandatory + +If one of this argument is not set correctly, the request is redirected to +CONSENT dialog:: + + https://consent.google.com/m?continue= + +The google news API ignores some parameters from the common :ref:`google API`: + +- num_ : the number of search results is ignored / there is no paging all + results for a query term are in the first response. - save_ : is ignored / Google-News results are always *SafeSearch* +.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp +.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp - """ -# pylint: disable=invalid-name +from typing import TYPE_CHECKING import binascii import re from urllib.parse import urlencode from base64 import b64decode from lxml import html +import babel +from searx import locales from searx.utils import ( eval_xpath, eval_xpath_list, @@ -26,19 +42,19 @@ from searx.utils import ( extract_text, ) -# pylint: disable=unused-import +from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import from searx.engines.google import ( - fetch_traits, - supported_languages_url, - _fetch_supported_languages, + get_google_info, + detect_google_sorry, ) +from searx.enginelib.traits import EngineTraits -# pylint: enable=unused-import +if TYPE_CHECKING: + import logging -from searx.engines.google import ( - get_lang_info, - detect_google_sorry, -) + logger: logging.Logger + +traits: EngineTraits # about about = { @@ -50,70 +66,77 @@ about = { "results": 'HTML', } -# compared to other google engines google-news has a different time range -# support. The time range is included in the search term. -time_range_dict = { - 'day': 'when:1d', - 'week': 'when:7d', - 'month': 'when:1m', - 'year': 'when:1y', -} - # engine dependent config - categories = ['news'] paging = False -use_locale_domain = True -time_range_support = True +time_range_support = False # Google-News results are always *SafeSearch*. Option 'safesearch' is set to # False here, otherwise checker will report safesearch-errors:: # # safesearch : results are identitical for safesearch=0 and safesearch=2 -safesearch = False -send_accept_language_header = True +safesearch = True +# send_accept_language_header = True def request(query, params): """Google-News search request""" - lang_info = get_lang_info(params, supported_languages, language_aliases, False) + sxng_locale = params.get('searxng_locale', 'en-US') + ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en') + google_info = get_google_info(params, traits) + google_info['subdomain'] = 'news.google.com' # google news has only one domain - # google news has only one domain - lang_info['subdomain'] = 'news.google.com' + ceid_region, ceid_lang = ceid.split(':') + ceid_lang, ceid_suffix = ( + ceid_lang.split('-') + + [ + None, + ] + )[:2] - ceid = "%s:%s" % (lang_info['country'], lang_info['language']) + google_info['params']['hl'] = ceid_lang - # google news redirects en to en-US - if lang_info['params']['hl'] == 'en': - lang_info['params']['hl'] = 'en-US' + if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']: - # Very special to google-news compared to other google engines, the time - # range is included in the search term. - if params['time_range']: - query += ' ' + time_range_dict[params['time_range']] + if ceid_region.lower() == ceid_lang: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix + + elif ceid_region.lower() != ceid_lang: + + if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']: + google_info['params']['hl'] = ceid_lang + else: + google_info['params']['hl'] = ceid_lang + '-' + ceid_region + + google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0] + google_info['params']['gl'] = ceid_region query_url = ( 'https://' - + lang_info['subdomain'] - + '/search' - + "?" - + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']}) + + google_info['subdomain'] + + "/search?" + + urlencode( + { + 'q': query, + **google_info['params'], + } + ) + # ceid includes a ':' character which must not be urlencoded + ('&ceid=%s' % ceid) - ) # ceid includes a ':' character which must not be urlencoded - params['url'] = query_url - - params['cookies']['CONSENT'] = "YES+" - params['headers'].update(lang_info['headers']) - params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + params['url'] = query_url + params['cookies'] = google_info['cookies'] + params['headers'].update(google_info['headers']) return params def response(resp): """Get response from google's search request""" results = [] - detect_google_sorry(resp) # convert the text to dom @@ -153,8 +176,8 @@ def response(resp): # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. - pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) - pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) + pub_date = extract_text(eval_xpath(result, './article//time')) + pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]')) content = ' / '.join([x for x in [pub_origin, pub_date] if x]) @@ -175,3 +198,127 @@ def response(resp): # return results return results + + +ceid_list = [ + 'AE:ar', + 'AR:es-419', + 'AT:de', + 'AU:en', + 'BD:bn', + 'BE:fr', + 'BE:nl', + 'BG:bg', + 'BR:pt-419', + 'BW:en', + 'CA:en', + 'CA:fr', + 'CH:de', + 'CH:fr', + 'CL:es-419', + 'CN:zh-Hans', + 'CO:es-419', + 'CU:es-419', + 'CZ:cs', + 'DE:de', + 'EG:ar', + 'ES:es', + 'ET:en', + 'FR:fr', + 'GB:en', + 'GH:en', + 'GR:el', + 'HK:zh-Hant', + 'HU:hu', + 'ID:en', + 'ID:id', + 'IE:en', + 'IL:en', + 'IL:he', + 'IN:bn', + 'IN:en', + 'IN:hi', + 'IN:ml', + 'IN:mr', + 'IN:ta', + 'IN:te', + 'IT:it', + 'JP:ja', + 'KE:en', + 'KR:ko', + 'LB:ar', + 'LT:lt', + 'LV:en', + 'LV:lv', + 'MA:fr', + 'MX:es-419', + 'MY:en', + 'NA:en', + 'NG:en', + 'NL:nl', + 'NO:no', + 'NZ:en', + 'PE:es-419', + 'PH:en', + 'PK:en', + 'PL:pl', + 'PT:pt-150', + 'RO:ro', + 'RS:sr', + 'RU:ru', + 'SA:ar', + 'SE:sv', + 'SG:en', + 'SI:sl', + 'SK:sk', + 'SN:fr', + 'TH:th', + 'TR:tr', + 'TW:zh-Hant', + 'TZ:en', + 'UA:ru', + 'UA:uk', + 'UG:en', + 'US:en', + 'US:es-419', + 'VE:es-419', + 'VN:vi', + 'ZA:en', + 'ZW:en', +] +"""List of region/language combinations supported by Google News. Values of the +``ceid`` argument of the Google News REST API.""" + + +_skip_values = [ + 'ET:en', # english (ethiopia) + 'ID:en', # english (indonesia) + 'LV:en', # english (latvia) +] + +_ceid_locale_map = {'NO:no': 'nb-NO'} + + +def fetch_traits(engine_traits: EngineTraits): + _fetch_traits(engine_traits, add_domains=False) + + engine_traits.custom['ceid'] = {} + + for ceid in ceid_list: + if ceid in _skip_values: + continue + + region, lang = ceid.split(':') + x = lang.split('-') + if len(x) > 1: + if x[1] not in ['Hant', 'Hans']: + lang = x[0] + + sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region) + try: + locale = babel.Locale.parse(sxng_locale, sep='-') + except babel.UnknownLocaleError: + print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) + continue + + engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid -- cgit v1.2.3