From 6e5f22e5583cfc2a413e0afac66d3c5ea9f628b1 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 29 Sep 2022 20:54:46 +0200 Subject: [mod] replace engines_languages.json by engines_traits.json Implementations of the *traits* of the engines. Engine's traits are fetched from the origin engine and stored in a JSON file in the *data folder*. Most often traits are languages and region codes and their mapping from SearXNG's representation to the representation in the origin search engine. To load traits from the persistence:: searx.enginelib.traits.EngineTraitsMap.from_data() For new traits new properties can be added to the class:: searx.enginelib.traits.EngineTraits .. hint:: Implementation is downward compatible to the deprecated *supported_languages method* from the vintage implementation. The vintage code is tagged as *deprecated* an can be removed when all engines has been ported to the *traits method*. Signed-off-by: Markus Heiser --- searx/search/processors/__init__.py | 5 ++++- searx/search/processors/abstract.py | 12 +++++++++++- searx/search/processors/online.py | 3 +++ searx/search/processors/online_currency.py | 4 ++-- searx/search/processors/online_dictionary.py | 5 +++-- searx/search/processors/online_url_search.py | 5 +++-- 6 files changed, 26 insertions(+), 8 deletions(-) (limited to 'searx/search/processors') diff --git a/searx/search/processors/__init__.py b/searx/search/processors/__init__.py index a270b4ef5..1390de456 100644 --- a/searx/search/processors/__init__.py +++ b/searx/search/processors/__init__.py @@ -30,7 +30,10 @@ from .abstract import EngineProcessor logger = logger.getChild('search.processors') PROCESSORS: Dict[str, EngineProcessor] = {} -"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)""" +"""Cache request processores, stored by *engine-name* (:py:func:`initialize`) + +:meta hide-value: +""" def get_processor_class(engine_type): diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index d74616db0..5f1882ca4 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -138,7 +138,8 @@ class EngineProcessor(ABC): return False def get_params(self, search_query, engine_category): - """Returns a set of *request params* or ``None`` if request is not supported. + """Returns a set of (see :ref:`request params `) or + ``None`` if request is not supported. Not supported conditions (``None`` is returned): @@ -159,11 +160,20 @@ class EngineProcessor(ABC): params['safesearch'] = search_query.safesearch params['time_range'] = search_query.time_range params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) + params['searxng_locale'] = search_query.lang + + # deprecated / vintage --> use params['searxng_locale'] + # + # Conditions related to engine's traits are implemented in engine.traits + # module. Don't do 'locale' decissions here in the abstract layer of the + # search processor, just pass the value from user's choice unchanged to + # the engine request. if hasattr(self.engine, 'language') and self.engine.language: params['language'] = self.engine.language else: params['language'] = search_query.lang + return params @abstractmethod diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 242718416..86e9eed89 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -51,6 +51,9 @@ class OnlineProcessor(EngineProcessor): super().initialize() def get_params(self, search_query, engine_category): + """Returns a set of :ref:`request params ` or ``None`` + if request is not supported. + """ params = super().get_params(search_query, engine_category) if params is None: return None diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py index 92398239f..7cb4205c9 100644 --- a/searx/search/processors/online_currency.py +++ b/searx/search/processors/online_currency.py @@ -38,8 +38,8 @@ class OnlineCurrencyProcessor(OnlineProcessor): engine_type = 'online_currency' def get_params(self, search_query, engine_category): - """Returns a set of *request params* or ``None`` if search query does not match - to :py:obj:`parser_re`.""" + """Returns a set of :ref:`request params ` + or ``None`` if search query does not match to :py:obj:`parser_re`.""" params = super().get_params(search_query, engine_category) if params is None: diff --git a/searx/search/processors/online_dictionary.py b/searx/search/processors/online_dictionary.py index fbfc9df8e..6145a47d1 100644 --- a/searx/search/processors/online_dictionary.py +++ b/searx/search/processors/online_dictionary.py @@ -18,8 +18,9 @@ class OnlineDictionaryProcessor(OnlineProcessor): engine_type = 'online_dictionary' def get_params(self, search_query, engine_category): - """Returns a set of *request params* or ``None`` if search query does not match - to :py:obj:`parser_re`.""" + """Returns a set of :ref:`request params ` or + ``None`` if search query does not match to :py:obj:`parser_re`. + """ params = super().get_params(search_query, engine_category) if params is None: return None diff --git a/searx/search/processors/online_url_search.py b/searx/search/processors/online_url_search.py index 6383fa37f..a1dd6a018 100644 --- a/searx/search/processors/online_url_search.py +++ b/searx/search/processors/online_url_search.py @@ -20,9 +20,10 @@ class OnlineUrlSearchProcessor(OnlineProcessor): engine_type = 'online_url_search' def get_params(self, search_query, engine_category): - """Returns a set of *request params* or ``None`` if search query does not match - to at least one of :py:obj:`re_search_urls`. + """Returns a set of :ref:`request params ` or ``None`` if + search query does not match to :py:obj:`re_search_urls`. """ + params = super().get_params(search_query, engine_category) if params is None: return None -- cgit v1.2.3 From 249989955497cd048fa3312d115971282983b269 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 4 Dec 2022 22:57:22 +0100 Subject: [mod] Google: reversed engineered & upgrade to data_type: traits_v1 Partial reverse engineering of the Google engines including a improved language and region handling based on the engine.traits_v1 data. When ever possible the implementations of the Google engines try to make use of the async REST APIs. The get_lang_info() has been generalized to a get_google_info() function / especially the region handling has been improved by adding the cr parameter. searx/data/engine_traits.json Add data type "traits_v1" generated by the fetch_traits() functions from: - Google (WEB), - Google images, - Google news, - Google scholar and - Google videos and remove data from obsolete data type "supported_languages". A traits.custom type that maps region codes to *supported_domains* is fetched from https://www.google.com/supported_domains searx/autocomplete.py: Reversed engineered autocomplete from Google WEB. Supports Google's languages and subdomains. The old API suggestqueries.google.com/complete has been replaced by the async REST API: https://{subdomain}/complete/search?{args} searx/engines/google.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - always use the async REST API (formally known as 'use_mobile_ui') - use *supported_domains* from traits - improved the result list by fetching './/div[@data-content-feature]' and parsing the type of the various *content features* --> thumbnails are added searx/engines/google_images.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - if exists, freshness_date is added to the result - issue 1864: result list has been improved a lot (due to the new cr parameter) searx/engines/google_news.py Reverse engineering and extensive testing .. - fetch_traits(): Fetch languages & regions from Google properties. *supported_domains* is not needed but a ceid list has been added. - different region handling compared to Google WEB - fixed for various languages & regions (due to the new ceid parameter) / avoid CONSENT page - Google News do no longer support time range - result list has been fixed: XPath of pub_date and pub_origin searx/engines/google_videos.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - add paging support - implement a async request ('asearch': 'arc' & 'async': 'use_ac:true,_fmt:html') - simplified code (thanks to '_fmt:html' request) - issue 1359: fixed xpath of video length data searx/engines/google_scholar.py - fetch_traits(): Fetch languages & regions from Google properties. - use *supported_domains* from traits - request(): include patents & citations - response(): fixed CAPTCHA detection (Scholar has its own CATCHA manager) - hardening XPath to iterate over results - fixed XPath of pub_type (has been change from gs_ct1 to gs_cgt2 class) - issue 1769 fixed: new request implementation is no longer incompatible Signed-off-by: Markus Heiser --- searx/search/processors/online.py | 5 ----- 1 file changed, 5 deletions(-) (limited to 'searx/search/processors') diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 86e9eed89..48e3a2e92 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -187,11 +187,6 @@ class OnlineProcessor(EngineProcessor): self.handle_exception(result_container, e, suspend=True) self.logger.exception('CAPTCHA') except SearxEngineTooManyRequestsException as e: - if "google" in self.engine_name: - self.logger.warn( - "Set to 'true' the use_mobile_ui parameter in the 'engines:'" - " section of your settings.yml file if google is blocked for you." - ) self.handle_exception(result_container, e, suspend=True) self.logger.exception('Too many requests') except SearxEngineAccessDeniedException as e: -- cgit v1.2.3 From 4d4aa13e1f1d254e5d57c67973a7809d9c1e21f9 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 30 Dec 2022 18:28:02 +0100 Subject: [mod] remove obsolete EngineTraits.supported_languages All engines has been migrated from ``supported_languages`` to the ``fetch_traits`` concept. There is no longer a need for the obsolete code that implements the ``supported_languages`` concept. Signed-off-by: Markus Heiser --- searx/search/processors/online.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/search/processors') diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 48e3a2e92..697533d8c 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -221,7 +221,7 @@ class OnlineProcessor(EngineProcessor): 'test': ['unique_results'], } - if getattr(self.engine, 'supported_languages', []): + if getattr(self.engine, 'traits', False): tests['lang_fr'] = { 'matrix': {'query': 'paris', 'lang': 'fr'}, 'result_container': ['not_empty', ('has_language', 'fr')], -- cgit v1.2.3