From c9cd376186d12d2d281e655d0b5539d1359fe148 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 10 Oct 2022 19:31:22 +0200 Subject: [mod] replace searx.languages by searx.sxng_locales With the language and region tags from the EngineTraitsMap the handling of SearXNG's tags of languages and regions has been normalized and is no longer a *mystery*. The "languages" became "locales" that are supported by babel and by this, the update_engine_traits.py can be simplified a lot. Other code places can be simplified as well, but these simplifications should (respectively can) only be done when none of the engines work with the deprecated EngineTraits.supported_languages interface anymore. This commit replaces searx.languages by searx.sxng_locales and fix the naming of some names from "language" to "locale" (e.g. language_codes --> sxng_locales). Signed-off-by: Markus Heiser --- searx/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index e6180906b..f7a71b649 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -24,7 +24,7 @@ from babel.core import get_global from searx import settings from searx.data import USER_AGENTS, data_dir from searx.version import VERSION_TAG -from searx.languages import language_codes +from searx.sxng_locales import sxng_locales from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException from searx import logger @@ -53,8 +53,8 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None """fasttext model to predict laguage of a search term""" -SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes]) -"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" +SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales]) +"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`).""" class _NotSetClass: # pylint: disable=too-few-public-methods @@ -355,11 +355,11 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]: is_abbr = len(lang) == 2 lang = lang.lower() if is_abbr: - for l in language_codes: + for l in sxng_locales: if l[0][:2] == lang: return (True, l[0][:2], l[3].lower()) return None - for l in language_codes: + for l in sxng_locales: if l[1].lower() == lang or l[3].lower() == lang: return (True, l[0][:2], l[3].lower()) return None -- cgit v1.2.3 From 16f0db44939c23d2980d6fd2e5dfada13d8f5ee9 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 7 Feb 2023 14:11:58 +0100 Subject: [mod] replace utils.match_language by locales.match_locale This patch replaces the *full of magic* ``utils.match_language`` function by a ``locales.match_locale``. The ``locales.match_locale`` function is based on the ``locales.build_engine_locales`` introduced in 9ae409a0 [1]. In the past SearXNG did only support a search by a language but not in a region. This has been changed a long time ago and regions have been added to SearXNG core but not to the engines. The ``utils.match_language`` was the function to handle the different aspects of language/regions in SearXNG core and the supported *languages* in the engine. The ``utils.match_language`` did it with some magic and works good for most use cases but fails in some edge case. To replace the concurrence of languages and regions in the SearXNG core the ``locales.build_engine_locales`` was introduced in 9ae409a0 [1]. With the last patches all engines has been migrated to a ``fetch_traits`` and a language/region concept that is based on ``locales.build_engine_locales``. To summarize: there is no longer a need for the ``locales.match_language``. [1] https://github.com/searxng/searxng/pull/1652 Signed-off-by: Markus Heiser --- searx/utils.py | 88 ---------------------------------------------------------- 1 file changed, 88 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index f7a71b649..161983011 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -18,8 +18,6 @@ from urllib.parse import urljoin, urlparse from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult -from babel.core import get_global - from searx import settings from searx.data import USER_AGENTS, data_dir @@ -365,92 +363,6 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]: return None -def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]: - key = str(lang_list) - value = _LANG_TO_LC_CACHE.get(key, None) - if value is None: - value = {} - for lang in lang_list: - value.setdefault(lang.split('-')[0], lang) - _LANG_TO_LC_CACHE[key] = value - return value - - -# babel's get_global contains all sorts of miscellaneous locale and territory related data -# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py -def _get_from_babel(lang_code: str, key): - match = get_global(key).get(lang_code.replace('-', '_')) - # for some keys, such as territory_aliases, match may be a list - if isinstance(match, str): - return match.replace('_', '-') - return match - - -def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102 - """auxiliary function to match lang_code in lang_list""" - # replace language code with a custom alias if necessary - if lang_code in custom_aliases: - lang_code = custom_aliases[lang_code] - - if lang_code in lang_list: - return lang_code - - # try to get the most likely country for this language - subtags = _get_from_babel(lang_code, 'likely_subtags') - if subtags: - if subtags in lang_list: - return subtags - subtag_parts = subtags.split('-') - new_code = subtag_parts[0] + '-' + subtag_parts[-1] - if new_code in custom_aliases: - new_code = custom_aliases[new_code] - if new_code in lang_list: - return new_code - - # try to get the any supported country for this language - return _get_lang_to_lc_dict(lang_list).get(lang_code) - - -def match_language( # pylint: disable=W0102 - locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US' -) -> Optional[str]: - """get the language code from lang_list that best matches locale_code""" - # try to get language from given locale_code - language = _match_language(locale_code, lang_list, custom_aliases) - if language: - return language - - locale_parts = locale_code.split('-') - lang_code = locale_parts[0] - - # if locale_code has script, try matching without it - if len(locale_parts) > 2: - language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases) - if language: - return language - - # try to get language using an equivalent country code - if len(locale_parts) > 1: - country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases') - if country_alias: - language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases) - if language: - return language - - # try to get language using an equivalent language code - alias = _get_from_babel(lang_code, 'language_aliases') - if alias: - language = _match_language(alias, lang_list, custom_aliases) - if language: - return language - - if lang_code != locale_code: - # try to get language from given language without giving the country - language = _match_language(lang_code, lang_list, custom_aliases) - - return language or fallback - - def load_module(filename: str, module_dir: str) -> types.ModuleType: modname = splitext(filename)[0] modpath = join(module_dir, filename) -- cgit v1.2.3