From 6e5f22e5583cfc2a413e0afac66d3c5ea9f628b1 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 29 Sep 2022 20:54:46 +0200 Subject: [mod] replace engines_languages.json by engines_traits.json Implementations of the *traits* of the engines. Engine's traits are fetched from the origin engine and stored in a JSON file in the *data folder*. Most often traits are languages and region codes and their mapping from SearXNG's representation to the representation in the origin search engine. To load traits from the persistence:: searx.enginelib.traits.EngineTraitsMap.from_data() For new traits new properties can be added to the class:: searx.enginelib.traits.EngineTraits .. hint:: Implementation is downward compatible to the deprecated *supported_languages method* from the vintage implementation. The vintage code is tagged as *deprecated* an can be removed when all engines has been ported to the *traits method*. Signed-off-by: Markus Heiser --- searxng_extra/update/update_engine_traits.py | 336 +++++++++++++++++++++++++++ searxng_extra/update/update_languages.py | 313 ------------------------- 2 files changed, 336 insertions(+), 313 deletions(-) create mode 100755 searxng_extra/update/update_engine_traits.py delete mode 100755 searxng_extra/update/update_languages.py (limited to 'searxng_extra') diff --git a/searxng_extra/update/update_engine_traits.py b/searxng_extra/update/update_engine_traits.py new file mode 100755 index 000000000..9630d6d24 --- /dev/null +++ b/searxng_extra/update/update_engine_traits.py @@ -0,0 +1,336 @@ +#!/usr/bin/env python +# lint: pylint +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py` + +:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`: + Persistence of engines traits, fetched from the engines. + +:origin:`searx/languages.py` + Is generated from intersecting each engine's supported traits. + +The script :origin:`searxng_extra/update/update_engine_traits.py` is called in +the :origin:`CI Update data ... <.github/workflows/data-update.yml>` + +""" + +# pylint: disable=invalid-name +from unicodedata import lookup +from pathlib import Path +from pprint import pformat +from babel import Locale, UnknownLocaleError +from babel.languages import get_global +from babel.core import parse_locale + +from searx import settings, searx_dir +from searx import network +from searx.engines import load_engines, engines +from searx.enginelib.traits import EngineTraitsMap + +# Output files. +languages_file = Path(searx_dir) / 'languages.py' + + +def fetch_traits_map(): + """Fetchs supported languages for each engine and writes json file with those.""" + network.set_timeout_for_thread(10.0) + + def log(msg): + print(msg) + + traits_map = EngineTraitsMap.fetch_traits(log=log) + print("fetched properties from %s engines" % len(traits_map)) + print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) + traits_map.save_data() + return traits_map + + +# Get babel Locale object from lang_code if possible. +def get_locale(lang_code): + try: + locale = Locale.parse(lang_code, sep='-') + return locale + except (UnknownLocaleError, ValueError): + return None + + +lang2emoji = { + 'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger + 'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina + 'jp': '\U0001F1EF\U0001F1F5', # Japanese + 'ua': '\U0001F1FA\U0001F1E6', # Ukrainian + 'he': '\U0001F1EE\U0001F1F7', # Hebrew +} + + +def get_unicode_flag(lang_code): + """Determine a unicode flag (emoji) that fits to the ``lang_code``""" + + emoji = lang2emoji.get(lang_code.lower()) + if emoji: + return emoji + + if len(lang_code) == 2: + return '\U0001F310' + + language = territory = script = variant = '' + try: + language, territory, script, variant = parse_locale(lang_code, '-') + except ValueError as exc: + print(exc) + + # https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 + if not territory: + # https://www.unicode.org/emoji/charts/emoji-list.html#country-flag + emoji = lang2emoji.get(language) + if not emoji: + print( + "%s --> language: %s / territory: %s / script: %s / variant: %s" + % (lang_code, language, territory, script, variant) + ) + return emoji + + emoji = lang2emoji.get(territory.lower()) + if emoji: + return emoji + + try: + c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0]) + c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1]) + # print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 )) + except KeyError as exc: + print("%s --> territory: %s --> %s" % (lang_code, territory, exc)) + return None + + return c1 + c2 + + +def get_territory_name(lang_code): + country_name = None + locale = get_locale(lang_code) + try: + if locale is not None: + country_name = locale.get_territory_name() + except FileNotFoundError as exc: + print("ERROR: %s --> %s" % (locale, exc)) + return country_name + + +def join_language_lists(traits_map: EngineTraitsMap): + """Join all languages of the engines into one list. The returned language list + contains language codes (``zh``) and region codes (``zh-TW``). The codes can + be parsed by babel:: + + babel.Locale.parse(language_list[n]) + """ + # pylint: disable=too-many-branches + language_list = {} + + for eng_name, eng_traits in traits_map.items(): + eng = engines[eng_name] + eng_codes = set() + + if eng_traits.data_type == 'traits_v1': + # items of type 'engine_traits' do have regions & languages, the + # list of eng_codes should contain both. + eng_codes.update(eng_traits.regions.keys()) + eng_codes.update(eng_traits.languages.keys()) + + elif eng_traits.data_type == 'supported_languages': + # vintage / deprecated + _codes = set() + if isinstance(eng_traits.supported_languages, dict): + _codes.update(eng_traits.supported_languages.keys()) + elif isinstance(eng_traits.supported_languages, list): + _codes.update(eng_traits.supported_languages) + else: + raise TypeError('engine.supported_languages type %s is unknown' % type(eng_traits.supported_languages)) + + for lang_code in _codes: + # apply custom fixes if necessary + if lang_code in getattr(eng, 'language_aliases', {}).values(): + lang_code = next(lc for lc, alias in eng.language_aliases.items() if lang_code == alias) + eng_codes.add(lang_code) + + for lang_code in eng_codes: + + locale = get_locale(lang_code) + + # ensure that lang_code uses standard language and country codes + if locale and locale.territory: + lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) + short_code = lang_code.split('-')[0] + + # add language without country if not in list + if short_code not in language_list: + if locale: + # get language's data from babel's Locale object + language_name = locale.get_language_name().title() + english_name = locale.english_name.split(' (')[0] + elif short_code in traits_map['wikipedia'].supported_languages: + # get language's data from wikipedia if not known by babel + language_name = traits_map['wikipedia'].supported_languages[short_code]['name'] + english_name = traits_map['wikipedia'].supported_languages[short_code]['english_name'] + else: + language_name = None + english_name = None + + # add language to list + language_list[short_code] = { + 'name': language_name, + 'english_name': english_name, + 'counter': set(), + 'countries': {}, + } + + # add language with country if not in list + if lang_code != short_code and lang_code not in language_list[short_code]['countries']: + country_name = '' + if locale: + # get country name from babel's Locale object + try: + country_name = locale.get_territory_name() + except FileNotFoundError as exc: + print("ERROR: %s --> %s" % (locale, exc)) + locale = None + + language_list[short_code]['countries'][lang_code] = { + 'country_name': country_name, + 'counter': set(), + } + + # count engine for both language_country combination and language alone + language_list[short_code]['counter'].add(eng_name) + if lang_code != short_code: + language_list[short_code]['countries'][lang_code]['counter'].add(eng_name) + + return language_list + + +# Filter language list so it only includes the most supported languages and countries +def filter_language_list(joined_languages_map): + min_engines_per_lang = 12 + min_engines_per_country = 7 + # pylint: disable=consider-using-dict-items, consider-iterating-dictionary + main_engines = [ + engine_name + for engine_name in engines.keys() + if 'general' in engines[engine_name].categories + and hasattr(engines[engine_name], 'supported_languages') + and engines[engine_name].supported_languages + and not engines[engine_name].disabled + ] + + # filter list to include only languages supported by most engines or all default general engines + filtered_languages = { + code: lang + for code, lang in joined_languages_map.items() + if ( + len(lang['counter']) >= min_engines_per_lang + or all(main_engine in lang['counter'] for main_engine in main_engines) + ) + } + + def _copy_lang_data(lang, country_name=None): + new_dict = {} + new_dict['name'] = joined_languages_map[lang]['name'] + new_dict['english_name'] = joined_languages_map[lang]['english_name'] + if country_name: + new_dict['country_name'] = country_name + return new_dict + + # for each language get country codes supported by most engines or at least one country code + filtered_languages_with_countries = {} + for lang, lang_data in filtered_languages.items(): + countries = lang_data['countries'] + filtered_countries = {} + + # get language's country codes with enough supported engines + for lang_country, country_data in countries.items(): + if len(country_data['counter']) >= min_engines_per_country: + filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) + + # add language without countries too if there's more than one country to choose from + if len(filtered_countries) > 1: + filtered_countries[lang] = _copy_lang_data(lang, None) + elif len(filtered_countries) == 1: + lang_country = next(iter(filtered_countries)) + + # if no country has enough engines try to get most likely country code from babel + if not filtered_countries: + lang_country = None + subtags = get_global('likely_subtags').get(lang) + if subtags: + country_code = subtags.split('_')[-1] + if len(country_code) == 2: + lang_country = "{lang}-{country}".format(lang=lang, country=country_code) + + if lang_country: + filtered_countries[lang_country] = _copy_lang_data(lang, None) + else: + filtered_countries[lang] = _copy_lang_data(lang, None) + + filtered_languages_with_countries.update(filtered_countries) + + return filtered_languages_with_countries + + +class UnicodeEscape(str): + """Escape unicode string in :py:obj:`pprint.pformat`""" + + def __repr__(self): + return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" + + +# Write languages.py. +def write_languages_file(languages): + file_headers = ( + "# -*- coding: utf-8 -*-", + "# list of language codes", + "# this file is generated automatically by utils/fetch_languages.py", + "language_codes = (\n", + ) + + language_codes = [] + + for code in sorted(languages): + + name = languages[code]['name'] + if name is None: + print("ERROR: languages['%s'] --> %s" % (code, languages[code])) + continue + + flag = get_unicode_flag(code) or '' + item = ( + code, + languages[code]['name'].split(' (')[0], + get_territory_name(code) or '', + languages[code].get('english_name') or '', + UnicodeEscape(flag), + ) + + language_codes.append(item) + + language_codes = tuple(language_codes) + + with open(languages_file, 'w', encoding='utf-8') as new_file: + file_content = "{file_headers} {language_codes},\n)\n".format( + # fmt: off + file_headers = '\n'.join(file_headers), + language_codes = pformat(language_codes, indent=4)[1:-1] + # fmt: on + ) + new_file.write(file_content) + new_file.close() + + +def main(): + load_engines(settings['engines']) + traits_map = fetch_traits_map() + joined_languages_map = join_language_lists(traits_map) + filtered_languages = filter_language_list(joined_languages_map) + write_languages_file(filtered_languages) + + +if __name__ == "__main__": + main() diff --git a/searxng_extra/update/update_languages.py b/searxng_extra/update/update_languages.py deleted file mode 100755 index 87b13b276..000000000 --- a/searxng_extra/update/update_languages.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -# lint: pylint - -# SPDX-License-Identifier: AGPL-3.0-or-later -"""This script generates languages.py from intersecting each engine's supported -languages. - -Output files: :origin:`searx/data/engines_languages.json` and -:origin:`searx/languages.py` (:origin:`CI Update data ... -<.github/workflows/data-update.yml>`). - -""" - -# pylint: disable=invalid-name -from unicodedata import lookup -import json -from pathlib import Path -from pprint import pformat -from babel import Locale, UnknownLocaleError -from babel.languages import get_global -from babel.core import parse_locale - -from searx import settings, searx_dir -from searx.engines import load_engines, engines -from searx.network import set_timeout_for_thread - -# Output files. -engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' -languages_file = Path(searx_dir) / 'languages.py' - - -# Fetches supported languages for each engine and writes json file with those. -def fetch_supported_languages(): - set_timeout_for_thread(10.0) - - engines_languages = {} - names = list(engines) - names.sort() - - for engine_name in names: - if hasattr(engines[engine_name], 'fetch_supported_languages'): - engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() - print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name)) - if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck - engines_languages[engine_name] = sorted(engines_languages[engine_name]) - - print("fetched languages from %s engines" % len(engines_languages)) - - # write json file - with open(engines_languages_file, 'w', encoding='utf-8') as f: - json.dump(engines_languages, f, indent=2, sort_keys=True) - - return engines_languages - - -# Get babel Locale object from lang_code if possible. -def get_locale(lang_code): - try: - locale = Locale.parse(lang_code, sep='-') - return locale - except (UnknownLocaleError, ValueError): - return None - - -lang2emoji = { - 'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger - 'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina - 'jp': '\U0001F1EF\U0001F1F5', # Japanese - 'ua': '\U0001F1FA\U0001F1E6', # Ukrainian - 'he': '\U0001F1EE\U0001F1F7', # Hebrew -} - - -def get_unicode_flag(lang_code): - """Determine a unicode flag (emoji) that fits to the ``lang_code``""" - - emoji = lang2emoji.get(lang_code.lower()) - if emoji: - return emoji - - if len(lang_code) == 2: - return '\U0001F310' - - language = territory = script = variant = '' - try: - language, territory, script, variant = parse_locale(lang_code, '-') - except ValueError as exc: - print(exc) - - # https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 - if not territory: - # https://www.unicode.org/emoji/charts/emoji-list.html#country-flag - emoji = lang2emoji.get(language) - if not emoji: - print( - "%s --> language: %s / territory: %s / script: %s / variant: %s" - % (lang_code, language, territory, script, variant) - ) - return emoji - - emoji = lang2emoji.get(territory.lower()) - if emoji: - return emoji - - try: - c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0]) - c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1]) - # print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 )) - except KeyError as exc: - print("%s --> territory: %s --> %s" % (lang_code, territory, exc)) - return None - - return c1 + c2 - - -def get_territory_name(lang_code): - country_name = None - locale = get_locale(lang_code) - try: - if locale is not None: - country_name = locale.get_territory_name() - except FileNotFoundError as exc: - print("ERROR: %s --> %s" % (locale, exc)) - return country_name - - -# Join all language lists. -def join_language_lists(engines_languages): - language_list = {} - for engine_name in engines_languages: - for lang_code in engines_languages[engine_name]: - - # apply custom fixes if necessary - if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): - lang_code = next( - lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias - ) - - locale = get_locale(lang_code) - - # ensure that lang_code uses standard language and country codes - if locale and locale.territory: - lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) - short_code = lang_code.split('-')[0] - - # add language without country if not in list - if short_code not in language_list: - if locale: - # get language's data from babel's Locale object - language_name = locale.get_language_name().title() - english_name = locale.english_name.split(' (')[0] - elif short_code in engines_languages['wikipedia']: - # get language's data from wikipedia if not known by babel - language_name = engines_languages['wikipedia'][short_code]['name'] - english_name = engines_languages['wikipedia'][short_code]['english_name'] - else: - language_name = None - english_name = None - - # add language to list - language_list[short_code] = { - 'name': language_name, - 'english_name': english_name, - 'counter': set(), - 'countries': {}, - } - - # add language with country if not in list - if lang_code != short_code and lang_code not in language_list[short_code]['countries']: - country_name = '' - if locale: - # get country name from babel's Locale object - try: - country_name = locale.get_territory_name() - except FileNotFoundError as exc: - print("ERROR: %s --> %s" % (locale, exc)) - locale = None - - language_list[short_code]['countries'][lang_code] = { - 'country_name': country_name, - 'counter': set(), - } - - # count engine for both language_country combination and language alone - language_list[short_code]['counter'].add(engine_name) - if lang_code != short_code: - language_list[short_code]['countries'][lang_code]['counter'].add(engine_name) - - return language_list - - -# Filter language list so it only includes the most supported languages and countries -def filter_language_list(all_languages): - min_engines_per_lang = 12 - min_engines_per_country = 7 - # pylint: disable=consider-using-dict-items, consider-iterating-dictionary - main_engines = [ - engine_name - for engine_name in engines.keys() - if 'general' in engines[engine_name].categories - and engines[engine_name].supported_languages - and not engines[engine_name].disabled - ] - - # filter list to include only languages supported by most engines or all default general engines - filtered_languages = { - code: lang - for code, lang in all_languages.items() - if ( - len(lang['counter']) >= min_engines_per_lang - or all(main_engine in lang['counter'] for main_engine in main_engines) - ) - } - - def _copy_lang_data(lang, country_name=None): - new_dict = {} - new_dict['name'] = all_languages[lang]['name'] - new_dict['english_name'] = all_languages[lang]['english_name'] - if country_name: - new_dict['country_name'] = country_name - return new_dict - - # for each language get country codes supported by most engines or at least one country code - filtered_languages_with_countries = {} - for lang, lang_data in filtered_languages.items(): - countries = lang_data['countries'] - filtered_countries = {} - - # get language's country codes with enough supported engines - for lang_country, country_data in countries.items(): - if len(country_data['counter']) >= min_engines_per_country: - filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) - - # add language without countries too if there's more than one country to choose from - if len(filtered_countries) > 1: - filtered_countries[lang] = _copy_lang_data(lang, None) - elif len(filtered_countries) == 1: - lang_country = next(iter(filtered_countries)) - - # if no country has enough engines try to get most likely country code from babel - if not filtered_countries: - lang_country = None - subtags = get_global('likely_subtags').get(lang) - if subtags: - country_code = subtags.split('_')[-1] - if len(country_code) == 2: - lang_country = "{lang}-{country}".format(lang=lang, country=country_code) - - if lang_country: - filtered_countries[lang_country] = _copy_lang_data(lang, None) - else: - filtered_countries[lang] = _copy_lang_data(lang, None) - - filtered_languages_with_countries.update(filtered_countries) - - return filtered_languages_with_countries - - -class UnicodeEscape(str): - """Escape unicode string in :py:obj:`pprint.pformat`""" - - def __repr__(self): - return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" - - -# Write languages.py. -def write_languages_file(languages): - file_headers = ( - "# -*- coding: utf-8 -*-", - "# list of language codes", - "# this file is generated automatically by utils/fetch_languages.py", - "language_codes = (\n", - ) - - language_codes = [] - - for code in sorted(languages): - - name = languages[code]['name'] - if name is None: - print("ERROR: languages['%s'] --> %s" % (code, languages[code])) - continue - - flag = get_unicode_flag(code) or '' - item = ( - code, - languages[code]['name'].split(' (')[0], - get_territory_name(code) or '', - languages[code].get('english_name') or '', - UnicodeEscape(flag), - ) - - language_codes.append(item) - - language_codes = tuple(language_codes) - - with open(languages_file, 'w', encoding='utf-8') as new_file: - file_content = "{file_headers} {language_codes},\n)\n".format( - # fmt: off - file_headers = '\n'.join(file_headers), - language_codes = pformat(language_codes, indent=4)[1:-1] - # fmt: on - ) - new_file.write(file_content) - new_file.close() - - -if __name__ == "__main__": - load_engines(settings['engines']) - _engines_languages = fetch_supported_languages() - _all_languages = join_language_lists(_engines_languages) - _filtered_languages = filter_language_list(_all_languages) - write_languages_file(_filtered_languages) -- cgit v1.2.3 From c9cd376186d12d2d281e655d0b5539d1359fe148 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 10 Oct 2022 19:31:22 +0200 Subject: [mod] replace searx.languages by searx.sxng_locales With the language and region tags from the EngineTraitsMap the handling of SearXNG's tags of languages and regions has been normalized and is no longer a *mystery*. The "languages" became "locales" that are supported by babel and by this, the update_engine_traits.py can be simplified a lot. Other code places can be simplified as well, but these simplifications should (respectively can) only be done when none of the engines work with the deprecated EngineTraits.supported_languages interface anymore. This commit replaces searx.languages by searx.sxng_locales and fix the naming of some names from "language" to "locale" (e.g. language_codes --> sxng_locales). Signed-off-by: Markus Heiser --- searxng_extra/update/update_engine_traits.py | 382 +++++++++------------------ searxng_extra/update/update_osm_keys_tags.py | 4 +- 2 files changed, 124 insertions(+), 262 deletions(-) (limited to 'searxng_extra') diff --git a/searxng_extra/update/update_engine_traits.py b/searxng_extra/update/update_engine_traits.py index 9630d6d24..7449912dc 100755 --- a/searxng_extra/update/update_engine_traits.py +++ b/searxng_extra/update/update_engine_traits.py @@ -18,40 +18,52 @@ the :origin:`CI Update data ... <.github/workflows/data-update.yml>` from unicodedata import lookup from pathlib import Path from pprint import pformat -from babel import Locale, UnknownLocaleError -from babel.languages import get_global -from babel.core import parse_locale +import babel from searx import settings, searx_dir from searx import network -from searx.engines import load_engines, engines +from searx.engines import load_engines from searx.enginelib.traits import EngineTraitsMap # Output files. -languages_file = Path(searx_dir) / 'languages.py' +languages_file = Path(searx_dir) / 'sxng_locales.py' +languages_file_header = """\ +# -*- coding: utf-8 -*- +'''List of SearXNG's locale codes. +This file is generated automatically by:: -def fetch_traits_map(): - """Fetchs supported languages for each engine and writes json file with those.""" - network.set_timeout_for_thread(10.0) - - def log(msg): - print(msg) + ./manage pyenv.cmd searxng_extra/update/update_engine_traits.py +''' - traits_map = EngineTraitsMap.fetch_traits(log=log) - print("fetched properties from %s engines" % len(traits_map)) - print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) - traits_map.save_data() - return traits_map - - -# Get babel Locale object from lang_code if possible. -def get_locale(lang_code): - try: - locale = Locale.parse(lang_code, sep='-') - return locale - except (UnknownLocaleError, ValueError): - return None +sxng_locales = ( +""" +languages_file_footer = """, +) +''' +A list of five-digit tuples: + +0. SearXNG's internal locale tag (a language or region tag) +1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`) +2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`). + Empty string for language tags. +3. English language name (from :py:obj:`babel.core.Locale.english_name`) +4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages + are represented by a globe (\U0001F310) + +.. code:: python + + ('en', 'English', '', 'English', '\U0001f310'), + ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'), + ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), + .. + ('fr', 'Français', '', 'French', '\U0001f310'), + ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), + ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), + +:meta hide-value: +''' +""" lang2emoji = { @@ -63,249 +75,76 @@ lang2emoji = { } -def get_unicode_flag(lang_code): - """Determine a unicode flag (emoji) that fits to the ``lang_code``""" +def main(): + load_engines(settings['engines']) + # traits_map = EngineTraitsMap.from_data() + traits_map = fetch_traits_map() + sxng_tag_list = filter_locales(traits_map) + write_languages_file(sxng_tag_list) - emoji = lang2emoji.get(lang_code.lower()) - if emoji: - return emoji - if len(lang_code) == 2: - return '\U0001F310' +def fetch_traits_map(): + """Fetchs supported languages for each engine and writes json file with those.""" + network.set_timeout_for_thread(10.0) - language = territory = script = variant = '' - try: - language, territory, script, variant = parse_locale(lang_code, '-') - except ValueError as exc: - print(exc) - - # https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 - if not territory: - # https://www.unicode.org/emoji/charts/emoji-list.html#country-flag - emoji = lang2emoji.get(language) - if not emoji: - print( - "%s --> language: %s / territory: %s / script: %s / variant: %s" - % (lang_code, language, territory, script, variant) - ) - return emoji + def log(msg): + print(msg) - emoji = lang2emoji.get(territory.lower()) - if emoji: - return emoji + traits_map = EngineTraitsMap.fetch_traits(log=log) + print("fetched properties from %s engines" % len(traits_map)) + print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) + traits_map.save_data() + return traits_map - try: - c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0]) - c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1]) - # print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 )) - except KeyError as exc: - print("%s --> territory: %s --> %s" % (lang_code, territory, exc)) - return None - return c1 + c2 +def filter_locales(traits_map: EngineTraitsMap): + """Filter language & region tags by a threshold.""" + min_eng_per_region = 11 + min_eng_per_lang = 13 -def get_territory_name(lang_code): - country_name = None - locale = get_locale(lang_code) - try: - if locale is not None: - country_name = locale.get_territory_name() - except FileNotFoundError as exc: - print("ERROR: %s --> %s" % (locale, exc)) - return country_name - - -def join_language_lists(traits_map: EngineTraitsMap): - """Join all languages of the engines into one list. The returned language list - contains language codes (``zh``) and region codes (``zh-TW``). The codes can - be parsed by babel:: - - babel.Locale.parse(language_list[n]) - """ - # pylint: disable=too-many-branches - language_list = {} - - for eng_name, eng_traits in traits_map.items(): - eng = engines[eng_name] - eng_codes = set() - - if eng_traits.data_type == 'traits_v1': - # items of type 'engine_traits' do have regions & languages, the - # list of eng_codes should contain both. - eng_codes.update(eng_traits.regions.keys()) - eng_codes.update(eng_traits.languages.keys()) - - elif eng_traits.data_type == 'supported_languages': - # vintage / deprecated - _codes = set() - if isinstance(eng_traits.supported_languages, dict): - _codes.update(eng_traits.supported_languages.keys()) - elif isinstance(eng_traits.supported_languages, list): - _codes.update(eng_traits.supported_languages) - else: - raise TypeError('engine.supported_languages type %s is unknown' % type(eng_traits.supported_languages)) - - for lang_code in _codes: - # apply custom fixes if necessary - if lang_code in getattr(eng, 'language_aliases', {}).values(): - lang_code = next(lc for lc, alias in eng.language_aliases.items() if lang_code == alias) - eng_codes.add(lang_code) - - for lang_code in eng_codes: - - locale = get_locale(lang_code) - - # ensure that lang_code uses standard language and country codes - if locale and locale.territory: - lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory) - short_code = lang_code.split('-')[0] - - # add language without country if not in list - if short_code not in language_list: - if locale: - # get language's data from babel's Locale object - language_name = locale.get_language_name().title() - english_name = locale.english_name.split(' (')[0] - elif short_code in traits_map['wikipedia'].supported_languages: - # get language's data from wikipedia if not known by babel - language_name = traits_map['wikipedia'].supported_languages[short_code]['name'] - english_name = traits_map['wikipedia'].supported_languages[short_code]['english_name'] - else: - language_name = None - english_name = None - - # add language to list - language_list[short_code] = { - 'name': language_name, - 'english_name': english_name, - 'counter': set(), - 'countries': {}, - } - - # add language with country if not in list - if lang_code != short_code and lang_code not in language_list[short_code]['countries']: - country_name = '' - if locale: - # get country name from babel's Locale object - try: - country_name = locale.get_territory_name() - except FileNotFoundError as exc: - print("ERROR: %s --> %s" % (locale, exc)) - locale = None - - language_list[short_code]['countries'][lang_code] = { - 'country_name': country_name, - 'counter': set(), - } - - # count engine for both language_country combination and language alone - language_list[short_code]['counter'].add(eng_name) - if lang_code != short_code: - language_list[short_code]['countries'][lang_code]['counter'].add(eng_name) - - return language_list - - -# Filter language list so it only includes the most supported languages and countries -def filter_language_list(joined_languages_map): - min_engines_per_lang = 12 - min_engines_per_country = 7 - # pylint: disable=consider-using-dict-items, consider-iterating-dictionary - main_engines = [ - engine_name - for engine_name in engines.keys() - if 'general' in engines[engine_name].categories - and hasattr(engines[engine_name], 'supported_languages') - and engines[engine_name].supported_languages - and not engines[engine_name].disabled - ] - - # filter list to include only languages supported by most engines or all default general engines - filtered_languages = { - code: lang - for code, lang in joined_languages_map.items() - if ( - len(lang['counter']) >= min_engines_per_lang - or all(main_engine in lang['counter'] for main_engine in main_engines) - ) - } - - def _copy_lang_data(lang, country_name=None): - new_dict = {} - new_dict['name'] = joined_languages_map[lang]['name'] - new_dict['english_name'] = joined_languages_map[lang]['english_name'] - if country_name: - new_dict['country_name'] = country_name - return new_dict - - # for each language get country codes supported by most engines or at least one country code - filtered_languages_with_countries = {} - for lang, lang_data in filtered_languages.items(): - countries = lang_data['countries'] - filtered_countries = {} - - # get language's country codes with enough supported engines - for lang_country, country_data in countries.items(): - if len(country_data['counter']) >= min_engines_per_country: - filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name']) - - # add language without countries too if there's more than one country to choose from - if len(filtered_countries) > 1: - filtered_countries[lang] = _copy_lang_data(lang, None) - elif len(filtered_countries) == 1: - lang_country = next(iter(filtered_countries)) - - # if no country has enough engines try to get most likely country code from babel - if not filtered_countries: - lang_country = None - subtags = get_global('likely_subtags').get(lang) - if subtags: - country_code = subtags.split('_')[-1] - if len(country_code) == 2: - lang_country = "{lang}-{country}".format(lang=lang, country=country_code) - - if lang_country: - filtered_countries[lang_country] = _copy_lang_data(lang, None) - else: - filtered_countries[lang] = _copy_lang_data(lang, None) - - filtered_languages_with_countries.update(filtered_countries) - - return filtered_languages_with_countries + _ = {} + for eng in traits_map.values(): + for reg in eng.regions.keys(): + _[reg] = _.get(reg, 0) + 1 + regions = set(k for k, v in _.items() if v >= min_eng_per_region) + lang_from_region = set(k.split('-')[0] for k in regions) -class UnicodeEscape(str): - """Escape unicode string in :py:obj:`pprint.pformat`""" + _ = {} + for eng in traits_map.values(): + for lang in eng.languages.keys(): + # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they + # already counted by existence of 'zh' or 'sr', 'pa') + if '_' in lang: + # print("ignore %s" % lang) + continue + _[lang] = _.get(lang, 0) + 1 - def __repr__(self): - return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" + languages = set(k for k, v in _.items() if v >= min_eng_per_lang) + sxng_tag_list = set() + sxng_tag_list.update(regions) + sxng_tag_list.update(lang_from_region) + sxng_tag_list.update(languages) -# Write languages.py. -def write_languages_file(languages): - file_headers = ( - "# -*- coding: utf-8 -*-", - "# list of language codes", - "# this file is generated automatically by utils/fetch_languages.py", - "language_codes = (\n", - ) + return sxng_tag_list + + +def write_languages_file(sxng_tag_list): language_codes = [] - for code in sorted(languages): + for sxng_tag in sorted(sxng_tag_list): + sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-') - name = languages[code]['name'] - if name is None: - print("ERROR: languages['%s'] --> %s" % (code, languages[code])) - continue + flag = get_unicode_flag(sxng_locale) or '' - flag = get_unicode_flag(code) or '' item = ( - code, - languages[code]['name'].split(' (')[0], - get_territory_name(code) or '', - languages[code].get('english_name') or '', + sxng_tag, + sxng_locale.get_language_name().title(), + sxng_locale.get_territory_name() or '', + sxng_locale.english_name.split(' (')[0], UnicodeEscape(flag), ) @@ -314,22 +153,45 @@ def write_languages_file(languages): language_codes = tuple(language_codes) with open(languages_file, 'w', encoding='utf-8') as new_file: - file_content = "{file_headers} {language_codes},\n)\n".format( - # fmt: off - file_headers = '\n'.join(file_headers), - language_codes = pformat(language_codes, indent=4)[1:-1] - # fmt: on + file_content = "{header} {language_codes}{footer}".format( + header=languages_file_header, + language_codes=pformat(language_codes, width=120, indent=4)[1:-1], + footer=languages_file_footer, ) new_file.write(file_content) new_file.close() -def main(): - load_engines(settings['engines']) - traits_map = fetch_traits_map() - joined_languages_map = join_language_lists(traits_map) - filtered_languages = filter_language_list(joined_languages_map) - write_languages_file(filtered_languages) +class UnicodeEscape(str): + """Escape unicode string in :py:obj:`pprint.pformat`""" + + def __repr__(self): + return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" + + +def get_unicode_flag(locale: babel.Locale): + """Determine a unicode flag (emoji) that fits to the ``locale``""" + + emoji = lang2emoji.get(locale.language) + if emoji: + return emoji + + if not locale.territory: + return '\U0001F310' + + emoji = lang2emoji.get(locale.territory.lower()) + if emoji: + return emoji + + try: + c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0]) + c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1]) + # print("OK : %s --> %s%s" % (locale, c1, c2)) + except KeyError as exc: + print("ERROR: %s --> %s" % (locale, exc)) + return None + + return c1 + c2 if __name__ == "__main__": diff --git a/searxng_extra/update/update_osm_keys_tags.py b/searxng_extra/update/update_osm_keys_tags.py index 72197498d..72f3d61c5 100755 --- a/searxng_extra/update/update_osm_keys_tags.py +++ b/searxng_extra/update/update_osm_keys_tags.py @@ -50,7 +50,7 @@ from pathlib import Path from searx import searx_dir from searx.network import set_timeout_for_thread from searx.engines import wikidata, set_loggers -from searx.languages import language_codes +from searx.sxng_locales import sxng_locales from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK set_loggers(wikidata, 'wikidata') @@ -76,7 +76,7 @@ GROUP BY ?key ?item ?itemLabel ORDER BY ?key ?item ?itemLabel """ -LANGUAGES = [l[0].lower() for l in language_codes] +LANGUAGES = [l[0].lower() for l in sxng_locales] PRESET_KEYS = { ('wikidata',): {'en': 'Wikidata'}, -- cgit v1.2.3 From 16f0db44939c23d2980d6fd2e5dfada13d8f5ee9 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 7 Feb 2023 14:11:58 +0100 Subject: [mod] replace utils.match_language by locales.match_locale This patch replaces the *full of magic* ``utils.match_language`` function by a ``locales.match_locale``. The ``locales.match_locale`` function is based on the ``locales.build_engine_locales`` introduced in 9ae409a0 [1]. In the past SearXNG did only support a search by a language but not in a region. This has been changed a long time ago and regions have been added to SearXNG core but not to the engines. The ``utils.match_language`` was the function to handle the different aspects of language/regions in SearXNG core and the supported *languages* in the engine. The ``utils.match_language`` did it with some magic and works good for most use cases but fails in some edge case. To replace the concurrence of languages and regions in the SearXNG core the ``locales.build_engine_locales`` was introduced in 9ae409a0 [1]. With the last patches all engines has been migrated to a ``fetch_traits`` and a language/region concept that is based on ``locales.build_engine_locales``. To summarize: there is no longer a need for the ``locales.match_language``. [1] https://github.com/searxng/searxng/pull/1652 Signed-off-by: Markus Heiser --- searxng_extra/update/update_engine_descriptions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'searxng_extra') diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index 6052bf084..66bc303db 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -18,8 +18,8 @@ from os.path import join from lxml.html import fromstring from searx.engines import wikidata, set_loggers -from searx.utils import extract_text, match_language -from searx.locales import LOCALE_NAMES, locales_initialize +from searx.utils import extract_text +from searx.locales import LOCALE_NAMES, locales_initialize, match_locale from searx import searx_dir from searx.utils import gen_useragent, detect_language import searx.search @@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website): fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) if fetched_lang is None or desc is None: continue - matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None) + matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None) if matched_lang is None: - fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) + fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) if matched_lang is not None: update_description(engine_name, matched_lang, desc, website, replace=False) -- cgit v1.2.3