summaryrefslogtreecommitdiff
path: root/searxng_extra/update/update_engine_traits.py
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarIT.de>2023-03-29 09:47:21 +0200
committerGitHub <noreply@github.com>2023-03-29 09:47:21 +0200
commitf950119ca87363aec81591dc4985f11371aa2b3e (patch)
treeab893ff1f60d8c969ff0f5c2fad0cff49148aa3c /searxng_extra/update/update_engine_traits.py
parent64fea2f9cb079bd0055c6a23360097d285204515 (diff)
parent6f9e678346e5978a09ee453a62fa133cdc0ee0bd (diff)
Merge pull request #2269 from return42/locale-revision
Revision of the locale- and language- handling in SearXNG
Diffstat (limited to 'searxng_extra/update/update_engine_traits.py')
-rwxr-xr-xsearxng_extra/update/update_engine_traits.py198
1 files changed, 198 insertions, 0 deletions
diff --git a/searxng_extra/update/update_engine_traits.py b/searxng_extra/update/update_engine_traits.py
new file mode 100755
index 000000000..7449912dc
--- /dev/null
+++ b/searxng_extra/update/update_engine_traits.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+# lint: pylint
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
+
+:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
+ Persistence of engines traits, fetched from the engines.
+
+:origin:`searx/languages.py`
+ Is generated from intersecting each engine's supported traits.
+
+The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
+the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
+
+"""
+
+# pylint: disable=invalid-name
+from unicodedata import lookup
+from pathlib import Path
+from pprint import pformat
+import babel
+
+from searx import settings, searx_dir
+from searx import network
+from searx.engines import load_engines
+from searx.enginelib.traits import EngineTraitsMap
+
+# Output files.
+languages_file = Path(searx_dir) / 'sxng_locales.py'
+languages_file_header = """\
+# -*- coding: utf-8 -*-
+'''List of SearXNG's locale codes.
+
+This file is generated automatically by::
+
+ ./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
+'''
+
+sxng_locales = (
+"""
+languages_file_footer = """,
+)
+'''
+A list of five-digit tuples:
+
+0. SearXNG's internal locale tag (a language or region tag)
+1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
+2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
+ Empty string for language tags.
+3. English language name (from :py:obj:`babel.core.Locale.english_name`)
+4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
+ are represented by a globe (\U0001F310)
+
+.. code:: python
+
+ ('en', 'English', '', 'English', '\U0001f310'),
+ ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
+ ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
+ ..
+ ('fr', 'Français', '', 'French', '\U0001f310'),
+ ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
+ ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
+
+:meta hide-value:
+'''
+"""
+
+
+lang2emoji = {
+ 'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
+ 'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
+ 'jp': '\U0001F1EF\U0001F1F5', # Japanese
+ 'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
+ 'he': '\U0001F1EE\U0001F1F7', # Hebrew
+}
+
+
+def main():
+ load_engines(settings['engines'])
+ # traits_map = EngineTraitsMap.from_data()
+ traits_map = fetch_traits_map()
+ sxng_tag_list = filter_locales(traits_map)
+ write_languages_file(sxng_tag_list)
+
+
+def fetch_traits_map():
+ """Fetchs supported languages for each engine and writes json file with those."""
+ network.set_timeout_for_thread(10.0)
+
+ def log(msg):
+ print(msg)
+
+ traits_map = EngineTraitsMap.fetch_traits(log=log)
+ print("fetched properties from %s engines" % len(traits_map))
+ print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
+ traits_map.save_data()
+ return traits_map
+
+
+def filter_locales(traits_map: EngineTraitsMap):
+ """Filter language & region tags by a threshold."""
+
+ min_eng_per_region = 11
+ min_eng_per_lang = 13
+
+ _ = {}
+ for eng in traits_map.values():
+ for reg in eng.regions.keys():
+ _[reg] = _.get(reg, 0) + 1
+
+ regions = set(k for k, v in _.items() if v >= min_eng_per_region)
+ lang_from_region = set(k.split('-')[0] for k in regions)
+
+ _ = {}
+ for eng in traits_map.values():
+ for lang in eng.languages.keys():
+ # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
+ # already counted by existence of 'zh' or 'sr', 'pa')
+ if '_' in lang:
+ # print("ignore %s" % lang)
+ continue
+ _[lang] = _.get(lang, 0) + 1
+
+ languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
+
+ sxng_tag_list = set()
+ sxng_tag_list.update(regions)
+ sxng_tag_list.update(lang_from_region)
+ sxng_tag_list.update(languages)
+
+ return sxng_tag_list
+
+
+def write_languages_file(sxng_tag_list):
+
+ language_codes = []
+
+ for sxng_tag in sorted(sxng_tag_list):
+ sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
+
+ flag = get_unicode_flag(sxng_locale) or ''
+
+ item = (
+ sxng_tag,
+ sxng_locale.get_language_name().title(),
+ sxng_locale.get_territory_name() or '',
+ sxng_locale.english_name.split(' (')[0],
+ UnicodeEscape(flag),
+ )
+
+ language_codes.append(item)
+
+ language_codes = tuple(language_codes)
+
+ with open(languages_file, 'w', encoding='utf-8') as new_file:
+ file_content = "{header} {language_codes}{footer}".format(
+ header=languages_file_header,
+ language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
+ footer=languages_file_footer,
+ )
+ new_file.write(file_content)
+ new_file.close()
+
+
+class UnicodeEscape(str):
+ """Escape unicode string in :py:obj:`pprint.pformat`"""
+
+ def __repr__(self):
+ return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
+
+
+def get_unicode_flag(locale: babel.Locale):
+ """Determine a unicode flag (emoji) that fits to the ``locale``"""
+
+ emoji = lang2emoji.get(locale.language)
+ if emoji:
+ return emoji
+
+ if not locale.territory:
+ return '\U0001F310'
+
+ emoji = lang2emoji.get(locale.territory.lower())
+ if emoji:
+ return emoji
+
+ try:
+ c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
+ c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
+ # print("OK : %s --> %s%s" % (locale, c1, c2))
+ except KeyError as exc:
+ print("ERROR: %s --> %s" % (locale, exc))
+ return None
+
+ return c1 + c2
+
+
+if __name__ == "__main__":
+ main()