From 6748e8e2d5eff3c2202b2a714afb5534b1573101 Mon Sep 17 00:00:00 2001
From: Alexandre Flament <alex@al-f.net>
Date: Fri, 16 Dec 2022 20:28:57 +0000
Subject: Add "Auto-detected" as a language.

When the user choose "Auto-detected", the choice remains on the following queries.
The detected language is displayed.

For example "Auto-detected (en)":
* the next query language is going to be auto detected
* for the current query, the detected language is English.

This replace the autodetect_search_language plugin.
---
 searx/utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 4 deletions(-)

(limited to 'searx/utils.py')

diff --git a/searx/utils.py b/searx/utils.py
index cda336035..c3958ae78 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
 """fasttext model to predict laguage of a search term"""
 
+SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes])
+"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
+
 
 class _NotSetClass:  # pylint: disable=too-few-public-methods
     """Internal class for this module, do not create instance of this class.
@@ -637,11 +640,53 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
     return _FASTTEXT_MODEL
 
 
-def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
-    """https://fasttext.cc/docs/en/language-identification.html"""
+def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
+    """Detect the language of the text parameter
+
+    Args:
+        * text (str): the string whose language is to be detected.
+        * threshold (float): threshold filters the returned labels by a threshold on probability.
+          A choice of 0.3 will return labels with at least 0.3 probability.
+        * only_search_languages (bool): if True, returns only supported SearXNG search languages.
+          see :py:obj:`searx.languages`
+
+
+    Raises:
+        * ValueError: if text is not a string
+
+    Returns:
+        * result (str, None): the detected language code or None. See below.
+
+    The language detection is done by using `a fork`_ of the fastText_ library (`python
+    fasttext`_). fastText_ distributes the `language identification model`_, for
+    reference:
+
+    - `FastText.zip: Compressing text classification models`_
+    - `Bag of Tricks for Efficient Text Classification`_
+
+    The `language identification model`_ support the language codes (ISO-639-3)::
+    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
+    ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
+    fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
+    is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
+    mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
+    no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
+    sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
+    vi vls vo wa war wuu xal xmf yi yo yue zh
+
+    .. _a fork: https://github.com/searxng/fasttext-predict
+    .. _fastText: https://fasttext.cc/
+    .. _python fasttext: https://pypi.org/project/fasttext/
+    .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
+    .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
+    .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
+    """
     if not isinstance(text, str):
         raise ValueError('text must a str')
     r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
-    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
-        return r[0][0].split('__label__')[1]
+    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
+        language = r[0][0].split('__label__')[1]
+        if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
+            return None
+        return language
     return None
-- 
cgit v1.2.3


From 0b1444b61eb12a2d23d4c95e2440f24161daaec7 Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Mon, 30 Jan 2023 08:53:48 +0100
Subject: [doc] improved docs of implementations for automatic speech
 recognition

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
---
 searx/utils.py | 65 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 23 deletions(-)

(limited to 'searx/utils.py')

diff --git a/searx/utils.py b/searx/utils.py
index c3958ae78..e6180906b 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -641,38 +641,56 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
 
 
 def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
-    """Detect the language of the text parameter
+    """Detect the language of the ``text`` parameter.
 
-    Args:
-        * text (str): the string whose language is to be detected.
-        * threshold (float): threshold filters the returned labels by a threshold on probability.
-          A choice of 0.3 will return labels with at least 0.3 probability.
-        * only_search_languages (bool): if True, returns only supported SearXNG search languages.
-          see :py:obj:`searx.languages`
+    :param str text: The string whose language is to be detected.
 
+    :param float threshold: Threshold filters the returned labels by a threshold
+        on probability.  A choice of 0.3 will return labels with at least 0.3
+        probability.
 
-    Raises:
-        * ValueError: if text is not a string
+    :param bool only_search_languages: If ``True``, returns only supported
+        SearXNG search languages.  see :py:obj:`searx.languages`
 
-    Returns:
-        * result (str, None): the detected language code or None. See below.
+    :rtype: str, None
+    :returns:
+        The detected language code or ``None``. See below.
 
-    The language detection is done by using `a fork`_ of the fastText_ library (`python
-    fasttext`_). fastText_ distributes the `language identification model`_, for
-    reference:
+    :raises ValueError: If ``text`` is not a string.
+
+    The language detection is done by using `a fork`_ of the fastText_ library
+    (`python fasttext`_). fastText_ distributes the `language identification
+    model`_, for reference:
 
     - `FastText.zip: Compressing text classification models`_
     - `Bag of Tricks for Efficient Text Classification`_
 
-    The `language identification model`_ support the language codes (ISO-639-3)::
-    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
-    ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
-    fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
-    is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
-    mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
-    no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
-    sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
-    vi vls vo wa war wuu xal xmf yi yo yue zh
+    The `language identification model`_ support the language codes
+    (ISO-639-3)::
+
+        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
+        bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
+        et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
+        id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
+        lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
+        nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
+        rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
+        tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
+
+    By using ``only_search_languages=True`` the `language identification model`_
+    is harmonized with the SearXNG's language (locale) model.  General
+    conditions of SearXNG's locale model are:
+
+    a. SearXNG's locale of a query is passed to the
+       :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
+       code that is used by an engine.
+
+    b. Most of SearXNG's engines do not support all the languages from `language
+       identification model`_ and there is also a discrepancy in the ISO-639-3
+       (fastext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
+       locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
+       (``zh_Hans``) while the `language identification model`_ reduce both to
+       ``zh``.
 
     .. _a fork: https://github.com/searxng/fasttext-predict
     .. _fastText: https://fasttext.cc/
@@ -680,6 +698,7 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
     .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
     .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
     .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
+
     """
     if not isinstance(text, str):
         raise ValueError('text must a str')
-- 
cgit v1.2.3