From 6748e8e2d5eff3c2202b2a714afb5534b1573101 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 16 Dec 2022 20:28:57 +0000 Subject: Add "Auto-detected" as a language. When the user choose "Auto-detected", the choice remains on the following queries. The detected language is displayed. For example "Auto-detected (en)": * the next query language is going to be auto detected * for the current query, the detected language is English. This replace the autodetect_search_language plugin. --- searx/utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index cda336035..c3958ae78 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {} _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None """fasttext model to predict laguage of a search term""" +SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes]) +"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" + class _NotSetClass: # pylint: disable=too-few-public-methods """Internal class for this module, do not create instance of this class. @@ -637,11 +640,53 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": return _FASTTEXT_MODEL -def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]: - """https://fasttext.cc/docs/en/language-identification.html""" +def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]: + """Detect the language of the text parameter + + Args: + * text (str): the string whose language is to be detected. + * threshold (float): threshold filters the returned labels by a threshold on probability. + A choice of 0.3 will return labels with at least 0.3 probability. + * only_search_languages (bool): if True, returns only supported SearXNG search languages. + see :py:obj:`searx.languages` + + + Raises: + * ValueError: if text is not a string + + Returns: + * result (str, None): the detected language code or None. See below. + + The language detection is done by using `a fork`_ of the fastText_ library (`python + fasttext`_). fastText_ distributes the `language identification model`_, for + reference: + + - `FastText.zip: Compressing text classification models`_ + - `Bag of Tricks for Efficient Text Classification`_ + + The `language identification model`_ support the language codes (ISO-639-3):: + af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr + ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa + fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io + is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv + mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn + no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd + sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep + vi vls vo wa war wuu xal xmf yi yo yue zh + + .. _a fork: https://github.com/searxng/fasttext-predict + .. _fastText: https://fasttext.cc/ + .. _python fasttext: https://pypi.org/project/fasttext/ + .. _language identification model: https://fasttext.cc/docs/en/language-identification.html + .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 + .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 + """ if not isinstance(text, str): raise ValueError('text must a str') r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) - if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability: - return r[0][0].split('__label__')[1] + if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: + language = r[0][0].split('__label__')[1] + if only_search_languages and language not in SEARCH_LANGUAGE_CODES: + return None + return language return None -- cgit v1.2.3 From 0b1444b61eb12a2d23d4c95e2440f24161daaec7 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 30 Jan 2023 08:53:48 +0100 Subject: [doc] improved docs of implementations for automatic speech recognition Signed-off-by: Markus Heiser --- searx/utils.py | 65 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index c3958ae78..e6180906b 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -641,38 +641,56 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]: - """Detect the language of the text parameter + """Detect the language of the ``text`` parameter. - Args: - * text (str): the string whose language is to be detected. - * threshold (float): threshold filters the returned labels by a threshold on probability. - A choice of 0.3 will return labels with at least 0.3 probability. - * only_search_languages (bool): if True, returns only supported SearXNG search languages. - see :py:obj:`searx.languages` + :param str text: The string whose language is to be detected. + :param float threshold: Threshold filters the returned labels by a threshold + on probability. A choice of 0.3 will return labels with at least 0.3 + probability. - Raises: - * ValueError: if text is not a string + :param bool only_search_languages: If ``True``, returns only supported + SearXNG search languages. see :py:obj:`searx.languages` - Returns: - * result (str, None): the detected language code or None. See below. + :rtype: str, None + :returns: + The detected language code or ``None``. See below. - The language detection is done by using `a fork`_ of the fastText_ library (`python - fasttext`_). fastText_ distributes the `language identification model`_, for - reference: + :raises ValueError: If ``text`` is not a string. + + The language detection is done by using `a fork`_ of the fastText_ library + (`python fasttext`_). fastText_ distributes the `language identification + model`_, for reference: - `FastText.zip: Compressing text classification models`_ - `Bag of Tricks for Efficient Text Classification`_ - The `language identification model`_ support the language codes (ISO-639-3):: - af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr - ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa - fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io - is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv - mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn - no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd - sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep - vi vls vo wa war wuu xal xmf yi yo yue zh + The `language identification model`_ support the language codes + (ISO-639-3):: + + af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs + bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es + et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia + id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li + lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah + nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru + rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl + tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh + + By using ``only_search_languages=True`` the `language identification model`_ + is harmonized with the SearXNG's language (locale) model. General + conditions of SearXNG's locale model are: + + a. SearXNG's locale of a query is passed to the + :py:obj:`searx.locales.get_engine_locale` to get a language and/or region + code that is used by an engine. + + b. Most of SearXNG's engines do not support all the languages from `language + identification model`_ and there is also a discrepancy in the ISO-639-3 + (fastext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the + locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant`` + (``zh_Hans``) while the `language identification model`_ reduce both to + ``zh``. .. _a fork: https://github.com/searxng/fasttext-predict .. _fastText: https://fasttext.cc/ @@ -680,6 +698,7 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo .. _language identification model: https://fasttext.cc/docs/en/language-identification.html .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651 + """ if not isinstance(text, str): raise ValueError('text must a str') -- cgit v1.2.3