diff options
| author | Markus Heiser <markus.heiser@darmarIT.de> | 2023-04-15 16:10:53 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-04-15 16:10:53 +0200 |
| commit | 5c8d56e73a1859ac8ad5d433d6f3a02b2709d696 (patch) | |
| tree | b337fbf806d5772c96035eae7b27fcdb0575c397 /searxng_extra/update | |
| parent | 0adfed195e64c334117576f059b844e28e0d0d34 (diff) | |
| parent | 09295a3fd116130f5c5595e0eea8a7ccbf3a2355 (diff) | |
Merge pull request #2316 from return42/fix-2314-upd-desc
[fix] searxng_extra/update/update_engine_descriptions.py
Diffstat (limited to 'searxng_extra/update')
| -rwxr-xr-x | searxng_extra/update/update_engine_descriptions.py | 161 |
1 files changed, 113 insertions, 48 deletions
diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index 66bc303db..301ce798d 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -18,7 +18,7 @@ from os.path import join from lxml.html import fromstring from searx.engines import wikidata, set_loggers -from searx.utils import extract_text +from searx.utils import extract_text, searx_useragent from searx.locales import LOCALE_NAMES, locales_initialize, match_locale from searx import searx_dir from searx.utils import gen_useragent, detect_language @@ -28,8 +28,12 @@ import searx.network set_loggers(wikidata, 'wikidata') locales_initialize() +# you can run the query in https://query.wikidata.org +# replace %IDS% by Wikidata entities separated by spaces with the prefix wd: +# for example wd:Q182496 wd:Q1540899 +# replace %LANGUAGES_SPARQL% by languages SPARQL_WIKIPEDIA_ARTICLE = """ -SELECT DISTINCT ?item ?name +SELECT DISTINCT ?item ?name ?article ?lang WHERE { hint:Query hint:optimizer "None". VALUES ?item { %IDS% } @@ -40,6 +44,7 @@ WHERE { FILTER(?lang in (%LANGUAGES_SPARQL%)) . FILTER (!CONTAINS(?name, ':')) . } +ORDER BY ?item ?lang """ SPARQL_DESCRIPTION = """ @@ -69,10 +74,11 @@ SKIP_ENGINE_SOURCE = [ # fmt: on ] -LANGUAGES = LOCALE_NAMES.keys() -WIKIPEDIA_LANGUAGES = {'language': 'wikipedia_language'} +WIKIPEDIA_LANGUAGES = {} LANGUAGES_SPARQL = '' IDS = None +WIKIPEDIA_LANGUAGE_VARIANTS = {'zh_Hant': 'zh-tw'} + descriptions = {} wd_to_engine_name = {} @@ -102,16 +108,31 @@ def update_description(engine_name, lang, description, source, replace=True): descriptions[engine_name][lang] = [description, source] -def get_wikipedia_summary(lang, pageid): - params = {'language': lang.replace('_', '-'), 'headers': {}} - searx.engines.engines['wikipedia'].request(pageid, params) +def get_wikipedia_summary(wikipedia_url, searxng_locale): + # get the REST API URL from the HTML URL + + # Headers + headers = {'User-Agent': searx_useragent()} + + if searxng_locale in WIKIPEDIA_LANGUAGE_VARIANTS: + headers['Accept-Language'] = WIKIPEDIA_LANGUAGE_VARIANTS.get(searxng_locale) + + # URL path : from HTML URL to REST API URL + parsed_url = urlparse(wikipedia_url) + # remove the /wiki/ prefix + article_name = parsed_url.path.split('/wiki/')[1] + # article_name is already encoded but not the / which is required for the REST API call + encoded_article_name = article_name.replace('/', '%2F') + path = '/api/rest_v1/page/summary/' + encoded_article_name + wikipedia_rest_url = parsed_url._replace(path=path).geturl() try: - response = searx.network.get(params['url'], headers=params['headers'], timeout=10) + response = searx.network.get(wikipedia_rest_url, headers=headers, timeout=10) response.raise_for_status() - api_result = json.loads(response.text) - return api_result.get('extract') - except Exception: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except + print(" ", wikipedia_url, e) return None + api_result = json.loads(response.text) + return api_result.get('extract') def get_website_description(url, lang1, lang2=None): @@ -154,11 +175,25 @@ def get_website_description(url, lang1, lang2=None): def initialize(): - global IDS, WIKIPEDIA_LANGUAGES, LANGUAGES_SPARQL + global IDS, LANGUAGES_SPARQL searx.search.initialize() wikipedia_engine = searx.engines.engines['wikipedia'] - WIKIPEDIA_LANGUAGES = {language: wikipedia_engine.url_lang(language.replace('_', '-')) for language in LANGUAGES} - WIKIPEDIA_LANGUAGES['nb_NO'] = 'no' + + locale2lang = {'nl-BE': 'nl'} + for sxng_ui_lang in LOCALE_NAMES: + + sxng_ui_alias = locale2lang.get(sxng_ui_lang, sxng_ui_lang) + wiki_lang = None + + if sxng_ui_alias in wikipedia_engine.traits.custom['WIKIPEDIA_LANGUAGES']: + wiki_lang = sxng_ui_alias + if not wiki_lang: + wiki_lang = wikipedia_engine.traits.get_language(sxng_ui_alias) + if not wiki_lang: + print(f"WIKIPEDIA_LANGUAGES missing {sxng_ui_lang}") + continue + WIKIPEDIA_LANGUAGES[sxng_ui_lang] = wiki_lang + LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values())) for engine_name, engine in searx.engines.engines.items(): descriptions[engine_name] = {} @@ -170,6 +205,7 @@ def initialize(): def fetch_wikidata_descriptions(): + print('Fetching wikidata descriptions') searx.network.set_timeout_for_thread(60) result = wikidata.send_wikidata_query( SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL) @@ -178,14 +214,20 @@ def fetch_wikidata_descriptions(): for binding in result['results']['bindings']: wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_lang = binding['itemDescription']['xml:lang'] - description = binding['itemDescription']['value'] + desc = binding['itemDescription']['value'] for engine_name in wd_to_engine_name[wikidata_id]: - for lang in LANGUAGES: - if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang: - update_description(engine_name, lang, description, 'wikidata') + for searxng_locale in LOCALE_NAMES: + if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang: + continue + print( + f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}", + f"/ len(wikidata_desc): {len(desc)}", + ) + update_description(engine_name, searxng_locale, desc, 'wikidata') def fetch_wikipedia_descriptions(): + print('Fetching wikipedia descriptions') result = wikidata.send_wikidata_query( SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL) ) @@ -193,12 +235,19 @@ def fetch_wikipedia_descriptions(): for binding in result['results']['bindings']: wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_lang = binding['name']['xml:lang'] - pageid = binding['name']['value'] + wikipedia_url = binding['article']['value'] # for example the URL https://de.wikipedia.org/wiki/PubMed for engine_name in wd_to_engine_name[wikidata_id]: - for lang in LANGUAGES: - if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang: - description = get_wikipedia_summary(lang, pageid) - update_description(engine_name, lang, description, 'wikipedia') + for searxng_locale in LOCALE_NAMES: + if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang: + continue + desc = get_wikipedia_summary(wikipedia_url, searxng_locale) + if not desc: + continue + print( + f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}", + f"/ len(wikipedia_desc): {len(desc)}", + ) + update_description(engine_name, searxng_locale, desc, 'wikipedia') def normalize_url(url): @@ -209,41 +258,60 @@ def normalize_url(url): def fetch_website_description(engine_name, website): + print(f"- fetch website descr: {engine_name} / {website}") default_lang, default_description = get_website_description(website, None, None) + if default_lang is None or default_description is None: # the front page can't be fetched: skip this engine return - wikipedia_languages_r = {V: K for K, V in WIKIPEDIA_LANGUAGES.items()} + # to specify an order in where the most common languages are in front of the + # language list .. languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr'] - languages = languages + [l for l in LANGUAGES if l not in languages] + languages = languages + [l for l in LOCALE_NAMES if l not in languages] previous_matched_lang = None previous_count = 0 + for lang in languages: - if lang not in descriptions[engine_name]: - fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) - if fetched_lang is None or desc is None: - continue - matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None) - if matched_lang is None: - fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) - matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) - if matched_lang is not None: - update_description(engine_name, matched_lang, desc, website, replace=False) - # check if desc changed with the different lang values - if matched_lang == previous_matched_lang: - previous_count += 1 - if previous_count == 6: - # the website has returned the same description for 6 different languages in Accept-Language header - # stop now - break - else: - previous_matched_lang = matched_lang - previous_count = 0 + + if lang in descriptions[engine_name]: + continue + + fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) + if fetched_lang is None or desc is None: + continue + + # check if desc changed with the different lang values + + if fetched_lang == previous_matched_lang: + previous_count += 1 + if previous_count == 6: + # the website has returned the same description for 6 different languages in Accept-Language header + # stop now + break + else: + previous_matched_lang = fetched_lang + previous_count = 0 + + # Don't trust in the value of fetched_lang, some websites return + # for some inappropriate values, by example bing-images:: + # + # requested lang: zh-Hans-CN / fetched lang: ceb / desc: 查看根据您的兴趣量身定制的提要 + # + # The lang ceb is "Cebuano" but the description is given in zh-Hans-CN + + print( + f" engine: {engine_name:20} / requested lang:{lang:7}" + f" / fetched lang: {fetched_lang:7} / len(desc): {len(desc)}" + ) + + matched_lang = match_locale(fetched_lang, LOCALE_NAMES.keys(), fallback=lang) + update_description(engine_name, matched_lang, desc, website, replace=False) def fetch_website_descriptions(): + print('Fetching website descriptions') for engine_name, engine in searx.engines.engines.items(): website = getattr(engine, "about", {}).get('website') if website is None and hasattr(engine, "search_url"): @@ -289,11 +357,8 @@ def get_output(): def main(): initialize() - print('Fetching wikidata descriptions') fetch_wikidata_descriptions() - print('Fetching wikipedia descriptions') fetch_wikipedia_descriptions() - print('Fetching website descriptions') fetch_website_descriptions() output = get_output() |