diff options
Diffstat (limited to 'searx/engines/wikipedia.py')
| -rw-r--r-- | searx/engines/wikipedia.py | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index db2fdc000..a216ba886 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -13,6 +13,7 @@ from json import loads from lxml.html import fromstring from searx.url_utils import quote, urlencode +from searx.utils import match_language # search-url base_url = u'https://{language}.wikipedia.org/' @@ -20,7 +21,8 @@ search_url = base_url + u'w/api.php?'\ 'action=query'\ '&format=json'\ '&{query}'\ - '&prop=extracts|pageimages'\ + '&prop=extracts|pageimages|pageprops'\ + '&ppprop=disambiguation'\ '&exintro'\ '&explaintext'\ '&pithumbsize=300'\ @@ -30,13 +32,10 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' # set language in base_url def url_lang(lang): - lang = lang.split('-')[0] - if lang == 'all' or lang not in supported_languages: - language = 'en' - else: - language = lang - - return language + lang_pre = lang.split('-')[0] + if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases: + return 'en' + return match_language(lang, supported_languages, language_aliases).split('-')[0] # do search-request @@ -81,12 +80,15 @@ def response(resp): # wikipedia article's unique id # first valid id is assumed to be the requested article + if 'pages' not in search_result['query']: + return results + for article_id in search_result['query']['pages']: page = search_result['query']['pages'][article_id] if int(article_id) > 0: break - if int(article_id) < 0: + if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}): return [] title = page.get('title') @@ -98,6 +100,7 @@ def response(resp): extract = page.get('extract') summary = extract_first_paragraph(extract, title, image) + summary = summary.replace('() ', '') # link to wikipedia article wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ |