From c18048e0454f4e3dc75c778940903091fbeae06a Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 25 Aug 2019 22:23:37 -0700 Subject: exclude disambiguation pages from wikipedia infobox --- searx/engines/wikipedia.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 690da72fe..44dea56fa 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -21,7 +21,8 @@ search_url = base_url + u'w/api.php?'\ 'action=query'\ '&format=json'\ '&{query}'\ - '&prop=extracts|pageimages'\ + '&prop=extracts|pageimages|pageprops'\ + '&ppprop=disambiguation'\ '&exintro'\ '&explaintext'\ '&pithumbsize=300'\ @@ -87,7 +88,7 @@ def response(resp): if int(article_id) > 0: break - if int(article_id) < 0: + if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}): return [] title = page.get('title') -- cgit v1.2.3 From 5706c12fba98e169c7c76a4d3c29aabf48242d63 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 25 Aug 2019 22:47:23 -0700 Subject: remove empty parenthesis in wikipedia's summary They're usually IPA pronunciations which are removed by the API. --- searx/engines/wikipedia.py | 1 + 1 file changed, 1 insertion(+) (limited to 'searx/engines') diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 44dea56fa..a216ba886 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -100,6 +100,7 @@ def response(resp): extract = page.get('extract') summary = extract_first_paragraph(extract, title, image) + summary = summary.replace('() ', '') # link to wikipedia article wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ -- cgit v1.2.3