From 8d335dbdaedd6113242e785e8fabac86128d069a Mon Sep 17 00:00:00 2001 From: a01200356 Date: Mon, 14 Mar 2016 00:32:36 -0600 Subject: [enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api --- searx/engines/wikidata.py | 16 +++--- searx/engines/wikipedia.py | 114 +++++++++++++++++++++++++++++++++++++ searx/results.py | 12 +++- searx/settings.yml | 4 +- searx/templates/oscar/infobox.html | 4 +- 5 files changed, 136 insertions(+), 14 deletions(-) create mode 100644 searx/engines/wikipedia.py (limited to 'searx') diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 9f3496b72..8aa2fcd5c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale): results.append({'title': title, 'url': official_website}) wikipedia_link_count = 0 + wikipedia_link = get_wikilink(result, language + 'wiki') + wikipedia_link_count += add_url(urls, + 'Wikipedia (' + language + ')', + wikipedia_link) if language != 'en': + wikipedia_en_link = get_wikilink(result, 'enwiki') wikipedia_link_count += add_url(urls, - 'Wikipedia (' + language + ')', - get_wikilink(result, language + - 'wiki')) - wikipedia_en_link = get_wikilink(result, 'enwiki') - wikipedia_link_count += add_url(urls, - 'Wikipedia (en)', - wikipedia_en_link) + 'Wikipedia (en)', + wikipedia_en_link) if wikipedia_link_count == 0: misc_language = get_wiki_firstlanguage(result, 'wiki') if misc_language is not None: @@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): else: results.append({ 'infobox': title, - 'id': wikipedia_en_link, + 'id': wikipedia_link, 'content': description, 'attributes': attributes, 'urls': urls diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py new file mode 100644 index 000000000..fed7b263f --- /dev/null +++ b/searx/engines/wikipedia.py @@ -0,0 +1,114 @@ +""" + Wikipedia (Web) + + @website https://{language}.wikipedia.org + @provide-api yes + + @using-api yes + @results JSON + @stable yes + @parse url, infobox +""" + +from json import loads +from urllib import urlencode, quote + +# search-url +base_url = 'https://{language}.wikipedia.org/' +search_postfix = 'w/api.php?'\ + 'action=query'\ + '&format=json'\ + '&{query}'\ + '&prop=extracts|pageimages'\ + '&exintro'\ + '&explaintext'\ + '&pithumbsize=300'\ + '&redirects' + + +# set language in base_url +def url_lang(lang): + if lang == 'all': + language = 'en' + else: + language = lang.split('_')[0] + + return base_url.format(language=language) + + +# do search-request +def request(query, params): + if query.islower(): + query += '|' + query.title() + + params['url'] = url_lang(params['language']) \ + + search_postfix.format(query=urlencode({'titles': query})) + + return params + + +# get first meaningful paragraph +# this should filter out disambiguation pages and notes above first paragraph +# "magic numbers" were obtained by fine tuning +def extract_first_paragraph(content, title, image): + first_paragraph = None + + failed_attempts = 0 + for paragraph in content.split('\n'): + + starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) + length = len(paragraph) + + if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): + first_paragraph = paragraph + break + + failed_attempts += 1 + if failed_attempts > 3: + return None + + return first_paragraph + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.content) + + # wikipedia article's unique id + # first valid id is assumed to be the requested article + for article_id in search_result['query']['pages']: + page = search_result['query']['pages'][article_id] + if int(article_id) > 0: + break + + if int(article_id) < 0: + return [] + + title = page.get('title') + + image = page.get('thumbnail') + if image: + image = image.get('source') + + extract = page.get('extract') + + summary = extract_first_paragraph(extract, title, image) + if not summary: + return [] + + # link to wikipedia article + # parenthesis are not quoted to make infobox mergeable with wikidata's + wikipedia_link = url_lang(resp.search_params['language']) \ + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')') + + results.append({'url': wikipedia_link, 'title': title}) + + results.append({'infobox': title, + 'id': wikipedia_link, + 'content': summary, + 'img_src': image, + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) + + return results diff --git a/searx/results.py b/searx/results.py index 5d51eb5b5..c3040b305 100644 --- a/searx/results.py +++ b/searx/results.py @@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2): urls1 = infobox1.get('urls', None) if urls1 is None: urls1 = [] - infobox1.set('urls', urls1) + infobox1['urls'] = urls1 urlSet = set() for url in infobox1.get('urls', []): @@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2): if url.get('url', None) not in urlSet: urls1.append(url) + if 'img_src' in infobox2: + img1 = infobox1.get('img_src', None) + img2 = infobox2.get('img_src') + if img1 is None: + infobox1['img_src'] = img2 + if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) if attributes1 is None: attributes1 = [] - infobox1.set('attributes', attributes1) + infobox1['attributes'] = attributes1 attributeSet = set() for attribute in infobox1.get('attributes', []): @@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2): if result_content_len(content2) > result_content_len(content1): infobox1['content'] = content2 else: - infobox1.set('content', content2) + infobox1['content'] = content2 def result_score(result): diff --git a/searx/settings.yml b/searx/settings.yml index 96ac4e716..ff85684ac 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -43,10 +43,9 @@ engines: shortcut : bs - name : wikipedia - engine : mediawiki + engine : wikipedia shortcut : wp base_url : 'https://{language}.wikipedia.org/' - number_of_results : 1 - name : bing engine : bing @@ -93,6 +92,7 @@ engines: - name : ddg definitions engine : duckduckgo_definitions shortcut : ddd + disabled : True - name : digg engine : digg diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html index d87d98453..606a5d22c 100644 --- a/searx/templates/oscar/infobox.html +++ b/searx/templates/oscar/infobox.html @@ -1,8 +1,9 @@
-

{{ infobox.infobox }}

+

{{ infobox.infobox }}

+ {% if infobox.img_src %}{{ infobox.infobox }}{% endif %} {% if infobox.content %}

{{ infobox.content }}

{% endif %} @@ -28,5 +29,6 @@ {% endfor %}
{% endif %} +
-- cgit v1.2.3 From 6dca14e95d08479fb49314cb4093be36ac49cf94 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Sun, 17 Apr 2016 16:21:44 -0500 Subject: [enh] multilingual duckduckgo_definitions --- searx/engines/duckduckgo_definitions.py | 7 ++++++- searx/templates/oscar/infobox.html | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'searx') diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 793e97d22..dc25d416f 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,5 +1,6 @@ import json from urllib import urlencode +from re import sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text @@ -19,8 +20,8 @@ def result_to_text(url, text, htmlResult): def request(query, params): - # TODO add kl={locale} params['url'] = url.format(query=urlencode({'q': query})) + params['headers']['Accept-Language'] = params['language'] return params @@ -103,6 +104,10 @@ def response(resp): urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL}) + # to merge with wikidata's infobox + if infobox_id: + infobox_id = sub(r'^http:', r'https:', infobox_id) + # entity entity = search_res.get('Entity', None) # TODO continent / country / department / location / waterfall / diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html index 606a5d22c..c72cfb638 100644 --- a/searx/templates/oscar/infobox.html +++ b/searx/templates/oscar/infobox.html @@ -1,9 +1,9 @@
-

{{ infobox.infobox }}

+

{{ infobox.infobox }}

- + {% if infobox.img_src %}{{ infobox.infobox }}{% endif %} {% if infobox.content %}

{{ infobox.content }}

{% endif %} @@ -29,6 +29,6 @@ {% endfor %}
{% endif %} - +
-- cgit v1.2.3 From a44faa77167980a414df2cbe936a52359351f455 Mon Sep 17 00:00:00 2001 From: a01200356 Date: Mon, 18 Apr 2016 10:52:16 -0500 Subject: [fix] compile regex in ddg_definitions --- searx/engines/duckduckgo_definitions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'searx') diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index dc25d416f..208ccca28 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,6 +1,6 @@ import json from urllib import urlencode -from re import sub +from re import compile, sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text @@ -8,6 +8,8 @@ from searx.engines.xpath import extract_text url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' +http_regex = compile(r'^http:') + def result_to_text(url, text, htmlResult): # TODO : remove result ending with "Meaning" or "Category" @@ -106,7 +108,7 @@ def response(resp): # to merge with wikidata's infobox if infobox_id: - infobox_id = sub(r'^http:', r'https:', infobox_id) + infobox_id = http_regex.sub('https:', infobox_id) # entity entity = search_res.get('Entity', None) -- cgit v1.2.3