From 8d335dbdaedd6113242e785e8fabac86128d069a Mon Sep 17 00:00:00 2001
From: a01200356 <a01200356@itesm.mx>
Date: Mon, 14 Mar 2016 00:32:36 -0600
Subject: [enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api
---
 searx/engines/wikidata.py  |  16 +++----
 searx/engines/wikipedia.py | 114 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 8 deletions(-)
 create mode 100644 searx/engines/wikipedia.py

(limited to 'searx/engines')

diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index 9f3496b72..8aa2fcd5c 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
         results.append({'title': title, 'url': official_website})
 
     wikipedia_link_count = 0
+    wikipedia_link = get_wikilink(result, language + 'wiki')
+    wikipedia_link_count += add_url(urls,
+                                    'Wikipedia (' + language + ')',
+                                    wikipedia_link)
     if language != 'en':
+        wikipedia_en_link = get_wikilink(result, 'enwiki')
         wikipedia_link_count += add_url(urls,
-                                        'Wikipedia (' + language + ')',
-                                        get_wikilink(result, language +
-                                                     'wiki'))
-    wikipedia_en_link = get_wikilink(result, 'enwiki')
-    wikipedia_link_count += add_url(urls,
-                                    'Wikipedia (en)',
-                                    wikipedia_en_link)
+                                        'Wikipedia (en)',
+                                        wikipedia_en_link)
     if wikipedia_link_count == 0:
         misc_language = get_wiki_firstlanguage(result, 'wiki')
         if misc_language is not None:
@@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
     else:
         results.append({
                        'infobox': title,
-                       'id': wikipedia_en_link,
+                       'id': wikipedia_link,
                        'content': description,
                        'attributes': attributes,
                        'urls': urls
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
new file mode 100644
index 000000000..fed7b263f
--- /dev/null
+++ b/searx/engines/wikipedia.py
@@ -0,0 +1,114 @@
+"""
+ Wikipedia (Web)
+
+ @website     https://{language}.wikipedia.org
+ @provide-api yes
+
+ @using-api   yes
+ @results     JSON
+ @stable      yes
+ @parse       url, infobox
+"""
+
+from json import loads
+from urllib import urlencode, quote
+
+# search-url
+base_url = 'https://{language}.wikipedia.org/'
+search_postfix = 'w/api.php?'\
+    'action=query'\
+    '&format=json'\
+    '&{query}'\
+    '&prop=extracts|pageimages'\
+    '&exintro'\
+    '&explaintext'\
+    '&pithumbsize=300'\
+    '&redirects'
+
+
+# set language in base_url
+def url_lang(lang):
+    if lang == 'all':
+        language = 'en'
+    else:
+        language = lang.split('_')[0]
+
+    return base_url.format(language=language)
+
+
+# do search-request
+def request(query, params):
+    if query.islower():
+        query += '|' + query.title()
+
+    params['url'] = url_lang(params['language']) \
+        + search_postfix.format(query=urlencode({'titles': query}))
+
+    return params
+
+
+# get first meaningful paragraph
+# this should filter out disambiguation pages and notes above first paragraph
+# "magic numbers" were obtained by fine tuning
+def extract_first_paragraph(content, title, image):
+    first_paragraph = None
+
+    failed_attempts = 0
+    for paragraph in content.split('\n'):
+
+        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
+        length = len(paragraph)
+
+        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
+            first_paragraph = paragraph
+            break
+
+        failed_attempts += 1
+        if failed_attempts > 3:
+            return None
+
+    return first_paragraph
+
+
+# get response from search-request
+def response(resp):
+    results = []
+
+    search_result = loads(resp.content)
+
+    # wikipedia article's unique id
+    # first valid id is assumed to be the requested article
+    for article_id in search_result['query']['pages']:
+        page = search_result['query']['pages'][article_id]
+        if int(article_id) > 0:
+            break
+
+    if int(article_id) < 0:
+        return []
+
+    title = page.get('title')
+
+    image = page.get('thumbnail')
+    if image:
+        image = image.get('source')
+
+    extract = page.get('extract')
+
+    summary = extract_first_paragraph(extract, title, image)
+    if not summary:
+        return []
+
+    # link to wikipedia article
+    # parenthesis are not quoted to make infobox mergeable with wikidata's
+    wikipedia_link = url_lang(resp.search_params['language']) \
+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
+
+    results.append({'url': wikipedia_link, 'title': title})
+
+    results.append({'infobox': title,
+                    'id': wikipedia_link,
+                    'content': summary,
+                    'img_src': image,
+                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
+
+    return results
-- 
cgit v1.2.3


From 6dca14e95d08479fb49314cb4093be36ac49cf94 Mon Sep 17 00:00:00 2001
From: a01200356 <a01200356@itesm.mx>
Date: Sun, 17 Apr 2016 16:21:44 -0500
Subject: [enh] multilingual duckduckgo_definitions

---
 searx/engines/duckduckgo_definitions.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'searx/engines')

diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
index 793e97d22..dc25d416f 100644
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -1,5 +1,6 @@
 import json
 from urllib import urlencode
+from re import sub
 from lxml import html
 from searx.utils import html_to_text
 from searx.engines.xpath import extract_text
@@ -19,8 +20,8 @@ def result_to_text(url, text, htmlResult):
 
 
 def request(query, params):
-    # TODO add kl={locale}
     params['url'] = url.format(query=urlencode({'q': query}))
+    params['headers']['Accept-Language'] = params['language']
     return params
 
 
@@ -103,6 +104,10 @@ def response(resp):
         urls.append({'title': search_res.get('DefinitionSource'),
                     'url': definitionURL})
 
+    # to merge with wikidata's infobox
+    if infobox_id:
+        infobox_id = sub(r'^http:', r'https:', infobox_id)
+
     # entity
     entity = search_res.get('Entity', None)
     # TODO continent / country / department / location / waterfall /
-- 
cgit v1.2.3


From a44faa77167980a414df2cbe936a52359351f455 Mon Sep 17 00:00:00 2001
From: a01200356 <a01200356@itesm.mx>
Date: Mon, 18 Apr 2016 10:52:16 -0500
Subject: [fix] compile regex in ddg_definitions

---
 searx/engines/duckduckgo_definitions.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'searx/engines')

diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
index dc25d416f..208ccca28 100644
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -1,6 +1,6 @@
 import json
 from urllib import urlencode
-from re import sub
+from re import compile, sub
 from lxml import html
 from searx.utils import html_to_text
 from searx.engines.xpath import extract_text
@@ -8,6 +8,8 @@ from searx.engines.xpath import extract_text
 url = 'https://api.duckduckgo.com/'\
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 
+http_regex = compile(r'^http:')
+
 
 def result_to_text(url, text, htmlResult):
     # TODO : remove result ending with "Meaning" or "Category"
@@ -106,7 +108,7 @@ def response(resp):
 
     # to merge with wikidata's infobox
     if infobox_id:
-        infobox_id = sub(r'^http:', r'https:', infobox_id)
+        infobox_id = http_regex.sub('https:', infobox_id)
 
     # entity
     entity = search_res.get('Entity', None)
-- 
cgit v1.2.3