Merge pull request #748 from a01200356/languages

[mod] Allow users to search in most engine supported languages
author: Adam Tauber <asciimoo@gmail.com> 2016-12-28 20:09:57 +0100
committer: GitHub <noreply@github.com> 2016-12-28 20:09:57 +0100
commit: 9743bde25ef2ce6b765b8192aafcdc0a15739b17 (patch)
tree: 00fd6b0b14773c0e20425d4a6478d67f244d64ed /searx/engines/wikipedia.py
parent: ea034fafa994227ea89662710901e73cb901e28c (diff)
parent: 8bff42f049dcac77559beaf2932a47921feb1d49 (diff)
1 files changed, 27 insertions, 2 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 70191d22b..78acd349d 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -12,6 +12,8 @@
 
 from json import loads
 from urllib import urlencode, quote
+from lxml.html import fromstring
+
 
 # search-url
 base_url = 'https://{language}.wikipedia.org/'
@@ -24,14 +26,16 @@ search_postfix = 'w/api.php?'\
     '&explaintext'\
     '&pithumbsize=300'\
     '&redirects'
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 
 
 # set language in base_url
 def url_lang(lang):
-    if lang == 'all':
+    lang = lang.split('-')[0]
+    if lang == 'all' or lang not in supported_languages:
         language = 'en'
     else:
-        language = lang.split('_')[0]
+        language = lang
 
     return base_url.format(language=language)
 
@@ -111,3 +115,24 @@ def response(resp):
                     'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
 
     return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+    supported_languages = {}
+    dom = fromstring(resp.text)
+    tables = dom.xpath('//table[contains(@class,"sortable")]')
+    for table in tables:
+        # exclude header row
+        trs = table.xpath('.//tr')[1:]
+        for tr in trs:
+            td = tr.xpath('./td')
+            code = td[3].xpath('./a')[0].text
+            name = td[2].xpath('./a')[0].text
+            english_name = td[1].xpath('./a')[0].text
+            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
+            # exclude languages with too few articles
+            if articles >= 100000:
+                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
+
+    return supported_languages
author	Adam Tauber <asciimoo@gmail.com>	2016-12-28 20:09:57 +0100
committer	GitHub <noreply@github.com>	2016-12-28 20:09:57 +0100
commit	9743bde25ef2ce6b765b8192aafcdc0a15739b17 (patch)
tree	00fd6b0b14773c0e20425d4a6478d67f244d64ed /searx/engines/wikipedia.py
parent	ea034fafa994227ea89662710901e73cb901e28c (diff)
parent	8bff42f049dcac77559beaf2932a47921feb1d49 (diff)