summaryrefslogtreecommitdiff
path: root/searx/engines/wikipedia.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines/wikipedia.py')
-rw-r--r--searx/engines/wikipedia.py29
1 files changed, 27 insertions, 2 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 70191d22b..78acd349d 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -12,6 +12,8 @@
from json import loads
from urllib import urlencode, quote
+from lxml.html import fromstring
+
# search-url
base_url = 'https://{language}.wikipedia.org/'
@@ -24,14 +26,16 @@ search_postfix = 'w/api.php?'\
'&explaintext'\
'&pithumbsize=300'\
'&redirects'
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url
def url_lang(lang):
- if lang == 'all':
+ lang = lang.split('-')[0]
+ if lang == 'all' or lang not in supported_languages:
language = 'en'
else:
- language = lang.split('_')[0]
+ language = lang
return base_url.format(language=language)
@@ -111,3 +115,24 @@ def response(resp):
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = {}
+ dom = fromstring(resp.text)
+ tables = dom.xpath('//table[contains(@class,"sortable")]')
+ for table in tables:
+ # exclude header row
+ trs = table.xpath('.//tr')[1:]
+ for tr in trs:
+ td = tr.xpath('./td')
+ code = td[3].xpath('./a')[0].text
+ name = td[2].xpath('./a')[0].text
+ english_name = td[1].xpath('./a')[0].text
+ articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
+ # exclude languages with too few articles
+ if articles >= 100000:
+ supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
+
+ return supported_languages