diff options
| author | Adam Tauber <asciimoo@gmail.com> | 2016-12-28 20:09:57 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-12-28 20:09:57 +0100 |
| commit | 9743bde25ef2ce6b765b8192aafcdc0a15739b17 (patch) | |
| tree | 00fd6b0b14773c0e20425d4a6478d67f244d64ed /searx/engines/wikipedia.py | |
| parent | ea034fafa994227ea89662710901e73cb901e28c (diff) | |
| parent | 8bff42f049dcac77559beaf2932a47921feb1d49 (diff) | |
Merge pull request #748 from a01200356/languages
[mod] Allow users to search in most engine supported languages
Diffstat (limited to 'searx/engines/wikipedia.py')
| -rw-r--r-- | searx/engines/wikipedia.py | 29 |
1 files changed, 27 insertions, 2 deletions
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 70191d22b..78acd349d 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -12,6 +12,8 @@ from json import loads from urllib import urlencode, quote +from lxml.html import fromstring + # search-url base_url = 'https://{language}.wikipedia.org/' @@ -24,14 +26,16 @@ search_postfix = 'w/api.php?'\ '&explaintext'\ '&pithumbsize=300'\ '&redirects' +supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' # set language in base_url def url_lang(lang): - if lang == 'all': + lang = lang.split('-')[0] + if lang == 'all' or lang not in supported_languages: language = 'en' else: - language = lang.split('_')[0] + language = lang return base_url.format(language=language) @@ -111,3 +115,24 @@ def response(resp): 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + dom = fromstring(resp.text) + tables = dom.xpath('//table[contains(@class,"sortable")]') + for table in tables: + # exclude header row + trs = table.xpath('.//tr')[1:] + for tr in trs: + td = tr.xpath('./td') + code = td[3].xpath('./a')[0].text + name = td[2].xpath('./a')[0].text + english_name = td[1].xpath('./a')[0].text + articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) + # exclude languages with too few articles + if articles >= 100000: + supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} + + return supported_languages |