From 149802c56926bf48520c98932c4c36b8152b3d2d Mon Sep 17 00:00:00 2001
From: marc <a01200356@itesm.mx>
Date: Fri, 5 Aug 2016 23:34:56 -0500
Subject: [enh] add supported_languages on engines and auto-generate
 languages.py

---
 searx/engines/wikipedia.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

(limited to 'searx/engines/wikipedia.py')

diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 70191d22b..fdba5ed68 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -13,6 +13,36 @@
 from json import loads
 from urllib import urlencode, quote
 
+supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
+                       "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
+                       "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
+                       "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
+                       "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
+                       "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
+                       "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
+                       "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
+                       "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
+                       "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
+                       "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
+                       "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
+                       "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
+                       "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
+                       "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
+                       "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
+                       "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
+                       "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
+                       "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
+                       "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
+                       "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
+                       "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
+                       "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
+                       "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
+                       "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
+                       "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
+                       "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
+                       "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
+                       "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
+
 # search-url
 base_url = 'https://{language}.wikipedia.org/'
 search_postfix = 'w/api.php?'\
@@ -28,10 +58,11 @@ search_postfix = 'w/api.php?'\
 
 # set language in base_url
 def url_lang(lang):
-    if lang == 'all':
+    lang = lang.split('-')[0]
+    if lang == 'all' or lang not in supported_languages:
         language = 'en'
     else:
-        language = lang.split('_')[0]
+        language = lang
 
     return base_url.format(language=language)
 
-- 
cgit v1.2.3


From f62ce21f50b540315a708ebfbf36878ddec9d1c4 Mon Sep 17 00:00:00 2001
From: marc <a01200356@itesm.mx>
Date: Sat, 5 Nov 2016 20:51:38 -0600
Subject: [mod] fetch supported languages for several engines
 utils/fetch_languages.py gets languages supported by each engine and
 generates engines_languages.json with each engine's supported language.

---
 searx/engines/wikipedia.py | 53 +++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

(limited to 'searx/engines/wikipedia.py')

diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index fdba5ed68..0dee325a7 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -12,36 +12,9 @@
 
 from json import loads
 from urllib import urlencode, quote
+from requests import get
+from lxml.html import fromstring
 
-supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
-                       "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
-                       "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
-                       "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
-                       "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
-                       "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
-                       "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
-                       "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
-                       "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
-                       "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
-                       "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
-                       "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
-                       "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
-                       "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
-                       "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
-                       "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
-                       "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
-                       "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
-                       "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
-                       "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
-                       "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
-                       "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
-                       "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
-                       "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
-                       "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
-                       "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
-                       "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
-                       "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
-                       "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
 
 # search-url
 base_url = 'https://{language}.wikipedia.org/'
@@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
     '&explaintext'\
     '&pithumbsize=300'\
     '&redirects'
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 
 
 # set language in base_url
@@ -142,3 +116,24 @@ def response(resp):
                     'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
 
     return results
+
+
+# get supported languages from their site
+def fetch_supported_languages():
+    supported_languages = {}
+    response = get(supported_languages_url)
+    dom = fromstring(response.text)
+    tables = dom.xpath('//table[contains(@class,"sortable")]')
+    for table in tables:
+        # exclude header row
+        trs = table.xpath('.//tr')[1:]
+        for tr in trs:
+            td = tr.xpath('./td')
+            code = td[3].xpath('./a')[0].text
+            name = td[2].xpath('./a')[0].text
+            english_name = td[1].xpath('./a')[0].text
+            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
+            if articles >= 10000:
+                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
+
+    return supported_languages
-- 
cgit v1.2.3


From af35eee10b98940c51c6e5e18629de514b4bd48d Mon Sep 17 00:00:00 2001
From: marc <a01200356@itesm.mx>
Date: Thu, 15 Dec 2016 00:34:43 -0600
Subject: tests for _fetch_supported_languages in engines and refactor method
 to make it testable without making requests

---
 searx/engines/wikipedia.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'searx/engines/wikipedia.py')

diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 0dee325a7..322e8d128 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -12,7 +12,6 @@
 
 from json import loads
 from urllib import urlencode, quote
-from requests import get
 from lxml.html import fromstring
 
 
@@ -119,10 +118,9 @@ def response(resp):
 
 
 # get supported languages from their site
-def fetch_supported_languages():
+def _fetch_supported_languages(resp):
     supported_languages = {}
-    response = get(supported_languages_url)
-    dom = fromstring(response.text)
+    dom = fromstring(resp.text)
     tables = dom.xpath('//table[contains(@class,"sortable")]')
     for table in tables:
         # exclude header row
-- 
cgit v1.2.3


From 4a1ff56389d6ad560594ba82b448aef1d70bbbf4 Mon Sep 17 00:00:00 2001
From: marc <a01200356@itesm.mx>
Date: Fri, 16 Dec 2016 22:14:14 -0600
Subject: minor fixes in utils/fetch_languages.py

---
 searx/engines/wikipedia.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'searx/engines/wikipedia.py')

diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 322e8d128..78acd349d 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -131,7 +131,8 @@ def _fetch_supported_languages(resp):
             name = td[2].xpath('./a')[0].text
             english_name = td[1].xpath('./a')[0].text
             articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
-            if articles >= 10000:
+            # exclude languages with too few articles
+            if articles >= 100000:
                 supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
 
     return supported_languages
-- 
cgit v1.2.3