From 149802c56926bf48520c98932c4c36b8152b3d2d Mon Sep 17 00:00:00 2001 From: marc Date: Fri, 5 Aug 2016 23:34:56 -0500 Subject: [enh] add supported_languages on engines and auto-generate languages.py --- searx/engines/google.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'searx/engines/google.py') diff --git a/searx/engines/google.py b/searx/engines/google.py index a02b6940e..375e627ba 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -23,6 +23,20 @@ categories = ['general'] paging = True language_support = True use_locale_domain = True +supported_languages = ['de', 'en', 'es', 'es_419', 'fr', 'hr', 'it', 'nl', 'pl', 'pt-BR', + 'pt-PT', 'vi', 'tr', 'ru', 'ar', 'th', 'ko', 'zh-CN', 'zh-TW', 'ja', + 'ach', 'af', 'ak', 'az', 'ms', 'ban', 'xx_bork', 'bs', 'br', 'ca', + 'ceb', 'ckb', 'cs', 'sn', 'co', 'cy', 'da', 'yo', 'et', 'xx_elmer', + 'eo', 'eu', 'ee', 'tl', 'fo', 'gaa', 'ga', 'gd', 'gl', 'gn', 'xx_hacker', + 'ht', 'ha', 'haw', 'bem', 'ig', 'rn', 'id', 'ia', 'zu', 'is', 'jw', 'rw', + 'sw', 'tlh', 'kg', 'mfe', 'kri', 'la', 'lv', 'to', 'lt', 'ln', 'loz', + 'lua', 'lg', 'hu', 'mg', 'mt', 'mi', 'pcm', 'no', 'nso', 'ny', 'nn', + 'uz', 'oc', 'om', 'xx_pirate', 'pt', 'ro', 'mo', 'rm', 'qu', 'nyn', 'crs', + 'sq', 'sd', 'sk', 'sl', 'so', 'st', 'sr_ME', 'sr_Latn', 'su', 'fi', 'sv', + 'tg', 'tt', 'tn', 'tum', 'tk', 'tw', 'fy', 'wo', 'xh', 'el', 'be', 'bg', + 'ky', 'kk', 'mk', 'mn', 'sr', 'uk', 'ka', 'hy', 'yi', 'iw', 'ug', 'ur', + 'ps', 'fa', 'ti', 'am', 'ne', 'mr', 'hi', 'bn', 'pa', 'gu', 'or', 'ta', + 'te', 'kn', 'ml', 'si', 'lo', 'my', 'km', 'chr'] time_range_support = True # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests -- cgit v1.2.3 From c677aee58a4eca1015262eb24530620a333ddcef Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 6 Aug 2016 22:19:21 -0500 Subject: filter langauges --- searx/engines/google.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'searx/engines/google.py') diff --git a/searx/engines/google.py b/searx/engines/google.py index 375e627ba..31035be69 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -23,20 +23,20 @@ categories = ['general'] paging = True language_support = True use_locale_domain = True -supported_languages = ['de', 'en', 'es', 'es_419', 'fr', 'hr', 'it', 'nl', 'pl', 'pt-BR', - 'pt-PT', 'vi', 'tr', 'ru', 'ar', 'th', 'ko', 'zh-CN', 'zh-TW', 'ja', - 'ach', 'af', 'ak', 'az', 'ms', 'ban', 'xx_bork', 'bs', 'br', 'ca', - 'ceb', 'ckb', 'cs', 'sn', 'co', 'cy', 'da', 'yo', 'et', 'xx_elmer', - 'eo', 'eu', 'ee', 'tl', 'fo', 'gaa', 'ga', 'gd', 'gl', 'gn', 'xx_hacker', - 'ht', 'ha', 'haw', 'bem', 'ig', 'rn', 'id', 'ia', 'zu', 'is', 'jw', 'rw', - 'sw', 'tlh', 'kg', 'mfe', 'kri', 'la', 'lv', 'to', 'lt', 'ln', 'loz', - 'lua', 'lg', 'hu', 'mg', 'mt', 'mi', 'pcm', 'no', 'nso', 'ny', 'nn', - 'uz', 'oc', 'om', 'xx_pirate', 'pt', 'ro', 'mo', 'rm', 'qu', 'nyn', 'crs', - 'sq', 'sd', 'sk', 'sl', 'so', 'st', 'sr_ME', 'sr_Latn', 'su', 'fi', 'sv', - 'tg', 'tt', 'tn', 'tum', 'tk', 'tw', 'fy', 'wo', 'xh', 'el', 'be', 'bg', - 'ky', 'kk', 'mk', 'mn', 'sr', 'uk', 'ka', 'hy', 'yi', 'iw', 'ug', 'ur', - 'ps', 'fa', 'ti', 'am', 'ne', 'mr', 'hi', 'bn', 'pa', 'gu', 'or', 'ta', - 'te', 'kn', 'ml', 'si', 'lo', 'my', 'km', 'chr'] +supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca", + "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et", + "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr", + "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw", + "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw", + "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", + "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso", + "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT", + "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st", + "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum", + "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk", + "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps", + "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", + "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"] time_range_support = True # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests @@ -181,8 +181,12 @@ def request(query, params): language = 'en' country = 'US' url_lang = '' + elif params['language'][:2] == 'jv': + language = 'jw' + country = 'ID' + url_lang = 'lang_jw' else: - language_array = params['language'].lower().split('_') + language_array = params['language'].lower().split('-') if len(language_array) == 2: country = language_array[1] else: -- cgit v1.2.3 From f62ce21f50b540315a708ebfbf36878ddec9d1c4 Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 5 Nov 2016 20:51:38 -0600 Subject: [mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. --- searx/engines/google.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'searx/engines/google.py') diff --git a/searx/engines/google.py b/searx/engines/google.py index 31035be69..a82a0b5a7 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -12,6 +12,7 @@ import re from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree +from requests import get from searx.engines.xpath import extract_text, extract_url from searx.search import logger @@ -23,20 +24,6 @@ categories = ['general'] paging = True language_support = True use_locale_domain = True -supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca", - "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et", - "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr", - "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw", - "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw", - "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", - "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso", - "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT", - "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st", - "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum", - "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk", - "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps", - "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", - "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"] time_range_support = True # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests @@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.' maps_path = '/maps' redirect_path = '/url' images_path = '/images' +supported_languages_url = 'https://www.google.com/preferences?#languages' # specific xpath variables results_xpath = '//div[@class="g"]' @@ -373,3 +361,17 @@ def attributes_to_html(attributes): retval = retval + '' + a.get('label') + '' + value + '' retval = retval + '' return retval + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = {} + response = get(supported_languages_url) + dom = html.fromstring(response.text) + options = dom.xpath('//select[@name="hl"]/option') + for option in options: + code = option.xpath('./@value')[0].split('-')[0] + name = option.text[:-1].title() + supported_languages[code] = {"name": name} + + return supported_languages -- cgit v1.2.3 From af35eee10b98940c51c6e5e18629de514b4bd48d Mon Sep 17 00:00:00 2001 From: marc Date: Thu, 15 Dec 2016 00:34:43 -0600 Subject: tests for _fetch_supported_languages in engines and refactor method to make it testable without making requests --- searx/engines/google.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'searx/engines/google.py') diff --git a/searx/engines/google.py b/searx/engines/google.py index a82a0b5a7..803cd307e 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -12,7 +12,6 @@ import re from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree -from requests import get from searx.engines.xpath import extract_text, extract_url from searx.search import logger @@ -364,14 +363,13 @@ def attributes_to_html(attributes): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = {} - response = get(supported_languages_url) - dom = html.fromstring(response.text) - options = dom.xpath('//select[@name="hl"]/option') + dom = html.fromstring(resp.text) + options = dom.xpath('//table//td/font/label/span') for option in options: - code = option.xpath('./@value')[0].split('-')[0] - name = option.text[:-1].title() + code = option.xpath('./@id')[0][1:] + name = option.text.title() supported_languages[code] = {"name": name} return supported_languages -- cgit v1.2.3