From 149802c56926bf48520c98932c4c36b8152b3d2d Mon Sep 17 00:00:00 2001 From: marc Date: Fri, 5 Aug 2016 23:34:56 -0500 Subject: [enh] add supported_languages on engines and auto-generate languages.py --- searx/engines/__init__.py | 1 + searx/engines/archlinux.py | 5 ++-- searx/engines/bing.py | 2 +- searx/engines/bing_images.py | 3 ++- searx/engines/bing_news.py | 3 ++- searx/engines/duckduckgo.py | 42 +++++++++++++++++++++++++++++++-- searx/engines/duckduckgo_definitions.py | 3 ++- searx/engines/gigablast.py | 2 +- searx/engines/google.py | 14 +++++++++++ searx/engines/google_news.py | 4 +++- searx/engines/mediawiki.py | 3 ++- searx/engines/photon.py | 4 ++-- searx/engines/startpage.py | 2 +- searx/engines/subtitleseeker.py | 9 +++++-- searx/engines/swisscows.py | 4 ++-- searx/engines/twitter.py | 2 +- searx/engines/wikidata.py | 2 ++ searx/engines/wikipedia.py | 35 +++++++++++++++++++++++++-- searx/engines/yacy.py | 2 +- searx/engines/yahoo.py | 12 +++++++++- searx/engines/yahoo_news.py | 2 +- searx/engines/yandex.py | 2 +- searx/engines/youtube_api.py | 2 +- 23 files changed, 134 insertions(+), 26 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 87b1b0eb4..ab3677984 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -38,6 +38,7 @@ engine_shortcuts = {} engine_default_args = {'paging': False, 'categories': ['general'], 'language_support': True, + 'supported_languages': [], 'safesearch': False, 'timeout': settings['outgoing']['request_timeout'], 'shortcut': '-', diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index 5ba512766..dca825790 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -29,8 +29,8 @@ xpath_link = './/div[@class="mw-search-result-heading"]/a' # cut 'en' from 'en_US', 'de' from 'de_CH', and so on def locale_to_lang_code(locale): - if locale.find('_') >= 0: - locale = locale.split('_')[0] + if locale.find('-') >= 0: + locale = locale.split('-')[0] return locale @@ -95,6 +95,7 @@ main_langs = { 'uk': 'Українська', 'zh': '简体中文' } +supported_languages = dict(lang_urls, **main_langs) # do search-request diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 58db61251..052b66448 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -32,7 +32,7 @@ def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 if params['language'] != 'all': - query = u'language:{} {}'.format(params['language'].split('_')[0].upper(), + query = u'language:{} {}'.format(params['language'].split('-')[0].upper(), query.decode('utf-8')).encode('utf-8') search_path = search_string.format( diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 4dd362cb3..c0deaf6b2 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -19,6 +19,7 @@ from urllib import urlencode from lxml import html from json import loads import re +from searx.engines.bing import supported_languages # engine dependent config categories = ['images'] @@ -53,7 +54,7 @@ def request(query, params): if params['language'] == 'all': language = 'en-US' else: - language = params['language'].replace('_', '-') + language = params['language'] search_path = search_string.format( query=urlencode({'q': query}), diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 4e7c33129..4bac5bbce 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -17,6 +17,7 @@ from datetime import datetime from dateutil import parser from lxml import etree from searx.utils import list_get +from searx.engines.bing import supported_languages # engine dependent config categories = ['news'] @@ -74,7 +75,7 @@ def request(query, params): if params['language'] == 'all': language = 'en-US' else: - language = params['language'].replace('_', '-') + language = params['language'] params['url'] = _get_url(query, language, offset, params['time_range']) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 9959a52e6..a1cb5882c 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -22,6 +22,13 @@ from searx.languages import language_codes categories = ['general'] paging = True language_support = True +supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT", + "es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", + "el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP", + "kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO", + "es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG", + "sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW", + "th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"] time_range_support = True # search-url @@ -46,10 +53,23 @@ def request(query, params): offset = (params['pageno'] - 1) * 30 + # custom fixes for languages if params['language'] == 'all': locale = None + elif params['language'][:2] == 'ja': + locale = 'jp-jp' + elif params['language'] == 'zh-TW': + locale = 'tw-tzh' + elif params['language'] == 'zh-HK': + locale = 'hk-tzh' + elif params['language'][-2:] == 'SA': + locale = 'xa' + params['language'].split('-')[0] + elif params['language'][-2:] == 'GB': + locale = 'uk' + params['language'].split('-')[0] + elif params['language'] == 'es-419': + locale = 'xl-es' else: - locale = params['language'].split('_') + locale = params['language'].split('-') if len(locale) == 2: # country code goes first locale = locale[1].lower() + '-' + locale[0].lower() @@ -58,7 +78,25 @@ def request(query, params): locale = locale[0].lower() lang_codes = [x[0] for x in language_codes] for lc in lang_codes: - lc = lc.split('_') + lc = lc.split('-') + if locale == lc[0] and len(lc) == 2: + locale = lc[1].lower() + '-' + lc[0].lower() + break + + if locale: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': locale}), offset=offset) + else: + locale = params['language'].split('-') + if len(locale) == 2: + # country code goes first + locale = locale[1].lower() + '-' + locale[0].lower() + else: + # tries to get a country code from language + locale = locale[0].lower() + lang_codes = [x[0] for x in language_codes] + for lc in lang_codes: + lc = lc.split('-') if locale == lc[0]: locale = lc[1].lower() + '-' + lc[0].lower() break diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 208ccca28..23a2f3be3 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -4,6 +4,7 @@ from re import compile, sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text +from searx.engines.duckduckgo import supported_languages url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' @@ -23,7 +24,7 @@ def result_to_text(url, text, htmlResult): def request(query, params): params['url'] = url.format(query=urlencode({'q': query})) - params['headers']['Accept-Language'] = params['language'] + params['headers']['Accept-Language'] = params['language'].split('-')[0] return params diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 5430eb3ba..e139842fa 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -48,7 +48,7 @@ def request(query, params): if params['language'] == 'all': language = 'xx' else: - language = params['language'][0:2] + language = params['language'].split('-')[0] if params['safesearch'] >= 1: safesearch = 1 diff --git a/searx/engines/google.py b/searx/engines/google.py index a02b6940e..375e627ba 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -23,6 +23,20 @@ categories = ['general'] paging = True language_support = True use_locale_domain = True +supported_languages = ['de', 'en', 'es', 'es_419', 'fr', 'hr', 'it', 'nl', 'pl', 'pt-BR', + 'pt-PT', 'vi', 'tr', 'ru', 'ar', 'th', 'ko', 'zh-CN', 'zh-TW', 'ja', + 'ach', 'af', 'ak', 'az', 'ms', 'ban', 'xx_bork', 'bs', 'br', 'ca', + 'ceb', 'ckb', 'cs', 'sn', 'co', 'cy', 'da', 'yo', 'et', 'xx_elmer', + 'eo', 'eu', 'ee', 'tl', 'fo', 'gaa', 'ga', 'gd', 'gl', 'gn', 'xx_hacker', + 'ht', 'ha', 'haw', 'bem', 'ig', 'rn', 'id', 'ia', 'zu', 'is', 'jw', 'rw', + 'sw', 'tlh', 'kg', 'mfe', 'kri', 'la', 'lv', 'to', 'lt', 'ln', 'loz', + 'lua', 'lg', 'hu', 'mg', 'mt', 'mi', 'pcm', 'no', 'nso', 'ny', 'nn', + 'uz', 'oc', 'om', 'xx_pirate', 'pt', 'ro', 'mo', 'rm', 'qu', 'nyn', 'crs', + 'sq', 'sd', 'sk', 'sl', 'so', 'st', 'sr_ME', 'sr_Latn', 'su', 'fi', 'sv', + 'tg', 'tt', 'tn', 'tum', 'tk', 'tw', 'fy', 'wo', 'xh', 'el', 'be', 'bg', + 'ky', 'kk', 'mk', 'mn', 'sr', 'uk', 'ka', 'hy', 'yi', 'iw', 'ug', 'ur', + 'ps', 'fa', 'ti', 'am', 'ne', 'mr', 'hi', 'bn', 'pa', 'gu', 'or', 'ta', + 'te', 'kn', 'ml', 'si', 'lo', 'my', 'km', 'chr'] time_range_support = True # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 37253c6a7..6d1430248 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -12,6 +12,8 @@ from lxml import html from urllib import urlencode +from json import loads +from searx.engines.google import supported_languages # search-url categories = ['news'] @@ -50,7 +52,7 @@ def request(query, params): search_options=urlencode(search_options)) if params['language'] != 'all': - language_array = params['language'].lower().split('_') + language_array = params['language'].lower().split('-') params['url'] += '&lr=lang_' + language_array[0] return params diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 26d3720d9..b17cb38e4 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -15,6 +15,7 @@ from json import loads from string import Formatter from urllib import urlencode, quote +from searx.engines.wikipedia import supported_engines # engine dependent config categories = ['general'] @@ -46,7 +47,7 @@ def request(query, params): if params['language'] == 'all': language = 'en' else: - language = params['language'].split('_')[0] + language = params['language'].split('-')[0] # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] if any(x[1] == 'language' for x in format_strings): diff --git a/searx/engines/photon.py b/searx/engines/photon.py index 2197005e5..a029bbfef 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -26,7 +26,7 @@ search_string = 'api/?{query}&limit={limit}' result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' # list of supported languages -allowed_languages = ['de', 'en', 'fr', 'it'] +supported_languages = ['de', 'en', 'fr', 'it'] # do search-request @@ -37,7 +37,7 @@ def request(query, params): if params['language'] != 'all': language = params['language'].split('_')[0] - if language in allowed_languages: + if language in supported_languages: params['url'] = params['url'] + "&lang=" + language # using searx User-Agent diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 6f6eae1cf..54aafdee5 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -47,7 +47,7 @@ def request(query, params): # set language if specified if params['language'] != 'all': - params['data']['with_language'] = ('lang_' + params['language'].split('_')[0]) + params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) return params diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index daba68be7..2c0a94f08 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -43,8 +43,13 @@ def response(resp): search_lang = "" - if resp.search_params['language'] != 'all': - search_lang = [lc[1] + # dirty fix for languages named differenly in their site + if resp.search_params['language'][:2] == 'fa': + search_lang = 'Farsi' + elif resp.search_params['language'] == 'pt_BR': + search_lang = 'Brazilian' + elif resp.search_params['language'] != 'all': + search_lang = [lc[3] for lc in language_codes if lc[0][:2] == resp.search_params['language'].split('_')[0]][0] diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 72184e428..68632a15a 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -36,8 +36,8 @@ def request(query, params): ui_language = 'browser' region = 'browser' else: - region = params['language'].replace('_', '-') - ui_language = params['language'].split('_')[0] + region = params['language'] + ui_language = params['language'].split('-')[0] search_path = search_string.format( query=urlencode({'query': query, diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index 36efac186..6cca05f70 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -40,7 +40,7 @@ def request(query, params): # set language if specified if params['language'] != 'all': - params['cookies']['lang'] = params['language'].split('_')[0] + params['cookies']['lang'] = params['language'].split('-')[0] else: params['cookies']['lang'] = 'en' diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 91040e218..edb6d75fe 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -14,6 +14,8 @@ from searx import logger from searx.poolrequests import get from searx.engines.xpath import extract_text +from searx.utils import format_date_by_locale +from searx.engines.wikipedia import supported_languages from json import loads from lxml.html import fromstring diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 70191d22b..fdba5ed68 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -13,6 +13,36 @@ from json import loads from urllib import urlencode, quote +supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war", + "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh", + "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr", + "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan", + "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo", + "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta", + "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms", + "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb", + "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy", + "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds", + "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su", + "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms", + "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or", + "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo", + "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak", + "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc", + "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv", + "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq", + "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur", + "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom", + "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi", + "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa", + "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn", + "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo", + "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab", + "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum", + "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts", + "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve", + "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"] + # search-url base_url = 'https://{language}.wikipedia.org/' search_postfix = 'w/api.php?'\ @@ -28,10 +58,11 @@ search_postfix = 'w/api.php?'\ # set language in base_url def url_lang(lang): - if lang == 'all': + lang = lang.split('-')[0] + if lang == 'all' or lang not in supported_languages: language = 'en' else: - language = lang.split('_')[0] + language = lang return base_url.format(language=language) diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 92cf881c0..7b1b6b35d 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -53,7 +53,7 @@ def request(query, params): # add language tag if specified if params['language'] != 'all': - params['url'] += '&lr=lang_' + params['language'].split('_')[0] + params['url'] += '&lr=lang_' + params['language'].split('-')[0] return params diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 2bb34b83d..c00e42368 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -20,6 +20,10 @@ from searx.engines.xpath import extract_text, extract_url categories = ['general'] paging = True language_support = True +supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en", + "et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja", + "ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr", + "sl", "es", "sv", "th", "tr"] time_range_support = True # search-url @@ -72,7 +76,13 @@ def _get_url(query, offset, language, time_range): def _get_language(params): if params['language'] == 'all': return 'en' - return params['language'].split('_')[0] + elif params['language'][:2] == 'zh': + if params['language'] == 'zh' or params['language'] == 'zh-CH': + return 'szh' + else: + return 'tzh' + else: + return params['language'].split('-')[0] # do search-request diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index e91c1d34e..613513e59 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -12,7 +12,7 @@ from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text, extract_url -from searx.engines.yahoo import parse_url +from searx.engines.yahoo import parse_url, supported_languages from datetime import datetime, timedelta import re from dateutil import parser diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index b83a747f9..eee345c45 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -36,7 +36,7 @@ content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m orga def request(query, params): - lang = params['language'].split('_')[0] + lang = params['language'].split('-')[0] host = base_url.format(tld=language_map.get(lang) or default_tld) params['url'] = host + search_url.format(page=params['pageno'] - 1, query=urlencode({'text': query})) diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index 8fd939a25..1dfca5166 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -36,7 +36,7 @@ def request(query, params): # add language tag if specified if params['language'] != 'all': - params['url'] += '&relevanceLanguage=' + params['language'].split('_')[0] + params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] return params -- cgit v1.2.3 From c677aee58a4eca1015262eb24530620a333ddcef Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 6 Aug 2016 22:19:21 -0500 Subject: filter langauges --- searx/engines/gigablast.py | 10 +++++++++- searx/engines/google.py | 34 +++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 16 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index e139842fa..125ffa0a6 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -40,6 +40,12 @@ url_xpath = './/url' title_xpath = './/title' content_xpath = './/sum' +supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de", + "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el", + "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr", + "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv", + "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] + # do search-request def request(query, params): @@ -48,7 +54,9 @@ def request(query, params): if params['language'] == 'all': language = 'xx' else: - language = params['language'].split('-')[0] + language = params['language'].replace('-', '_').lower() + if language.split('-')[0] != 'zh': + language = language.split('-')[0] if params['safesearch'] >= 1: safesearch = 1 diff --git a/searx/engines/google.py b/searx/engines/google.py index 375e627ba..31035be69 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -23,20 +23,20 @@ categories = ['general'] paging = True language_support = True use_locale_domain = True -supported_languages = ['de', 'en', 'es', 'es_419', 'fr', 'hr', 'it', 'nl', 'pl', 'pt-BR', - 'pt-PT', 'vi', 'tr', 'ru', 'ar', 'th', 'ko', 'zh-CN', 'zh-TW', 'ja', - 'ach', 'af', 'ak', 'az', 'ms', 'ban', 'xx_bork', 'bs', 'br', 'ca', - 'ceb', 'ckb', 'cs', 'sn', 'co', 'cy', 'da', 'yo', 'et', 'xx_elmer', - 'eo', 'eu', 'ee', 'tl', 'fo', 'gaa', 'ga', 'gd', 'gl', 'gn', 'xx_hacker', - 'ht', 'ha', 'haw', 'bem', 'ig', 'rn', 'id', 'ia', 'zu', 'is', 'jw', 'rw', - 'sw', 'tlh', 'kg', 'mfe', 'kri', 'la', 'lv', 'to', 'lt', 'ln', 'loz', - 'lua', 'lg', 'hu', 'mg', 'mt', 'mi', 'pcm', 'no', 'nso', 'ny', 'nn', - 'uz', 'oc', 'om', 'xx_pirate', 'pt', 'ro', 'mo', 'rm', 'qu', 'nyn', 'crs', - 'sq', 'sd', 'sk', 'sl', 'so', 'st', 'sr_ME', 'sr_Latn', 'su', 'fi', 'sv', - 'tg', 'tt', 'tn', 'tum', 'tk', 'tw', 'fy', 'wo', 'xh', 'el', 'be', 'bg', - 'ky', 'kk', 'mk', 'mn', 'sr', 'uk', 'ka', 'hy', 'yi', 'iw', 'ug', 'ur', - 'ps', 'fa', 'ti', 'am', 'ne', 'mr', 'hi', 'bn', 'pa', 'gu', 'or', 'ta', - 'te', 'kn', 'ml', 'si', 'lo', 'my', 'km', 'chr'] +supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca", + "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et", + "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr", + "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw", + "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw", + "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", + "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso", + "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT", + "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st", + "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum", + "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk", + "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps", + "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", + "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"] time_range_support = True # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests @@ -181,8 +181,12 @@ def request(query, params): language = 'en' country = 'US' url_lang = '' + elif params['language'][:2] == 'jv': + language = 'jw' + country = 'ID' + url_lang = 'lang_jw' else: - language_array = params['language'].lower().split('_') + language_array = params['language'].lower().split('-') if len(language_array) == 2: country = language_array[1] else: -- cgit v1.2.3 From a11948c71bfe7b2aac6e50e7634874d5073c7d84 Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 29 Oct 2016 21:04:01 -0500 Subject: Add language support for more engines. --- searx/engines/dailymotion.py | 18 ++++++++++++++++++ searx/engines/duckduckgo.py | 22 +--------------------- searx/engines/gigablast.py | 2 +- searx/engines/qwant.py | 15 ++++++++++++++- searx/engines/startpage.py | 5 +++++ searx/engines/swisscows.py | 8 ++++++++ searx/engines/yandex.py | 4 +++- 7 files changed, 50 insertions(+), 24 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 317f34f59..4a7d7b6a8 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -20,6 +20,24 @@ from datetime import datetime categories = ['videos'] paging = True language_support = True +supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az", + "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca", + "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da", + "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo", + "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv", + "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr", + "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik", + "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk", + "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", + "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml", + "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv", + "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc", + "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu", + "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se", + "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss", + "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th", + "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur", + "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"] # search-url # see http://www.dailymotion.com/doc/api/obj-video.html diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index a1cb5882c..3e1752dd0 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -16,7 +16,6 @@ from urllib import urlencode from lxml.html import fromstring from searx.engines.xpath import extract_text -from searx.languages import language_codes # engine dependent config categories = ['general'] @@ -76,26 +75,7 @@ def request(query, params): else: # tries to get a country code from language locale = locale[0].lower() - lang_codes = [x[0] for x in language_codes] - for lc in lang_codes: - lc = lc.split('-') - if locale == lc[0] and len(lc) == 2: - locale = lc[1].lower() + '-' + lc[0].lower() - break - - if locale: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': locale}), offset=offset) - else: - locale = params['language'].split('-') - if len(locale) == 2: - # country code goes first - locale = locale[1].lower() + '-' + locale[0].lower() - else: - # tries to get a country code from language - locale = locale[0].lower() - lang_codes = [x[0] for x in language_codes] - for lc in lang_codes: + for lc in supported_languages: lc = lc.split('-') if locale == lc[0]: locale = lc[1].lower() + '-' + lc[0].lower() diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 125ffa0a6..f012e1df2 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -44,7 +44,7 @@ supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el", "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr", "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv", - "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] + "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] # do search-request diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index d8b084292..200e9ada9 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -20,6 +20,11 @@ from searx.utils import html_to_text categories = None paging = True language_support = True +supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT", + "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU", + "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR", + "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH", + "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"] category_to_keyword = {'general': 'web', 'images': 'images', @@ -46,7 +51,15 @@ def request(query, params): # add language tag if specified if params['language'] != 'all': - params['url'] += '&locale=' + params['language'].lower() + locale = params['language'].split('-') + if len(locale) == 2 and params['language'] in supported_languages: + params['url'] += '&locale=' + params['language'].replace('-', '_').lower() + else: + # try to get a country code for language + for lang in supported_languages: + if locale[0] == lang.split('-')[0]: + params['url'] += '&locale=' + lang.replace('-', '_').lower() + break return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 54aafdee5..3814d9949 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -24,6 +24,11 @@ categories = ['general'] # paging = False language_support = True +supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW", + "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr", + "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja", + "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw", + "sv", "tl", "th", "tr", "uk", "vi"] # search-url base_url = 'https://startpage.com/' diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 68632a15a..5c6b051a9 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -18,6 +18,12 @@ import re categories = ['general', 'images'] paging = True language_support = True +supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", + "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR", + "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT", + "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", + "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH", + "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"] # search-url base_url = 'https://swisscows.ch/' @@ -35,6 +41,8 @@ def request(query, params): if params['language'] == 'all': ui_language = 'browser' region = 'browser' + elif params['language'].split('-')[0] == 'no': + region = 'nb-NO' else: region = params['language'] ui_language = params['language'].split('-')[0] diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index eee345c45..65aee28b8 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -22,7 +22,9 @@ language_support = True # TODO default_tld = 'com' language_map = {'ru': 'ru', - 'ua': 'uk', + 'ua': 'ua', + 'be': 'by', + 'kk': 'kz', 'tr': 'com.tr'} # search-url -- cgit v1.2.3 From 92c6e88ad3e5ba57bd6e2ba64d0c38e8fd72ea09 Mon Sep 17 00:00:00 2001 From: marc Date: Mon, 31 Oct 2016 23:52:08 -0600 Subject: small fixes --- searx/engines/mediawiki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index b17cb38e4..ea607dd60 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -15,7 +15,7 @@ from json import loads from string import Formatter from urllib import urlencode, quote -from searx.engines.wikipedia import supported_engines +from searx.engines.wikipedia import supported_languages # engine dependent config categories = ['general'] -- cgit v1.2.3 From f62ce21f50b540315a708ebfbf36878ddec9d1c4 Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 5 Nov 2016 20:51:38 -0600 Subject: [mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. --- searx/engines/__init__.py | 6 ++++ searx/engines/bing.py | 15 ++++++++++ searx/engines/bing_images.py | 2 +- searx/engines/bing_news.py | 2 +- searx/engines/dailymotion.py | 41 ++++++++++++++----------- searx/engines/duckduckgo.py | 27 +++++++++++------ searx/engines/duckduckgo_definitions.py | 2 +- searx/engines/gigablast.py | 22 ++++++++++---- searx/engines/google.py | 30 ++++++++++--------- searx/engines/google_news.py | 2 +- searx/engines/mediawiki.py | 1 - searx/engines/qwant.py | 15 +--------- searx/engines/startpage.py | 5 ---- searx/engines/subtitleseeker.py | 5 ++-- searx/engines/swisscows.py | 21 +++++++++---- searx/engines/wikidata.py | 6 ++-- searx/engines/wikipedia.py | 53 +++++++++++++++------------------ searx/engines/yahoo.py | 20 ++++++++++--- searx/engines/yahoo_news.py | 2 +- 19 files changed, 162 insertions(+), 115 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index ab3677984..7a64fd25b 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -20,6 +20,7 @@ from os.path import realpath, dirname import sys from flask_babel import gettext from operator import itemgetter +from json import loads from searx import settings from searx import logger from searx.utils import load_module @@ -78,6 +79,9 @@ def load_engine(engine_data): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) + if engine_data['name'] in languages: + setattr(engine, 'supported_languages', languages[engine_data['name']]) + # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): @@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']: logger.error('No engines found. Edit your settings.yml') exit(2) +languages = loads(open(engine_dir + '/../data/engines_languages.json').read()) + for engine_data in settings['engines']: engine = load_engine(engine_data) if engine is not None: diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 052b66448..354003399 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -15,12 +15,14 @@ from urllib import urlencode from lxml import html +from requests import get from searx.engines.xpath import extract_text # engine dependent config categories = ['general'] paging = True language_support = True +supported_languages_url = 'https://www.bing.com/account/general' # search-url base_url = 'https://www.bing.com/' @@ -81,3 +83,16 @@ def response(resp): # return results return results + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = [] + response = get(supported_languages_url) + dom = html.fromstring(response.text) + options = dom.xpath('//div[@id="limit-languages"]//input') + for option in options: + code = option.xpath('./@id')[0].replace('_', '-') + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index c0deaf6b2..746d3abc4 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -19,7 +19,7 @@ from urllib import urlencode from lxml import html from json import loads import re -from searx.engines.bing import supported_languages +from searx.engines.bing import fetch_supported_languages # engine dependent config categories = ['images'] diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 4bac5bbce..2d936fa53 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -17,7 +17,7 @@ from datetime import datetime from dateutil import parser from lxml import etree from searx.utils import list_get -from searx.engines.bing import supported_languages +from searx.engines.bing import fetch_supported_languages # engine dependent config categories = ['news'] diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 4a7d7b6a8..813dd951f 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -15,29 +15,12 @@ from urllib import urlencode from json import loads from datetime import datetime +from requests import get # engine dependent config categories = ['videos'] paging = True language_support = True -supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az", - "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca", - "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da", - "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo", - "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv", - "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr", - "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik", - "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk", - "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", - "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml", - "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv", - "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc", - "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu", - "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se", - "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss", - "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th", - "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur", - "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"] # search-url # see http://www.dailymotion.com/doc/api/obj-video.html @@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr embedded_url = '' +supported_languages_url = 'https://api.dailymotion.com/languages' + # do search-request def request(query, params): @@ -92,3 +77,23 @@ def response(resp): # return results return results + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = {} + + response = get(supported_languages_url) + response_json = loads(response.text) + + for language in response_json['list']: + supported_languages[language['code']] = {} + + name = language['native_name'] + if name: + supported_languages[language['code']]['name'] = name + english_name = language['name'] + if english_name: + supported_languages[language['code']]['english_name'] = english_name + + return supported_languages diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 3e1752dd0..d37d2778b 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -15,19 +15,15 @@ from urllib import urlencode from lxml.html import fromstring +from requests import get +from json import loads from searx.engines.xpath import extract_text # engine dependent config categories = ['general'] paging = True language_support = True -supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT", - "es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", - "el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP", - "kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO", - "es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG", - "sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW", - "th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"] +supported_languages_url = 'https://duckduckgo.com/d2030.js' time_range_support = True # search-url @@ -65,8 +61,6 @@ def request(query, params): locale = 'xa' + params['language'].split('-')[0] elif params['language'][-2:] == 'GB': locale = 'uk' + params['language'].split('-')[0] - elif params['language'] == 'es-419': - locale = 'xl-es' else: locale = params['language'].split('-') if len(locale) == 2: @@ -120,3 +114,18 @@ def response(resp): # return results return results + + +# get supported languages from their site +def fetch_supported_languages(): + response = get(supported_languages_url) + + # response is a js file with regions as an embedded object + response_page = response.text + response_page = response_page[response_page.find('regions:{') + 8:] + response_page = response_page[:response_page.find('}') + 1] + + regions_json = loads(response_page) + supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) + + return supported_languages diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 23a2f3be3..b965c02e9 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -4,7 +4,7 @@ from re import compile, sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text -from searx.engines.duckduckgo import supported_languages +from searx.engines.duckduckgo import fetch_supported_languages url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index f012e1df2..e598e55c4 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -14,6 +14,8 @@ from json import loads from random import randint from time import time from urllib import urlencode +from requests import get +from lxml.html import fromstring # engine dependent config categories = ['general'] @@ -40,11 +42,7 @@ url_xpath = './/url' title_xpath = './/title' content_xpath = './/sum' -supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de", - "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el", - "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr", - "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv", - "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] +supported_languages_url = 'https://gigablast.com/search?&rxikd=1' # do search-request @@ -90,3 +88,17 @@ def response(resp): # return results return results + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = [] + response = get(supported_languages_url) + dom = fromstring(response.text) + links = dom.xpath('//span[@id="menu2"]/a') + for link in links: + code = link.xpath('./@href')[0][-2:] + if code != 'xx' and code not in supported_languages: + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/google.py b/searx/engines/google.py index 31035be69..a82a0b5a7 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -12,6 +12,7 @@ import re from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree +from requests import get from searx.engines.xpath import extract_text, extract_url from searx.search import logger @@ -23,20 +24,6 @@ categories = ['general'] paging = True language_support = True use_locale_domain = True -supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca", - "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et", - "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr", - "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw", - "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw", - "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz", - "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso", - "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT", - "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st", - "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum", - "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk", - "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps", - "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te", - "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"] time_range_support = True # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests @@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.' maps_path = '/maps' redirect_path = '/url' images_path = '/images' +supported_languages_url = 'https://www.google.com/preferences?#languages' # specific xpath variables results_xpath = '//div[@class="g"]' @@ -373,3 +361,17 @@ def attributes_to_html(attributes): retval = retval + '' + a.get('label') + '' + value + '' retval = retval + '' return retval + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = {} + response = get(supported_languages_url) + dom = html.fromstring(response.text) + options = dom.xpath('//select[@name="hl"]/option') + for option in options: + code = option.xpath('./@value')[0].split('-')[0] + name = option.text[:-1].title() + supported_languages[code] = {"name": name} + + return supported_languages diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 6d1430248..d138f99f5 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -13,7 +13,7 @@ from lxml import html from urllib import urlencode from json import loads -from searx.engines.google import supported_languages +from searx.engines.google import fetch_supported_languages # search-url categories = ['news'] diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index ea607dd60..93d98d3aa 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -15,7 +15,6 @@ from json import loads from string import Formatter from urllib import urlencode, quote -from searx.engines.wikipedia import supported_languages # engine dependent config categories = ['general'] diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 200e9ada9..67803fa94 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -20,11 +20,6 @@ from searx.utils import html_to_text categories = None paging = True language_support = True -supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT", - "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU", - "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR", - "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH", - "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"] category_to_keyword = {'general': 'web', 'images': 'images', @@ -51,15 +46,7 @@ def request(query, params): # add language tag if specified if params['language'] != 'all': - locale = params['language'].split('-') - if len(locale) == 2 and params['language'] in supported_languages: - params['url'] += '&locale=' + params['language'].replace('-', '_').lower() - else: - # try to get a country code for language - for lang in supported_languages: - if locale[0] == lang.split('-')[0]: - params['url'] += '&locale=' + lang.replace('-', '_').lower() - break + params['url'] += '&locale=' + params['language'].replace('-', '_').lower() return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 3814d9949..54aafdee5 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -24,11 +24,6 @@ categories = ['general'] # paging = False language_support = True -supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW", - "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr", - "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja", - "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw", - "sv", "tl", "th", "tr", "uk", "vi"] # search-url base_url = 'https://startpage.com/' diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 2c0a94f08..f979d0141 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -22,7 +22,7 @@ language = "" # search-url url = 'http://www.subtitleseeker.com/' -search_url = url + 'search/TITLES/{query}&p={pageno}' +search_url = url + 'search/TITLES/{query}?p={pageno}' # specific xpath variables results_xpath = '//div[@class="boxRows"]' @@ -51,7 +51,8 @@ def response(resp): elif resp.search_params['language'] != 'all': search_lang = [lc[3] for lc in language_codes - if lc[0][:2] == resp.search_params['language'].split('_')[0]][0] + if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]] + search_lang = search_lang[0].split(' (')[0] # parse results for result in dom.xpath(results_xpath): diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 5c6b051a9..7f85019a6 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -13,17 +13,13 @@ from json import loads from urllib import urlencode, unquote import re +from requests import get +from lxml.html import fromstring # engine dependent config categories = ['general', 'images'] paging = True language_support = True -supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", - "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR", - "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT", - "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", - "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH", - "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"] # search-url base_url = 'https://swisscows.ch/' @@ -114,3 +110,16 @@ def response(resp): # return results return results + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = [] + response = get(base_url) + dom = fromstring(response.text) + options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') + for option in options: + code = option.xpath('./@data-val')[0] + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index edb6d75fe..9c0a768e0 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -15,7 +15,7 @@ from searx import logger from searx.poolrequests import get from searx.engines.xpath import extract_text from searx.utils import format_date_by_locale -from searx.engines.wikipedia import supported_languages +from searx.engines.wikipedia import fetch_supported_languages from json import loads from lxml.html import fromstring @@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' def request(query, params): - language = params['language'].split('_')[0] + language = params['language'].split('-')[0] if language == 'all': language = 'en' @@ -72,7 +72,7 @@ def response(resp): html = fromstring(resp.content) wikidata_ids = html.xpath(wikidata_ids_xpath) - language = resp.search_params['language'].split('_')[0] + language = resp.search_params['language'].split('-')[0] if language == 'all': language = 'en' diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index fdba5ed68..0dee325a7 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -12,36 +12,9 @@ from json import loads from urllib import urlencode, quote +from requests import get +from lxml.html import fromstring -supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war", - "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh", - "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr", - "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan", - "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo", - "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta", - "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms", - "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb", - "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy", - "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds", - "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su", - "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms", - "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or", - "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo", - "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak", - "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc", - "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv", - "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq", - "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur", - "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom", - "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi", - "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa", - "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn", - "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo", - "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab", - "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum", - "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts", - "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve", - "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"] # search-url base_url = 'https://{language}.wikipedia.org/' @@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\ '&explaintext'\ '&pithumbsize=300'\ '&redirects' +supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' # set language in base_url @@ -142,3 +116,24 @@ def response(resp): 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) return results + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = {} + response = get(supported_languages_url) + dom = fromstring(response.text) + tables = dom.xpath('//table[contains(@class,"sortable")]') + for table in tables: + # exclude header row + trs = table.xpath('.//tr')[1:] + for tr in trs: + td = tr.xpath('./td') + code = td[3].xpath('./a')[0].text + name = td[2].xpath('./a')[0].text + english_name = td[1].xpath('./a')[0].text + articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) + if articles >= 10000: + supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} + + return supported_languages diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index c00e42368..db10c8939 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -14,16 +14,13 @@ from urllib import urlencode from urlparse import unquote from lxml import html +from requests import get from searx.engines.xpath import extract_text, extract_url # engine dependent config categories = ['general'] paging = True language_support = True -supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en", - "et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja", - "ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr", - "sl", "es", "sv", "th", "tr"] time_range_support = True # search-url @@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/' search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' +supported_languages_url = 'https://search.yahoo.com/web/advanced' + # specific xpath variables results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" url_xpath = './/h3/a/@href' @@ -142,3 +141,16 @@ def response(resp): # return results return results + + +# get supported languages from their site +def fetch_supported_languages(): + supported_languages = [] + response = get(supported_languages_url) + dom = html.fromstring(response.text) + options = dom.xpath('//div[@id="yschlang"]/span/label/input') + for option in options: + code = option.xpath('./@value')[0][5:] + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 613513e59..bc7b5c368 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -12,7 +12,7 @@ from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text, extract_url -from searx.engines.yahoo import parse_url, supported_languages +from searx.engines.yahoo import parse_url, fetch_supported_languages from datetime import datetime, timedelta import re from dateutil import parser -- cgit v1.2.3 From e0c270bd72f7b2a40222e3ed264e25d36cb0fc30 Mon Sep 17 00:00:00 2001 From: marc Date: Tue, 13 Dec 2016 23:51:15 -0600 Subject: tests for language support in engines --- searx/engines/.yandex.py.swp | Bin 0 -> 12288 bytes searx/engines/duckduckgo.py | 6 ++++-- searx/engines/subtitleseeker.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 searx/engines/.yandex.py.swp (limited to 'searx/engines') diff --git a/searx/engines/.yandex.py.swp b/searx/engines/.yandex.py.swp new file mode 100644 index 000000000..ff2a8f648 Binary files /dev/null and b/searx/engines/.yandex.py.swp differ diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index d37d2778b..9cf5fb339 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -53,14 +53,16 @@ def request(query, params): locale = None elif params['language'][:2] == 'ja': locale = 'jp-jp' + elif params['language'][:2] == 'sl': + locale = 'sl-sl' elif params['language'] == 'zh-TW': locale = 'tw-tzh' elif params['language'] == 'zh-HK': locale = 'hk-tzh' elif params['language'][-2:] == 'SA': - locale = 'xa' + params['language'].split('-')[0] + locale = 'xa-' + params['language'].split('-')[0] elif params['language'][-2:] == 'GB': - locale = 'uk' + params['language'].split('-')[0] + locale = 'uk-' + params['language'].split('-')[0] else: locale = params['language'].split('-') if len(locale) == 2: diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index f979d0141..77b010c3f 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -46,7 +46,7 @@ def response(resp): # dirty fix for languages named differenly in their site if resp.search_params['language'][:2] == 'fa': search_lang = 'Farsi' - elif resp.search_params['language'] == 'pt_BR': + elif resp.search_params['language'] == 'pt-BR': search_lang = 'Brazilian' elif resp.search_params['language'] != 'all': search_lang = [lc[3] -- cgit v1.2.3 From af35eee10b98940c51c6e5e18629de514b4bd48d Mon Sep 17 00:00:00 2001 From: marc Date: Thu, 15 Dec 2016 00:34:43 -0600 Subject: tests for _fetch_supported_languages in engines and refactor method to make it testable without making requests --- searx/engines/.yandex.py.swp | Bin 12288 -> 0 bytes searx/engines/__init__.py | 13 ++++++++++--- searx/engines/bing.py | 6 ++---- searx/engines/bing_images.py | 2 +- searx/engines/bing_news.py | 2 +- searx/engines/dailymotion.py | 5 ++--- searx/engines/duckduckgo.py | 5 ++--- searx/engines/duckduckgo_definitions.py | 2 +- searx/engines/gigablast.py | 6 ++---- searx/engines/google.py | 12 +++++------- searx/engines/google_news.py | 2 +- searx/engines/swisscows.py | 8 ++++---- searx/engines/wikidata.py | 2 +- searx/engines/wikipedia.py | 6 ++---- searx/engines/yahoo.py | 8 +++----- searx/engines/yahoo_news.py | 2 +- 16 files changed, 38 insertions(+), 43 deletions(-) delete mode 100644 searx/engines/.yandex.py.swp (limited to 'searx/engines') diff --git a/searx/engines/.yandex.py.swp b/searx/engines/.yandex.py.swp deleted file mode 100644 index ff2a8f648..000000000 Binary files a/searx/engines/.yandex.py.swp and /dev/null differ diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 7a64fd25b..cc200a0d1 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -21,6 +21,7 @@ import sys from flask_babel import gettext from operator import itemgetter from json import loads +from requests import get from searx import settings from searx import logger from searx.utils import load_module @@ -79,9 +80,6 @@ def load_engine(engine_data): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) - if engine_data['name'] in languages: - setattr(engine, 'supported_languages', languages[engine_data['name']]) - # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): @@ -91,6 +89,15 @@ def load_engine(engine_data): .format(engine.name, engine_attr)) sys.exit(1) + # assign supported languages from json file + if engine_data['name'] in languages: + setattr(engine, 'supported_languages', languages[engine_data['name']]) + + # assign language fetching method if auxiliary method exists + if hasattr(engine, '_fetch_supported_languages'): + setattr(engine, 'fetch_supported_languages', + lambda: engine._fetch_supported_languages(get(engine.supported_languages_url))) + engine.stats = { 'result_count': 0, 'search_count': 0, diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 354003399..b2ad7b6cf 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -15,7 +15,6 @@ from urllib import urlencode from lxml import html -from requests import get from searx.engines.xpath import extract_text # engine dependent config @@ -86,10 +85,9 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = [] - response = get(supported_languages_url) - dom = html.fromstring(response.text) + dom = html.fromstring(resp.text) options = dom.xpath('//div[@id="limit-languages"]//input') for option in options: code = option.xpath('./@id')[0].replace('_', '-') diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 746d3abc4..97f6dca37 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -19,7 +19,7 @@ from urllib import urlencode from lxml import html from json import loads import re -from searx.engines.bing import fetch_supported_languages +from searx.engines.bing import _fetch_supported_languages, supported_languages_url # engine dependent config categories = ['images'] diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 2d936fa53..765bcd38e 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -17,7 +17,7 @@ from datetime import datetime from dateutil import parser from lxml import etree from searx.utils import list_get -from searx.engines.bing import fetch_supported_languages +from searx.engines.bing import _fetch_supported_languages, supported_languages_url # engine dependent config categories = ['news'] diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 813dd951f..8c69aafe0 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -80,11 +80,10 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = {} - response = get(supported_languages_url) - response_json = loads(response.text) + response_json = loads(resp.text) for language in response_json['list']: supported_languages[language['code']] = {} diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 9cf5fb339..df230222d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -119,11 +119,10 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): - response = get(supported_languages_url) +def _fetch_supported_languages(resp): # response is a js file with regions as an embedded object - response_page = response.text + response_page = resp.text response_page = response_page[response_page.find('regions:{') + 8:] response_page = response_page[:response_page.find('}') + 1] diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index b965c02e9..dd3f12e1e 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -4,7 +4,7 @@ from re import compile, sub from lxml import html from searx.utils import html_to_text from searx.engines.xpath import extract_text -from searx.engines.duckduckgo import fetch_supported_languages +from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index e598e55c4..827b9cd03 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -14,7 +14,6 @@ from json import loads from random import randint from time import time from urllib import urlencode -from requests import get from lxml.html import fromstring # engine dependent config @@ -91,10 +90,9 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = [] - response = get(supported_languages_url) - dom = fromstring(response.text) + dom = fromstring(resp.text) links = dom.xpath('//span[@id="menu2"]/a') for link in links: code = link.xpath('./@href')[0][-2:] diff --git a/searx/engines/google.py b/searx/engines/google.py index a82a0b5a7..803cd307e 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -12,7 +12,6 @@ import re from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree -from requests import get from searx.engines.xpath import extract_text, extract_url from searx.search import logger @@ -364,14 +363,13 @@ def attributes_to_html(attributes): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = {} - response = get(supported_languages_url) - dom = html.fromstring(response.text) - options = dom.xpath('//select[@name="hl"]/option') + dom = html.fromstring(resp.text) + options = dom.xpath('//table//td/font/label/span') for option in options: - code = option.xpath('./@value')[0].split('-')[0] - name = option.text[:-1].title() + code = option.xpath('./@id')[0][1:] + name = option.text.title() supported_languages[code] = {"name": name} return supported_languages diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index d138f99f5..ddacd1a61 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -13,7 +13,7 @@ from lxml import html from urllib import urlencode from json import loads -from searx.engines.google import fetch_supported_languages +from searx.engines.google import _fetch_supported_languages, supported_languages_url # search-url categories = ['news'] diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 7f85019a6..d8a454039 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -13,7 +13,6 @@ from json import loads from urllib import urlencode, unquote import re -from requests import get from lxml.html import fromstring # engine dependent config @@ -25,6 +24,8 @@ language_support = True base_url = 'https://swisscows.ch/' search_string = '?{query}&page={page}' +supported_languages_url = base_url + # regex regex_json = re.compile(r'initialData: {"Request":(.|\n)*},\s*environment') regex_json_remove_start = re.compile(r'^initialData:\s*') @@ -113,10 +114,9 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = [] - response = get(base_url) - dom = fromstring(response.text) + dom = fromstring(resp.text) options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') for option in options: code = option.xpath('./@data-val')[0] diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 9c0a768e0..3f849bc7d 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -15,7 +15,7 @@ from searx import logger from searx.poolrequests import get from searx.engines.xpath import extract_text from searx.utils import format_date_by_locale -from searx.engines.wikipedia import fetch_supported_languages +from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url from json import loads from lxml.html import fromstring diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 0dee325a7..322e8d128 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -12,7 +12,6 @@ from json import loads from urllib import urlencode, quote -from requests import get from lxml.html import fromstring @@ -119,10 +118,9 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = {} - response = get(supported_languages_url) - dom = fromstring(response.text) + dom = fromstring(resp.text) tables = dom.xpath('//table[contains(@class,"sortable")]') for table in tables: # exclude header row diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index db10c8939..5c62c2ed8 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -14,7 +14,6 @@ from urllib import urlencode from urlparse import unquote from lxml import html -from requests import get from searx.engines.xpath import extract_text, extract_url # engine dependent config @@ -144,13 +143,12 @@ def response(resp): # get supported languages from their site -def fetch_supported_languages(): +def _fetch_supported_languages(resp): supported_languages = [] - response = get(supported_languages_url) - dom = html.fromstring(response.text) + dom = html.fromstring(resp.text) options = dom.xpath('//div[@id="yschlang"]/span/label/input') for option in options: - code = option.xpath('./@value')[0][5:] + code = option.xpath('./@value')[0][5:].replace('_', '-') supported_languages.append(code) return supported_languages diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index bc7b5c368..3e4cf02eb 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -12,7 +12,7 @@ from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text, extract_url -from searx.engines.yahoo import parse_url, fetch_supported_languages +from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url from datetime import datetime, timedelta import re from dateutil import parser -- cgit v1.2.3 From 4a1ff56389d6ad560594ba82b448aef1d70bbbf4 Mon Sep 17 00:00:00 2001 From: marc Date: Fri, 16 Dec 2016 22:14:14 -0600 Subject: minor fixes in utils/fetch_languages.py --- searx/engines/wikipedia.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 322e8d128..78acd349d 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -131,7 +131,8 @@ def _fetch_supported_languages(resp): name = td[2].xpath('./a')[0].text english_name = td[1].xpath('./a')[0].text articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) - if articles >= 10000: + # exclude languages with too few articles + if articles >= 100000: supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} return supported_languages -- cgit v1.2.3