diff options
Diffstat (limited to 'searx/engines')
29 files changed, 136 insertions, 178 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index c2f9f3da4..f32b57202 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -113,8 +113,7 @@ def load_engine(engine_data): iso_lang not in getattr(engine, 'supported_languages'): language_aliases[iso_lang] = engine_lang - if language_aliases: - setattr(engine, 'language_aliases', language_aliases) + setattr(engine, 'language_aliases', language_aliases) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index fc08112af..dce862f55 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -36,7 +36,7 @@ def locale_to_lang_code(locale): # wikis for some languages were moved off from the main site, we need to make # requests to correct URLs to be able to get results in those languages lang_urls = { - 'en': { + 'all': { 'base': 'https://wiki.archlinux.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}' }, @@ -67,7 +67,7 @@ lang_urls = { def get_lang_urls(language): if language in lang_urls: return lang_urls[language] - return lang_urls['en'] + return lang_urls['all'] # Language names to build search requests for diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 2da40619d..742379c1a 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -34,7 +34,10 @@ search_string = 'search?{query}&first={offset}' def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - lang = match_language(params['language'], supported_languages, language_aliases) + if params['language'] == 'all': + lang = 'EN' + else: + lang = match_language(params['language'], supported_languages, language_aliases) query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8') diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 66e14c01f..e2495200c 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -55,7 +55,7 @@ def request(query, params): query=urlencode({'q': query}), offset=offset) - language = match_language(params['language'], supported_languages).lower() + language = match_language(params['language'], supported_languages, language_aliases).lower() params['cookies']['SRCHHPGUSR'] = \ 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') @@ -88,9 +88,7 @@ def response(resp): url = json_data.get('purl') img_src = json_data.get('murl') - - thumb_json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('mad'))) - thumbnail = thumb_json_data.get('turl') + thumbnail = json_data.get('turl') # append result results.append({'template': 'images.html', diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 39048a1fc..669130c42 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -71,7 +71,10 @@ def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - language = match_language(params['language'], supported_languages, language_aliases) + if params['language'] == 'all': + language = 'en-US' + else: + language = match_language(params['language'], supported_languages, language_aliases) params['url'] = _get_url(query, language, offset, params['time_range']) diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index 7002ac861..bf17f9168 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -48,7 +48,7 @@ def request(query, params): 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') # language cookie - language = match_language(params['language'], supported_languages).lower() + language = match_language(params['language'], supported_languages, language_aliases).lower() params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' # query and paging diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 8268b6257..06a9c41f3 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -33,7 +33,10 @@ supported_languages_url = 'https://api.dailymotion.com/languages' # do search-request def request(query, params): - locale = match_language(params['language'], supported_languages) + if params['language'] == 'all': + locale = 'en-US' + else: + locale = match_language(params['language'], supported_languages) params['url'] = search_url.format( query=urlencode({'search': query, 'localization': locale}), diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 2c5dc50d8..fb8f523ac 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -54,6 +54,9 @@ content_xpath = './/a[@class="result__snippet"]' # match query's language to a region code that duckduckgo will accept def get_region_code(lang, lang_list=[]): + if lang == 'all': + return None + lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT') lang_parts = lang_code.split('-') @@ -61,7 +64,6 @@ def get_region_code(lang, lang_list=[]): return lang_parts[1].lower() + '-' + lang_parts[0].lower() -# do search-request def request(query, params): if params['time_range'] and params['time_range'] not in time_range_dict: return params @@ -69,8 +71,12 @@ def request(query, params): offset = (params['pageno'] - 1) * 30 region_code = get_region_code(params['language'], supported_languages) - params['url'] = url.format( - query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) if params['time_range'] in time_range_dict: params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 7905d0bcd..8c46ec92d 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -56,8 +56,12 @@ def request(query, params): safesearch = params['safesearch'] - 1 region_code = get_region_code(params['language'], lang_list=supported_languages) - params['url'] = images_url.format( - query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) + if region_code: + params['url'] = images_url.format( + query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) + else: + params['url'] = images_url.format( + query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) return params diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py index 4e8b56743..a36ecf778 100644 --- a/searx/engines/faroo.py +++ b/searx/engines/faroo.py @@ -40,7 +40,10 @@ def request(query, params): offset = (params['pageno'] - 1) * number_of_results + 1 categorie = search_category.get(params['category'], 'web') - language = params['language'].split('-')[0] + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('-')[0] # if language is not supported, put it in english if language != 'en' and\ diff --git a/searx/engines/findx.py b/searx/engines/findx.py deleted file mode 100644 index 87c9d503c..000000000 --- a/searx/engines/findx.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -FindX (General, Images, Videos) - -@website https://www.findx.com -@provide-api no -@using-api no -@results HTML -@stable no -@parse url, title, content, embedded, img_src, thumbnail_src -""" - -from dateutil import parser -from json import loads -import re - -from lxml import html - -from searx import logger -from searx.engines.xpath import extract_text -from searx.engines.youtube_noapi import base_youtube_url, embedded_url -from searx.url_utils import urlencode - - -paging = True -results_xpath = '//script[@id="initial-state"]' -search_url = 'https://www.findx.com/{category}?{q}' -type_map = { - 'none': 'web', - 'general': 'web', - 'images': 'images', - 'videos': 'videos', -} - - -def request(query, params): - params['url'] = search_url.format( - category=type_map[params['category']], - q=urlencode({ - 'q': query, - 'page': params['pageno'] - }) - ) - return params - - -def response(resp): - dom = html.fromstring(resp.text) - results_raw_json = dom.xpath(results_xpath) - results_json = loads(extract_text(results_raw_json)) - - if len(results_json['web']['results']) > 0: - return _general_results(results_json['web']['results']['webSearch']['results']) - - if len(results_json['images']['results']) > 0: - return _images_results(results_json['images']['results']) - - if len(results_json['video']['results']) > 0: - return _videos_results(results_json['video']['results']) - - return [] - - -def _general_results(general_results): - results = [] - for result in general_results: - results.append({ - 'url': result['url'], - 'title': result['title'], - 'content': result['sum'], - }) - return results - - -def _images_results(image_results): - results = [] - for result in image_results: - results.append({ - 'url': result['sourceURL'], - 'title': result['title'], - 'content': result['source'], - 'thumbnail_src': _extract_url(result['assets']['thumb']['url']), - 'img_src': _extract_url(result['assets']['file']['url']), - 'template': 'images.html', - }) - return results - - -def _videos_results(video_results): - results = [] - for result in video_results: - if not result['kind'].startswith('youtube'): - logger.warn('Unknown video kind in findx: {}'.format(result['kind'])) - continue - - description = result['snippet']['description'] - if len(description) > 300: - description = description[:300] + '...' - - results.append({ - 'url': base_youtube_url + result['id'], - 'title': result['snippet']['title'], - 'content': description, - 'thumbnail': _extract_url(result['snippet']['thumbnails']['default']['url']), - 'publishedDate': parser.parse(result['snippet']['publishedAt']), - 'embedded': embedded_url.format(videoid=result['id']), - 'template': 'videos.html', - }) - return results - - -def _extract_url(url): - matching = re.search('(/https?://[^)]+)', url) - if matching: - return matching.group(0)[1:] - return '' diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 9b9b9bd77..a6aa5d718 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -50,9 +50,12 @@ supported_languages_url = 'https://gigablast.com/search?&rxikd=1' def request(query, params): offset = (params['pageno'] - 1) * number_of_results - language = params['language'].replace('-', '_').lower() - if language.split('-')[0] != 'zh': - language = language.split('-')[0] + if params['language'] == 'all': + language = 'xx' + else: + language = params['language'].replace('-', '_').lower() + if language.split('-')[0] != 'zh': + language = language.split('-')[0] if params['safesearch'] >= 1: safesearch = 1 diff --git a/searx/engines/google.py b/searx/engines/google.py index 62e7d1170..36ca7a116 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -166,7 +166,11 @@ def extract_text_from_dom(result, xpath): def request(query, params): offset = (params['pageno'] - 1) * 10 - language = match_language(params['language'], supported_languages) + if params['language'] == 'all' or params['language'] == 'en-US': + language = 'en-GB' + else: + language = match_language(params['language'], supported_languages, language_aliases) + language_array = language.split('-') if params['language'].find('-') > 0: country = params['language'].split('-')[1] @@ -381,10 +385,10 @@ def attributes_to_html(attributes): def _fetch_supported_languages(resp): supported_languages = {} dom = html.fromstring(resp.text) - options = dom.xpath('//table//td/font/label/span') + options = dom.xpath('//*[@id="langSec"]//input[@name="lr"]') for option in options: - code = option.xpath('./@id')[0][1:] - name = option.text.title() + code = option.xpath('./@value')[0].split('_')[-1] + name = option.xpath('./@data-name')[0].title() supported_languages[code] = {"name": name} return supported_languages diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index aadcb76df..9c837b45b 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -51,9 +51,10 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), search_options=urlencode(search_options)) - language = match_language(params['language'], supported_languages).split('-')[0] - if language: - params['url'] += '&lr=lang_' + language + if params['language'] != 'all': + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + if language: + params['url'] += '&lr=lang_' + language return params diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 310b31490..9a41b2dfa 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -7,7 +7,7 @@ @using-api no @results HTML @stable no - @parse url, title, content + @parse url, title, content, thumbnail """ from datetime import date, timedelta @@ -15,7 +15,7 @@ from json import loads from lxml import html from searx.engines.xpath import extract_text from searx.url_utils import urlencode - +import re # engine dependent config categories = ['videos'] @@ -25,7 +25,7 @@ time_range_support = True number_of_results = 10 search_url = 'https://www.google.com/search'\ - '?{query}'\ + '?q={query}'\ '&tbm=vid'\ '&{search_options}' time_range_attr = "qdr:{range}" @@ -69,15 +69,27 @@ def response(resp): # parse results for result in dom.xpath('//div[@class="g"]'): - title = extract_text(result.xpath('.//h3/a')) - url = result.xpath('.//h3/a/@href')[0] + title = extract_text(result.xpath('.//h3')) + url = result.xpath('.//div[@class="r"]/a/@href')[0] content = extract_text(result.xpath('.//span[@class="st"]')) + # get thumbnails + script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) + id = result.xpath('.//div[@class="s"]//img/@id')[0] + thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id, + script) + tmp = [] + if len(thumbnails_data) != 0: + tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) + thumbnail = '' + if len(tmp) != 0: + thumbnail = tmp[-1] + # append result results.append({'url': url, 'title': title, 'content': content, - 'thumbnail': '', + 'thumbnail': thumbnail, 'template': 'videos.html'}) return results diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index c7b05ffcd..0607ac93b 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -45,7 +45,10 @@ def request(query, params): format_strings = list(Formatter().parse(base_url)) - language = params['language'].split('-')[0] + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('-')[0] # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] if any(x[1] == 'language' for x in format_strings): diff --git a/searx/engines/photon.py b/searx/engines/photon.py index 240841954..15236f680 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -35,9 +35,10 @@ def request(query, params): search_string.format(query=urlencode({'q': query}), limit=number_of_results) - language = params['language'].split('-')[0] - if language in supported_languages: - params['url'] = params['url'] + "&lang=" + language + if params['language'] != 'all': + language = params['language'].split('_')[0] + if language in supported_languages: + params['url'] = params['url'] + "&lang=" + language # using searx User-Agent params['headers']['User-Agent'] = searx_useragent() diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 4b0f1c87c..de12955c6 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -46,8 +46,9 @@ def request(query, params): offset=offset) # add language tag - language = match_language(params['language'], supported_languages) - params['url'] += '&locale=' + language.replace('-', '_').lower() + if params['language'] != 'all': + language = match_language(params['language'], supported_languages, language_aliases) + params['url'] += '&locale=' + language.replace('-', '_').lower() return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 3e067597e..6638f3d83 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -32,8 +32,9 @@ search_url = base_url + 'do/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//div[@class="result"]' +results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]' link_xpath = './/h3/a' +content_xpath = './p[@class="search-item__body"]' # do search-request @@ -45,8 +46,9 @@ def request(query, params): params['data'] = {'query': query, 'startat': offset} - # set language - params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) + # set language if specified + if params['language'] != 'all': + params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) return params @@ -73,14 +75,10 @@ def response(resp): if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue - # block ixquick search url's - if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): - continue - title = extract_text(link) - if result.xpath('./p[@class="desc clk"]'): - content = extract_text(result.xpath('./p[@class="desc clk"]')) + if result.xpath(content_xpath): + content = extract_text(result.xpath(content_xpath)) else: content = '' diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 118504ffd..2cbc991b3 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -48,7 +48,7 @@ def response(resp): search_lang = 'Farsi' elif resp.search_params['language'] == 'pt-BR': search_lang = 'Brazilian' - else: + elif resp.search_params['language'] != 'all': search_lang = [lc[3] for lc in language_codes if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]] diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index ff4df24b7..e451bcffc 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -36,8 +36,12 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=') # do search-request def request(query, params): - region = match_language(params['language'], supported_languages) - ui_language = region.split('-')[0] + if params['language'] == 'all': + ui_language = 'browser' + region = 'browser' + else: + region = match_language(params['language'], supported_languages, language_aliases) + ui_language = region.split('-')[0] search_path = search_string.format( query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}), diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index 8641167dc..d2a8d2088 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -37,7 +37,12 @@ timestamp_xpath = './/span[contains(@class,"_timestamp")]' # do search-request def request(query, params): params['url'] = search_url + urlencode({'q': query}) - params['cookies']['lang'] = params['language'].split('-')[0] + + # set language if specified + if params['language'] != 'all': + params['cookies']['lang'] = params['language'].split('-')[0] + else: + params['cookies']['lang'] = 'en' return params diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index c315b30da..03a58a31a 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -68,7 +68,10 @@ def response(resp): html = fromstring(resp.text) search_results = html.xpath(wikidata_ids_xpath) - language = match_language(resp.search_params['language'], supported_languages).split('-')[0] + if resp.search_params['language'].split('-')[0] == 'all': + language = 'en' + else: + language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 for search_result in search_results[:result_count]: diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 6cd17e378..4dae735d1 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -31,7 +31,10 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' # set language in base_url def url_lang(lang): - return match_language(lang, supported_languages).split('-')[0] + lang_pre = lang.split('-')[0] + if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases: + return 'en' + return match_language(lang, supported_languages, language_aliases).split('-')[0] # do search-request diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 595c6b7de..1c58c4a9b 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -65,7 +65,7 @@ def replace_pua_chars(text): def response(resp): results = [] - search_results = etree.XML(resp.text) + search_results = etree.XML(resp.content) # return empty array if there are no results if search_results.xpath(failure_xpath): diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 7c307ce53..25bc83687 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -51,7 +51,9 @@ def request(query, params): limit=number_of_results, search_type=search_type) - params['url'] += '&lr=lang_' + params['language'].split('-')[0] + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=lang_' + params['language'].split('-')[0] return params diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index ba4cb6af8..73b78bcf7 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -73,16 +73,25 @@ def _get_url(query, offset, language, time_range): lang=language) +def _get_language(params): + if params['language'] == 'all': + return 'en' + + language = match_language(params['language'], supported_languages, language_aliases) + if language not in language_aliases.values(): + language = language.split('-')[0] + language = language.replace('-', '_').lower() + + return language + + # do search-request def request(query, params): if params['time_range'] and params['time_range'] not in time_range_dict: return params offset = (params['pageno'] - 1) * 10 + 1 - language = match_language(params['language'], supported_languages, language_aliases) - if language not in language_aliases.values(): - language = language.split('-')[0] - language = language.replace('-', '_').lower() + language = _get_language(params) params['url'] = _get_url(query, offset, language, params['time_range']) diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index b61384d06..9f6a4159b 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -41,7 +41,10 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + if params['language'] == 'all': + language = 'en' + else: + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] params['url'] = search_url.format(offset=offset, query=urlencode({'p': query}), diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index f8bc353f4..6de18aa2c 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -34,7 +34,9 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), api_key=api_key) - params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] return params |