diff options
Diffstat (limited to 'searx/engines')
69 files changed, 1988 insertions, 1320 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 7a9cc56a2..48c02e2e7 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -19,13 +19,15 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. import sys import threading from os.path import realpath, dirname +from io import open +from babel.localedata import locale_identifiers from flask_babel import gettext from operator import itemgetter from json import loads from requests import get from searx import settings from searx import logger -from searx.utils import load_module +from searx.utils import load_module, match_language, get_engine_from_settings logger = logger.getChild('engines') @@ -36,7 +38,9 @@ engines = {} categories = {'general': []} -languages = loads(open(engine_dir + '/../data/engines_languages.json').read()) +languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read()) +babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0] + for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())] engine_shortcuts = {} engine_default_args = {'paging': False, @@ -49,15 +53,23 @@ engine_default_args = {'paging': False, 'disabled': False, 'suspend_end_time': 0, 'continuous_errors': 0, - 'time_range_support': False} + 'time_range_support': False, + 'offline': False, + 'display_error_messages': True, + 'tokens': []} def load_engine(engine_data): - - if '_' in engine_data['name']: - logger.error('Engine name conains underscore: "{}"'.format(engine_data['name'])) + engine_name = engine_data['name'] + if '_' in engine_name: + logger.error('Engine name contains underscore: "{}"'.format(engine_name)) sys.exit(1) + if engine_name.lower() != engine_name: + logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name)) + engine_name = engine_name.lower() + engine_data['name'] = engine_name + engine_module = engine_data['engine'] try: @@ -96,6 +108,21 @@ def load_engine(engine_data): if engine_data['name'] in languages: setattr(engine, 'supported_languages', languages[engine_data['name']]) + # find custom aliases for non standard language codes + if hasattr(engine, 'supported_languages'): + if hasattr(engine, 'language_aliases'): + language_aliases = getattr(engine, 'language_aliases') + else: + language_aliases = {} + + for engine_lang in getattr(engine, 'supported_languages'): + iso_lang = match_language(engine_lang, babel_langs, fallback=None) + if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ + iso_lang not in getattr(engine, 'supported_languages'): + language_aliases[iso_lang] = engine_lang + + setattr(engine, 'language_aliases', language_aliases) + # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): setattr(engine, 'fetch_supported_languages', @@ -104,14 +131,16 @@ def load_engine(engine_data): engine.stats = { 'result_count': 0, 'search_count': 0, - 'page_load_time': 0, - 'page_load_count': 0, 'engine_time': 0, 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } + if not engine.offline: + engine.stats['page_load_time'] = 0 + engine.stats['page_load_count'] = 0 + for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) @@ -133,7 +162,7 @@ def to_percentage(stats, maxvalue): return stats -def get_engines_stats(): +def get_engines_stats(preferences): # TODO refactor pageloads = [] engine_times = [] @@ -144,16 +173,15 @@ def get_engines_stats(): max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa for engine in engines.values(): + if not preferences.validate_token(engine): + continue + if engine.stats['search_count'] == 0: continue + results_num = \ engine.stats['result_count'] / float(engine.stats['search_count']) - if engine.stats['page_load_count'] != 0: - load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa - else: - load_times = 0 - if engine.stats['engine_time_count'] != 0: this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa else: @@ -165,14 +193,19 @@ def get_engines_stats(): else: score = score_per_result = 0.0 - max_pageload = max(load_times, max_pageload) + if not engine.offline: + load_times = 0 + if engine.stats['page_load_count'] != 0: + load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa + max_pageload = max(load_times, max_pageload) + pageloads.append({'avg': load_times, 'name': engine.name}) + max_engine_times = max(this_engine_time, max_engine_times) max_results = max(results_num, max_results) max_score = max(score, max_score) max_score_per_result = max(score_per_result, max_score_per_result) max_errors = max(max_errors, engine.stats['errors']) - pageloads.append({'avg': load_times, 'name': engine.name}) engine_times.append({'avg': this_engine_time, 'name': engine.name}) results.append({'avg': results_num, 'name': engine.name}) scores.append({'avg': score, 'name': engine.name}) @@ -229,12 +262,14 @@ def load_engines(engine_list): def initialize_engines(engine_list): load_engines(engine_list) - for engine in engines.items(): - if hasattr(engine, 'init'): - init_fn = getattr(engine, engine_attr) - def engine_init(): - init_fn() - logger.debug('%s engine initialized', engine_data['name']) - logger.debug('Starting background initialization of %s engine', engine_data['name']) - threading.Thread(target=engine_init).start() + def engine_init(engine_name, init_fn): + init_fn(get_engine_from_settings(engine_name)) + logger.debug('%s engine: Initialized', engine_name) + + for engine_name, engine in engines.items(): + if hasattr(engine, 'init'): + init_fn = getattr(engine, 'init') + if init_fn: + logger.debug('%s engine: Starting background initialization', engine_name) + threading.Thread(target=engine_init, args=(engine_name, init_fn)).start() diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py new file mode 100644 index 000000000..cca28f0db --- /dev/null +++ b/searx/engines/acgsou.py @@ -0,0 +1,75 @@ +""" + Acgsou (Japanese Animation/Music/Comics Bittorrent tracker) + + @website https://www.acgsou.com/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'http://www.acgsou.com/' +search_url = base_url + 'search.php?{query}&page={offset}' +# xpath queries +xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' +xpath_category = './/td[2]/a[1]' +xpath_title = './/td[3]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' + + +def request(query, params): + query = urlencode({'keyword': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" + torrent_link = "" + + try: + category = extract_text(result.xpath(xpath_category)[0]) + except: + pass + + page_a = result.xpath(xpath_title)[0] + title = extract_text(page_a) + href = base_url + page_a.attrib.get('href') + + magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) + + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize = filesize_info[:-2] + filesize_multiplier = filesize_info[-2:] + filesize = get_torrent_size(filesize, filesize_multiplier) + except: + pass + # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime + content = u'Category: "{category}".' + content = content.format(category=category) + + results.append({'url': href, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnet_link, + 'template': 'torrent.html'}) + return results diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py new file mode 100644 index 000000000..f2ee12b29 --- /dev/null +++ b/searx/engines/apkmirror.py @@ -0,0 +1,61 @@ +""" + APK Mirror + + @website https://www.apkmirror.com + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail_src +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# I am not 100% certain about this, as apkmirror appears to be a wordpress site, +# which might support time_range searching. If you want to implement it, go ahead. +time_range_support = False + +# search-url +base_url = 'https://www.apkmirror.com' +search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}' + + +# do search-request +def request(query, params): + + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'s': query})) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('.//div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'): + + link = result.xpath('.//h5/a')[0] + url = base_url + link.attrib.get('href') + '#downloads' + title = extract_text(link) + thumbnail_src = base_url + result.xpath('.//img')[0].attrib.get('src').replace('&w=32&h=32', '&w=64&h=64') + + res = { + 'url': url, + 'title': title, + 'thumbnail_src': thumbnail_src + } + + # append result + results.append(res) + + # return results + return results diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index cad06f8c6..dce862f55 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -26,7 +26,7 @@ xpath_results = '//ul[@class="mw-search-results"]/li' xpath_link = './/div[@class="mw-search-result-heading"]/a' -# cut 'en' from 'en_US', 'de' from 'de_CH', and so on +# cut 'en' from 'en-US', 'de' from 'de-CH', and so on def locale_to_lang_code(locale): if locale.find('-') >= 0: locale = locale.split('-')[0] @@ -99,13 +99,13 @@ supported_languages = dict(lang_urls, **main_langs) # do search-request def request(query, params): - # translate the locale (e.g. 'en_US') to language code ('en') + # translate the locale (e.g. 'en-US') to language code ('en') language = locale_to_lang_code(params['language']) # if our language is hosted on the main site, we need to add its name # to the query in order to narrow the results to that language if language in main_langs: - query += '(' + main_langs[language] + ')' + query += b' (' + main_langs[language] + b')' # prepare the request parameters query = urlencode({'search': query}) diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py new file mode 100644 index 000000000..e3c871d17 --- /dev/null +++ b/searx/engines/arxiv.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +""" + ArXiV (Scientific preprints) + @website https://arxiv.org + @provide-api yes (export.arxiv.org/api/query) + @using-api yes + @results XML-RSS + @stable yes + @parse url, title, publishedDate, content + More info on api: https://arxiv.org/help/api/user-manual +""" + +from lxml import html +from datetime import datetime +from searx.url_utils import urlencode + + +categories = ['science'] +paging = True + +base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + + '{query}&start={offset}&max_results={number_of_results}' + +# engine dependent config +number_of_results = 10 + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=query.decode('utf-8'), + offset=offset, + number_of_results=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.content) + search_results = dom.xpath('//entry') + + for entry in search_results: + title = entry.xpath('.//title')[0].text + + url = entry.xpath('.//id')[0].text + + content_string = '{doi_content}{abstract_content}' + + abstract = entry.xpath('.//summary')[0].text + + # If a doi is available, add it to the snipppet + try: + doi_content = entry.xpath('.//link[@title="doi"]')[0].text + content = content_string.format(doi_content=doi_content, abstract_content=abstract) + except: + content = content_string.format(doi_content="", abstract_content=abstract) + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/engines/base.py b/searx/engines/base.py index ff006a3bc..f1b1cf671 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -55,7 +55,7 @@ shorcut_dict = { def request(query, params): # replace shortcuts with API advanced search keywords for key in shorcut_dict.keys(): - query = re.sub(str(key), str(shorcut_dict[key]), query) + query = re.sub(key, shorcut_dict[key], str(query)) # basic search offset = (params['pageno'] - 1) * number_of_results @@ -73,7 +73,7 @@ def request(query, params): def response(resp): results = [] - search_results = etree.XML(resp.text) + search_results = etree.XML(resp.content) for entry in search_results.xpath('./result/doc'): content = "No description available" diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 052d567ea..afb776acd 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -13,58 +13,63 @@ @todo publishedDate """ +import re from lxml import html +from searx import logger, utils from searx.engines.xpath import extract_text from searx.url_utils import urlencode +from searx.utils import match_language, gen_useragent, eval_xpath + +logger = logger.getChild('bing engine') # engine dependent config categories = ['general'] paging = True language_support = True supported_languages_url = 'https://www.bing.com/account/general' +language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} # search-url base_url = 'https://www.bing.com/' search_string = 'search?{query}&first={offset}' +def _get_offset_from_pageno(pageno): + return (pageno - 1) * 10 + 1 + + # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + 1 + offset = _get_offset_from_pageno(params.get('pageno', 0)) - if params['language'] != 'all': - lang = params['language'].split('-')[0].upper() - else: + if params['language'] == 'all': lang = 'EN' + else: + lang = match_language(params['language'], supported_languages, language_aliases) - query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8') + query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8') search_path = search_string.format( query=urlencode({'q': query}), offset=offset) params['url'] = base_url + search_path + return params # get response from search-request def response(resp): results = [] + result_len = 0 dom = html.fromstring(resp.text) - - try: - results.append({'number_of_results': int(dom.xpath('//span[@class="sb_count"]/text()')[0] - .split()[0].replace(',', ''))}) - except: - pass - # parse results - for result in dom.xpath('//div[@class="sa_cc"]'): - link = result.xpath('.//h3/a')[0] + for result in eval_xpath(dom, '//div[@class="sa_cc"]'): + link = eval_xpath(result, './/h3/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = extract_text(result.xpath('.//p')) + content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, @@ -72,30 +77,51 @@ def response(resp): 'content': content}) # parse results again if nothing is found yet - for result in dom.xpath('//li[@class="b_algo"]'): - link = result.xpath('.//h2/a')[0] + for result in eval_xpath(dom, '//li[@class="b_algo"]'): + link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = extract_text(result.xpath('.//p')) + content = extract_text(eval_xpath(result, './/p')) # append result results.append({'url': url, 'title': title, 'content': content}) - # return results + try: + result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) + if "-" in result_len_container: + # Remove the part "from-to" for paginated request ... + result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:] + + result_len_container = re.sub('[^0-9]', '', result_len_container) + if len(result_len_container) > 0: + result_len = int(result_len_container) + except Exception as e: + logger.debug('result error :\n%s', e) + pass + + if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: + return [] + + results.append({'number_of_results': result_len}) return results # get supported languages from their site def _fetch_supported_languages(resp): - supported_languages = [] + lang_tags = set() + + setmkt = re.compile('setmkt=([^&]*)') dom = html.fromstring(resp.text) - options = dom.xpath('//div[@id="limit-languages"]//input') - for option in options: - code = option.xpath('./@id')[0].replace('_', '-') - if code == 'nb': - code = 'no' - supported_languages.append(code) - - return supported_languages + lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]") + + for a in lang_links: + href = eval_xpath(a, './@href')[0] + match = setmkt.search(href) + l_tag = match.groups()[0] + _lang, _nation = l_tag.split('-', 1) + l_tag = _lang.lower() + '-' + _nation.upper() + lang_tags.add(l_tag) + + return list(lang_tags) diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 6300c94e4..138ed11c6 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -10,26 +10,32 @@ @stable no (HTML can change) @parse url, title, img_src - @todo currently there are up to 35 images receive per page, - because bing does not parse count=10. - limited response to 10 images """ from lxml import html from json import loads import re -from searx.engines.bing import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode +from searx.utils import match_language + +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases # engine dependent config categories = ['images'] paging = True safesearch = True time_range_support = True +language_support = True +supported_languages_url = 'https://www.bing.com/account/general' +number_of_results = 28 # search-url base_url = 'https://www.bing.com/' -search_string = 'images/search?{query}&count=10&first={offset}' +search_string = 'images/search'\ + '?{query}'\ + '&count={count}'\ + '&first={first}'\ + '&FORM=IBASEP' time_range_string = '&qft=+filterui:age-lt{interval}' time_range_dict = {'day': '1440', 'week': '10080', @@ -42,26 +48,22 @@ safesearch_types = {2: 'STRICT', 0: 'OFF'} -_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) - - # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + 1 - - # required for cookie - if params['language'] == 'all': - language = 'en-US' - else: - language = params['language'] + offset = ((params['pageno'] - 1) * number_of_results) + 1 search_path = search_string.format( query=urlencode({'q': query}), - offset=offset) + count=number_of_results, + first=offset) + + language = match_language(params['language'], supported_languages, language_aliases).lower() params['cookies']['SRCHHPGUSR'] = \ - 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\ - '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + params['cookies']['_EDGE_S'] = 'mkt=' + language +\ + '&ui=' + language + '&F=1' params['url'] = base_url + search_path if params['time_range'] in time_range_dict: @@ -77,32 +79,29 @@ def response(resp): dom = html.fromstring(resp.text) # parse results - for result in dom.xpath('//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'): - link = result.xpath('./a')[0] - - # TODO find actual title - title = link.xpath('.//img/@alt')[0] - - # parse json-data (it is required to add a space, to make it parsable) - json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m'))) - - url = json_data.get('purl') - img_src = json_data.get('murl') - - thumb_json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('mad'))) - thumbnail = thumb_json_data.get('turl') - - # append result - results.append({'template': 'images.html', - 'url': url, - 'title': title, - 'content': '', - 'thumbnail_src': thumbnail, - 'img_src': img_src}) - - # TODO stop parsing if 10 images are found - # if len(results) >= 10: - # break + for result in dom.xpath('//div[@class="imgpt"]'): + + img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] + # Microsoft seems to experiment with this code so don't make the path too specific, + # just catch the text section for the first anchor in img_info assuming this to be + # the originating site. + source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] + + try: + m = loads(result.xpath('./a/@m')[0]) + + # strip 'Unicode private use area' highlighting, they render to Tux + # the Linux penguin and a standing diamond on my machine... + title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '') + results.append({'template': 'images.html', + 'url': m['purl'], + 'thumbnail_src': m['turl'], + 'img_src': m['murl'], + 'content': '', + 'title': title, + 'source': source, + 'img_format': img_format}) + except: + continue - # return results return results diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index b999b2a39..d13be777c 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -14,10 +14,11 @@ from datetime import datetime from dateutil import parser from lxml import etree -from searx.utils import list_get -from searx.engines.bing import _fetch_supported_languages, supported_languages_url +from searx.utils import list_get, match_language from searx.url_utils import urlencode, urlparse, parse_qsl +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases + # engine dependent config categories = ['news'] paging = True @@ -58,6 +59,7 @@ def _get_url(query, language, offset, time_range): offset=offset, interval=time_range_dict[time_range]) else: + # e.g. setmkt=de-de&setlang=de search_path = search_string.format( query=urlencode({'q': query, 'setmkt': language}), offset=offset) @@ -74,7 +76,7 @@ def request(query, params): if params['language'] == 'all': language = 'en-US' else: - language = params['language'] + language = match_language(params['language'], supported_languages, language_aliases) params['url'] = _get_url(query, language, offset, params['time_range']) diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py new file mode 100644 index 000000000..f048f0d8e --- /dev/null +++ b/searx/engines/bing_videos.py @@ -0,0 +1,98 @@ +""" + Bing (Videos) + + @website https://www.bing.com/videos + @provide-api yes (http://datamarket.azure.com/dataset/bing/search) + + @using-api no + @results HTML + @stable no + @parse url, title, content, thumbnail +""" + +from json import loads +from lxml import html +from searx.url_utils import urlencode +from searx.utils import match_language + +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases + +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 28 +language_support = True + +base_url = 'https://www.bing.com/' +search_string = 'videos/search'\ + '?{query}'\ + '&count={count}'\ + '&first={first}'\ + '&scope=video'\ + '&FORM=QBLH' +time_range_string = '&qft=+filterui:videoage-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +# do search-request +def request(query, params): + offset = ((params['pageno'] - 1) * number_of_results) + 1 + + search_path = search_string.format( + query=urlencode({'q': query}), + count=number_of_results, + first=offset) + + # safesearch cookie + params['cookies']['SRCHHPGUSR'] = \ + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + # language cookie + language = match_language(params['language'], supported_languages, language_aliases).lower() + params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' + + # query and paging + params['url'] = base_url + search_path + + # time range + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + # bing videos did not like "older" versions < 70.0.1 when selectin other + # languages then 'en' .. very strange ?!?! + params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1' + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="dg_u"]'): + try: + metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) + info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() + content = '{0} - {1}'.format(metadata['du'], info) + thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid']) + results.append({'url': metadata['murl'], + 'thumbnail': thumbnail, + 'title': metadata.get('vt', ''), + 'content': content, + 'template': 'videos.html'}) + + except: + continue + + return results diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py deleted file mode 100644 index f71645634..000000000 --- a/searx/engines/blekko_images.py +++ /dev/null @@ -1,70 +0,0 @@ -""" - Blekko (Images) - - @website https://blekko.com - @provide-api yes (inofficial) - - @using-api yes - @results JSON - @stable yes - @parse url, title, img_src -""" - -from json import loads -from searx.url_utils import urlencode - -# engine dependent config -categories = ['images'] -paging = True -safesearch = True - -# search-url -base_url = 'https://blekko.com' -search_url = '/api/images?{query}&c={c}' - -# safesearch definitions -safesearch_types = {2: '1', - 1: '', - 0: '0'} - - -# do search-request -def request(query, params): - c = (params['pageno'] - 1) * 48 - - params['url'] = base_url +\ - search_url.format(query=urlencode({'q': query}), - c=c) - - if params['pageno'] != 1: - params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1)) - - # let Blekko know we wan't have profiling - params['cookies']['tag_lesslogging'] = '1' - - # parse safesearch argument - params['cookies']['safesearch'] = safesearch_types.get(params['safesearch'], '') - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_results = loads(resp.text) - - # return empty array if there are no results - if not search_results: - return [] - - for result in search_results: - # append result - results.append({'url': result['page_url'], - 'title': result['title'], - 'content': '', - 'img_src': result['url'], - 'template': 'images.html'}) - - # return results - return results diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index 40438673f..82eedc24b 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -1,7 +1,7 @@ """ BTDigg (Videos, Music, Files) - @website https://btdigg.org + @website https://btdig.com @provide-api yes (on demand) @using-api no @@ -21,7 +21,7 @@ categories = ['videos', 'music', 'files'] paging = True # search-url -url = 'https://btdigg.org' +url = 'https://btdig.com' search_url = url + '/search?q={search_term}&p={pageno}' @@ -39,7 +39,7 @@ def response(resp): dom = html.fromstring(resp.text) - search_res = dom.xpath('//div[@id="search_res"]/table/tr') + search_res = dom.xpath('//div[@class="one_result"]') # return empty array if nothing is found if not search_res: @@ -47,46 +47,39 @@ def response(resp): # parse results for result in search_res: - link = result.xpath('.//td[@class="torrent_name"]//a')[0] + link = result.xpath('.//div[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) - content = extract_text(result.xpath('.//pre[@class="snippet"]')[0]) - content = "<br />".join(content.split("\n")) - filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] - filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1] - files = result.xpath('.//span[@class="attr_val"]/text()')[1] - seed = result.xpath('.//span[@class="attr_val"]/text()')[2] + excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0] + content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False) + # it is better to emit <br/> instead of |, but html tags are verboten + content = content.strip().replace('\n', ' | ') + content = ' '.join(content.split()) - # convert seed to int if possible - if seed.isdigit(): - seed = int(seed) - else: - seed = 0 - - leech = 0 + filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0] + filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1] + files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0] # convert filesize to byte if possible filesize = get_torrent_size(filesize, filesize_multiplier) # convert files to int if possible - if files.isdigit(): + try: files = int(files) - else: + except: files = None - magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href'] + magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href'] # append result results.append({'url': href, 'title': title, 'content': content, - 'seed': seed, - 'leech': leech, 'filesize': filesize, 'files': files, 'magnetlink': magnetlink, 'template': 'torrent.html'}) # return results sorted by seeder - return sorted(results, key=itemgetter('seed'), reverse=True) + return results diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index 1218d4849..8eab8f673 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -4,13 +4,14 @@ import os import sys import unicodedata +from io import open from datetime import datetime if sys.version_info[0] == 3: unicode = str categories = [] -url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X' +url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' weight = 100 parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) @@ -43,16 +44,15 @@ def request(query, params): if not m: # wrong query return params - - ammount, from_currency, to_currency = m.groups() - ammount = float(ammount) + amount, from_currency, to_currency = m.groups() + amount = float(amount) from_currency = name_to_iso4217(from_currency.strip()) to_currency = name_to_iso4217(to_currency.strip()) q = (from_currency + to_currency).upper() - params['url'] = url.format(query=q) - params['ammount'] = ammount + params['url'] = url.format(from_currency, to_currency) + params['amount'] = amount params['from'] = from_currency params['to'] = to_currency params['from_name'] = iso4217_to_name(from_currency, 'en') @@ -62,31 +62,25 @@ def request(query, params): def response(resp): + """remove first and last lines to get only json""" + json_resp = resp.text[resp.text.find('\n') + 1:resp.text.rfind('\n') - 2] results = [] try: - _, conversion_rate, _ = resp.text.split(',', 2) - conversion_rate = float(conversion_rate) + conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount']) except: return results - answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format( - resp.search_params['ammount'], + resp.search_params['amount'], resp.search_params['from'], - resp.search_params['ammount'] * conversion_rate, + resp.search_params['amount'] * conversion_rate, resp.search_params['to'], conversion_rate, resp.search_params['from_name'], resp.search_params['to_name'], ) - now_date = datetime.now().strftime('%Y%m%d') - url = 'https://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa - url = url.format( - now_date, - resp.search_params['ammount'], - resp.search_params['from'].lower(), - resp.search_params['to'].lower() - ) + url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format( + resp.search_params['from'].upper(), resp.search_params['to']) results.append({'answer': answer, 'url': url}) @@ -97,7 +91,7 @@ def load(): global db current_dir = os.path.dirname(os.path.realpath(__file__)) - json_data = open(current_dir + "/../data/currencies.json").read() + json_data = open(current_dir + "/../data/currencies.json", 'r', encoding='utf-8').read() db = json.loads(json_data) diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index fad7e596c..1038e64bf 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -15,6 +15,7 @@ from json import loads from datetime import datetime from searx.url_utils import urlencode +from searx.utils import match_language, html_to_text # engine dependent config categories = ['videos'] @@ -25,7 +26,7 @@ language_support = True # see http://www.dailymotion.com/doc/api/obj-video.html search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\ - 'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>' + 'data-src="https://www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>' supported_languages_url = 'https://api.dailymotion.com/languages' @@ -35,7 +36,7 @@ def request(query, params): if params['language'] == 'all': locale = 'en-US' else: - locale = params['language'] + locale = match_language(params['language'], supported_languages) params['url'] = search_url.format( query=urlencode({'search': query, 'localization': locale}), @@ -58,7 +59,7 @@ def response(resp): for res in search_res['list']: title = res['title'] url = res['url'] - content = res['description'] + content = html_to_text(res['description']) thumbnail = res['thumbnail_360_url'] publishedDate = datetime.fromtimestamp(res['created_time'], None) embedded = embedded_url.format(videoid=res['id']) diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index bb85c6dc5..a0e27e622 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -24,7 +24,7 @@ time_range_support = True # search-url base_url = 'https://www.deviantart.com/' -search_url = base_url + 'browse/all/?offset={offset}&{query}' +search_url = base_url + 'search?page={page}&{query}' time_range_url = '&order={range}' time_range_dict = {'day': 11, @@ -37,9 +37,7 @@ def request(query, params): if params['time_range'] and params['time_range'] not in time_range_dict: return params - offset = (params['pageno'] - 1) * 24 - - params['url'] = search_url.format(offset=offset, + params['url'] = search_url.format(page=params['pageno'], query=urlencode({'q': query})) if params['time_range'] in time_range_dict: params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) @@ -57,28 +55,27 @@ def response(resp): dom = html.fromstring(resp.text) - regex = re.compile(r'\/200H\/') - # parse results - for result in dom.xpath('.//span[@class="thumb wide"]'): - link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0] - url = link.attrib.get('href') - title = extract_text(result.xpath('.//span[@class="title"]')) - thumbnail_src = link.xpath('.//img')[0].attrib.get('src') - img_src = regex.sub('/', thumbnail_src) - - # http to https, remove domain sharding - thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) - thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) - - url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) + for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): + for result in row.xpath('./div'): + link = result.xpath('.//a[@data-hook="deviation_link"]')[0] + url = link.attrib.get('href') + title = link.attrib.get('title') + thumbnail_src = result.xpath('.//img')[0].attrib.get('src') + img_src = thumbnail_src + + # http to https, remove domain sharding + thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) + thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) + + url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) # return results return results diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 7cc44df73..423af0971 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -11,11 +11,11 @@ import re from lxml import html -from searx.utils import is_valid_lang +from searx.utils import is_valid_lang, eval_xpath from searx.url_utils import urljoin categories = ['general'] -url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' weight = 100 parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) @@ -47,14 +47,14 @@ def response(resp): dom = html.fromstring(resp.text) - for k, result in enumerate(dom.xpath(results_xpath)[1:]): + for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]): try: - from_result, to_results_raw = result.xpath('./td') + from_result, to_results_raw = eval_xpath(result, './td') except: continue to_results = [] - for to_result in to_results_raw.xpath('./p/a'): + for to_result in eval_xpath(to_results_raw, './p/a'): t = to_result.text_content() if t.strip(): to_results.append(to_result.text_content()) diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 606747a4d..073410eb0 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -10,10 +10,13 @@ @parse url, title, content, publishedDate, thumbnail """ +import random +import string from dateutil import parser from json import loads from lxml import html -from searx.url_utils import quote_plus +from searx.url_utils import urlencode +from datetime import datetime # engine dependent config categories = ['news', 'social media'] @@ -21,7 +24,7 @@ paging = True # search-url base_url = 'https://digg.com/' -search_url = base_url + 'api/search/{query}.json?position={position}&format=html' +search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' # specific xpath variables results_xpath = '//article' @@ -30,12 +33,17 @@ title_xpath = './/h2//a//text()' content_xpath = './/p//text()' pubdate_xpath = './/time' +digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ + string.digits + "+_" + # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + offset = (params['pageno'] - 1) * 20 params['url'] = search_url.format(position=offset, - query=quote_plus(query)) + query=urlencode({'q': query})) + params['cookies']['frontend.auid'] = ''.join(random.choice( + digg_cookie_chars) for _ in range(22)) return params @@ -45,30 +53,17 @@ def response(resp): search_result = loads(resp.text) - if 'html' not in search_result or search_result['html'] == '': - return results - - dom = html.fromstring(search_result['html']) - # parse results - for result in dom.xpath(results_xpath): - url = result.attrib.get('data-contenturl') - thumbnail = result.xpath('.//img')[0].attrib.get('src') - title = ''.join(result.xpath(title_xpath)) - content = ''.join(result.xpath(content_xpath)) - pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') - publishedDate = parser.parse(pubdate) - - # http to https - thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com") + for result in search_result['mapped']: + published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") # append result - results.append({'url': url, - 'title': title, - 'content': content, + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['excerpt'], 'template': 'videos.html', - 'publishedDate': publishedDate, - 'thumbnail': thumbnail}) + 'publishedDate': published, + 'thumbnail': result['images']['thumbImage']}) # return results return results diff --git a/searx/engines/doku.py b/searx/engines/doku.py index a391be444..d20e66026 100644 --- a/searx/engines/doku.py +++ b/searx/engines/doku.py @@ -11,6 +11,7 @@ from lxml.html import fromstring from searx.engines.xpath import extract_text +from searx.utils import eval_xpath from searx.url_utils import urlencode # engine dependent config @@ -45,16 +46,16 @@ def response(resp): # parse results # Quickhits - for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'): + for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'): try: - res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] + res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] except: continue if not res_url: continue - title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) + title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) # append result results.append({'title': title, @@ -62,13 +63,13 @@ def response(resp): 'url': base_url + res_url}) # Search results - for r in doc.xpath('//dl[@class="search_results"]/*'): + for r in eval_xpath(doc, '//dl[@class="search_results"]/*'): try: if r.tag == "dt": - res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] - title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) + res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] + title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) elif r.tag == "dd": - content = extract_text(r.xpath('.')) + content = extract_text(eval_xpath(r, '.')) # append result results.append({'title': title, diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 407d731f0..0d2c0af2d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -18,16 +18,27 @@ from json import loads from searx.engines.xpath import extract_text from searx.poolrequests import get from searx.url_utils import urlencode +from searx.utils import match_language, eval_xpath # engine dependent config categories = ['general'] paging = True language_support = True -supported_languages_url = 'https://duckduckgo.com/d2030.js' +supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True +language_aliases = { + 'ar-SA': 'ar-XA', + 'es-419': 'es-XL', + 'ja': 'jp-JP', + 'ko': 'kr-KR', + 'sl-SI': 'sl-SL', + 'zh-TW': 'tzh-TW', + 'zh-HK': 'tzh-HK' +} + # search-url -url = 'https://duckduckgo.com/html?{query}&s={offset}&api=/d.js&o=json&dc={dc_param}' +url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}' time_range_url = '&df={range}' time_range_dict = {'day': 'd', @@ -42,55 +53,48 @@ content_xpath = './/a[@class="result__snippet"]' # match query's language to a region code that duckduckgo will accept -def get_region_code(lang, lang_list=None): - # custom fixes for languages +def get_region_code(lang, lang_list=[]): if lang == 'all': - region_code = None - elif lang[:2] == 'ja': - region_code = 'jp-jp' - elif lang[:2] == 'sl': - region_code = 'sl-sl' - elif lang == 'zh-TW': - region_code = 'tw-tzh' - elif lang == 'zh-HK': - region_code = 'hk-tzh' - elif lang[-2:] == 'SA': - region_code = 'xa-' + lang.split('-')[0] - elif lang[-2:] == 'GB': - region_code = 'uk-' + lang.split('-')[0] - else: - region_code = lang.split('-') - if len(region_code) == 2: - # country code goes first - region_code = region_code[1].lower() + '-' + region_code[0].lower() - else: - # tries to get a country code from language - region_code = region_code[0].lower() - for lc in (lang_list or supported_languages): - lc = lc.split('-') - if region_code == lc[0]: - region_code = lc[1].lower() + '-' + lc[0].lower() - break - return region_code + return None + + lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT') + lang_parts = lang_code.split('-') + + # country code goes first + return lang_parts[1].lower() + '-' + lang_parts[0].lower() -# do search-request def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: + if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict: return params offset = (params['pageno'] - 1) * 30 - region_code = get_region_code(params['language']) - if region_code: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + region_code = get_region_code(params['language'], supported_languages) + params['url'] = 'https://duckduckgo.com/html/' + if params['pageno'] > 1: + params['method'] = 'POST' + params['data']['q'] = query + params['data']['s'] = offset + params['data']['dc'] = 30 + params['data']['nextParams'] = '' + params['data']['v'] = 'l' + params['data']['o'] = 'json' + params['data']['api'] = '/d.js' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + if region_code: + params['data']['kl'] = region_code else: - params['url'] = url.format( - query=urlencode({'q': query}), offset=offset, dc_param=offset) + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) return params @@ -102,17 +106,19 @@ def response(resp): doc = fromstring(resp.text) # parse results - for r in doc.xpath(result_xpath): + for i, r in enumerate(eval_xpath(doc, result_xpath)): + if i >= 30: + break try: - res_url = r.xpath(url_xpath)[-1] + res_url = eval_xpath(r, url_xpath)[-1] except: continue if not res_url: continue - title = extract_text(r.xpath(title_xpath)) - content = extract_text(r.xpath(content_xpath)) + title = extract_text(eval_xpath(r, title_xpath)) + content = extract_text(eval_xpath(r, content_xpath)) # append result results.append({'title': title, @@ -134,4 +140,4 @@ def _fetch_supported_languages(resp): regions_json = loads(response_page) supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) - return supported_languages + return list(supported_languages) diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 21c6a6578..79d10c303 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,10 +1,21 @@ +""" +DuckDuckGo (definitions) + +- `Instant Answer API`_ +- `DuckDuckGo query`_ + +.. _Instant Answer API: https://duckduckgo.com/api +.. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1 + +""" + import json from lxml import html from re import compile from searx.engines.xpath import extract_text -from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url +from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases from searx.url_utils import urlencode -from searx.utils import html_to_text +from searx.utils import html_to_text, match_language url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' @@ -24,7 +35,9 @@ def result_to_text(url, text, htmlResult): def request(query, params): params['url'] = url.format(query=urlencode({'q': query})) - params['headers']['Accept-Language'] = params['language'].split('-')[0] + language = match_language(params['language'], supported_languages, language_aliases) + language = language.split('-')[0] + params['headers']['Accept-Language'] = language return params @@ -42,8 +55,9 @@ def response(resp): # add answer if there is one answer = search_res.get('Answer', '') - if answer != '': - results.append({'answer': html_to_text(answer)}) + if answer: + if search_res.get('AnswerType', '') not in ['calc']: + results.append({'answer': html_to_text(answer)}) # add infobox if 'Definition' in search_res: diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index dbd78b074..89924b71c 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -15,7 +15,10 @@ from json import loads from searx.engines.xpath import extract_text -from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code +from searx.engines.duckduckgo import ( + _fetch_supported_languages, supported_languages_url, + get_region_code, language_aliases +) from searx.poolrequests import get from searx.url_utils import urlencode @@ -32,9 +35,12 @@ site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' # run query in site to get vqd number needed for requesting images # TODO: find a way to get this number without an extra request (is it a hash of the query?) -def get_vqd(query): - res = get(site_url.format(query=urlencode({'q': query}))) +def get_vqd(query, headers): + query_url = site_url.format(query=urlencode({'q': query})) + res = get(query_url, headers=headers) content = res.text + if content.find('vqd=\'') == -1: + raise Exception('Request failed') vqd = content[content.find('vqd=\'') + 5:] vqd = vqd[:vqd.find('\'')] return vqd @@ -44,7 +50,7 @@ def get_vqd(query): def request(query, params): # to avoid running actual external requests when testing if 'is_test' not in params: - vqd = get_vqd(query) + vqd = get_vqd(query, params['headers']) else: vqd = '12345' @@ -71,7 +77,7 @@ def response(resp): try: res_json = loads(content) except: - return [] + raise Exception('Cannot parse results') # parse results for result in res_json['results']: diff --git a/searx/engines/duden.py b/searx/engines/duden.py new file mode 100644 index 000000000..cf2f1a278 --- /dev/null +++ b/searx/engines/duden.py @@ -0,0 +1,80 @@ +""" + Duden + @website https://www.duden.de + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html, etree +import re +from searx.engines.xpath import extract_text +from searx.utils import eval_xpath +from searx.url_utils import quote, urljoin +from searx import logger + +categories = ['general'] +paging = True +language_support = False + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}' + + +def request(query, params): + '''pre-request callback + params<dict>: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = (params['pageno'] - 1) + if offset == 0: + search_url_fmt = base_url + 'suchen/dudenonline/{query}' + params['url'] = search_url_fmt.format(query=quote(query)) + else: + params['url'] = search_url.format(offset=offset, query=quote(query)) + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + dom = html.fromstring(resp.text) + + try: + number_of_results_string =\ + re.sub('[^0-9]', '', + eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]) + + results.append({'number_of_results': int(number_of_results_string)}) + + except: + logger.debug("Couldn't read number of results.") + pass + + for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'): + try: + url = eval_xpath(result, './/h2/a')[0].get('href') + url = urljoin(base_url, url) + title = eval_xpath(result, 'string(.//h2/a)').strip() + content = extract_text(eval_xpath(result, './/p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + return results diff --git a/searx/engines/dummy-offline.py b/searx/engines/dummy-offline.py new file mode 100644 index 000000000..13a9ecc01 --- /dev/null +++ b/searx/engines/dummy-offline.py @@ -0,0 +1,12 @@ +""" + Dummy Offline + + @results one result + @stable yes +""" + + +def search(query, request_params): + return [{ + 'result': 'this is what you get', + }] diff --git a/searx/engines/etools.py b/searx/engines/etools.py new file mode 100644 index 000000000..a9eb0980d --- /dev/null +++ b/searx/engines/etools.py @@ -0,0 +1,54 @@ +""" + eTools (Web) + + @website https://www.etools.ch + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import quote +from searx.utils import eval_xpath + +categories = ['general'] +paging = False +language_support = False +safesearch = True + +base_url = 'https://www.etools.ch' +search_path = '/searchAdvancedSubmit.do'\ + '?query={search_term}'\ + '&pageResults=20'\ + '&safeSearch={safesearch}' + + +def request(query, params): + if params['safesearch']: + safesearch = 'true' + else: + safesearch = 'false' + + params['url'] = base_url + search_path.format(search_term=quote(query), safesearch=safesearch) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in eval_xpath(dom, '//table[@class="result"]//td[@class="record"]'): + url = eval_xpath(result, './a/@href')[0] + title = extract_text(eval_xpath(result, './a//text()')) + content = extract_text(eval_xpath(result, './/div[@class="text"]//text()')) + + results.append({'url': url, + 'title': title, + 'content': content}) + + return results diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py deleted file mode 100644 index e24d1b7dc..000000000 --- a/searx/engines/faroo.py +++ /dev/null @@ -1,116 +0,0 @@ -""" - Faroo (Web, News) - - @website http://www.faroo.com - @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, publishedDate, img_src -""" - -from json import loads -import datetime -from searx.utils import searx_useragent -from searx.url_utils import urlencode - -# engine dependent config -categories = ['general', 'news'] -paging = True -language_support = True -number_of_results = 10 -api_key = None - -# search-url -url = 'http://www.faroo.com/' -search_url = url + 'api?{query}'\ - '&start={offset}'\ - '&length={number_of_results}'\ - '&l={language}'\ - '&src={categorie}'\ - '&i=false'\ - '&f=json'\ - '&key={api_key}' # noqa - -search_category = {'general': 'web', - 'news': 'news'} - - -# do search-request -def request(query, params): - offset = (params['pageno'] - 1) * number_of_results + 1 - categorie = search_category.get(params['category'], 'web') - - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('_')[0] - - # if language is not supported, put it in english - if language != 'en' and\ - language != 'de' and\ - language != 'zh': - language = 'en' - - params['url'] = search_url.format(offset=offset, - number_of_results=number_of_results, - query=urlencode({'q': query}), - language=language, - categorie=categorie, - api_key=api_key) - - # using searx User-Agent - params['headers']['User-Agent'] = searx_useragent() - - return params - - -# get response from search-request -def response(resp): - # HTTP-Code 401: api-key is not valide - if resp.status_code == 401: - raise Exception("API key is not valide") - - # HTTP-Code 429: rate limit exceeded - if resp.status_code == 429: - raise Exception("rate limit has been exceeded!") - - results = [] - - search_res = loads(resp.text) - - # return empty array if there are no results - if not search_res.get('results', {}): - return [] - - # parse results - for result in search_res['results']: - if result['news']: - # timestamp (milliseconds since 1970) - publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0) # noqa - - # append news result - results.append({'url': result['url'], - 'title': result['title'], - 'publishedDate': publishedDate, - 'content': result['kwic']}) - - else: - # append general result - # TODO, publishedDate correct? - results.append({'url': result['url'], - 'title': result['title'], - 'content': result['kwic']}) - - # append image result if image url is set - # TODO, show results with an image like in faroo - if result['iurl']: - results.append({'template': 'images.html', - 'url': result['url'], - 'title': result['title'], - 'content': result['kwic'], - 'img_src': result['iurl']}) - - # return results - return results diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index a6b01a8ee..4066dc716 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -18,13 +18,13 @@ categories = ['files'] paging = True # search-url -base_url = 'https://f-droid.org/' -search_url = base_url + 'repository/browse/?{query}' +base_url = 'https://search.f-droid.org/' +search_url = base_url + '?{query}' # do search-request def request(query, params): - query = urlencode({'fdfilter': query, 'fdpage': params['pageno']}) + query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''}) params['url'] = search_url.format(query=query) return params @@ -35,17 +35,16 @@ def response(resp): dom = html.fromstring(resp.text) - for app in dom.xpath('//div[@id="appheader"]'): - url = app.xpath('./ancestor::a/@href')[0] - title = app.xpath('./p/span/text()')[0] - img_src = app.xpath('.//img/@src')[0] - - content = extract_text(app.xpath('./p')[0]) - content = content.replace(title, '', 1).strip() - - results.append({'url': url, - 'title': title, - 'content': content, - 'img_src': img_src}) + for app in dom.xpath('//a[@class="package-header"]'): + app_url = app.xpath('./@href')[0] + app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()')) + app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \ + + ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip() + app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0] + + results.append({'url': app_url, + 'title': app_title, + 'content': app_content, + 'img_src': app_img_src}) return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 08f07f7ce..c8ee34f7a 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -17,7 +17,7 @@ from time import time import re from searx.engines import logger from searx.url_utils import urlencode - +from searx.utils import ecma_unescape, html_to_text logger = logger.getChild('flickr-noapi') @@ -27,7 +27,7 @@ url = 'https://www.flickr.com/' search_url = url + 'search?{query}&page={page}' time_range_url = '&min_upload_date={start}&max_upload_date={end}' photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' -regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) +modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M) image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') paging = True @@ -57,40 +57,44 @@ def request(query, params): def response(resp): results = [] - matches = regex.search(resp.text) + matches = modelexport_re.search(resp.text) if matches is None: return results match = matches.group(1) - search_results = loads(match) - - if '_data' not in search_results: - return [] + model_export = loads(match) - photos = search_results['_data'] + if 'legend' not in model_export: + return results - for photo in photos: + legend = model_export['legend'] - # In paged configuration, the first pages' photos - # are represented by a None object - if photo is None: - continue + # handle empty page + if not legend or not legend[0]: + return results + for index in legend: + photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])] + author = ecma_unescape(photo.get('realname', '')) + source = ecma_unescape(photo.get('username', '')) + ' @ Flickr' + title = ecma_unescape(photo.get('title', '')) + content = html_to_text(ecma_unescape(photo.get('description', ''))) img_src = None # From the biggest to the lowest format for image_size in image_sizes: if image_size in photo['sizes']: img_src = photo['sizes'][image_size]['url'] + img_format = 'jpg ' \ + + str(photo['sizes'][image_size]['width']) \ + + 'x' \ + + str(photo['sizes'][image_size]['height']) break if not img_src: logger.debug('cannot find valid image size: {0}'.format(repr(photo))) continue - if 'ownerNsid' not in photo: - continue - # For a bigger thumbnail, keep only the url_z, not the url_n if 'n' in photo['sizes']: thumbnail_src = photo['sizes']['n']['url'] @@ -99,19 +103,28 @@ def response(resp): else: thumbnail_src = img_src - url = build_flickr_url(photo['ownerNsid'], photo['id']) - - title = photo.get('title', '') - - author = photo['username'] - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'content': '', - 'author': author, - 'template': 'images.html'}) + if 'ownerNsid' not in photo: + # should not happen, disowned photo? Show it anyway + url = img_src + else: + url = build_flickr_url(photo['ownerNsid'], photo['id']) + + result = { + 'url': url, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'source': source, + 'img_format': img_format, + 'template': 'images.html' + } + try: + result['author'] = author + result['title'] = title + result['content'] = content + except: + result['author'] = '' + result['title'] = '' + result['content'] = '' + results.append(result) return results diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py index 146cdaeec..f3441fa5f 100644 --- a/searx/engines/framalibre.py +++ b/searx/engines/framalibre.py @@ -10,7 +10,10 @@ @parse url, title, content, thumbnail, img_src """ -from cgi import escape +try: + from cgi import escape +except: + from html import escape from lxml import html from searx.engines.xpath import extract_text from searx.url_utils import urljoin, urlencode diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py deleted file mode 100644 index 3bb27444f..000000000 --- a/searx/engines/generalfile.py +++ /dev/null @@ -1,62 +0,0 @@ -""" - General Files (Files) - - @website http://www.general-files.org - @provide-api no (nothing found) - - @using-api no (because nothing found) - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content - - @todo detect torrents? -""" - -from lxml import html - -# engine dependent config -categories = ['files'] -paging = True - -# search-url -base_url = 'http://www.general-file.com' -search_url = base_url + '/files-{letter}/{query}/{pageno}' - -# specific xpath variables -result_xpath = '//table[@class="block-file"]' -title_xpath = './/h2/a//text()' -url_xpath = './/h2/a/@href' -content_xpath = './/p//text()' - - -# do search-request -def request(query, params): - - params['url'] = search_url.format(query=query, - letter=query[0], - pageno=params['pageno']) - - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - # parse results - for result in dom.xpath(result_xpath): - url = result.xpath(url_xpath)[0] - - # skip fast download links - if not url.startswith('/'): - continue - - # append result - results.append({'url': base_url + url, - 'title': ''.join(result.xpath(title_xpath)), - 'content': ''.join(result.xpath(content_xpath))}) - - # return results - return results diff --git a/searx/engines/genius.py b/searx/engines/genius.py new file mode 100644 index 000000000..aa5afad9b --- /dev/null +++ b/searx/engines/genius.py @@ -0,0 +1,89 @@ +""" +Genius + + @website https://www.genius.com/ + @provide-api yes (https://docs.genius.com/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +from json import loads +from searx.url_utils import urlencode +from datetime import datetime + +# engine dependent config +categories = ['music'] +paging = True +language_support = False +page_size = 5 + +url = 'https://genius.com/api/' +search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}' + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), + index='multi', + page_size=page_size, + pageno=params['pageno']) + return params + + +def parse_lyric(hit): + try: + content = hit['highlights'][0]['value'] + except: + content = None + timestamp = hit['result']['lyrics_updated_at'] + result = {'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'content': content, + 'thumbnail': hit['result']['song_art_image_thumbnail_url'], + 'template': 'videos.html'} + if timestamp: + result.update({'publishedDate': datetime.fromtimestamp(timestamp)}) + return result + + +def parse_artist(hit): + result = {'url': hit['result']['url'], + 'title': hit['result']['name'], + 'content': None, + 'thumbnail': hit['result']['image_url'], + 'template': 'videos.html'} + return result + + +def parse_album(hit): + result = {'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'thumbnail': hit['result']['cover_art_url'], + # 'thumbnail': hit['result']['cover_art_thumbnail_url'], + 'template': 'videos.html'} + try: + year = hit['result']['release_date_components']['year'] + except: + pass + else: + if year: + result.update({'content': 'Released: {}'.format(year)}) + return result + + +parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album} + + +def response(resp): + results = [] + json = loads(resp.text) + hits = [hit for section in json['response']['sections'] for hit in section['hits']] + for hit in hits: + try: + func = parse[hit['type']] + except KeyError: + continue + results.append(func(hit)) + return results diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py new file mode 100644 index 000000000..a7a966cc9 --- /dev/null +++ b/searx/engines/gentoo.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +""" + Gentoo Wiki + + @website https://wiki.gentoo.org + @provide-api yes + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['it'] +language_support = True +paging = True +base_url = 'https://wiki.gentoo.org' + +# xpath queries +xpath_results = '//ul[@class="mw-search-results"]/li' +xpath_link = './/div[@class="mw-search-result-heading"]/a' + + +# cut 'en' from 'en-US', 'de' from 'de-CH', and so on +def locale_to_lang_code(locale): + if locale.find('-') >= 0: + locale = locale.split('-')[0] + return locale + + +# wikis for some languages were moved off from the main site, we need to make +# requests to correct URLs to be able to get results in those languages +lang_urls = { + 'en': { + 'base': 'https://wiki.gentoo.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}' + }, + 'others': { + 'base': 'https://wiki.gentoo.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}\ + &profile=translation&languagefilter={language}' + } +} + + +# get base & search URLs for selected language +def get_lang_urls(language): + if language != 'en': + return lang_urls['others'] + return lang_urls['en'] + + +# Language names to build search requests for +# those languages which are hosted on the main site. +main_langs = { + 'ar': 'العربية', + 'bg': 'Български', + 'cs': 'Česky', + 'da': 'Dansk', + 'el': 'Ελληνικά', + 'es': 'Español', + 'he': 'עברית', + 'hr': 'Hrvatski', + 'hu': 'Magyar', + 'it': 'Italiano', + 'ko': '한국어', + 'lt': 'Lietuviškai', + 'nl': 'Nederlands', + 'pl': 'Polski', + 'pt': 'Português', + 'ru': 'Русский', + 'sl': 'Slovenský', + 'th': 'ไทย', + 'uk': 'Українська', + 'zh': '简体中文' +} +supported_languages = dict(lang_urls, **main_langs) + + +# do search-request +def request(query, params): + # translate the locale (e.g. 'en-US') to language code ('en') + language = locale_to_lang_code(params['language']) + + # if our language is hosted on the main site, we need to add its name + # to the query in order to narrow the results to that language + if language in main_langs: + query += b' (' + (main_langs[language]).encode('utf-8') + b')' + + # prepare the request parameters + query = urlencode({'search': query}) + offset = (params['pageno'] - 1) * 20 + + # get request URLs for our language of choice + urls = get_lang_urls(language) + search_url = urls['base'] + urls['search'] + + params['url'] = search_url.format(query=query, offset=offset, + language=language) + + return params + + +# get response from search-request +def response(resp): + # get the base URL for the language in which request was made + language = locale_to_lang_code(resp.search_params['language']) + base_url = get_lang_urls(language)['base'] + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(xpath_results): + link = result.xpath(xpath_link)[0] + href = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + + results.append({'url': href, + 'title': title}) + + return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 37933c69b..2bb29a9fe 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -10,10 +10,13 @@ @parse url, title, content """ +import random from json import loads from time import time from lxml.html import fromstring +from searx.poolrequests import get from searx.url_utils import urlencode +from searx.utils import eval_xpath # engine dependent config categories = ['general'] @@ -29,11 +32,9 @@ search_string = 'search?{query}'\ '&c=main'\ '&s={offset}'\ '&format=json'\ - '&qh=0'\ - '&qlang={lang}'\ + '&langcountry={lang}'\ '&ff={safesearch}'\ - '&rxikd={rxikd}' # random number - 9 digits - + '&rand={rxikd}' # specific xpath variables results_xpath = '//response//result' url_xpath = './/url' @@ -42,9 +43,26 @@ content_xpath = './/sum' supported_languages_url = 'https://gigablast.com/search?&rxikd=1' +extra_param = '' # gigablast requires a random extra parameter +# which can be extracted from the source code of the search page + + +def parse_extra_param(text): + global extra_param + param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')] + extra_param = '' + for l in param_lines: + extra_param += l.split("'")[1] + extra_param = extra_param.split('&')[-1] + + +def init(engine_settings=None): + parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text) + # do search-request def request(query, params): + print("EXTRAPARAM:", extra_param) offset = (params['pageno'] - 1) * number_of_results if params['language'] == 'all': @@ -59,14 +77,15 @@ def request(query, params): else: safesearch = 0 + # rxieu is some kind of hash from the search query, but accepts random atm search_path = search_string.format(query=urlencode({'q': query}), offset=offset, number_of_results=number_of_results, - rxikd=str(time())[:9], lang=language, + rxikd=int(time() * 1000), safesearch=safesearch) - params['url'] = base_url + search_path + params['url'] = base_url + search_path + '&' + extra_param return params @@ -76,7 +95,11 @@ def response(resp): results = [] # parse results - response_json = loads(resp.text) + try: + response_json = loads(resp.text) + except: + parse_extra_param(resp.text) + raise Exception('extra param expired, please reload') for result in response_json['results']: # append result @@ -92,9 +115,9 @@ def response(resp): def _fetch_supported_languages(resp): supported_languages = [] dom = fromstring(resp.text) - links = dom.xpath('//span[@id="menu2"]/a') + links = eval_xpath(dom, '//span[@id="menu2"]/a') for link in links: - href = link.xpath('./@href')[0].split('lang%3A') + href = eval_xpath(link, './@href')[0].split('lang%3A') if len(href) == 2: code = href[1].split('_') if len(code) == 2: diff --git a/searx/engines/google.py b/searx/engines/google.py index 934f5c29a..eed3a044e 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,10 +9,12 @@ # @parse url, title, content, suggestion import re +from flask_babel import gettext from lxml import html, etree from searx.engines.xpath import extract_text, extract_url from searx import logger from searx.url_utils import urlencode, urlparse, parse_qsl +from searx.utils import match_language, eval_xpath logger = logger.getChild('google engine') @@ -71,7 +73,7 @@ country_to_hostname = { 'RO': 'www.google.ro', # Romania 'RU': 'www.google.ru', # Russia 'SK': 'www.google.sk', # Slovakia - 'SL': 'www.google.si', # Slovenia (SL -> si) + 'SI': 'www.google.si', # Slovenia 'SE': 'www.google.se', # Sweden 'TH': 'www.google.co.th', # Thailand 'TR': 'www.google.com.tr', # Turkey @@ -89,7 +91,7 @@ url_map = 'https://www.openstreetmap.org/'\ search_path = '/search' search_url = ('https://{hostname}' + search_path + - '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&ei=x') + '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&hl={lang_short}&ei=x') time_range_search = "&tbs=qdr:{range}" time_range_dict = {'day': 'd', @@ -105,13 +107,12 @@ images_path = '/images' supported_languages_url = 'https://www.google.com/preferences?#languages' # specific xpath variables -results_xpath = '//div[@class="g"]' -url_xpath = './/h3/a/@href' -title_xpath = './/h3' -content_xpath = './/span[@class="st"]' -content_misc_xpath = './/div[@class="f slp"]' -suggestion_xpath = '//p[@class="_Bmc"]' -spelling_suggestion_xpath = '//a[@class="spell"]' +results_xpath = '//div[contains(@class, "ZINbbc")]' +url_xpath = './/div[@class="kCrYT"][1]/a/@href' +title_xpath = './/div[@class="kCrYT"][1]/a/div[1]' +content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]' +suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]' +spelling_suggestion_xpath = '//div[@id="scc"]//a' # map : detail location map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' @@ -154,7 +155,7 @@ def parse_url(url_string, google_hostname): # returns extract_text on the first result selected by the xpath or None def extract_text_from_dom(result, xpath): - r = result.xpath(xpath) + r = eval_xpath(result, xpath) if len(r) > 0: return extract_text(r[0]) return None @@ -164,36 +165,37 @@ def extract_text_from_dom(result, xpath): def request(query, params): offset = (params['pageno'] - 1) * 10 - if params['language'] == 'all': - language = 'en' - country = 'US' - url_lang = '' - elif params['language'][:2] == 'jv': - language = 'jw' - country = 'ID' - url_lang = 'lang_jw' + if params['language'] == 'all' or params['language'] == 'en-US': + language = 'en-GB' + else: + language = match_language(params['language'], supported_languages, language_aliases) + + language_array = language.split('-') + if params['language'].find('-') > 0: + country = params['language'].split('-')[1] + elif len(language_array) == 2: + country = language_array[1] else: - language_array = params['language'].lower().split('-') - if len(language_array) == 2: - country = language_array[1] - else: - country = 'US' - language = language_array[0] + ',' + language_array[0] + '-' + country - url_lang = 'lang_' + language_array[0] + country = 'US' + + url_lang = 'lang_' + language if use_locale_domain: google_hostname = country_to_hostname.get(country.upper(), default_hostname) else: google_hostname = default_hostname + # original format: ID=3e2b6616cee08557:TM=5556667580:C=r:IP=4.1.12.5-:S=23ASdf0soFgF2d34dfgf-_22JJOmHdfgg + params['cookies']['GOOGLE_ABUSE_EXEMPTION'] = 'x' params['url'] = search_url.format(offset=offset, query=urlencode({'q': query}), hostname=google_hostname, - lang=url_lang) + lang=url_lang, + lang_short=language) if params['time_range'] in time_range_dict: params['url'] += time_range_search.format(range=time_range_dict[params['time_range']]) - params['headers']['Accept-Language'] = language + params['headers']['Accept-Language'] = language + ',' + language + '-' + country params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' params['google_hostname'] = google_hostname @@ -210,6 +212,9 @@ def response(resp): if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') + if resp_url.path.startswith('/sorry'): + raise RuntimeWarning(gettext('CAPTCHA required')) + # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname @@ -217,21 +222,21 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) - instant_answer = dom.xpath('//div[@id="_vBb"]//text()') + instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: - results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0] + results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0] .split()[1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results - for result in dom.xpath(results_xpath): + for result in eval_xpath(dom, results_xpath): try: - title = extract_text(result.xpath(title_xpath)[0]) - url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) + title = extract_text(eval_xpath(result, title_xpath)[0]) + url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result @@ -240,7 +245,7 @@ def response(resp): continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 - # x = result.xpath(map_near) + # x = eval_xpath(result, map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) @@ -264,9 +269,7 @@ def response(resp): content = extract_text_from_dom(result, content_xpath) if content is None: continue - content_misc = extract_text_from_dom(result, content_misc_xpath) - if content_misc is not None: - content = content_misc + "<br />" + content + # append result results.append({'url': url, 'title': title, @@ -277,11 +280,11 @@ def response(resp): continue # parse suggestion - for suggestion in dom.xpath(suggestion_xpath): + for suggestion in eval_xpath(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - for correction in dom.xpath(spelling_suggestion_xpath): + for correction in eval_xpath(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results @@ -290,9 +293,9 @@ def response(resp): def parse_images(result, google_hostname): results = [] - for image in result.xpath(images_xpath): - url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname) - img_src = extract_text(image.xpath(image_img_src_xpath)[0]) + for image in eval_xpath(result, images_xpath): + url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname) + img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0]) # append result results.append({'url': url, @@ -379,10 +382,10 @@ def attributes_to_html(attributes): def _fetch_supported_languages(resp): supported_languages = {} dom = html.fromstring(resp.text) - options = dom.xpath('//table//td/font/label/span') + options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]') for option in options: - code = option.xpath('./@id')[0][1:] - name = option.text.title() + code = eval_xpath(option, './@value')[0].split('_')[-1] + name = eval_xpath(option, './@data-name')[0].title() supported_languages[code] = {"name": name} return supported_languages diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index a380170ca..636913114 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -15,7 +15,6 @@ from json import loads from lxml import html from searx.url_utils import urlencode - # engine dependent config categories = ['images'] paging = True @@ -25,8 +24,6 @@ number_of_results = 100 search_url = 'https://www.google.com/search'\ '?{query}'\ - '&asearch=ichunk'\ - '&async=_id:rg_s,_pms:s'\ '&tbm=isch'\ '&yv=2'\ '&{search_options}' @@ -66,30 +63,35 @@ def request(query, params): def response(resp): results = [] - g_result = loads(resp.text) - - dom = html.fromstring(g_result[1][1]) + dom = html.fromstring(resp.text) # parse results - for result in dom.xpath('//div[@data-ved]'): + for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'): try: - metadata = loads(''.join(result.xpath('./div[contains(@class, "rg_meta")]/text()'))) + metadata = loads(result) + + img_format = metadata.get('ity', '') + img_width = metadata.get('ow', '') + img_height = metadata.get('oh', '') + if img_width and img_height: + img_format += " {0}x{1}".format(img_width, img_height) + + source = metadata.get('st', '') + source_url = metadata.get('isu', '') + if source_url: + source += " ({0})".format(source_url) + + results.append({'url': metadata['ru'], + 'title': metadata['pt'], + 'content': metadata.get('s', ''), + 'source': source, + 'img_format': img_format, + 'thumbnail_src': metadata['tu'], + 'img_src': metadata['ou'], + 'template': 'images.html'}) + except: continue - thumbnail_src = metadata['tu'] - - # http to https - thumbnail_src = thumbnail_src.replace("http://", "https://") - - # append result - results.append({'url': metadata['ru'], - 'title': metadata['pt'], - 'content': metadata['s'], - 'thumbnail_src': thumbnail_src, - 'img_src': metadata['ou'], - 'template': 'images.html'}) - - # return results return results diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 7344b5289..c9cc75435 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -13,6 +13,7 @@ from lxml import html from searx.engines.google import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode +from searx.utils import match_language # search-url categories = ['news'] @@ -51,8 +52,9 @@ def request(query, params): search_options=urlencode(search_options)) if params['language'] != 'all': - language_array = params['language'].lower().split('-') - params['url'] += '&lr=lang_' + language_array[0] + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + if language: + params['url'] += '&hl=' + language return params @@ -67,8 +69,8 @@ def response(resp): for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): try: r = { - 'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0], - 'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')), + 'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"), + 'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')), 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), } except: diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 310b31490..fd6b2e3be 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -7,7 +7,7 @@ @using-api no @results HTML @stable no - @parse url, title, content + @parse url, title, content, thumbnail """ from datetime import date, timedelta @@ -15,7 +15,7 @@ from json import loads from lxml import html from searx.engines.xpath import extract_text from searx.url_utils import urlencode - +import re # engine dependent config categories = ['videos'] @@ -25,7 +25,7 @@ time_range_support = True number_of_results = 10 search_url = 'https://www.google.com/search'\ - '?{query}'\ + '?q={query}'\ '&tbm=vid'\ '&{search_options}' time_range_attr = "qdr:{range}" @@ -69,15 +69,29 @@ def response(resp): # parse results for result in dom.xpath('//div[@class="g"]'): - title = extract_text(result.xpath('.//h3/a')) - url = result.xpath('.//h3/a/@href')[0] + title = extract_text(result.xpath('.//h3')) + url = result.xpath('.//div[@class="r"]/a/@href')[0] content = extract_text(result.xpath('.//span[@class="st"]')) + # get thumbnails + script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) + ids = result.xpath('.//div[@class="s"]//img/@id') + if len(ids) > 0: + thumbnails_data = \ + re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0], + script) + tmp = [] + if len(thumbnails_data) != 0: + tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) + thumbnail = '' + if len(tmp) != 0: + thumbnail = tmp[-1] + # append result results.append({'url': url, 'title': title, 'content': content, - 'thumbnail': '', + 'thumbnail': thumbnail, 'template': 'videos.html'}) return results diff --git a/searx/engines/ina.py b/searx/engines/ina.py index 37a05f099..ea509649f 100644 --- a/searx/engines/ina.py +++ b/searx/engines/ina.py @@ -32,7 +32,7 @@ base_url = 'https://www.ina.fr' search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' # specific xpath variables -results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' +results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' url_xpath = './/a/@href' title_xpath = './/h3[@class="h3--title media-heading"]' thumbnail_xpath = './/img/@src' @@ -65,8 +65,11 @@ def response(resp): videoid = result.xpath(url_xpath)[0] url = base_url + videoid title = p.unescape(extract_text(result.xpath(title_xpath))) - thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) - if thumbnail[0] == '/': + try: + thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) + except: + thumbnail = '' + if thumbnail and thumbnail[0] == '/': thumbnail = base_url + thumbnail d = extract_text(result.xpath(publishedDate_xpath)[0]) d = d.split('/') diff --git a/searx/engines/invidious.py b/searx/engines/invidious.py new file mode 100644 index 000000000..8d81691fc --- /dev/null +++ b/searx/engines/invidious.py @@ -0,0 +1,100 @@ +# Invidious (Videos) +# +# @website https://invidio.us/ +# @provide-api yes (https://github.com/omarroth/invidious/wiki/API) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content, publishedDate, thumbnail, embedded + +from searx.url_utils import quote_plus +from dateutil import parser +import time + +# engine dependent config +categories = ["videos", "music"] +paging = True +language_support = True +time_range_support = True + +# search-url +base_url = "https://invidio.us/" + + +# do search-request +def request(query, params): + time_range_dict = { + "day": "today", + "week": "week", + "month": "month", + "year": "year", + } + search_url = base_url + "api/v1/search?q={query}" + params["url"] = search_url.format( + query=quote_plus(query) + ) + "&page={pageno}".format(pageno=params["pageno"]) + + if params["time_range"] in time_range_dict: + params["url"] += "&date={timerange}".format( + timerange=time_range_dict[params["time_range"]] + ) + + if params["language"] != "all": + lang = params["language"].split("-") + if len(lang) == 2: + params["url"] += "&range={lrange}".format(lrange=lang[1]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = resp.json() + embedded_url = ( + '<iframe width="540" height="304" ' + + 'data-src="' + + base_url + + 'embed/{videoid}" ' + + 'frameborder="0" allowfullscreen></iframe>' + ) + + base_invidious_url = base_url + "watch?v=" + + for result in search_results: + rtype = result.get("type", None) + if rtype == "video": + videoid = result.get("videoId", None) + if not videoid: + continue + + url = base_invidious_url + videoid + embedded = embedded_url.format(videoid=videoid) + thumbs = result.get("videoThumbnails", []) + thumb = next( + (th for th in thumbs if th["quality"] == "sddefault"), None + ) + if thumb: + thumbnail = thumb.get("url", "") + else: + thumbnail = "" + + publishedDate = parser.parse( + time.ctime(result.get("published", 0)) + ) + + results.append( + { + "url": url, + "title": result.get("title", ""), + "content": result.get("description", ""), + "template": "videos.html", + "publishedDate": publishedDate, + "embedded": embedded, + "thumbnail": thumbnail, + } + ) + + return results diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index 67d6a5a65..785b0c490 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -2,6 +2,7 @@ from collections import Iterable from json import loads from sys import version_info from searx.url_utils import urlencode +from searx.utils import to_string if version_info[0] == 3: unicode = str @@ -98,18 +99,35 @@ def response(resp): results = [] json = loads(resp.text) if results_query: - for result in query(json, results_query)[0]: - url = query(result, url_query)[0] - title = query(result, title_query)[0] - content = query(result, content_query)[0] - results.append({'url': url, 'title': title, 'content': content}) + rs = query(json, results_query) + if not len(rs): + return results + for result in rs[0]: + try: + url = query(result, url_query)[0] + title = query(result, title_query)[0] + except: + continue + try: + content = query(result, content_query)[0] + except: + content = "" + results.append({ + 'url': to_string(url), + 'title': to_string(title), + 'content': to_string(content), + }) else: for url, title, content in zip( query(json, url_query), query(json, title_query), query(json, content_query) ): - results.append({'url': url, 'title': title, 'content': content}) + results.append({ + 'url': to_string(url), + 'title': to_string(title), + 'content': to_string(content), + }) if not suggestion_query: return results diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py new file mode 100644 index 000000000..9bac0069c --- /dev/null +++ b/searx/engines/microsoft_academic.py @@ -0,0 +1,77 @@ +""" +Microsoft Academic (Science) + +@website https://academic.microsoft.com +@provide-api yes +@using-api no +@results JSON +@stable no +@parse url, title, content +""" + +from datetime import datetime +from json import loads +from uuid import uuid4 + +from searx.url_utils import urlencode +from searx.utils import html_to_text + +categories = ['images'] +paging = True +result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' + + +def request(query, params): + correlation_id = uuid4() + msacademic = uuid4() + time_now = datetime.now() + + params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id})) + params['cookies']['msacademic'] = str(msacademic) + params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now)) + params['method'] = 'POST' + params['data'] = { + 'Query': '@{query}@'.format(query=query), + 'Limit': 10, + 'Offset': params['pageno'] - 1, + 'Filters': '', + 'OrderBy': '', + 'SortAscending': False, + } + + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + if not response_data: + return results + + for result in response_data['results']: + url = _get_url(result) + title = result['e']['dn'] + content = _get_content(result) + results.append({ + 'url': url, + 'title': html_to_text(title), + 'content': html_to_text(content), + }) + + return results + + +def _get_url(result): + if 's' in result['e']: + return result['e']['s'][0]['u'] + return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id']) + + +def _get_content(result): + if 'd' in result['e']: + content = result['e']['d'] + if len(content) > 300: + return content[:300] + '...' + return content + + return '' diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index 272c712c4..c57979a5f 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -1,7 +1,7 @@ """ - Nyaa.se (Anime Bittorrent tracker) + Nyaa.si (Anime Bittorrent tracker) - @website http://www.nyaa.se/ + @website https://nyaa.si/ @provide-api no @using-api no @results HTML @@ -12,50 +12,25 @@ from lxml import html from searx.engines.xpath import extract_text from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero # engine dependent config categories = ['files', 'images', 'videos', 'music'] paging = True # search-url -base_url = 'http://www.nyaa.se/' +base_url = 'https://nyaa.si/' search_url = base_url + '?page=search&{query}&offset={offset}' # xpath queries -xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]' -xpath_category = './/td[@class="tlisticon"]/a' -xpath_title = './/td[@class="tlistname"]/a' -xpath_torrent_file = './/td[@class="tlistdownload"]/a' -xpath_filesize = './/td[@class="tlistsize"]/text()' -xpath_seeds = './/td[@class="tlistsn"]/text()' -xpath_leeches = './/td[@class="tlistln"]/text()' -xpath_downloads = './/td[@class="tlistdn"]/text()' - - -# convert a variable to integer or return 0 if it's not a number -def int_or_zero(num): - if isinstance(num, list): - if len(num) < 1: - return 0 - num = num[0] - if num.isdigit(): - return int(num) - return 0 - - -# get multiplier to convert torrent size to bytes -def get_filesize_mul(suffix): - return { - 'KB': 1024, - 'MB': 1024 ** 2, - 'GB': 1024 ** 3, - 'TB': 1024 ** 4, - - 'KIB': 1024, - 'MIB': 1024 ** 2, - 'GIB': 1024 ** 3, - 'TIB': 1024 ** 4 - }[str(suffix).upper()] +xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]' +xpath_category = './/td[1]/a[1]' +xpath_title = './/td[2]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' +xpath_seeds = './/td[6]/text()' +xpath_leeches = './/td[7]/text()' +xpath_downloads = './/td[8]/text()' # do search-request @@ -72,25 +47,32 @@ def response(resp): dom = html.fromstring(resp.text) for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "" + torrent_link = "" + # category in which our torrent belongs - category = result.xpath(xpath_category)[0].attrib.get('title') + try: + category = result.xpath(xpath_category)[0].attrib.get('title') + except: + pass # torrent title page_a = result.xpath(xpath_title)[0] title = extract_text(page_a) # link to the page - href = page_a.attrib.get('href') + href = base_url + page_a.attrib.get('href') - # link to the torrent file - torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href') - - # torrent size - try: - file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') - file_size = int(float(file_size) * get_filesize_mul(suffix)) - except: - file_size = None + for link in result.xpath(xpath_torrent_links): + url = link.attrib.get('href') + if 'magnet' in url: + # link to the magnet + magnet_link = url + else: + # link to the torrent file + torrent_link = url # seed count seed = int_or_zero(result.xpath(xpath_seeds)) @@ -101,6 +83,14 @@ def response(resp): # torrent downloads count downloads = int_or_zero(result.xpath(xpath_downloads)) + # let's try to calculate the torrent size + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + except: + pass + # content string contains all information not included into template content = 'Category: "{category}". Downloaded {downloads} times.' content = content.format(category=category, downloads=downloads) @@ -110,8 +100,9 @@ def response(resp): 'content': content, 'seed': seed, 'leech': leech, - 'filesize': file_size, + 'filesize': filesize, 'torrentfile': torrent_link, + 'magnetlink': magnet_link, 'template': 'torrent.html'}) return results diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 733ba6203..cec10a3c7 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -24,7 +24,7 @@ result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' # do search-request def request(query, params): - params['url'] = base_url + search_string.format(query=query) + params['url'] = base_url + search_string.format(query=query.decode('utf-8')) return params diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py index f784e106f..2db92868a 100644 --- a/searx/engines/pdbe.py +++ b/searx/engines/pdbe.py @@ -43,7 +43,7 @@ def construct_body(result): title = result['title'] # construct content body - content = """{title}<br />{authors} {journal} <strong>{volume}</strong> {page} ({year})""" + content = """{title} - {authors} {journal} ({volume}) {page} ({year})""" # replace placeholders with actual content try: @@ -84,15 +84,18 @@ def response(resp): continue if result['status'] == 'OBS': # expand title to add some sort of warning message - title = gettext('{title} (OBSOLETE)').format(title=result['title']) - superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) + title = gettext('{title} (OBSOLETE)').format(title=result['title']) + try: + superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) + except: + continue # since we can't construct a proper body from the response, we'll make up our own msg_superseded = gettext("This entry has been superseded by") - content = '<em>{msg_superseded} \<a href="{url}">{pdb_id}</a></em>'.format( + content = '{msg_superseded}: {url} ({pdb_id})'.format( msg_superseded=msg_superseded, url=superseded_url, - pdb_id=result['superseded_by'], ) + pdb_id=result['superseded_by']) # obsoleted entries don't have preview images img_src = None diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index a5af8d824..2f3f22a97 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -18,7 +18,7 @@ categories = ['videos', 'music', 'files'] paging = True # search-url -url = 'https://thepiratebay.se/' +url = 'https://thepiratebay.org/' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' # piratebay specific type-definitions diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py new file mode 100644 index 000000000..055f09226 --- /dev/null +++ b/searx/engines/pubmed.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +""" + PubMed (Scholar publications) + @website https://www.ncbi.nlm.nih.gov/pubmed/ + @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/) + @using-api yes + @results XML + @stable yes + @parse url, title, publishedDate, content + More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/ +""" + +from flask_babel import gettext +from lxml import etree +from datetime import datetime +from searx.url_utils import urlencode +from searx.poolrequests import get + + +categories = ['science'] + +base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\ + + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' + +# engine dependent config +number_of_results = 10 +pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'term': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + # First retrieve notice of each result + pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\ + + 'db=pubmed&retmode=xml&id={pmids_string}' + + pmids_results = etree.XML(resp.content) + pmids = pmids_results.xpath('//eSearchResult/IdList/Id') + pmids_string = '' + + for item in pmids: + pmids_string += item.text + ',' + + retrieve_notice_args = dict(pmids_string=pmids_string) + + retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) + + search_results_xml = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') + + for entry in search_results: + title = entry.xpath('.//Article/ArticleTitle')[0].text + + pmid = entry.xpath('.//PMID')[0].text + url = pubmed_url + pmid + + try: + content = entry.xpath('.//Abstract/AbstractText')[0].text + except: + content = gettext('No abstract is available for this publication.') + + # If a doi is available, add it to the snipppet + try: + doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text + content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) + except: + pass + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + res_dict = {'url': url, + 'title': title, + 'content': content} + + try: + publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text + + '-' + entry.xpath('.//DateCreated/Month')[0].text + + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d') + res_dict['publishedDate'] = publishedDate + except: + pass + + results.append(res_dict) + + return results diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 3d266e228..54e9dafad 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -14,6 +14,7 @@ from datetime import datetime from json import loads from searx.utils import html_to_text from searx.url_utils import urlencode +from searx.utils import match_language # engine dependent config categories = None @@ -27,7 +28,7 @@ category_to_keyword = {'general': 'web', 'social media': 'social'} # search-url -url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}' +url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' # do search-request @@ -44,19 +45,12 @@ def request(query, params): query=urlencode({'q': query}), offset=offset) - # add language tag if specified + # add language tag if params['language'] != 'all': - if params['language'] == 'no' or params['language'].startswith('no-'): - params['language'] = params['language'].replace('no', 'nb', 1) - if params['language'].find('-') < 0: - # tries to get a country code from language - for lang in supported_languages: - lc = lang.split('-') - if params['language'] == lc[0]: - params['language'] = lang - break - params['url'] += '&locale=' + params['language'].replace('-', '_').lower() + language = match_language(params['language'], supported_languages, language_aliases) + params['url'] += '&locale=' + language.replace('-', '_').lower() + params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' return params diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py index 72fd2b3c9..7208dcb70 100644 --- a/searx/engines/scanr_structures.py +++ b/searx/engines/scanr_structures.py @@ -29,7 +29,7 @@ def request(query, params): params['url'] = search_url params['method'] = 'POST' params['headers']['Content-type'] = "application/json" - params['data'] = dumps({"query": query, + params['data'] = dumps({"query": query.decode('utf-8'), "searchField": "ALL", "sortDirection": "ASC", "sortOrder": "RELEVANCY", diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py index 3770dacac..f9b1f99c8 100644 --- a/searx/engines/seedpeer.py +++ b/searx/engines/seedpeer.py @@ -1,6 +1,6 @@ # Seedpeer (Videos, Music, Files) # -# @website http://seedpeer.eu +# @website https://seedpeer.me # @provide-api no (nothing found) # # @using-api no @@ -9,31 +9,28 @@ # @parse url, title, content, seed, leech, magnetlink from lxml import html +from json import loads from operator import itemgetter from searx.url_utils import quote, urljoin +from searx.engines.xpath import extract_text -url = 'http://www.seedpeer.eu/' -search_url = url + 'search/{search_term}/7/{page_no}.html' +url = 'https://seedpeer.me/' +search_url = url + 'search/{search_term}?page={page_no}' +torrent_file_url = url + 'torrent/{torrent_hash}' + # specific xpath variables -torrent_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a' -alternative_torrent_xpath = '//*[@id="body"]/center/center/table[1]/tr/td/a' -title_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a/text()' -alternative_title_xpath = '//*[@id="body"]/center/center/table/tr/td/a' -seeds_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[4]/font/text()' -alternative_seeds_xpath = '//*[@id="body"]/center/center/table/tr/td[4]/font/text()' -peers_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[5]/font/text()' -alternative_peers_xpath = '//*[@id="body"]/center/center/table/tr/td[5]/font/text()' -age_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[2]/text()' -alternative_age_xpath = '//*[@id="body"]/center/center/table/tr/td[2]/text()' -size_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[3]/text()' -alternative_size_xpath = '//*[@id="body"]/center/center/table/tr/td[3]/text()' +script_xpath = '//script[@type="text/javascript"][not(@src)]' +torrent_xpath = '(//table)[2]/tbody/tr' +link_xpath = '(./td)[1]/a/@href' +age_xpath = '(./td)[2]' +size_xpath = '(./td)[3]' # do search-request def request(query, params): params['url'] = search_url.format(search_term=quote(query), - page_no=params['pageno'] - 1) + page_no=params['pageno']) return params @@ -41,34 +38,40 @@ def request(query, params): def response(resp): results = [] dom = html.fromstring(resp.text) - torrent_links = dom.xpath(torrent_xpath) - if len(torrent_links) > 0: - seeds = dom.xpath(seeds_xpath) - peers = dom.xpath(peers_xpath) - titles = dom.xpath(title_xpath) - sizes = dom.xpath(size_xpath) - ages = dom.xpath(age_xpath) - else: # under ~5 results uses a different xpath - torrent_links = dom.xpath(alternative_torrent_xpath) - seeds = dom.xpath(alternative_seeds_xpath) - peers = dom.xpath(alternative_peers_xpath) - titles = dom.xpath(alternative_title_xpath) - sizes = dom.xpath(alternative_size_xpath) - ages = dom.xpath(alternative_age_xpath) - # return empty array if nothing is found - if not torrent_links: + result_rows = dom.xpath(torrent_xpath) + + try: + script_element = dom.xpath(script_xpath)[0] + json_string = script_element.text[script_element.text.find('{'):] + torrents_json = loads(json_string) + except: return [] # parse results - for index, result in enumerate(torrent_links): - link = result.attrib.get('href') + for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']): + title = torrent_json['name'] + seed = int(torrent_json['seeds']) + leech = int(torrent_json['peers']) + size = int(torrent_json['size']) + torrent_hash = torrent_json['hash'] + + torrentfile = torrent_file_url.format(torrent_hash=torrent_hash) + magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash) + + age = extract_text(torrent_row.xpath(age_xpath)) + link = torrent_row.xpath(link_xpath)[0] + href = urljoin(url, link) - results.append({'url': href, - 'title': titles[index].text_content(), - 'content': '{}, {}'.format(sizes[index], ages[index]), - 'seed': seeds[index], - 'leech': peers[index], + # append result + results.append({'url': href, + 'title': title, + 'content': age, + 'seed': seed, + 'leech': leech, + 'filesize': size, + 'torrentfile': torrentfile, + 'magnetlink': magnetlink, 'template': 'torrent.html'}) # return results sorted by seeder diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index d59755e04..284689bf6 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -28,8 +28,10 @@ categories = ['music'] paging = True # search-url -url = 'https://api.soundcloud.com/' +# missing attribute: user_id, app_version, app_locale +url = 'https://api-v2.soundcloud.com/' search_url = url + 'search?{query}'\ + '&variant_ids='\ '&facet=model'\ '&limit=20'\ '&offset={offset}'\ @@ -49,7 +51,9 @@ def get_client_id(): if response.ok: tree = html.fromstring(response.content) - script_tags = tree.xpath("//script[contains(@src, '/assets/app')]") + # script_tags has been moved from /assets/app/ to /assets/ path. I + # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js + script_tags = tree.xpath("//script[contains(@src, '/assets/')]") app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] # extracts valid app_js urls from soundcloud.com content @@ -57,14 +61,14 @@ def get_client_id(): # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: - cids = cid_re.search(response.text) + cids = cid_re.search(response.content.decode("utf-8")) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") return "" -def init(): +def init(engine_settings=None): global guest_client_id # api-key guest_client_id = get_client_id() diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index aed756be3..00c395706 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -12,10 +12,14 @@ from json import loads from searx.url_utils import urlencode +import requests +import base64 # engine dependent config categories = ['music'] paging = True +api_client_id = None +api_client_secret = None # search-url url = 'https://api.spotify.com/' @@ -31,6 +35,16 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + r = requests.post( + 'https://accounts.spotify.com/api/token', + data={'grant_type': 'client_credentials'}, + headers={'Authorization': 'Basic ' + base64.b64encode( + "{}:{}".format(api_client_id, api_client_secret).encode('utf-8') + ).decode('utf-8')} + ) + j = loads(r.text) + params['headers'] = {'Authorization': 'Bearer {}'.format(j.get('access_token'))} + return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 314b7b9a8..953734934 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -15,6 +15,8 @@ from dateutil import parser from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text +from searx.languages import language_codes +from searx.utils import eval_xpath # engine dependent config categories = ['general'] @@ -22,7 +24,7 @@ categories = ['general'] # (probably the parameter qid), require # storing of qid's between mulitble search-calls -# paging = False +paging = True language_support = True # search-url @@ -32,22 +34,32 @@ search_url = base_url + 'do/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//div[@class="result"]' -link_xpath = './/h3/a' +results_xpath = '//div[@class="w-gl__result"]' +link_xpath = './/a[@class="w-gl__result-title"]' +content_xpath = './/p[@class="w-gl__description"]' # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 params['url'] = search_url params['method'] = 'POST' - params['data'] = {'query': query, - 'startat': offset} + params['data'] = { + 'query': query, + 'page': params['pageno'], + 'cat': 'web', + 'cmd': 'process_search', + 'engine0': 'v1all', + } # set language if specified if params['language'] != 'all': - params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) + language = 'english' + for lc, _, _, lang in language_codes: + if lc == params['language']: + language = lang + params['data']['language'] = language + params['data']['lui'] = language return params @@ -59,8 +71,8 @@ def response(resp): dom = html.fromstring(resp.text) # parse results - for result in dom.xpath(results_xpath): - links = result.xpath(link_xpath) + for result in eval_xpath(dom, results_xpath): + links = eval_xpath(result, link_xpath) if not links: continue link = links[0] @@ -74,14 +86,10 @@ def response(resp): if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue - # block ixquick search url's - if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): - continue - title = extract_text(link) - if result.xpath('./p[@class="desc clk"]'): - content = extract_text(result.xpath('./p[@class="desc clk"]')) + if eval_xpath(result, content_xpath): + content = extract_text(eval_xpath(result, content_xpath)) else: content = '' @@ -91,11 +99,14 @@ def response(resp): if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] - published_date = parser.parse(date_string, dayfirst=True) - # fix content string content = content[date_pos:] + try: + published_date = parser.parse(date_string, dayfirst=True) + except ValueError: + pass + # check if search result starts with something like: "5 days ago ... " elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): date_pos = content.find('...') + 4 diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py deleted file mode 100644 index 2cbc991b3..000000000 --- a/searx/engines/subtitleseeker.py +++ /dev/null @@ -1,86 +0,0 @@ -""" - Subtitleseeker (Video) - - @website http://www.subtitleseeker.com - @provide-api no - - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content -""" - -from lxml import html -from searx.languages import language_codes -from searx.engines.xpath import extract_text -from searx.url_utils import quote_plus - -# engine dependent config -categories = ['videos'] -paging = True -language = "" - -# search-url -url = 'http://www.subtitleseeker.com/' -search_url = url + 'search/TITLES/{query}?p={pageno}' - -# specific xpath variables -results_xpath = '//div[@class="boxRows"]' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(query=quote_plus(query), - pageno=params['pageno']) - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - - search_lang = "" - - # dirty fix for languages named differenly in their site - if resp.search_params['language'][:2] == 'fa': - search_lang = 'Farsi' - elif resp.search_params['language'] == 'pt-BR': - search_lang = 'Brazilian' - elif resp.search_params['language'] != 'all': - search_lang = [lc[3] - for lc in language_codes - if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]] - search_lang = search_lang[0].split(' (')[0] - - # parse results - for result in dom.xpath(results_xpath): - link = result.xpath(".//a")[0] - href = link.attrib.get('href') - - if language is not "": - href = href + language + '/' - elif search_lang: - href = href + search_lang + '/' - - title = extract_text(link) - - content = extract_text(result.xpath('.//div[contains(@class,"red")]')) - content = content + " - " - text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0]) - content = content + text - - if result.xpath(".//span") != []: - content = content +\ - " - (" +\ - extract_text(result.xpath(".//span")) +\ - ")" - - # append result - results.append({'url': href, - 'title': title, - 'content': content}) - - # return results - return results diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py deleted file mode 100644 index e9c13ca24..000000000 --- a/searx/engines/swisscows.py +++ /dev/null @@ -1,126 +0,0 @@ -""" - Swisscows (Web, Images) - - @website https://swisscows.ch - @provide-api no - - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content -""" - -from json import loads -import re -from lxml.html import fromstring -from searx.url_utils import unquote, urlencode - -# engine dependent config -categories = ['general', 'images'] -paging = True -language_support = True - -# search-url -base_url = 'https://swisscows.ch/' -search_string = '?{query}&page={page}' - -supported_languages_url = base_url - -# regex -regex_json = re.compile(b'initialData: {"Request":(.|\n)*},\s*environment') -regex_json_remove_start = re.compile(b'^initialData:\s*') -regex_json_remove_end = re.compile(b',\s*environment$') -regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=') - - -# do search-request -def request(query, params): - if params['language'] == 'all': - ui_language = 'browser' - region = 'browser' - elif params['language'].split('-')[0] == 'no': - region = 'nb-NO' - else: - region = params['language'] - ui_language = params['language'].split('-')[0] - - search_path = search_string.format( - query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}), - page=params['pageno'] - ) - - # image search query is something like 'image?{query}&page={page}' - if params['category'] == 'images': - search_path = 'image' + search_path - - params['url'] = base_url + search_path - - return params - - -# get response from search-request -def response(resp): - results = [] - - json_regex = regex_json.search(resp.text) - - # check if results are returned - if not json_regex: - return [] - - json_raw = regex_json_remove_end.sub(b'', regex_json_remove_start.sub(b'', json_regex.group())) - json = loads(json_raw.decode('utf-8')) - - # parse results - for result in json['Results'].get('items', []): - result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '') - - # parse image results - if result.get('ContentType', '').startswith('image'): - img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) - - # append result - results.append({'url': result['SourceUrl'], - 'title': result['Title'], - 'content': '', - 'img_src': img_url, - 'template': 'images.html'}) - - # parse general results - else: - result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '') - result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '') - - # append result - results.append({'url': result_url, - 'title': result_title, - 'content': result_content}) - - # parse images - for result in json.get('Images', []): - # decode image url - img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) - - # append result - results.append({'url': result['SourceUrl'], - 'title': result['Title'], - 'content': '', - 'img_src': img_url, - 'template': 'images.html'}) - - # return results - return results - - -# get supported languages from their site -def _fetch_supported_languages(resp): - supported_languages = [] - dom = fromstring(resp.text) - options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') - for option in options: - code = option.xpath('./@data-val')[0] - if code.startswith('nb-'): - code = code.replace('nb', 'no', 1) - supported_languages.append(code) - - return supported_languages diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index 9a6b5e57d..773212043 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -14,8 +14,8 @@ import re from lxml import html from searx.engines.xpath import extract_text from datetime import datetime -from searx.engines.nyaa import int_or_zero, get_filesize_mul from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero # engine dependent config categories = ['files', 'videos', 'music'] @@ -76,8 +76,7 @@ def response(resp): try: # ('1.228', 'GB') groups = size_re.match(item).groups() - multiplier = get_filesize_mul(groups[1]) - params['filesize'] = int(multiplier * float(groups[0])) + params['filesize'] = get_torrent_size(groups[0], groups[1]) except: pass elif item.startswith('Date:'): diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index dda56fc22..fd4164a66 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -1,7 +1,7 @@ """ - Torrentz.eu (BitTorrent meta-search engine) + Torrentz2.eu (BitTorrent meta-search engine) - @website https://torrentz.eu/ + @website https://torrentz2.eu/ @provide-api no @using-api no @@ -14,24 +14,24 @@ import re from lxml import html from datetime import datetime -from searx.engines.nyaa import int_or_zero, get_filesize_mul from searx.engines.xpath import extract_text from searx.url_utils import urlencode +from searx.utils import get_torrent_size # engine dependent config categories = ['files', 'videos', 'music'] paging = True # search-url -# https://torrentz.eu/search?f=EXAMPLE&p=6 -base_url = 'https://torrentz.eu/' +# https://torrentz2.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz2.eu/' search_url = base_url + 'search?{query}' # do search-request def request(query, params): page = params['pageno'] - 1 - query = urlencode({'q': query, 'p': page}) + query = urlencode({'f': query, 'p': page}) params['url'] = search_url.format(query=query) return params @@ -54,22 +54,29 @@ def response(resp): # extract url and remove a slash in the beginning link = links[0].attrib.get('href').lstrip('/') - seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '') - leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '') + seed = 0 + leech = 0 + try: + seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', '')) + leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', '')) + except: + pass params = { 'url': base_url + link, 'title': title, - 'seed': int_or_zero(seed), - 'leech': int_or_zero(leech), + 'seed': seed, + 'leech': leech, 'template': 'torrent.html' } # let's try to calculate the torrent size try: - size_str = result.xpath('./dd/span[@class="s"]/text()')[0] - size, suffix = size_str.split() - params['filesize'] = int(size) * get_filesize_mul(suffix) + filesize_info = result.xpath('./dd/span[3]/text()')[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + + params['filesize'] = filesize except: pass @@ -80,9 +87,8 @@ def response(resp): # extract and convert creation date try: - date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title') - # Fri, 25 Mar 2016 16:29:01 - date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') + date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title') + date = datetime.fromtimestamp(float(date_ts)) params['publishedDate'] = date except: pass diff --git a/searx/engines/unsplash.py b/searx/engines/unsplash.py new file mode 100644 index 000000000..2e8d6fdfc --- /dev/null +++ b/searx/engines/unsplash.py @@ -0,0 +1,52 @@ +""" + Unsplash + + @website https://unsplash.com + @provide-api yes (https://unsplash.com/developers) + + @using-api no + @results JSON (using search portal's infiniscroll API) + @stable no (JSON format could change any time) + @parse url, title, img_src, thumbnail_src +""" + +from searx.url_utils import urlencode, urlparse, urlunparse, parse_qsl +from json import loads + +url = 'https://unsplash.com/' +search_url = url + 'napi/search/photos?' +categories = ['images'] +page_size = 20 +paging = True + + +def clean_url(url): + parsed = urlparse(url) + query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] + + return urlunparse((parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + urlencode(query), + parsed.fragment)) + + +def request(query, params): + params['url'] = search_url + urlencode({'query': query, 'page': params['pageno'], 'per_page': page_size}) + return params + + +def response(resp): + results = [] + json_data = loads(resp.text) + + if 'results' in json_data: + for result in json_data['results']: + results.append({'template': 'images.html', + 'url': clean_url(result['links']['html']), + 'thumbnail_src': clean_url(result['urls']['thumb']), + 'img_src': clean_url(result['urls']['raw']), + 'title': result['description'], + 'content': ''}) + return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 1408be8df..a92271019 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -24,7 +24,7 @@ paging = True base_url = 'https://vimeo.com/' search_url = base_url + '/search/page:{pageno}?{query}' -embedded_url = '<iframe data-src="//player.vimeo.com/video/{videoid}" ' +\ +embedded_url = '<iframe data-src="https://player.vimeo.com/video/{videoid}" ' +\ 'width="540" height="304" frameborder="0" ' +\ 'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>' diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index be217463c..e913b3915 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -16,9 +16,11 @@ from searx.poolrequests import get from searx.engines.xpath import extract_text from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode +from searx.utils import match_language, eval_xpath from json import loads from lxml.html import fromstring +from lxml import etree logger = logger.getChild('wikidata') result_count = 1 @@ -26,23 +28,23 @@ result_count = 1 # urls wikidata_host = 'https://www.wikidata.org' url_search = wikidata_host \ - + '/wiki/Special:ItemDisambiguation?{query}' + + '/w/index.php?{query}&ns0=1' wikidata_api = wikidata_host + '/w/api.php' url_detail = wikidata_api\ + '?action=parse&format=json&{query}'\ - + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\ - + '&disableeditsection=1&disabletidy=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' + + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\ + + '&disableeditsection=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' url_map = 'https://www.openstreetmap.org/'\ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' # xpaths -wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' +div_ids_xpath = '//div[@id]' +wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' title_xpath = '//*[contains(@class,"wikibase-title-label")]' description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' -property_xpath = '//div[@id="{propertyid}"]' label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ @@ -53,38 +55,47 @@ value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ + '/*/div[contains(@class,"wikibase-snakview-value")]' language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' +media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' -def request(query, params): - language = params['language'].split('-')[0] - if language == 'all': - language = 'en' +def get_id_cache(result): + id_cache = {} + for e in eval_xpath(result, div_ids_xpath): + id = e.get('id') + if id.startswith('P'): + id_cache[id] = e + return id_cache + +def request(query, params): params['url'] = url_search.format( - query=urlencode({'label': query, 'language': language})) + query=urlencode({'search': query})) return params def response(resp): results = [] - html = fromstring(resp.text) - wikidata_ids = html.xpath(wikidata_ids_xpath) + htmlparser = etree.HTMLParser() + html = fromstring(resp.content.decode("utf-8"), parser=htmlparser) + search_results = eval_xpath(html, wikidata_ids_xpath) - language = resp.search_params['language'].split('-')[0] - if language == 'all': + if resp.search_params['language'].split('-')[0] == 'all': language = 'en' + else: + language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] # TODO: make requests asynchronous to avoid timeout when result_count > 1 - for wikidata_id in wikidata_ids[:result_count]: + for search_result in search_results[:result_count]: + wikidata_id = search_result.split('/')[-1] url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) - jsonresponse = loads(htmlresponse.text) - results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) + jsonresponse = loads(htmlresponse.content.decode("utf-8")) + results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) return results -def getDetail(jsonresponse, wikidata_id, language, locale): +def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser): results = [] urls = [] attributes = [] @@ -95,21 +106,23 @@ def getDetail(jsonresponse, wikidata_id, language, locale): if not title or not result: return results - title = fromstring(title) - for elem in title.xpath(language_fallback_xpath): + title = fromstring(title, parser=htmlparser) + for elem in eval_xpath(title, language_fallback_xpath): elem.getparent().remove(elem) - title = extract_text(title.xpath(title_xpath)) + title = extract_text(eval_xpath(title, title_xpath)) - result = fromstring(result) - for elem in result.xpath(language_fallback_xpath): + result = fromstring(result, parser=htmlparser) + for elem in eval_xpath(result, language_fallback_xpath): elem.getparent().remove(elem) - description = extract_text(result.xpath(description_xpath)) + description = extract_text(eval_xpath(result, description_xpath)) + + id_cache = get_id_cache(result) # URLS # official website - add_url(urls, result, 'P856', results=results) + add_url(urls, result, id_cache, 'P856', results=results) # wikipedia wikipedia_link_count = 0 @@ -130,30 +143,30 @@ def getDetail(jsonresponse, wikidata_id, language, locale): # if wikipedia_link_count == 0: # more wikis - add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') - add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') - add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki') + add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') + add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') + add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki') - add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo') + add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo') # musicbrainz - add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') - add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') - add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') - add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') + add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') + add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') + add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') + add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') # IMDb - add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') + add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') # source code repository - add_url(urls, result, 'P1324') + add_url(urls, result, id_cache, 'P1324') # blog - add_url(urls, result, 'P1581') + add_url(urls, result, id_cache, 'P1581') # social media links - add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') - add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') - add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/') - add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/') - add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/') + add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') + add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') + add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/') + add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/') + add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/') urls.append({'title': 'Wikidata', 'url': 'https://www.wikidata.org/wiki/' @@ -163,132 +176,132 @@ def getDetail(jsonresponse, wikidata_id, language, locale): # DATES # inception date - add_attribute(attributes, result, 'P571', date=True) + add_attribute(attributes, id_cache, 'P571', date=True) # dissolution date - add_attribute(attributes, result, 'P576', date=True) + add_attribute(attributes, id_cache, 'P576', date=True) # start date - add_attribute(attributes, result, 'P580', date=True) + add_attribute(attributes, id_cache, 'P580', date=True) # end date - add_attribute(attributes, result, 'P582', date=True) + add_attribute(attributes, id_cache, 'P582', date=True) # date of birth - add_attribute(attributes, result, 'P569', date=True) + add_attribute(attributes, id_cache, 'P569', date=True) # date of death - add_attribute(attributes, result, 'P570', date=True) + add_attribute(attributes, id_cache, 'P570', date=True) # date of spacecraft launch - add_attribute(attributes, result, 'P619', date=True) + add_attribute(attributes, id_cache, 'P619', date=True) # date of spacecraft landing - add_attribute(attributes, result, 'P620', date=True) + add_attribute(attributes, id_cache, 'P620', date=True) # nationality - add_attribute(attributes, result, 'P27') + add_attribute(attributes, id_cache, 'P27') # country of origin - add_attribute(attributes, result, 'P495') + add_attribute(attributes, id_cache, 'P495') # country - add_attribute(attributes, result, 'P17') + add_attribute(attributes, id_cache, 'P17') # headquarters - add_attribute(attributes, result, 'Q180') + add_attribute(attributes, id_cache, 'Q180') # PLACES # capital - add_attribute(attributes, result, 'P36', trim=True) + add_attribute(attributes, id_cache, 'P36', trim=True) # head of state - add_attribute(attributes, result, 'P35', trim=True) + add_attribute(attributes, id_cache, 'P35', trim=True) # head of government - add_attribute(attributes, result, 'P6', trim=True) + add_attribute(attributes, id_cache, 'P6', trim=True) # type of government - add_attribute(attributes, result, 'P122') + add_attribute(attributes, id_cache, 'P122') # official language - add_attribute(attributes, result, 'P37') + add_attribute(attributes, id_cache, 'P37') # population - add_attribute(attributes, result, 'P1082', trim=True) + add_attribute(attributes, id_cache, 'P1082', trim=True) # area - add_attribute(attributes, result, 'P2046') + add_attribute(attributes, id_cache, 'P2046') # currency - add_attribute(attributes, result, 'P38', trim=True) + add_attribute(attributes, id_cache, 'P38', trim=True) # heigth (building) - add_attribute(attributes, result, 'P2048') + add_attribute(attributes, id_cache, 'P2048') # MEDIA # platform (videogames) - add_attribute(attributes, result, 'P400') + add_attribute(attributes, id_cache, 'P400') # author - add_attribute(attributes, result, 'P50') + add_attribute(attributes, id_cache, 'P50') # creator - add_attribute(attributes, result, 'P170') + add_attribute(attributes, id_cache, 'P170') # director - add_attribute(attributes, result, 'P57') + add_attribute(attributes, id_cache, 'P57') # performer - add_attribute(attributes, result, 'P175') + add_attribute(attributes, id_cache, 'P175') # developer - add_attribute(attributes, result, 'P178') + add_attribute(attributes, id_cache, 'P178') # producer - add_attribute(attributes, result, 'P162') + add_attribute(attributes, id_cache, 'P162') # manufacturer - add_attribute(attributes, result, 'P176') + add_attribute(attributes, id_cache, 'P176') # screenwriter - add_attribute(attributes, result, 'P58') + add_attribute(attributes, id_cache, 'P58') # production company - add_attribute(attributes, result, 'P272') + add_attribute(attributes, id_cache, 'P272') # record label - add_attribute(attributes, result, 'P264') + add_attribute(attributes, id_cache, 'P264') # publisher - add_attribute(attributes, result, 'P123') + add_attribute(attributes, id_cache, 'P123') # original network - add_attribute(attributes, result, 'P449') + add_attribute(attributes, id_cache, 'P449') # distributor - add_attribute(attributes, result, 'P750') + add_attribute(attributes, id_cache, 'P750') # composer - add_attribute(attributes, result, 'P86') + add_attribute(attributes, id_cache, 'P86') # publication date - add_attribute(attributes, result, 'P577', date=True) + add_attribute(attributes, id_cache, 'P577', date=True) # genre - add_attribute(attributes, result, 'P136') + add_attribute(attributes, id_cache, 'P136') # original language - add_attribute(attributes, result, 'P364') + add_attribute(attributes, id_cache, 'P364') # isbn - add_attribute(attributes, result, 'Q33057') + add_attribute(attributes, id_cache, 'Q33057') # software license - add_attribute(attributes, result, 'P275') + add_attribute(attributes, id_cache, 'P275') # programming language - add_attribute(attributes, result, 'P277') + add_attribute(attributes, id_cache, 'P277') # version - add_attribute(attributes, result, 'P348', trim=True) + add_attribute(attributes, id_cache, 'P348', trim=True) # narrative location - add_attribute(attributes, result, 'P840') + add_attribute(attributes, id_cache, 'P840') # LANGUAGES # number of speakers - add_attribute(attributes, result, 'P1098') + add_attribute(attributes, id_cache, 'P1098') # writing system - add_attribute(attributes, result, 'P282') + add_attribute(attributes, id_cache, 'P282') # regulatory body - add_attribute(attributes, result, 'P1018') + add_attribute(attributes, id_cache, 'P1018') # language code - add_attribute(attributes, result, 'P218') + add_attribute(attributes, id_cache, 'P218') # OTHER # ceo - add_attribute(attributes, result, 'P169', trim=True) + add_attribute(attributes, id_cache, 'P169', trim=True) # founder - add_attribute(attributes, result, 'P112') + add_attribute(attributes, id_cache, 'P112') # legal form (company/organization) - add_attribute(attributes, result, 'P1454') + add_attribute(attributes, id_cache, 'P1454') # operator - add_attribute(attributes, result, 'P137') + add_attribute(attributes, id_cache, 'P137') # crew members (tripulation) - add_attribute(attributes, result, 'P1029') + add_attribute(attributes, id_cache, 'P1029') # taxon - add_attribute(attributes, result, 'P225') + add_attribute(attributes, id_cache, 'P225') # chemical formula - add_attribute(attributes, result, 'P274') + add_attribute(attributes, id_cache, 'P274') # winner (sports/contests) - add_attribute(attributes, result, 'P1346') + add_attribute(attributes, id_cache, 'P1346') # number of deaths - add_attribute(attributes, result, 'P1120') + add_attribute(attributes, id_cache, 'P1120') # currency code - add_attribute(attributes, result, 'P498') + add_attribute(attributes, id_cache, 'P498') - image = add_image(result) + image = add_image(id_cache) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: results.append({ @@ -310,43 +323,42 @@ def getDetail(jsonresponse, wikidata_id, language, locale): # only returns first match -def add_image(result): +def add_image(id_cache): # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] for property_id in property_ids: - image = result.xpath(property_xpath.replace('{propertyid}', property_id)) - if image: - image_name = image[0].xpath(value_xpath) + image = id_cache.get(property_id, None) + if image is not None: + image_name = eval_xpath(image, media_xpath) image_src = url_image.replace('{filename}', extract_text(image_name[0])) return image_src # setting trim will only returned high ranked rows OR the first row -def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False): - attribute = result.xpath(property_xpath.replace('{propertyid}', property_id)) - if attribute: +def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False): + attribute = id_cache.get(property_id, None) + if attribute is not None: if default_label: label = default_label else: - label = extract_text(attribute[0].xpath(label_xpath)) + label = extract_text(eval_xpath(attribute, label_xpath)) label = label[0].upper() + label[1:] if date: trim = True # remove calendar name - calendar_name = attribute[0].xpath(calendar_name_xpath) + calendar_name = eval_xpath(attribute, calendar_name_xpath) for calendar in calendar_name: calendar.getparent().remove(calendar) concat_values = "" values = [] first_value = None - for row in attribute[0].xpath(property_row_xpath): - if not first_value or not trim or row.xpath(preferred_rank_xpath): - - value = row.xpath(value_xpath) + for row in eval_xpath(attribute, property_row_xpath): + if not first_value or not trim or eval_xpath(row, preferred_rank_xpath): + value = eval_xpath(row, value_xpath) if not value: continue value = extract_text(value) @@ -369,18 +381,18 @@ def add_attribute(attributes, result, property_id, default_label=None, date=Fals # requires property_id unless it's a wiki link (defined in link_type) -def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None): +def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, + link_type=None): links = [] # wiki links don't have property in wikidata page if link_type and 'wiki' in link_type: links.append(get_wikilink(result, link_type)) else: - dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id)) - if dom_element: - dom_element = dom_element[0] + dom_element = id_cache.get(property_id, None) + if dom_element is not None: if not default_label: - label = extract_text(dom_element.xpath(label_xpath)) + label = extract_text(eval_xpath(dom_element, label_xpath)) label = label[0].upper() + label[1:] if link_type == 'geo': @@ -390,7 +402,7 @@ def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, links.append(get_imdblink(dom_element, url_prefix)) else: - url_results = dom_element.xpath(url_xpath) + url_results = eval_xpath(dom_element, url_xpath) for link in url_results: if link is not None: if url_prefix: @@ -410,7 +422,7 @@ def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, def get_imdblink(result, url_prefix): - imdb_id = result.xpath(value_xpath) + imdb_id = eval_xpath(result, value_xpath) if imdb_id: imdb_id = extract_text(imdb_id) id_prefix = imdb_id[:2] @@ -430,7 +442,7 @@ def get_imdblink(result, url_prefix): def get_geolink(result): - coordinates = result.xpath(value_xpath) + coordinates = eval_xpath(result, value_xpath) if not coordinates: return None coordinates = extract_text(coordinates[0]) @@ -477,7 +489,7 @@ def get_geolink(result): def get_wikilink(result, wikiid): - url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid)) + url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid)) if not url: return None url = url[0] diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index db2fdc000..a216ba886 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -13,6 +13,7 @@ from json import loads from lxml.html import fromstring from searx.url_utils import quote, urlencode +from searx.utils import match_language # search-url base_url = u'https://{language}.wikipedia.org/' @@ -20,7 +21,8 @@ search_url = base_url + u'w/api.php?'\ 'action=query'\ '&format=json'\ '&{query}'\ - '&prop=extracts|pageimages'\ + '&prop=extracts|pageimages|pageprops'\ + '&ppprop=disambiguation'\ '&exintro'\ '&explaintext'\ '&pithumbsize=300'\ @@ -30,13 +32,10 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' # set language in base_url def url_lang(lang): - lang = lang.split('-')[0] - if lang == 'all' or lang not in supported_languages: - language = 'en' - else: - language = lang - - return language + lang_pre = lang.split('-')[0] + if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases: + return 'en' + return match_language(lang, supported_languages, language_aliases).split('-')[0] # do search-request @@ -81,12 +80,15 @@ def response(resp): # wikipedia article's unique id # first valid id is assumed to be the requested article + if 'pages' not in search_result['query']: + return results + for article_id in search_result['query']['pages']: page = search_result['query']['pages'][article_id] if int(article_id) > 0: break - if int(article_id) < 0: + if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}): return [] title = page.get('title') @@ -98,6 +100,7 @@ def response(resp): extract = page.get('extract') summary = extract_first_paragraph(extract, title, image) + summary = summary.replace('() ', '') # link to wikipedia article wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 595c6b7de..1c58c4a9b 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -65,7 +65,7 @@ def replace_pua_chars(text): def response(resp): results = [] - search_results = etree.XML(resp.text) + search_results = etree.XML(resp.content) # return empty array if there are no results if search_results.xpath(failure_xpath): diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 2cbbc5adc..387c9fa17 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -55,7 +55,7 @@ def obtain_token(): return token -def init(): +def init(engine_settings=None): obtain_token() diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 508803240..f1154b16d 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -11,8 +11,8 @@ """ from lxml import html -import re from searx.url_utils import urlencode, urljoin +from searx.engines.xpath import extract_text # engine dependent config categories = ['images'] @@ -34,41 +34,18 @@ def request(query, params): def response(resp): results = [] - # get links from result-text - regex = re.compile('(</a>|<a)') - results_parts = re.split(regex, resp.text) - - cur_element = '' - - # iterate over link parts - for result_part in results_parts: + dom = html.fromstring(resp.text) + for res in dom.xpath('//div[@class="List-item MainListing"]'): # processed start and end of link - if result_part == '<a': - cur_element = result_part - continue - elif result_part != '</a>': - cur_element += result_part - continue - - cur_element += result_part - - # fix xml-error - cur_element = cur_element.replace('"></a>', '"/></a>') - - dom = html.fromstring(cur_element) - link = dom.xpath('//a')[0] + link = res.xpath('//a')[0] url = urljoin(base_url, link.attrib.get('href')) - title = link.attrib.get('title', '') + title = extract_text(link) - thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) + thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) # TODO: get image with higher resolution img_src = thumbnail_src - # check if url is showing to a photo - if '/photo/' not in url: - continue - # append result results.append({'url': url, 'title': title, diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py deleted file mode 100644 index 7a2015ae9..000000000 --- a/searx/engines/www500px.py +++ /dev/null @@ -1,73 +0,0 @@ -""" - 500px (Images) - - @website https://500px.com - @provide-api yes (https://developers.500px.com/) - - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, thumbnail, img_src, content - - @todo rewrite to api -""" - -from json import loads -from searx.url_utils import urlencode, urljoin - -# engine dependent config -categories = ['images'] -paging = True - -# search-url -base_url = 'https://500px.com' -search_url = 'https://api.500px.com/v1/photos/search?type=photos'\ - '&{query}'\ - '&image_size%5B%5D=4'\ - '&image_size%5B%5D=20'\ - '&image_size%5B%5D=21'\ - '&image_size%5B%5D=1080'\ - '&image_size%5B%5D=1600'\ - '&image_size%5B%5D=2048'\ - '&include_states=true'\ - '&formats=jpeg%2Clytro'\ - '&include_tags=true'\ - '&exclude_nude=true'\ - '&page={pageno}'\ - '&rpp=50'\ - '&sdk_key=b68e60cff4c929bedea36ca978830c5caca790c3' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(pageno=params['pageno'], - query=urlencode({'term': query})) - - return params - - -# get response from search-request -def response(resp): - results = [] - - response_json = loads(resp.text) - - # parse results - for result in response_json['photos']: - url = urljoin(base_url, result['url']) - title = result['name'] - # last index is the biggest resolution - img_src = result['image_url'][-1] - thumbnail_src = result['image_url'][0] - content = result['description'] or '' - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'content': content, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) - - # return results - return results diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index c8c56da44..b75896cc7 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -1,12 +1,13 @@ from lxml import html from lxml.etree import _ElementStringResult, _ElementUnicodeResult -from searx.utils import html_to_text +from searx.utils import html_to_text, eval_xpath from searx.url_utils import unquote, urlencode, urljoin, urlparse search_url = None url_xpath = None content_xpath = None title_xpath = None +thumbnail_xpath = False paging = False suggestion_xpath = '' results_xpath = '' @@ -40,7 +41,9 @@ def extract_text(xpath_results): return ''.join(xpath_results) else: # it's a element - text = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False) + text = html.tostring( + xpath_results, encoding='unicode', method='text', with_tail=False + ) text = text.strip().replace('\n', ' ') return ' '.join(text.split()) @@ -53,7 +56,7 @@ def extract_url(xpath_results, search_url): if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(search_url) - url = u'{0}:{1}'.format(parsed_search_url.scheme, url) + url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url) elif url.startswith('/'): # fix relative url to the search engine url = urljoin(search_url, url) @@ -101,22 +104,30 @@ def response(resp): results = [] dom = html.fromstring(resp.text) if results_xpath: - for result in dom.xpath(results_xpath): - url = extract_url(result.xpath(url_xpath), search_url) - title = extract_text(result.xpath(title_xpath)) - content = extract_text(result.xpath(content_xpath)) - results.append({'url': url, 'title': title, 'content': content}) + for result in eval_xpath(dom, results_xpath): + url = extract_url(eval_xpath(result, url_xpath), search_url) + title = extract_text(eval_xpath(result, title_xpath)) + content = extract_text(eval_xpath(result, content_xpath)) + tmp_result = {'url': url, 'title': title, 'content': content} + + # add thumbnail if available + if thumbnail_xpath: + thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath) + if len(thumbnail_xpath_result) > 0: + tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) + + results.append(tmp_result) else: for url, title, content in zip( (extract_url(x, search_url) for - x in dom.xpath(url_xpath)), - map(extract_text, dom.xpath(title_xpath)), - map(extract_text, dom.xpath(content_xpath)) + x in eval_xpath(dom, url_xpath)), + map(extract_text, eval_xpath(dom, title_xpath)), + map(extract_text, eval_xpath(dom, content_xpath)) ): results.append({'url': url, 'title': title, 'content': content}) if not suggestion_xpath: return results - for suggestion in dom.xpath(suggestion_xpath): + for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) return results diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index a62a1296e..25bc83687 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -76,8 +76,17 @@ def response(resp): for result in search_results[0].get('items', []): # parse image results if result.get('image'): + + result_url = '' + if 'url' in result: + result_url = result['url'] + elif 'link' in result: + result_url = result['link'] + else: + continue + # append result - results.append({'url': result['url'], + results.append({'url': result_url, 'title': result['title'], 'content': '', 'img_src': result['image'], diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 5387aaf54..36c1a11f8 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -14,6 +14,7 @@ from lxml import html from searx.engines.xpath import extract_text, extract_url from searx.url_utils import unquote, urlencode +from searx.utils import match_language, eval_xpath # engine dependent config categories = ['general'] @@ -39,6 +40,8 @@ time_range_dict = {'day': ['1d', 'd'], 'week': ['1w', 'w'], 'month': ['1m', 'm']} +language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} + # remove yahoo-specific tracking-url def parse_url(url_string): @@ -73,13 +76,13 @@ def _get_url(query, offset, language, time_range): def _get_language(params): if params['language'] == 'all': return 'en' - elif params['language'][:2] == 'zh': - if params['language'] == 'zh' or params['language'] == 'zh-CH': - return 'szh' - else: - return 'tzh' - else: - return params['language'].split('-')[0] + + language = match_language(params['language'], supported_languages, language_aliases) + if language not in language_aliases.values(): + language = language.split('-')[0] + language = language.replace('-', '_').lower() + + return language # do search-request @@ -106,21 +109,21 @@ def response(resp): dom = html.fromstring(resp.text) try: - results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0] + results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0] .split()[0].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results - for result in dom.xpath(results_xpath): + for result in eval_xpath(dom, results_xpath): try: - url = parse_url(extract_url(result.xpath(url_xpath), search_url)) - title = extract_text(result.xpath(title_xpath)[0]) + url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url)) + title = extract_text(eval_xpath(result, title_xpath)[0]) except: continue - content = extract_text(result.xpath(content_xpath)[0]) + content = extract_text(eval_xpath(result, content_xpath)[0]) # append result results.append({'url': url, @@ -128,7 +131,7 @@ def response(resp): 'content': content}) # if no suggestion found, return results - suggestions = dom.xpath(suggestion_xpath) + suggestions = eval_xpath(dom, suggestion_xpath) if not suggestions: return results @@ -145,9 +148,13 @@ def response(resp): def _fetch_supported_languages(resp): supported_languages = [] dom = html.fromstring(resp.text) - options = dom.xpath('//div[@id="yschlang"]/span/label/input') + options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input') for option in options: - code = option.xpath('./@value')[0][5:].replace('_', '-') + code_parts = eval_xpath(option, './@value')[0][5:].split('_') + if len(code_parts) == 2: + code = code_parts[0] + '-' + code_parts[1].upper() + else: + code = code_parts[0] supported_languages.append(code) return supported_languages diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index ae54a4acd..9f6a4159b 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -13,9 +13,12 @@ import re from datetime import datetime, timedelta from lxml import html from searx.engines.xpath import extract_text, extract_url -from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url +from searx.engines.yahoo import ( + parse_url, _fetch_supported_languages, supported_languages_url, language_aliases +) from dateutil import parser from searx.url_utils import urlencode +from searx.utils import match_language # engine dependent config categories = ['news'] @@ -41,7 +44,7 @@ def request(query, params): if params['language'] == 'all': language = 'en' else: - language = params['language'].split('_')[0] + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] params['url'] = search_url.format(offset=offset, query=urlencode({'p': query}), diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index 6de18aa2c..bc4c0d58e 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -23,7 +23,7 @@ base_url = 'https://www.googleapis.com/youtube/v3/search' search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}' embedded_url = '<iframe width="540" height="304" ' +\ - 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\ + 'data-src="https://www.youtube-nocookie.com/embed/{videoid}" ' +\ 'frameborder="0" allowfullscreen></iframe>' base_youtube_url = 'https://www.youtube.com/watch?v=' diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 9f01841f6..49d0ae604 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -8,7 +8,8 @@ # @stable no # @parse url, title, content, publishedDate, thumbnail, embedded -from lxml import html +from functools import reduce +from json import loads from searx.engines.xpath import extract_text from searx.utils import list_get from searx.url_utils import quote_plus @@ -29,25 +30,11 @@ time_range_dict = {'day': 'Ag', 'year': 'BQ'} embedded_url = '<iframe width="540" height="304" ' +\ - 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\ + 'data-src="https://www.youtube-nocookie.com/embed/{videoid}" ' +\ 'frameborder="0" allowfullscreen></iframe>' base_youtube_url = 'https://www.youtube.com/watch?v=' -# specific xpath variables -results_xpath = "//ol/li/div[contains(@class, 'yt-lockup yt-lockup-tile yt-lockup-video vve-check')]" -url_xpath = './/h3/a/@href' -title_xpath = './/div[@class="yt-lockup-content"]/h3/a' -content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]' - - -# returns extract_text on the first result selected by the xpath or None -def extract_text_from_dom(result, xpath): - r = result.xpath(xpath) - if len(r) > 0: - return extract_text(r[0]) - return None - # do search-request def request(query, params): @@ -63,27 +50,41 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.text) - - # parse results - for result in dom.xpath(results_xpath): - videoid = list_get(result.xpath('@data-context-item-id'), 0) - if videoid is not None: - url = base_youtube_url + videoid - thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' - - title = extract_text_from_dom(result, title_xpath) or videoid - content = extract_text_from_dom(result, content_xpath) - - embedded = embedded_url.format(videoid=videoid) - - # append result - results.append({'url': url, - 'title': title, - 'content': content, - 'template': 'videos.html', - 'embedded': embedded, - 'thumbnail': thumbnail}) + results_data = resp.text[resp.text.find('ytInitialData'):] + results_data = results_data[results_data.find('{'):results_data.find(';\n')] + + results_json = loads(results_data) if results_data else {} + sections = results_json.get('contents', {})\ + .get('twoColumnSearchResultsRenderer', {})\ + .get('primaryContents', {})\ + .get('sectionListRenderer', {})\ + .get('contents', []) + + for section in sections: + for video_container in section.get('itemSectionRenderer', {}).get('contents', []): + video = video_container.get('videoRenderer', {}) + videoid = video.get('videoId') + if videoid is not None: + url = base_youtube_url + videoid + thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' + title = get_text_from_json(video.get('title', {})) + content = get_text_from_json(video.get('descriptionSnippet', {})) + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'embedded': embedded, + 'thumbnail': thumbnail}) # return results return results + + +def get_text_from_json(element): + if 'runs' in element: + return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '') + else: + return element.get('simpleText', '') |