diff options
Diffstat (limited to 'searx/engines')
82 files changed, 8439 insertions, 0 deletions
diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py new file mode 100644 index 000000000..0de04bd95 --- /dev/null +++ b/searx/engines/1337x.py @@ -0,0 +1,39 @@ +from lxml import html +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size +from searx.url_utils import quote, urljoin + +url = 'https://1337x.to/' +search_url = url + 'search/{search_term}/{pageno}/' +categories = ['videos'] +paging = True + + +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno']) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//table[contains(@class, "table-list")]/tbody//tr'): + href = urljoin(url, result.xpath('./td[contains(@class, "name")]/a[2]/@href')[0]) + title = extract_text(result.xpath('./td[contains(@class, "name")]/a[2]')) + seed = extract_text(result.xpath('.//td[contains(@class, "seeds")]')) + leech = extract_text(result.xpath('.//td[contains(@class, "leeches")]')) + filesize_info = extract_text(result.xpath('.//td[contains(@class, "size")]/text()')) + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + + results.append({'url': href, + 'title': title, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py new file mode 100644 index 000000000..2393f52b6 --- /dev/null +++ b/searx/engines/__init__.py @@ -0,0 +1,269 @@ + +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2013- by Adam Tauber, <asciimoo@gmail.com> +''' + +import sys +import threading +from os.path import realpath, dirname +from io import open +from babel.localedata import locale_identifiers +from flask_babel import gettext +from operator import itemgetter +from json import loads +from requests import get +from searx import settings +from searx import logger +from searx.utils import load_module, match_language, get_engine_from_settings + + +logger = logger.getChild('engines') + +engine_dir = dirname(realpath(__file__)) + +engines = {} + +categories = {'general': []} + +languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read()) +babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0] + for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())] + +engine_shortcuts = {} +engine_default_args = {'paging': False, + 'categories': ['general'], + 'language_support': True, + 'supported_languages': [], + 'safesearch': False, + 'timeout': settings['outgoing']['request_timeout'], + 'shortcut': '-', + 'disabled': False, + 'suspend_end_time': 0, + 'continuous_errors': 0, + 'time_range_support': False, + 'offline': False} + + +def load_engine(engine_data): + engine_name = engine_data['name'] + if '_' in engine_name: + logger.error('Engine name contains underscore: "{}"'.format(engine_name)) + sys.exit(1) + + if engine_name.lower() != engine_name: + logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name)) + engine_name = engine_name.lower() + engine_data['name'] = engine_name + + engine_module = engine_data['engine'] + + try: + engine = load_module(engine_module + '.py', engine_dir) + except: + logger.exception('Cannot load engine "{}"'.format(engine_module)) + return None + + for param_name in engine_data: + if param_name == 'engine': + continue + if param_name == 'categories': + if engine_data['categories'] == 'none': + engine.categories = [] + else: + engine.categories = list(map(str.strip, engine_data['categories'].split(','))) + continue + setattr(engine, param_name, engine_data[param_name]) + + for arg_name, arg_value in engine_default_args.items(): + if not hasattr(engine, arg_name): + setattr(engine, arg_name, arg_value) + + # checking required variables + for engine_attr in dir(engine): + if engine_attr.startswith('_'): + continue + if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: + return None + if getattr(engine, engine_attr) is None: + logger.error('Missing engine config attribute: "{0}.{1}"' + .format(engine.name, engine_attr)) + sys.exit(1) + + # assign supported languages from json file + if engine_data['name'] in languages: + setattr(engine, 'supported_languages', languages[engine_data['name']]) + + # find custom aliases for non standard language codes + if hasattr(engine, 'supported_languages'): + if hasattr(engine, 'language_aliases'): + language_aliases = getattr(engine, 'language_aliases') + else: + language_aliases = {} + + for engine_lang in getattr(engine, 'supported_languages'): + iso_lang = match_language(engine_lang, babel_langs, fallback=None) + if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ + iso_lang not in getattr(engine, 'supported_languages'): + language_aliases[iso_lang] = engine_lang + + setattr(engine, 'language_aliases', language_aliases) + + # assign language fetching method if auxiliary method exists + if hasattr(engine, '_fetch_supported_languages'): + setattr(engine, 'fetch_supported_languages', + lambda: engine._fetch_supported_languages(get(engine.supported_languages_url))) + + engine.stats = { + 'result_count': 0, + 'search_count': 0, + 'engine_time': 0, + 'engine_time_count': 0, + 'score_count': 0, + 'errors': 0 + } + + if not engine.offline: + engine.stats['page_load_time'] = 0 + engine.stats['page_load_count'] = 0 + + for category_name in engine.categories: + categories.setdefault(category_name, []).append(engine) + + if engine.shortcut in engine_shortcuts: + logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut)) + sys.exit(1) + + engine_shortcuts[engine.shortcut] = engine.name + + return engine + + +def to_percentage(stats, maxvalue): + for engine_stat in stats: + if maxvalue: + engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100) + else: + engine_stat['percentage'] = 0 + return stats + + +def get_engines_stats(): + # TODO refactor + pageloads = [] + engine_times = [] + results = [] + scores = [] + errors = [] + scores_per_result = [] + + max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa + for engine in engines.values(): + if engine.stats['search_count'] == 0: + continue + results_num = \ + engine.stats['result_count'] / float(engine.stats['search_count']) + + if engine.stats['engine_time_count'] != 0: + this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa + else: + this_engine_time = 0 + + if results_num: + score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa + score_per_result = score / results_num + else: + score = score_per_result = 0.0 + + if not engine.offline: + load_times = 0 + if engine.stats['page_load_count'] != 0: + load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa + max_pageload = max(load_times, max_pageload) + pageloads.append({'avg': load_times, 'name': engine.name}) + + max_engine_times = max(this_engine_time, max_engine_times) + max_results = max(results_num, max_results) + max_score = max(score, max_score) + max_score_per_result = max(score_per_result, max_score_per_result) + max_errors = max(max_errors, engine.stats['errors']) + + engine_times.append({'avg': this_engine_time, 'name': engine.name}) + results.append({'avg': results_num, 'name': engine.name}) + scores.append({'avg': score, 'name': engine.name}) + errors.append({'avg': engine.stats['errors'], 'name': engine.name}) + scores_per_result.append({ + 'avg': score_per_result, + 'name': engine.name + }) + + pageloads = to_percentage(pageloads, max_pageload) + engine_times = to_percentage(engine_times, max_engine_times) + results = to_percentage(results, max_results) + scores = to_percentage(scores, max_score) + scores_per_result = to_percentage(scores_per_result, max_score_per_result) + erros = to_percentage(errors, max_errors) + + return [ + ( + gettext('Engine time (sec)'), + sorted(engine_times, key=itemgetter('avg')) + ), + ( + gettext('Page loads (sec)'), + sorted(pageloads, key=itemgetter('avg')) + ), + ( + gettext('Number of results'), + sorted(results, key=itemgetter('avg'), reverse=True) + ), + ( + gettext('Scores'), + sorted(scores, key=itemgetter('avg'), reverse=True) + ), + ( + gettext('Scores per result'), + sorted(scores_per_result, key=itemgetter('avg'), reverse=True) + ), + ( + gettext('Errors'), + sorted(errors, key=itemgetter('avg'), reverse=True) + ), + ] + + +def load_engines(engine_list): + global engines + engines.clear() + for engine_data in engine_list: + engine = load_engine(engine_data) + if engine is not None: + engines[engine.name] = engine + return engines + + +def initialize_engines(engine_list): + load_engines(engine_list) + + def engine_init(engine_name, init_fn): + init_fn(get_engine_from_settings(engine_name)) + logger.debug('%s engine: Initialized', engine_name) + + for engine_name, engine in engines.items(): + if hasattr(engine, 'init'): + init_fn = getattr(engine, 'init') + if init_fn: + logger.debug('%s engine: Starting background initialization', engine_name) + threading.Thread(target=engine_init, args=(engine_name, init_fn)).start() diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py new file mode 100644 index 000000000..cca28f0db --- /dev/null +++ b/searx/engines/acgsou.py @@ -0,0 +1,75 @@ +""" + Acgsou (Japanese Animation/Music/Comics Bittorrent tracker) + + @website https://www.acgsou.com/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'http://www.acgsou.com/' +search_url = base_url + 'search.php?{query}&page={offset}' +# xpath queries +xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' +xpath_category = './/td[2]/a[1]' +xpath_title = './/td[3]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' + + +def request(query, params): + query = urlencode({'keyword': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" + torrent_link = "" + + try: + category = extract_text(result.xpath(xpath_category)[0]) + except: + pass + + page_a = result.xpath(xpath_title)[0] + title = extract_text(page_a) + href = base_url + page_a.attrib.get('href') + + magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) + + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize = filesize_info[:-2] + filesize_multiplier = filesize_info[-2:] + filesize = get_torrent_size(filesize, filesize_multiplier) + except: + pass + # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime + content = u'Category: "{category}".' + content = content.format(category=category) + + results.append({'url': href, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnet_link, + 'template': 'torrent.html'}) + return results diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py new file mode 100644 index 000000000..f2ee12b29 --- /dev/null +++ b/searx/engines/apkmirror.py @@ -0,0 +1,61 @@ +""" + APK Mirror + + @website https://www.apkmirror.com + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail_src +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# I am not 100% certain about this, as apkmirror appears to be a wordpress site, +# which might support time_range searching. If you want to implement it, go ahead. +time_range_support = False + +# search-url +base_url = 'https://www.apkmirror.com' +search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}' + + +# do search-request +def request(query, params): + + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'s': query})) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('.//div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'): + + link = result.xpath('.//h5/a')[0] + url = base_url + link.attrib.get('href') + '#downloads' + title = extract_text(link) + thumbnail_src = base_url + result.xpath('.//img')[0].attrib.get('src').replace('&w=32&h=32', '&w=64&h=64') + + res = { + 'url': url, + 'title': title, + 'thumbnail_src': thumbnail_src + } + + # append result + results.append(res) + + # return results + return results diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py new file mode 100644 index 000000000..dce862f55 --- /dev/null +++ b/searx/engines/archlinux.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +""" + Arch Linux Wiki + + @website https://wiki.archlinux.org + @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['it'] +language_support = True +paging = True +base_url = 'https://wiki.archlinux.org' + +# xpath queries +xpath_results = '//ul[@class="mw-search-results"]/li' +xpath_link = './/div[@class="mw-search-result-heading"]/a' + + +# cut 'en' from 'en-US', 'de' from 'de-CH', and so on +def locale_to_lang_code(locale): + if locale.find('-') >= 0: + locale = locale.split('-')[0] + return locale + + +# wikis for some languages were moved off from the main site, we need to make +# requests to correct URLs to be able to get results in those languages +lang_urls = { + 'all': { + 'base': 'https://wiki.archlinux.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}' + }, + 'de': { + 'base': 'https://wiki.archlinux.de', + 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}' + }, + 'fr': { + 'base': 'https://wiki.archlinux.fr', + 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}' + }, + 'ja': { + 'base': 'https://wiki.archlinuxjp.org', + 'search': '/index.php?title=特別:検索&offset={offset}&{query}' + }, + 'ro': { + 'base': 'http://wiki.archlinux.ro', + 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}' + }, + 'tr': { + 'base': 'http://archtr.org/wiki', + 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}' + } +} + + +# get base & search URLs for selected language +def get_lang_urls(language): + if language in lang_urls: + return lang_urls[language] + return lang_urls['all'] + + +# Language names to build search requests for +# those languages which are hosted on the main site. +main_langs = { + 'ar': 'العربية', + 'bg': 'Български', + 'cs': 'Česky', + 'da': 'Dansk', + 'el': 'Ελληνικά', + 'es': 'Español', + 'he': 'עברית', + 'hr': 'Hrvatski', + 'hu': 'Magyar', + 'it': 'Italiano', + 'ko': '한국어', + 'lt': 'Lietuviškai', + 'nl': 'Nederlands', + 'pl': 'Polski', + 'pt': 'Português', + 'ru': 'Русский', + 'sl': 'Slovenský', + 'th': 'ไทย', + 'uk': 'Українська', + 'zh': '简体中文' +} +supported_languages = dict(lang_urls, **main_langs) + + +# do search-request +def request(query, params): + # translate the locale (e.g. 'en-US') to language code ('en') + language = locale_to_lang_code(params['language']) + + # if our language is hosted on the main site, we need to add its name + # to the query in order to narrow the results to that language + if language in main_langs: + query += b' (' + main_langs[language] + b')' + + # prepare the request parameters + query = urlencode({'search': query}) + offset = (params['pageno'] - 1) * 20 + + # get request URLs for our language of choice + urls = get_lang_urls(language) + search_url = urls['base'] + urls['search'] + + params['url'] = search_url.format(query=query, offset=offset) + + return params + + +# get response from search-request +def response(resp): + # get the base URL for the language in which request was made + language = locale_to_lang_code(resp.search_params['language']) + base_url = get_lang_urls(language)['base'] + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(xpath_results): + link = result.xpath(xpath_link)[0] + href = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + + results.append({'url': href, + 'title': title}) + + return results diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py new file mode 100644 index 000000000..e3c871d17 --- /dev/null +++ b/searx/engines/arxiv.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +""" + ArXiV (Scientific preprints) + @website https://arxiv.org + @provide-api yes (export.arxiv.org/api/query) + @using-api yes + @results XML-RSS + @stable yes + @parse url, title, publishedDate, content + More info on api: https://arxiv.org/help/api/user-manual +""" + +from lxml import html +from datetime import datetime +from searx.url_utils import urlencode + + +categories = ['science'] +paging = True + +base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + + '{query}&start={offset}&max_results={number_of_results}' + +# engine dependent config +number_of_results = 10 + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=query.decode('utf-8'), + offset=offset, + number_of_results=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.content) + search_results = dom.xpath('//entry') + + for entry in search_results: + title = entry.xpath('.//title')[0].text + + url = entry.xpath('.//id')[0].text + + content_string = '{doi_content}{abstract_content}' + + abstract = entry.xpath('.//summary')[0].text + + # If a doi is available, add it to the snipppet + try: + doi_content = entry.xpath('.//link[@title="doi"]')[0].text + content = content_string.format(doi_content=doi_content, abstract_content=abstract) + except: + content = content_string.format(doi_content="", abstract_content=abstract) + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/engines/base.py b/searx/engines/base.py new file mode 100755 index 000000000..f1b1cf671 --- /dev/null +++ b/searx/engines/base.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +""" + BASE (Scholar publications) + + @website https://base-search.net + @provide-api yes with authorization (https://api.base-search.net/) + + @using-api yes + @results XML + @stable ? + @parse url, title, publishedDate, content + More info on api: http://base-search.net/about/download/base_interface.pdf +""" + +from lxml import etree +from datetime import datetime +import re +from searx.url_utils import urlencode +from searx.utils import searx_useragent + + +categories = ['science'] + +base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\ + + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' + +# engine dependent config +paging = True +number_of_results = 10 + +# shortcuts for advanced search +shorcut_dict = { + # user-friendly keywords + 'format:': 'dcformat:', + 'author:': 'dccreator:', + 'collection:': 'dccollection:', + 'hdate:': 'dchdate:', + 'contributor:': 'dccontributor:', + 'coverage:': 'dccoverage:', + 'date:': 'dcdate:', + 'abstract:': 'dcdescription:', + 'urls:': 'dcidentifier:', + 'language:': 'dclanguage:', + 'publisher:': 'dcpublisher:', + 'relation:': 'dcrelation:', + 'rights:': 'dcrights:', + 'source:': 'dcsource:', + 'subject:': 'dcsubject:', + 'title:': 'dctitle:', + 'type:': 'dcdctype:' +} + + +def request(query, params): + # replace shortcuts with API advanced search keywords + for key in shorcut_dict.keys(): + query = re.sub(key, shorcut_dict[key], str(query)) + + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'query': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + params['headers']['User-Agent'] = searx_useragent() + return params + + +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + for entry in search_results.xpath('./result/doc'): + content = "No description available" + + date = datetime.now() # needed in case no dcdate is available for an item + for item in entry: + if item.attrib["name"] == "dchdate": + harvestDate = item.text + + elif item.attrib["name"] == "dcdate": + date = item.text + + elif item.attrib["name"] == "dctitle": + title = item.text + + elif item.attrib["name"] == "dclink": + url = item.text + + elif item.attrib["name"] == "dcdescription": + content = item.text[:300] + if len(item.text) > 300: + content += "..." + +# dates returned by the BASE API are not several formats + publishedDate = None + for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: + try: + publishedDate = datetime.strptime(date, date_format) + break + except: + pass + + if publishedDate is not None: + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + else: + res_dict = {'url': url, + 'title': title, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/engines/bing.py b/searx/engines/bing.py new file mode 100644 index 000000000..ed0b87dbd --- /dev/null +++ b/searx/engines/bing.py @@ -0,0 +1,123 @@ +""" + Bing (Web) + + @website https://www.bing.com + @provide-api yes (http://datamarket.azure.com/dataset/bing/search), + max. 5000 query/month + + @using-api no (because of query limit) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo publishedDate +""" + +import re +from lxml import html +from searx import logger, utils +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +from searx.utils import match_language, gen_useragent, eval_xpath + +logger = logger.getChild('bing engine') + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +supported_languages_url = 'https://www.bing.com/account/general' +language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'search?{query}&first={offset}' + + +def _get_offset_from_pageno(pageno): + return (pageno - 1) * 10 + 1 + + +# do search-request +def request(query, params): + offset = _get_offset_from_pageno(params.get('pageno', 0)) + + if params['language'] == 'all': + lang = 'EN' + else: + lang = match_language(params['language'], supported_languages, language_aliases) + + query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8') + + search_path = search_string.format( + query=urlencode({'q': query}), + offset=offset) + + params['url'] = base_url + search_path + + return params + + +# get response from search-request +def response(resp): + results = [] + result_len = 0 + + dom = html.fromstring(resp.text) + # parse results + for result in eval_xpath(dom, '//div[@class="sa_cc"]'): + link = eval_xpath(result, './/h3/a')[0] + url = link.attrib.get('href') + title = extract_text(link) + content = extract_text(eval_xpath(result, './/p')) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # parse results again if nothing is found yet + for result in eval_xpath(dom, '//li[@class="b_algo"]'): + link = eval_xpath(result, './/h2/a')[0] + url = link.attrib.get('href') + title = extract_text(link) + content = extract_text(eval_xpath(result, './/p')) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + try: + result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()')) + result_len_container = utils.to_string(result_len_container) + if "-" in result_len_container: + # Remove the part "from-to" for paginated request ... + result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:] + + result_len_container = re.sub('[^0-9]', '', result_len_container) + if len(result_len_container) > 0: + result_len = int(result_len_container) + except Exception as e: + logger.debug('result error :\n%s', e) + pass + + if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: + return [] + + results.append({'number_of_results': result_len}) + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + options = eval_xpath(dom, '//div[@id="limit-languages"]//input') + for option in options: + code = eval_xpath(option, './@id')[0].replace('_', '-') + if code == 'nb': + code = 'no' + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py new file mode 100644 index 000000000..44e2c3bbc --- /dev/null +++ b/searx/engines/bing_images.py @@ -0,0 +1,124 @@ +""" + Bing (Images) + + @website https://www.bing.com/images + @provide-api yes (http://datamarket.azure.com/dataset/bing/search), + max. 5000 query/month + + @using-api no (because of query limit) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, img_src + +""" + +from lxml import html +from json import loads +import re +from searx.url_utils import urlencode +from searx.utils import match_language + +# engine dependent config +categories = ['images'] +paging = True +safesearch = True +time_range_support = True +language_support = True +supported_languages_url = 'https://www.bing.com/account/general' +number_of_results = 28 + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'images/search'\ + '?{query}'\ + '&count={count}'\ + '&first={first}'\ + '&FORM=IBASEP' +time_range_string = '&qft=+filterui:age-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +# do search-request +def request(query, params): + offset = ((params['pageno'] - 1) * number_of_results) + 1 + + search_path = search_string.format( + query=urlencode({'q': query}), + count=number_of_results, + first=offset) + + language = match_language(params['language'], supported_languages, language_aliases).lower() + + params['cookies']['SRCHHPGUSR'] = \ + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + params['cookies']['_EDGE_S'] = 'mkt=' + language +\ + '&ui=' + language + '&F=1' + + params['url'] = base_url + search_path + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[@class="imgpt"]'): + + img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0] + # Microsoft seems to experiment with this code so don't make the path too specific, + # just catch the text section for the first anchor in img_info assuming this to be + # the originating site. + source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0] + + try: + m = loads(result.xpath('./a/@m')[0]) + + # strip 'Unicode private use area' highlighting, they render to Tux + # the Linux penguin and a standing diamond on my machine... + title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '') + results.append({'template': 'images.html', + 'url': m['purl'], + 'thumbnail_src': m['turl'], + 'img_src': m['murl'], + 'content': '', + 'title': title, + 'source': source, + 'img_format': img_format}) + except: + continue + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + + regions_xpath = '//div[@id="region-section-content"]' \ + + '//ul[@class="b_vList"]/li/a/@href' + + regions = dom.xpath(regions_xpath) + for region in regions: + code = re.search('setmkt=[^\&]+', region).group()[7:] + if code == 'nb-NO': + code = 'no-NO' + + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py new file mode 100644 index 000000000..669130c42 --- /dev/null +++ b/searx/engines/bing_news.py @@ -0,0 +1,127 @@ +""" + Bing (News) + + @website https://www.bing.com/news + @provide-api yes (http://datamarket.azure.com/dataset/bing/search), + max. 5000 query/month + + @using-api no (because of query limit) + @results RSS (using search portal) + @stable yes (except perhaps for the images) + @parse url, title, content, publishedDate, thumbnail +""" + +from datetime import datetime +from dateutil import parser +from lxml import etree +from searx.utils import list_get, match_language +from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases +from searx.url_utils import urlencode, urlparse, parse_qsl + +# engine dependent config +categories = ['news'] +paging = True +language_support = True +time_range_support = True + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'news/search?{query}&first={offset}&format=RSS' +search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' +time_range_dict = {'day': '7', + 'week': '8', + 'month': '9'} + + +# remove click +def url_cleanup(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': + query = dict(parse_qsl(parsed_url.query)) + return query.get('url', None) + return url_string + + +# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=... +def image_url_cleanup(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': + query = dict(parse_qsl(parsed_url.query)) + return "https://www.bing.com/th?id=" + query.get('id') + return url_string + + +def _get_url(query, language, offset, time_range): + if time_range in time_range_dict: + search_path = search_string_with_time.format( + query=urlencode({'q': query, 'setmkt': language}), + offset=offset, + interval=time_range_dict[time_range]) + else: + search_path = search_string.format( + query=urlencode({'q': query, 'setmkt': language}), + offset=offset) + return base_url + search_path + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 10 + 1 + + if params['language'] == 'all': + language = 'en-US' + else: + language = match_language(params['language'], supported_languages, language_aliases) + + params['url'] = _get_url(query, language, offset, params['time_range']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + rss = etree.fromstring(resp.content) + + ns = rss.nsmap + + # parse results + for item in rss.xpath('./channel/item'): + # url / title / content + url = url_cleanup(item.xpath('./link/text()')[0]) + title = list_get(item.xpath('./title/text()'), 0, url) + content = list_get(item.xpath('./description/text()'), 0, '') + + # publishedDate + publishedDate = list_get(item.xpath('./pubDate/text()'), 0) + try: + publishedDate = parser.parse(publishedDate, dayfirst=False) + except TypeError: + publishedDate = datetime.now() + except ValueError: + publishedDate = datetime.now() + + # thumbnail + thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0) + if thumbnail is not None: + thumbnail = image_url_cleanup(thumbnail) + + # append result + if thumbnail is not None: + results.append({'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content, + 'img_src': thumbnail}) + else: + results.append({'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content}) + + # return results + return results diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py new file mode 100644 index 000000000..f1e636819 --- /dev/null +++ b/searx/engines/bing_videos.py @@ -0,0 +1,94 @@ +""" + Bing (Videos) + + @website https://www.bing.com/videos + @provide-api yes (http://datamarket.azure.com/dataset/bing/search) + + @using-api no + @results HTML + @stable no + @parse url, title, content, thumbnail +""" + +from json import loads +from lxml import html +from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode +from searx.utils import match_language + + +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 28 +language_support = True + +base_url = 'https://www.bing.com/' +search_string = 'videos/search'\ + '?{query}'\ + '&count={count}'\ + '&first={first}'\ + '&scope=video'\ + '&FORM=QBLH' +time_range_string = '&qft=+filterui:videoage-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +# do search-request +def request(query, params): + offset = ((params['pageno'] - 1) * number_of_results) + 1 + + search_path = search_string.format( + query=urlencode({'q': query}), + count=number_of_results, + first=offset) + + # safesearch cookie + params['cookies']['SRCHHPGUSR'] = \ + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + # language cookie + language = match_language(params['language'], supported_languages, language_aliases).lower() + params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' + + # query and paging + params['url'] = base_url + search_path + + # time range + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="dg_u"]'): + try: + metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) + info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() + content = '{0} - {1}'.format(metadata['du'], info) + thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid']) + results.append({'url': metadata['murl'], + 'thumbnail': thumbnail, + 'title': metadata.get('vt', ''), + 'content': content, + 'template': 'videos.html'}) + + except: + continue + + return results diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py new file mode 100644 index 000000000..82eedc24b --- /dev/null +++ b/searx/engines/btdigg.py @@ -0,0 +1,85 @@ +""" + BTDigg (Videos, Music, Files) + + @website https://btdig.com + @provide-api yes (on demand) + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, seed, leech, magnetlink +""" + +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin +from searx.utils import get_torrent_size + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://btdig.com' +search_url = url + '/search?q={search_term}&p={pageno}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//div[@class="one_result"]') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res: + link = result.xpath('.//div[@class="torrent_name"]//a')[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + + excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0] + content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False) + # it is better to emit <br/> instead of |, but html tags are verboten + content = content.strip().replace('\n', ' | ') + content = ' '.join(content.split()) + + filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0] + filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1] + files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0] + + # convert filesize to byte if possible + filesize = get_torrent_size(filesize, filesize_multiplier) + + # convert files to int if possible + try: + files = int(files) + except: + files = None + + magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href'] + + # append result + results.append({'url': href, + 'title': title, + 'content': content, + 'filesize': filesize, + 'files': files, + 'magnetlink': magnetlink, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return results diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py new file mode 100644 index 000000000..8eab8f673 --- /dev/null +++ b/searx/engines/currency_convert.py @@ -0,0 +1,99 @@ +import json +import re +import os +import sys +import unicodedata + +from io import open +from datetime import datetime + +if sys.version_info[0] == 3: + unicode = str + +categories = [] +url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' +weight = 100 + +parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) + +db = 1 + + +def normalize_name(name): + name = name.decode('utf-8').lower().replace('-', ' ').rstrip('s') + name = re.sub(' +', ' ', name) + return unicodedata.normalize('NFKD', name).lower() + + +def name_to_iso4217(name): + global db + + name = normalize_name(name) + currencies = db['names'].get(name, [name]) + return currencies[0] + + +def iso4217_to_name(iso4217, language): + global db + + return db['iso4217'].get(iso4217, {}).get(language, iso4217) + + +def request(query, params): + m = parser_re.match(query) + if not m: + # wrong query + return params + amount, from_currency, to_currency = m.groups() + amount = float(amount) + from_currency = name_to_iso4217(from_currency.strip()) + to_currency = name_to_iso4217(to_currency.strip()) + + q = (from_currency + to_currency).upper() + + params['url'] = url.format(from_currency, to_currency) + params['amount'] = amount + params['from'] = from_currency + params['to'] = to_currency + params['from_name'] = iso4217_to_name(from_currency, 'en') + params['to_name'] = iso4217_to_name(to_currency, 'en') + + return params + + +def response(resp): + """remove first and last lines to get only json""" + json_resp = resp.text[resp.text.find('\n') + 1:resp.text.rfind('\n') - 2] + results = [] + try: + conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount']) + except: + return results + answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format( + resp.search_params['amount'], + resp.search_params['from'], + resp.search_params['amount'] * conversion_rate, + resp.search_params['to'], + conversion_rate, + resp.search_params['from_name'], + resp.search_params['to_name'], + ) + + url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format( + resp.search_params['from'].upper(), resp.search_params['to']) + + results.append({'answer': answer, 'url': url}) + + return results + + +def load(): + global db + + current_dir = os.path.dirname(os.path.realpath(__file__)) + json_data = open(current_dir + "/../data/currencies.json", 'r', encoding='utf-8').read() + + db = json.loads(json_data) + + +load() diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py new file mode 100644 index 000000000..1038e64bf --- /dev/null +++ b/searx/engines/dailymotion.py @@ -0,0 +1,98 @@ +""" + Dailymotion (Videos) + + @website https://www.dailymotion.com + @provide-api yes (http://www.dailymotion.com/developer) + + @using-api yes + @results JSON + @stable yes + @parse url, title, thumbnail, publishedDate, embedded + + @todo set content-parameter with correct data +""" + +from json import loads +from datetime import datetime +from searx.url_utils import urlencode +from searx.utils import match_language, html_to_text + +# engine dependent config +categories = ['videos'] +paging = True +language_support = True + +# search-url +# see http://www.dailymotion.com/doc/api/obj-video.html +search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa +embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\ + 'data-src="https://www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>' + +supported_languages_url = 'https://api.dailymotion.com/languages' + + +# do search-request +def request(query, params): + if params['language'] == 'all': + locale = 'en-US' + else: + locale = match_language(params['language'], supported_languages) + + params['url'] = search_url.format( + query=urlencode({'search': query, 'localization': locale}), + pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if 'list' not in search_res: + return [] + + # parse results + for res in search_res['list']: + title = res['title'] + url = res['url'] + content = html_to_text(res['description']) + thumbnail = res['thumbnail_360_url'] + publishedDate = datetime.fromtimestamp(res['created_time'], None) + embedded = embedded_url.format(videoid=res['id']) + + # http to https + thumbnail = thumbnail.replace("http://", "https://") + + results.append({'template': 'videos.html', + 'url': url, + 'title': title, + 'content': content, + 'publishedDate': publishedDate, + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + + response_json = loads(resp.text) + + for language in response_json['list']: + supported_languages[language['code']] = {} + + name = language['native_name'] + if name: + supported_languages[language['code']]['name'] = name + english_name = language['name'] + if english_name: + supported_languages[language['code']]['english_name'] = english_name + + return supported_languages diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py new file mode 100644 index 000000000..af63478fb --- /dev/null +++ b/searx/engines/deezer.py @@ -0,0 +1,67 @@ +""" + Deezer (Music) + + @website https://deezer.com + @provide-api yes (http://developers.deezer.com/api/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, embedded +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.deezer.com/' +search_url = url + 'search?{query}&index={offset}' + +embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\ + 'data-src="https://www.deezer.com/plugins/player?type=tracks&id={audioid}" ' +\ + 'width="540" height="80"></iframe>' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 25 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('data', []): + if result['type'] == 'track': + title = result['title'] + url = result['link'] + + if url.startswith('http://'): + url = 'https' + url[4:] + + content = u'{} - {} - {}'.format( + result['artist']['name'], + result['album']['title'], + result['title']) + + embedded = embedded_url.format(audioid=result['id']) + + # append result + results.append({'url': url, + 'title': title, + 'embedded': embedded, + 'content': content}) + + # return results + return results diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py new file mode 100644 index 000000000..a0e27e622 --- /dev/null +++ b/searx/engines/deviantart.py @@ -0,0 +1,81 @@ +""" + Deviantart (Images) + + @website https://www.deviantart.com/ + @provide-api yes (https://www.deviantart.com/developers/) (RSS) + + @using-api no (TODO, rewrite to api) + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail_src, img_src + + @todo rewrite to api +""" + +from lxml import html +import re +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +time_range_support = True + +# search-url +base_url = 'https://www.deviantart.com/' +search_url = base_url + 'search?page={page}&{query}' +time_range_url = '&order={range}' + +time_range_dict = {'day': 11, + 'week': 14, + 'month': 15} + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + params['url'] = search_url.format(page=params['pageno'], + query=urlencode({'q': query})) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # return empty array if a redirection code is returned + if resp.status_code == 302: + return [] + + dom = html.fromstring(resp.text) + + # parse results + for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): + for result in row.xpath('./div'): + link = result.xpath('.//a[@data-hook="deviation_link"]')[0] + url = link.attrib.get('href') + title = link.attrib.get('title') + thumbnail_src = result.xpath('.//img')[0].attrib.get('src') + img_src = thumbnail_src + + # http to https, remove domain sharding + thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) + thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) + + url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py new file mode 100644 index 000000000..423af0971 --- /dev/null +++ b/searx/engines/dictzone.py @@ -0,0 +1,68 @@ +""" + Dictzone + + @website https://dictzone.com/ + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +import re +from lxml import html +from searx.utils import is_valid_lang, eval_xpath +from searx.url_utils import urljoin + +categories = ['general'] +url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) +results_xpath = './/table[@id="r"]/tr' + + +def request(query, params): + m = parser_re.match(query) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + params['url'] = url.format(from_lang=from_lang[2], + to_lang=to_lang[2], + query=query.decode('utf-8')) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]): + try: + from_result, to_results_raw = eval_xpath(result, './td') + except: + continue + + to_results = [] + for to_result in eval_xpath(to_results_raw, './p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append({ + 'url': urljoin(resp.url, '?%d' % k), + 'title': from_result.text_content(), + 'content': '; '.join(to_results) + }) + + return results diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py new file mode 100644 index 000000000..ff2f94593 --- /dev/null +++ b/searx/engines/digbt.py @@ -0,0 +1,62 @@ +""" + DigBT (Videos, Music, Files) + + @website https://digbt.org + @provide-api no + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, magnetlink +""" + +from sys import version_info +from lxml import html +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size +from searx.url_utils import urljoin + +if version_info[0] == 3: + unicode = str + +categories = ['videos', 'music', 'files'] +paging = True + +URL = 'https://digbt.org' +SEARCH_URL = URL + '/search/{query}-time-{pageno}' +FILESIZE = 3 +FILESIZE_MULTIPLIER = 4 + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno']) + + return params + + +def response(resp): + dom = html.fromstring(resp.text) + search_res = dom.xpath('.//td[@class="x-item"]') + + if not search_res: + return list() + + results = list() + for result in search_res: + url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) + title = extract_text(result.xpath('.//a[@title]')) + content = extract_text(result.xpath('.//div[@class="files"]')) + files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() + filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) + magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0] + + results.append({'url': url, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnetlink, + 'seed': 'N/A', + 'leech': 'N/A', + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py new file mode 100644 index 000000000..073410eb0 --- /dev/null +++ b/searx/engines/digg.py @@ -0,0 +1,69 @@ +""" + Digg (News, Social media) + + @website https://digg.com/ + @provide-api no + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, publishedDate, thumbnail +""" + +import random +import string +from dateutil import parser +from json import loads +from lxml import html +from searx.url_utils import urlencode +from datetime import datetime + +# engine dependent config +categories = ['news', 'social media'] +paging = True + +# search-url +base_url = 'https://digg.com/' +search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' + +# specific xpath variables +results_xpath = '//article' +link_xpath = './/small[@class="time"]//a' +title_xpath = './/h2//a//text()' +content_xpath = './/p//text()' +pubdate_xpath = './/time' + +digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ + string.digits + "+_" + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + params['url'] = search_url.format(position=offset, + query=urlencode({'q': query})) + params['cookies']['frontend.auid'] = ''.join(random.choice( + digg_cookie_chars) for _ in range(22)) + return params + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.text) + + # parse results + for result in search_result['mapped']: + + published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['excerpt'], + 'template': 'videos.html', + 'publishedDate': published, + 'thumbnail': result['images']['thumbImage']}) + + # return results + return results diff --git a/searx/engines/doku.py b/searx/engines/doku.py new file mode 100644 index 000000000..d20e66026 --- /dev/null +++ b/searx/engines/doku.py @@ -0,0 +1,85 @@ +# Doku Wiki +# +# @website https://www.dokuwiki.org/ +# @provide-api yes +# (https://www.dokuwiki.org/devel:xmlrpc) +# +# @using-api no +# @results HTML +# @stable yes +# @parse (general) url, title, content + +from lxml.html import fromstring +from searx.engines.xpath import extract_text +from searx.utils import eval_xpath +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' +paging = False +language_support = False +number_of_results = 5 + +# search-url +# Doku is OpenSearch compatible +base_url = 'http://localhost:8090' +search_url = '/?do=search'\ + '&{query}' +# TODO '&startRecord={offset}'\ +# TODO '&maximumRecords={limit}'\ + + +# do search-request +def request(query, params): + + params['url'] = base_url +\ + search_url.format(query=urlencode({'id': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + doc = fromstring(resp.text) + + # parse results + # Quickhits + for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'): + try: + res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] + except: + continue + + if not res_url: + continue + + title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) + + # append result + results.append({'title': title, + 'content': "", + 'url': base_url + res_url}) + + # Search results + for r in eval_xpath(doc, '//dl[@class="search_results"]/*'): + try: + if r.tag == "dt": + res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1] + title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title')) + elif r.tag == "dd": + content = extract_text(eval_xpath(r, '.')) + + # append result + results.append({'title': title, + 'content': content, + 'url': base_url + res_url}) + except: + continue + + if not res_url: + continue + + # return results + return results diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py new file mode 100644 index 000000000..0d2c0af2d --- /dev/null +++ b/searx/engines/duckduckgo.py @@ -0,0 +1,143 @@ +""" + DuckDuckGo (Web) + + @website https://duckduckgo.com/ + @provide-api yes (https://duckduckgo.com/api), + but not all results from search-site + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo rewrite to api +""" + +from lxml.html import fromstring +from json import loads +from searx.engines.xpath import extract_text +from searx.poolrequests import get +from searx.url_utils import urlencode +from searx.utils import match_language, eval_xpath + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +supported_languages_url = 'https://duckduckgo.com/util/u172.js' +time_range_support = True + +language_aliases = { + 'ar-SA': 'ar-XA', + 'es-419': 'es-XL', + 'ja': 'jp-JP', + 'ko': 'kr-KR', + 'sl-SI': 'sl-SL', + 'zh-TW': 'tzh-TW', + 'zh-HK': 'tzh-HK' +} + +# search-url +url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}' +time_range_url = '&df={range}' + +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm'} + +# specific xpath variables +result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa +url_xpath = './/a[@class="result__a"]/@href' +title_xpath = './/a[@class="result__a"]' +content_xpath = './/a[@class="result__snippet"]' + + +# match query's language to a region code that duckduckgo will accept +def get_region_code(lang, lang_list=[]): + if lang == 'all': + return None + + lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT') + lang_parts = lang_code.split('-') + + # country code goes first + return lang_parts[1].lower() + '-' + lang_parts[0].lower() + + +def request(query, params): + if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 30 + + region_code = get_region_code(params['language'], supported_languages) + params['url'] = 'https://duckduckgo.com/html/' + if params['pageno'] > 1: + params['method'] = 'POST' + params['data']['q'] = query + params['data']['s'] = offset + params['data']['dc'] = 30 + params['data']['nextParams'] = '' + params['data']['v'] = 'l' + params['data']['o'] = 'json' + params['data']['api'] = '/d.js' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + if region_code: + params['data']['kl'] = region_code + else: + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) + + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + doc = fromstring(resp.text) + + # parse results + for i, r in enumerate(eval_xpath(doc, result_xpath)): + if i >= 30: + break + try: + res_url = eval_xpath(r, url_xpath)[-1] + except: + continue + + if not res_url: + continue + + title = extract_text(eval_xpath(r, title_xpath)) + content = extract_text(eval_xpath(r, content_xpath)) + + # append result + results.append({'title': title, + 'content': content, + 'url': res_url}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + + # response is a js file with regions as an embedded object + response_page = resp.text + response_page = response_page[response_page.find('regions:{') + 8:] + response_page = response_page[:response_page.find('}') + 1] + + regions_json = loads(response_page) + supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) + + return list(supported_languages) diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py new file mode 100644 index 000000000..79d10c303 --- /dev/null +++ b/searx/engines/duckduckgo_definitions.py @@ -0,0 +1,171 @@ +""" +DuckDuckGo (definitions) + +- `Instant Answer API`_ +- `DuckDuckGo query`_ + +.. _Instant Answer API: https://duckduckgo.com/api +.. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1 + +""" + +import json +from lxml import html +from re import compile +from searx.engines.xpath import extract_text +from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases +from searx.url_utils import urlencode +from searx.utils import html_to_text, match_language + +url = 'https://api.duckduckgo.com/'\ + + '?{query}&format=json&pretty=0&no_redirect=1&d=1' + +http_regex = compile(r'^http:') + + +def result_to_text(url, text, htmlResult): + # TODO : remove result ending with "Meaning" or "Category" + dom = html.fromstring(htmlResult) + a = dom.xpath('//a') + if len(a) >= 1: + return extract_text(a[0]) + else: + return text + + +def request(query, params): + params['url'] = url.format(query=urlencode({'q': query})) + language = match_language(params['language'], supported_languages, language_aliases) + language = language.split('-')[0] + params['headers']['Accept-Language'] = language + return params + + +def response(resp): + results = [] + + search_res = json.loads(resp.text) + + content = '' + heading = search_res.get('Heading', '') + attributes = [] + urls = [] + infobox_id = None + relatedTopics = [] + + # add answer if there is one + answer = search_res.get('Answer', '') + if answer: + if search_res.get('AnswerType', '') not in ['calc']: + results.append({'answer': html_to_text(answer)}) + + # add infobox + if 'Definition' in search_res: + content = content + search_res.get('Definition', '') + + if 'Abstract' in search_res: + content = content + search_res.get('Abstract', '') + + # image + image = search_res.get('Image', '') + image = None if image == '' else image + + # attributes + if 'Infobox' in search_res: + infobox = search_res.get('Infobox', None) + if 'content' in infobox: + for info in infobox.get('content'): + attributes.append({'label': info.get('label'), + 'value': info.get('value')}) + + # urls + for ddg_result in search_res.get('Results', []): + if 'FirstURL' in ddg_result: + firstURL = ddg_result.get('FirstURL', '') + text = ddg_result.get('Text', '') + urls.append({'title': text, 'url': firstURL}) + results.append({'title': heading, 'url': firstURL}) + + # related topics + for ddg_result in search_res.get('RelatedTopics', []): + if 'FirstURL' in ddg_result: + suggestion = result_to_text(ddg_result.get('FirstURL', None), + ddg_result.get('Text', None), + ddg_result.get('Result', None)) + if suggestion != heading: + results.append({'suggestion': suggestion}) + elif 'Topics' in ddg_result: + suggestions = [] + relatedTopics.append({'name': ddg_result.get('Name', ''), + 'suggestions': suggestions}) + for topic_result in ddg_result.get('Topics', []): + suggestion = result_to_text(topic_result.get('FirstURL', None), + topic_result.get('Text', None), + topic_result.get('Result', None)) + if suggestion != heading: + suggestions.append(suggestion) + + # abstract + abstractURL = search_res.get('AbstractURL', '') + if abstractURL != '': + # add as result ? problem always in english + infobox_id = abstractURL + urls.append({'title': search_res.get('AbstractSource'), + 'url': abstractURL}) + + # definition + definitionURL = search_res.get('DefinitionURL', '') + if definitionURL != '': + # add as result ? as answer ? problem always in english + infobox_id = definitionURL + urls.append({'title': search_res.get('DefinitionSource'), + 'url': definitionURL}) + + # to merge with wikidata's infobox + if infobox_id: + infobox_id = http_regex.sub('https:', infobox_id) + + # entity + entity = search_res.get('Entity', None) + # TODO continent / country / department / location / waterfall / + # mountain range : + # link to map search, get weather, near by locations + # TODO musician : link to music search + # TODO concert tour : ?? + # TODO film / actor / television / media franchise : + # links to IMDB / rottentomatoes (or scrap result) + # TODO music : link tu musicbrainz / last.fm + # TODO book : ?? + # TODO artist / playwright : ?? + # TODO compagny : ?? + # TODO software / os : ?? + # TODO software engineer : ?? + # TODO prepared food : ?? + # TODO website : ?? + # TODO performing art : ?? + # TODO prepared food : ?? + # TODO programming language : ?? + # TODO file format : ?? + + if len(heading) > 0: + # TODO get infobox.meta.value where .label='article_title' + if image is None and len(attributes) == 0 and len(urls) == 1 and\ + len(relatedTopics) == 0 and len(content) == 0: + results.append({ + 'url': urls[0]['url'], + 'title': heading, + 'content': content + }) + else: + results.append({ + 'infobox': heading, + 'id': infobox_id, + 'entity': entity, + 'content': content, + 'img_src': image, + 'attributes': attributes, + 'urls': urls, + 'relatedTopics': relatedTopics + }) + + return results diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py new file mode 100644 index 000000000..89924b71c --- /dev/null +++ b/searx/engines/duckduckgo_images.py @@ -0,0 +1,97 @@ +""" + DuckDuckGo (Images) + + @website https://duckduckgo.com/ + @provide-api yes (https://duckduckgo.com/api), + but images are not supported + + @using-api no + @results JSON (site requires js to get images) + @stable no (JSON can change) + @parse url, title, img_src + + @todo avoid extra request +""" + +from json import loads +from searx.engines.xpath import extract_text +from searx.engines.duckduckgo import ( + _fetch_supported_languages, supported_languages_url, + get_region_code, language_aliases +) +from searx.poolrequests import get +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +language_support = True +safesearch = True + +# search-url +images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' +site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' + + +# run query in site to get vqd number needed for requesting images +# TODO: find a way to get this number without an extra request (is it a hash of the query?) +def get_vqd(query, headers): + query_url = site_url.format(query=urlencode({'q': query})) + res = get(query_url, headers=headers) + content = res.text + if content.find('vqd=\'') == -1: + raise Exception('Request failed') + vqd = content[content.find('vqd=\'') + 5:] + vqd = vqd[:vqd.find('\'')] + return vqd + + +# do search-request +def request(query, params): + # to avoid running actual external requests when testing + if 'is_test' not in params: + vqd = get_vqd(query, params['headers']) + else: + vqd = '12345' + + offset = (params['pageno'] - 1) * 50 + + safesearch = params['safesearch'] - 1 + + region_code = get_region_code(params['language'], lang_list=supported_languages) + if region_code: + params['url'] = images_url.format( + query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd) + else: + params['url'] = images_url.format( + query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) + + return params + + +# get response from search-request +def response(resp): + results = [] + + content = resp.text + try: + res_json = loads(content) + except: + raise Exception('Cannot parse results') + + # parse results + for result in res_json['results']: + title = result['title'] + url = result['url'] + thumbnail = result['thumbnail'] + image = result['image'] + + # append result + results.append({'template': 'images.html', + 'title': title, + 'content': '', + 'thumbnail_src': thumbnail, + 'img_src': image, + 'url': url}) + + return results diff --git a/searx/engines/duden.py b/searx/engines/duden.py new file mode 100644 index 000000000..cf2f1a278 --- /dev/null +++ b/searx/engines/duden.py @@ -0,0 +1,80 @@ +""" + Duden + @website https://www.duden.de + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html, etree +import re +from searx.engines.xpath import extract_text +from searx.utils import eval_xpath +from searx.url_utils import quote, urljoin +from searx import logger + +categories = ['general'] +paging = True +language_support = False + +# search-url +base_url = 'https://www.duden.de/' +search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}' + + +def request(query, params): + '''pre-request callback + params<dict>: + method : POST/GET + headers : {} + data : {} # if method == POST + url : '' + category: 'search category' + pageno : 1 # number of the requested page + ''' + + offset = (params['pageno'] - 1) + if offset == 0: + search_url_fmt = base_url + 'suchen/dudenonline/{query}' + params['url'] = search_url_fmt.format(query=quote(query)) + else: + params['url'] = search_url.format(offset=offset, query=quote(query)) + return params + + +def response(resp): + '''post-response callback + resp: requests response object + ''' + results = [] + + dom = html.fromstring(resp.text) + + try: + number_of_results_string =\ + re.sub('[^0-9]', '', + eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]) + + results.append({'number_of_results': int(number_of_results_string)}) + + except: + logger.debug("Couldn't read number of results.") + pass + + for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'): + try: + url = eval_xpath(result, './/h2/a')[0].get('href') + url = urljoin(base_url, url) + title = eval_xpath(result, 'string(.//h2/a)').strip() + content = extract_text(eval_xpath(result, './/p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + return results diff --git a/searx/engines/dummy.py b/searx/engines/dummy.py new file mode 100644 index 000000000..50b56ef78 --- /dev/null +++ b/searx/engines/dummy.py @@ -0,0 +1,16 @@ +""" + Dummy + + @results empty array + @stable yes +""" + + +# do search-request +def request(query, params): + return params + + +# get response from search-request +def response(resp): + return [] diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py new file mode 100644 index 000000000..a36ecf778 --- /dev/null +++ b/searx/engines/faroo.py @@ -0,0 +1,96 @@ +""" + Faroo (Web, News) + + @website http://www.faroo.com + @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key + + @using-api no + @results JSON + @stable yes + @parse url, title, content, publishedDate, img_src +""" + +from json import loads +import datetime +from searx.utils import searx_useragent +from searx.url_utils import urlencode + +# engine dependent config +categories = ['general', 'news'] +paging = True +language_support = True +number_of_results = 10 + +# search-url +url = 'http://www.faroo.com/' +search_url = url + 'instant.json?{query}'\ + '&start={offset}'\ + '&length={number_of_results}'\ + '&l={language}'\ + '&src={categorie}'\ + '&i=false'\ + '&c=false' + +search_category = {'general': 'web', + 'news': 'news'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + 1 + categorie = search_category.get(params['category'], 'web') + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('-')[0] + + # if language is not supported, put it in english + if language != 'en' and\ + language != 'de' and\ + language != 'zh': + language = 'en' + + params['url'] = search_url.format(offset=offset, + number_of_results=number_of_results, + query=urlencode({'q': query}), + language=language, + categorie=categorie) + + params['headers']['Referer'] = url + + return params + + +# get response from search-request +def response(resp): + # HTTP-Code 429: rate limit exceeded + if resp.status_code == 429: + raise Exception("rate limit has been exceeded!") + + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if not search_res.get('results', {}): + return [] + + # parse results + for result in search_res['results']: + publishedDate = None + result_json = {'url': result['url'], 'title': result['title'], + 'content': result['kwic']} + if result['news']: + result_json['publishedDate'] = \ + datetime.datetime.fromtimestamp(result['date'] / 1000.0) + + # append image result if image url is set + if result['iurl']: + result_json['template'] = 'videos.html' + result_json['thumbnail'] = result['iurl'] + + results.append(result_json) + + # return results + return results diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py new file mode 100644 index 000000000..4066dc716 --- /dev/null +++ b/searx/engines/fdroid.py @@ -0,0 +1,50 @@ +""" + F-Droid (a repository of FOSS applications for Android) + + @website https://f-droid.org/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'https://search.f-droid.org/' +search_url = base_url + '?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for app in dom.xpath('//a[@class="package-header"]'): + app_url = app.xpath('./@href')[0] + app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()')) + app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \ + + ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip() + app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0] + + results.append({'url': app_url, + 'title': app_title, + 'content': app_content, + 'img_src': app_img_src}) + + return results diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py new file mode 100644 index 000000000..ed57a6bf3 --- /dev/null +++ b/searx/engines/filecrop.py @@ -0,0 +1,88 @@ +from searx.url_utils import urlencode + +try: + from HTMLParser import HTMLParser +except: + from html.parser import HTMLParser + +url = 'http://www.filecrop.com/' +search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa + +paging = True + + +class FilecropResultParser(HTMLParser): + + def __init__(self): + HTMLParser.__init__(self) + self.__start_processing = False + + self.results = [] + self.result = {} + + self.tr_counter = 0 + self.data_counter = 0 + + def handle_starttag(self, tag, attrs): + + if tag == 'tr': + if ('bgcolor', '#edeff5') in attrs or\ + ('bgcolor', '#ffffff') in attrs: + self.__start_processing = True + + if not self.__start_processing: + return + + if tag == 'label': + self.result['title'] = [attr[1] for attr in attrs + if attr[0] == 'title'][0] + elif tag == 'a' and ('rel', 'nofollow') in attrs\ + and ('class', 'sourcelink') in attrs: + if 'content' in self.result: + self.result['content'] += [attr[1] for attr in attrs + if attr[0] == 'title'][0] + else: + self.result['content'] = [attr[1] for attr in attrs + if attr[0] == 'title'][0] + self.result['content'] += ' ' + elif tag == 'a': + self.result['url'] = url + [attr[1] for attr in attrs + if attr[0] == 'href'][0] + + def handle_endtag(self, tag): + if self.__start_processing is False: + return + + if tag == 'tr': + self.tr_counter += 1 + + if self.tr_counter == 2: + self.__start_processing = False + self.tr_counter = 0 + self.data_counter = 0 + self.results.append(self.result) + self.result = {} + + def handle_data(self, data): + if not self.__start_processing: + return + + if 'content' in self.result: + self.result['content'] += data + ' ' + else: + self.result['content'] = data + ' ' + + self.data_counter += 1 + + +def request(query, params): + index = 1 + (params['pageno'] - 1) * 30 + params['url'] = search_url.format(query=urlencode({'w': query}), index=index) + return params + + +def response(resp): + parser = FilecropResultParser() + parser.feed(resp.text) + + return parser.results diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py new file mode 100644 index 000000000..de1769370 --- /dev/null +++ b/searx/engines/flickr.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +""" + Flickr (Images) + + @website https://www.flickr.com + @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, thumbnail, img_src + More info on api-key : https://www.flickr.com/services/apps/create/ +""" + +from json import loads +from searx.url_utils import urlencode + +categories = ['images'] + +nb_per_page = 15 +paging = True +api_key = None + + +url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\ + '&api_key={api_key}&{text}&sort=relevance' +\ + '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' +\ + '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}' +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' + +paging = True + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def request(query, params): + params['url'] = url.format(text=urlencode({'text': query}), + api_key=api_key, + nb_per_page=nb_per_page, + page=params['pageno']) + return params + + +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'photos' not in search_results: + return [] + + if 'photo' not in search_results['photos']: + return [] + + photos = search_results['photos']['photo'] + + # parse results + for photo in photos: + if 'url_o' in photo: + img_src = photo['url_o'] + elif 'url_z' in photo: + img_src = photo['url_z'] + else: + continue + +# For a bigger thumbnail, keep only the url_z, not the url_n + if 'url_n' in photo: + thumbnail_src = photo['url_n'] + elif 'url_z' in photo: + thumbnail_src = photo['url_z'] + else: + thumbnail_src = img_src + + url = build_flickr_url(photo['owner'], photo['id']) + + # append result + results.append({'url': url, + 'title': photo['title'], + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'content': photo['description']['_content'], + 'author': photo['ownername'], + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py new file mode 100644 index 000000000..198ac2cff --- /dev/null +++ b/searx/engines/flickr_noapi.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +""" + Flickr (Images) + + @website https://www.flickr.com + @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) + + @using-api no + @results HTML + @stable no + @parse url, title, thumbnail, img_src +""" + +from json import loads +from time import time +import re +from searx.engines import logger +from searx.url_utils import urlencode +from searx.utils import ecma_unescape, html_to_text + +logger = logger.getChild('flickr-noapi') + +categories = ['images'] + +url = 'https://www.flickr.com/' +search_url = url + 'search?{query}&page={page}' +time_range_url = '&min_upload_date={start}&max_upload_date={end}' +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' +modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M) +image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') + +paging = True +time_range_support = True +time_range_dict = {'day': 60 * 60 * 24, + 'week': 60 * 60 * 24 * 7, + 'month': 60 * 60 * 24 * 7 * 4, + 'year': 60 * 60 * 24 * 7 * 52} + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def _get_time_range_url(time_range): + if time_range in time_range_dict: + return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range])) + return '' + + +def request(query, params): + params['url'] = (search_url.format(query=urlencode({'text': query}), page=params['pageno']) + + _get_time_range_url(params['time_range'])) + return params + + +def response(resp): + results = [] + + matches = modelexport_re.search(resp.text) + + if matches is None: + return results + + match = matches.group(1) + model_export = loads(match) + + if 'legend' not in model_export: + return results + + legend = model_export['legend'] + + # handle empty page + if not legend or not legend[0]: + return results + + for index in legend: + photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])] + author = ecma_unescape(photo.get('realname', '')) + source = ecma_unescape(photo.get('username', '')) + ' @ Flickr' + title = ecma_unescape(photo.get('title', '')) + content = html_to_text(ecma_unescape(photo.get('description', ''))) + img_src = None + # From the biggest to the lowest format + for image_size in image_sizes: + if image_size in photo['sizes']: + img_src = photo['sizes'][image_size]['url'] + img_format = 'jpg ' \ + + str(photo['sizes'][image_size]['width']) \ + + 'x' \ + + str(photo['sizes'][image_size]['height']) + break + + if not img_src: + logger.debug('cannot find valid image size: {0}'.format(repr(photo))) + continue + + # For a bigger thumbnail, keep only the url_z, not the url_n + if 'n' in photo['sizes']: + thumbnail_src = photo['sizes']['n']['url'] + elif 'z' in photo['sizes']: + thumbnail_src = photo['sizes']['z']['url'] + else: + thumbnail_src = img_src + + if 'ownerNsid' not in photo: + # should not happen, disowned photo? Show it anyway + url = img_src + else: + url = build_flickr_url(photo['ownerNsid'], photo['id']) + + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'content': content, + 'author': author, + 'source': source, + 'img_format': img_format, + 'template': 'images.html'}) + + return results diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py new file mode 100644 index 000000000..f3441fa5f --- /dev/null +++ b/searx/engines/framalibre.py @@ -0,0 +1,72 @@ +""" + FramaLibre (It) + + @website https://framalibre.org/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, thumbnail, img_src +""" + +try: + from cgi import escape +except: + from html import escape +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urljoin, urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +base_url = 'https://framalibre.org/' +search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}' + +# specific xpath variables +results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]' +link_xpath = './/h3[@class="node-title"]/a[@href]' +thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src' +content_xpath = './/div[@class="content"]//p' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) + params['url'] = search_url.format(query=urlencode({'keys': query}), + offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(base_url, link.attrib.get('href')) + # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... + title = escape(extract_text(link)) + thumbnail_tags = result.xpath(thumbnail_xpath) + thumbnail = None + if len(thumbnail_tags) > 0: + thumbnail = extract_text(thumbnail_tags[0]) + if thumbnail[0] == '/': + thumbnail = base_url + thumbnail + content = escape(extract_text(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'img_src': thumbnail, + 'content': content}) + + # return results + return results diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py new file mode 100644 index 000000000..a67b42dbe --- /dev/null +++ b/searx/engines/frinkiac.py @@ -0,0 +1,44 @@ +""" +Frinkiac (Images) + +@website https://www.frinkiac.com +@provide-api no +@using-api no +@results JSON +@stable no +@parse url, title, img_src +""" + +from json import loads +from searx.url_utils import urlencode + +categories = ['images'] + +BASE = 'https://frinkiac.com/' +SEARCH_URL = '{base}api/search?{query}' +RESULT_URL = '{base}?{query}' +THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg' +IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg' + + +def request(query, params): + params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query})) + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + for result in response_data: + episode = result['Episode'] + timestamp = result['Timestamp'] + + results.append({'template': 'images.html', + 'url': RESULT_URL.format(base=BASE, + query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})), + 'title': episode, + 'content': '', + 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp), + 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp)}) + + return results diff --git a/searx/engines/genius.py b/searx/engines/genius.py new file mode 100644 index 000000000..b265e9d76 --- /dev/null +++ b/searx/engines/genius.py @@ -0,0 +1,88 @@ +""" +Genius + + @website https://www.genius.com/ + @provide-api yes (https://docs.genius.com/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +from json import loads +from searx.url_utils import urlencode +from datetime import datetime + +# engine dependent config +categories = ['music'] +paging = True +language_support = False +page_size = 5 + +url = 'https://genius.com/api/' +search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}' + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), + index='multi', + page_size=page_size, + pageno=params['pageno']) + return params + + +def parse_lyric(hit): + try: + content = hit['highlights'][0]['value'] + except: + content = None + timestamp = hit['result']['lyrics_updated_at'] + result = {'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'content': content, + 'thumbnail': hit['result']['song_art_image_thumbnail_url'], + 'template': 'videos.html'} + if timestamp: + result.update({'publishedDate': datetime.fromtimestamp(timestamp)}) + return result + + +def parse_artist(hit): + result = {'url': hit['result']['url'], + 'title': hit['result']['name'], + 'content': None, + 'thumbnail': hit['result']['image_url'], + 'template': 'videos.html'} + return result + + +def parse_album(hit): + result = {'url': hit['result']['url'], + 'title': hit['result']['full_title'], + 'thumbnail': hit['result']['cover_art_url'], + # 'thumbnail': hit['result']['cover_art_thumbnail_url'], + 'template': 'videos.html'} + try: + year = hit['result']['release_date_components']['year'] + except: + pass + else: + if year: + result.update({'content': 'Released: {}'.format(year)}) + return result + +parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album} + + +def response(resp): + results = [] + json = loads(resp.text) + hits = [hit for section in json['response']['sections'] for hit in section['hits']] + for hit in hits: + try: + func = parse[hit['type']] + except KeyError: + continue + results.append(func(hit)) + return results diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py new file mode 100644 index 000000000..a7a966cc9 --- /dev/null +++ b/searx/engines/gentoo.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +""" + Gentoo Wiki + + @website https://wiki.gentoo.org + @provide-api yes + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['it'] +language_support = True +paging = True +base_url = 'https://wiki.gentoo.org' + +# xpath queries +xpath_results = '//ul[@class="mw-search-results"]/li' +xpath_link = './/div[@class="mw-search-result-heading"]/a' + + +# cut 'en' from 'en-US', 'de' from 'de-CH', and so on +def locale_to_lang_code(locale): + if locale.find('-') >= 0: + locale = locale.split('-')[0] + return locale + + +# wikis for some languages were moved off from the main site, we need to make +# requests to correct URLs to be able to get results in those languages +lang_urls = { + 'en': { + 'base': 'https://wiki.gentoo.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}' + }, + 'others': { + 'base': 'https://wiki.gentoo.org', + 'search': '/index.php?title=Special:Search&offset={offset}&{query}\ + &profile=translation&languagefilter={language}' + } +} + + +# get base & search URLs for selected language +def get_lang_urls(language): + if language != 'en': + return lang_urls['others'] + return lang_urls['en'] + + +# Language names to build search requests for +# those languages which are hosted on the main site. +main_langs = { + 'ar': 'العربية', + 'bg': 'Български', + 'cs': 'Česky', + 'da': 'Dansk', + 'el': 'Ελληνικά', + 'es': 'Español', + 'he': 'עברית', + 'hr': 'Hrvatski', + 'hu': 'Magyar', + 'it': 'Italiano', + 'ko': '한국어', + 'lt': 'Lietuviškai', + 'nl': 'Nederlands', + 'pl': 'Polski', + 'pt': 'Português', + 'ru': 'Русский', + 'sl': 'Slovenský', + 'th': 'ไทย', + 'uk': 'Українська', + 'zh': '简体中文' +} +supported_languages = dict(lang_urls, **main_langs) + + +# do search-request +def request(query, params): + # translate the locale (e.g. 'en-US') to language code ('en') + language = locale_to_lang_code(params['language']) + + # if our language is hosted on the main site, we need to add its name + # to the query in order to narrow the results to that language + if language in main_langs: + query += b' (' + (main_langs[language]).encode('utf-8') + b')' + + # prepare the request parameters + query = urlencode({'search': query}) + offset = (params['pageno'] - 1) * 20 + + # get request URLs for our language of choice + urls = get_lang_urls(language) + search_url = urls['base'] + urls['search'] + + params['url'] = search_url.format(query=query, offset=offset, + language=language) + + return params + + +# get response from search-request +def response(resp): + # get the base URL for the language in which request was made + language = locale_to_lang_code(resp.search_params['language']) + base_url = get_lang_urls(language)['base'] + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(xpath_results): + link = result.xpath(xpath_link)[0] + href = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + + results.append({'url': href, + 'title': title}) + + return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py new file mode 100644 index 000000000..a84f3f69d --- /dev/null +++ b/searx/engines/gigablast.py @@ -0,0 +1,114 @@ +""" + Gigablast (Web) + + @website https://gigablast.com + @provide-api yes (https://gigablast.com/api.html) + + @using-api yes + @results XML + @stable yes + @parse url, title, content +""" + +import random +from json import loads +from time import time +from lxml.html import fromstring +from searx.url_utils import urlencode +from searx.utils import eval_xpath + +# engine dependent config +categories = ['general'] +paging = True +number_of_results = 10 +language_support = True +safesearch = True + +# search-url +base_url = 'https://gigablast.com/' +search_string = 'search?{query}'\ + '&n={number_of_results}'\ + '&c=main'\ + '&s={offset}'\ + '&format=json'\ + '&qh=0'\ + '&qlang={lang}'\ + '&ff={safesearch}'\ + '&rxiec={rxieu}'\ + '&ulse={ulse}'\ + '&rand={rxikd}'\ + '&dbez={dbez}' +# specific xpath variables +results_xpath = '//response//result' +url_xpath = './/url' +title_xpath = './/title' +content_xpath = './/sum' + +supported_languages_url = 'https://gigablast.com/search?&rxikd=1' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + + if params['language'] == 'all': + language = 'xx' + else: + language = params['language'].replace('-', '_').lower() + if language.split('-')[0] != 'zh': + language = language.split('-')[0] + + if params['safesearch'] >= 1: + safesearch = 1 + else: + safesearch = 0 + + # rxieu is some kind of hash from the search query, but accepts random atm + search_path = search_string.format(query=urlencode({'q': query}), + offset=offset, + number_of_results=number_of_results, + rxikd=int(time() * 1000), + rxieu=random.randint(1000000000, 9999999999), + ulse=random.randint(100000000, 999999999), + lang=language, + safesearch=safesearch, + dbez=random.randint(100000000, 999999999)) + + params['url'] = base_url + search_path + + return params + + +# get response from search-request +def response(resp): + results = [] + + # parse results + response_json = loads(resp.text) + + for result in response_json['results']: + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['sum']}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = fromstring(resp.text) + links = eval_xpath(dom, '//span[@id="menu2"]/a') + for link in links: + href = eval_xpath(link, './@href')[0].split('lang%3A') + if len(href) == 2: + code = href[1].split('_') + if len(code) == 2: + code = code[0] + '-' + code[1].upper() + else: + code = code[0] + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/github.py b/searx/engines/github.py new file mode 100644 index 000000000..eaa00da4f --- /dev/null +++ b/searx/engines/github.py @@ -0,0 +1,60 @@ +""" + Github (It) + + @website https://github.com/ + @provide-api yes (https://developer.github.com/v3/) + + @using-api yes + @results JSON + @stable yes (using api) + @parse url, title, content +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] + +# search-url +search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa + +accept_header = 'application/vnd.github.preview.text-match+json' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + params['headers']['Accept'] = accept_header + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # check if items are recieved + if 'items' not in search_res: + return [] + + # parse results + for res in search_res['items']: + title = res['name'] + url = res['html_url'] + + if res['description']: + content = res['description'][:500] + else: + content = '' + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/google.py b/searx/engines/google.py new file mode 100644 index 000000000..eed3a044e --- /dev/null +++ b/searx/engines/google.py @@ -0,0 +1,391 @@ +# Google (Web) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/custom-search/) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content, suggestion + +import re +from flask_babel import gettext +from lxml import html, etree +from searx.engines.xpath import extract_text, extract_url +from searx import logger +from searx.url_utils import urlencode, urlparse, parse_qsl +from searx.utils import match_language, eval_xpath + +logger = logger.getChild('google engine') + + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +use_locale_domain = True +time_range_support = True + +# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests +default_hostname = 'www.google.com' + +country_to_hostname = { + 'BG': 'www.google.bg', # Bulgaria + 'CZ': 'www.google.cz', # Czech Republic + 'DE': 'www.google.de', # Germany + 'DK': 'www.google.dk', # Denmark + 'AT': 'www.google.at', # Austria + 'CH': 'www.google.ch', # Switzerland + 'GR': 'www.google.gr', # Greece + 'AU': 'www.google.com.au', # Australia + 'CA': 'www.google.ca', # Canada + 'GB': 'www.google.co.uk', # United Kingdom + 'ID': 'www.google.co.id', # Indonesia + 'IE': 'www.google.ie', # Ireland + 'IN': 'www.google.co.in', # India + 'MY': 'www.google.com.my', # Malaysia + 'NZ': 'www.google.co.nz', # New Zealand + 'PH': 'www.google.com.ph', # Philippines + 'SG': 'www.google.com.sg', # Singapore + # 'US': 'www.google.us', # United States, redirect to .com + 'ZA': 'www.google.co.za', # South Africa + 'AR': 'www.google.com.ar', # Argentina + 'CL': 'www.google.cl', # Chile + 'ES': 'www.google.es', # Spain + 'MX': 'www.google.com.mx', # Mexico + 'EE': 'www.google.ee', # Estonia + 'FI': 'www.google.fi', # Finland + 'BE': 'www.google.be', # Belgium + 'FR': 'www.google.fr', # France + 'IL': 'www.google.co.il', # Israel + 'HR': 'www.google.hr', # Croatia + 'HU': 'www.google.hu', # Hungary + 'IT': 'www.google.it', # Italy + 'JP': 'www.google.co.jp', # Japan + 'KR': 'www.google.co.kr', # South Korea + 'LT': 'www.google.lt', # Lithuania + 'LV': 'www.google.lv', # Latvia + 'NO': 'www.google.no', # Norway + 'NL': 'www.google.nl', # Netherlands + 'PL': 'www.google.pl', # Poland + 'BR': 'www.google.com.br', # Brazil + 'PT': 'www.google.pt', # Portugal + 'RO': 'www.google.ro', # Romania + 'RU': 'www.google.ru', # Russia + 'SK': 'www.google.sk', # Slovakia + 'SI': 'www.google.si', # Slovenia + 'SE': 'www.google.se', # Sweden + 'TH': 'www.google.co.th', # Thailand + 'TR': 'www.google.com.tr', # Turkey + 'UA': 'www.google.com.ua', # Ukraine + # 'CN': 'www.google.cn', # China, only from China ? + 'HK': 'www.google.com.hk', # Hong Kong + 'TW': 'www.google.com.tw' # Taiwan +} + +# osm +url_map = 'https://www.openstreetmap.org/'\ + + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' + +# search-url +search_path = '/search' +search_url = ('https://{hostname}' + + search_path + + '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&hl={lang_short}&ei=x') + +time_range_search = "&tbs=qdr:{range}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} + +# other URLs +map_hostname_start = 'maps.google.' +maps_path = '/maps' +redirect_path = '/url' +images_path = '/images' +supported_languages_url = 'https://www.google.com/preferences?#languages' + +# specific xpath variables +results_xpath = '//div[contains(@class, "ZINbbc")]' +url_xpath = './/div[@class="kCrYT"][1]/a/@href' +title_xpath = './/div[@class="kCrYT"][1]/a/div[1]' +content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]' +suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]' +spelling_suggestion_xpath = '//div[@id="scc"]//a' + +# map : detail location +map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' +map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span' +map_website_url_xpath = 'h3[2]/a/@href' +map_website_title_xpath = 'h3[2]' + +# map : near the location +map_near = 'table[@class="ts"]//tr' +map_near_title = './/h4' +map_near_url = './/h4/a/@href' +map_near_phone = './/span[@class="nobr"]' + +# images +images_xpath = './/div/a' +image_url_xpath = './@href' +image_img_src_xpath = './img/@src' + +# property names +# FIXME : no translation +property_address = "Address" +property_phone = "Phone number" + + +# remove google-specific tracking-url +def parse_url(url_string, google_hostname): + # sanity check + if url_string is None: + return url_string + + # normal case + parsed_url = urlparse(url_string) + if (parsed_url.netloc in [google_hostname, ''] + and parsed_url.path == redirect_path): + query = dict(parse_qsl(parsed_url.query)) + return query['q'] + else: + return url_string + + +# returns extract_text on the first result selected by the xpath or None +def extract_text_from_dom(result, xpath): + r = eval_xpath(result, xpath) + if len(r) > 0: + return extract_text(r[0]) + return None + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + if params['language'] == 'all' or params['language'] == 'en-US': + language = 'en-GB' + else: + language = match_language(params['language'], supported_languages, language_aliases) + + language_array = language.split('-') + if params['language'].find('-') > 0: + country = params['language'].split('-')[1] + elif len(language_array) == 2: + country = language_array[1] + else: + country = 'US' + + url_lang = 'lang_' + language + + if use_locale_domain: + google_hostname = country_to_hostname.get(country.upper(), default_hostname) + else: + google_hostname = default_hostname + + # original format: ID=3e2b6616cee08557:TM=5556667580:C=r:IP=4.1.12.5-:S=23ASdf0soFgF2d34dfgf-_22JJOmHdfgg + params['cookies']['GOOGLE_ABUSE_EXEMPTION'] = 'x' + params['url'] = search_url.format(offset=offset, + query=urlencode({'q': query}), + hostname=google_hostname, + lang=url_lang, + lang_short=language) + if params['time_range'] in time_range_dict: + params['url'] += time_range_search.format(range=time_range_dict[params['time_range']]) + + params['headers']['Accept-Language'] = language + ',' + language + '-' + country + params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + + params['google_hostname'] = google_hostname + + return params + + +# get response from search-request +def response(resp): + results = [] + + # detect google sorry + resp_url = urlparse(resp.url) + if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': + raise RuntimeWarning('sorry.google.com') + + if resp_url.path.startswith('/sorry'): + raise RuntimeWarning(gettext('CAPTCHA required')) + + # which hostname ? + google_hostname = resp.search_params.get('google_hostname') + google_url = "https://" + google_hostname + + # convert the text to dom + dom = html.fromstring(resp.text) + + instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()') + if instant_answer: + results.append({'answer': u' '.join(instant_answer)}) + try: + results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0] + .split()[1].replace(',', '')) + results.append({'number_of_results': results_num}) + except: + pass + + # parse results + for result in eval_xpath(dom, results_xpath): + try: + title = extract_text(eval_xpath(result, title_xpath)[0]) + url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname) + parsed_url = urlparse(url, google_hostname) + + # map result + if parsed_url.netloc == google_hostname: + # TODO fix inside links + continue + # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): + # print "yooooo"*30 + # x = eval_xpath(result, map_near) + # if len(x) > 0: + # # map : near the location + # results = results + parse_map_near(parsed_url, x, google_hostname) + # else: + # # map : detail about a location + # results = results + parse_map_detail(parsed_url, result, google_hostname) + # # google news + # elif parsed_url.path == search_path: + # # skipping news results + # pass + + # # images result + # elif parsed_url.path == images_path: + # # only thumbnail image provided, + # # so skipping image results + # # results = results + parse_images(result, google_hostname) + # pass + + else: + # normal result + content = extract_text_from_dom(result, content_xpath) + if content is None: + continue + + # append result + results.append({'url': url, + 'title': title, + 'content': content + }) + except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) + continue + + # parse suggestion + for suggestion in eval_xpath(dom, suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + for correction in eval_xpath(dom, spelling_suggestion_xpath): + results.append({'correction': extract_text(correction)}) + + # return results + return results + + +def parse_images(result, google_hostname): + results = [] + for image in eval_xpath(result, images_xpath): + url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname) + img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0]) + + # append result + results.append({'url': url, + 'title': '', + 'content': '', + 'img_src': img_src, + 'template': 'images.html' + }) + + return results + + +def parse_map_near(parsed_url, x, google_hostname): + results = [] + + for result in x: + title = extract_text_from_dom(result, map_near_title) + url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname) + attributes = [] + phone = extract_text_from_dom(result, map_near_phone) + add_attributes(attributes, property_phone, phone, 'tel:' + phone) + results.append({'title': title, + 'url': url, + 'content': attributes_to_html(attributes) + }) + + return results + + +def parse_map_detail(parsed_url, result, google_hostname): + results = [] + + # try to parse the geoloc + m = re.search(r'@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path) + if m is None: + m = re.search(r'll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) + + if m is not None: + # geoloc found (ignored) + lon = float(m.group(2)) # noqa + lat = float(m.group(1)) # noqa + zoom = int(m.group(3)) # noqa + + # attributes + attributes = [] + address = extract_text_from_dom(result, map_address_xpath) + phone = extract_text_from_dom(result, map_phone_xpath) + add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon)) + add_attributes(attributes, property_phone, phone, 'tel:' + phone) + + # title / content / url + website_title = extract_text_from_dom(result, map_website_title_xpath) + content = extract_text_from_dom(result, content_xpath) + website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname) + + # add a result if there is a website + if website_url is not None: + results.append({'title': website_title, + 'content': (content + '<br />' if content is not None else '') + + attributes_to_html(attributes), + 'url': website_url + }) + + return results + + +def add_attributes(attributes, name, value, url): + if value is not None and len(value) > 0: + attributes.append({'label': name, 'value': value, 'url': url}) + + +def attributes_to_html(attributes): + retval = '<table class="table table-striped">' + for a in attributes: + value = a.get('value') + if 'url' in a: + value = '<a href="' + a.get('url') + '">' + value + '</a>' + retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>' + retval = retval + '</table>' + return retval + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + dom = html.fromstring(resp.text) + options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]') + for option in options: + code = eval_xpath(option, './@value')[0].split('_')[-1] + name = eval_xpath(option, './@data-name')[0].title() + supported_languages[code] = {"name": name} + + return supported_languages diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py new file mode 100644 index 000000000..636913114 --- /dev/null +++ b/searx/engines/google_images.py @@ -0,0 +1,97 @@ +""" + Google (Images) + + @website https://www.google.com + @provide-api yes (https://developers.google.com/custom-search/) + + @using-api no + @results HTML chunks with JSON inside + @stable no + @parse url, title, img_src +""" + +from datetime import date, timedelta +from json import loads +from lxml import html +from searx.url_utils import urlencode + +# engine dependent config +categories = ['images'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 100 + +search_url = 'https://www.google.com/search'\ + '?{query}'\ + '&tbm=isch'\ + '&yv=2'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm'} + + +# do search-request +def request(query, params): + search_options = { + 'ijn': params['pageno'] - 1, + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + elif params['time_range'] == 'year': + now = date.today() + then = now - timedelta(days=365) + start = then.strftime('%m/%d/%Y') + end = now.strftime('%m/%d/%Y') + search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' + + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'): + + try: + metadata = loads(result) + + img_format = metadata.get('ity', '') + img_width = metadata.get('ow', '') + img_height = metadata.get('oh', '') + if img_width and img_height: + img_format += " {0}x{1}".format(img_width, img_height) + + source = metadata.get('st', '') + source_url = metadata.get('isu', '') + if source_url: + source += " ({0})".format(source_url) + + results.append({'url': metadata['ru'], + 'title': metadata['pt'], + 'content': metadata.get('s', ''), + 'source': source, + 'img_format': img_format, + 'thumbnail_src': metadata['tu'], + 'img_src': metadata['ou'], + 'template': 'images.html'}) + + except: + continue + + return results diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py new file mode 100644 index 000000000..9c837b45b --- /dev/null +++ b/searx/engines/google_news.py @@ -0,0 +1,86 @@ +""" + Google (News) + + @website https://news.google.com + @provide-api no + + @using-api no + @results HTML + @stable no + @parse url, title, content, publishedDate +""" + +from lxml import html +from searx.engines.google import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode +from searx.utils import match_language + +# search-url +categories = ['news'] +paging = True +language_support = True +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.google.com/search'\ + '?{query}'\ + '&tbm=nws'\ + '&gws_rd=cr'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} + + +# do search-request +def request(query, params): + + search_options = { + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' + + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + if params['language'] != 'all': + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + if language: + params['url'] += '&lr=lang_' + language + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): + try: + r = { + 'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"), + 'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')), + 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), + } + except: + continue + + imgs = result.xpath('.//img/@src') + if len(imgs) and not imgs[0].startswith('data'): + r['img_src'] = imgs[0] + + results.append(r) + + # return results + return results diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py new file mode 100644 index 000000000..fd6b2e3be --- /dev/null +++ b/searx/engines/google_videos.py @@ -0,0 +1,97 @@ +""" + Google (Videos) + + @website https://www.google.com + @provide-api yes (https://developers.google.com/custom-search/) + + @using-api no + @results HTML + @stable no + @parse url, title, content, thumbnail +""" + +from datetime import date, timedelta +from json import loads +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +import re + +# engine dependent config +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.google.com/search'\ + '?q={query}'\ + '&tbm=vid'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm'} + + +# do search-request +def request(query, params): + search_options = { + 'ijn': params['pageno'] - 1, + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + elif params['time_range'] == 'year': + now = date.today() + then = now - timedelta(days=365) + start = then.strftime('%m/%d/%Y') + end = now.strftime('%m/%d/%Y') + search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' + + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('//div[@class="g"]'): + + title = extract_text(result.xpath('.//h3')) + url = result.xpath('.//div[@class="r"]/a/@href')[0] + content = extract_text(result.xpath('.//span[@class="st"]')) + + # get thumbnails + script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) + ids = result.xpath('.//div[@class="s"]//img/@id') + if len(ids) > 0: + thumbnails_data = \ + re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0], + script) + tmp = [] + if len(thumbnails_data) != 0: + tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) + thumbnail = '' + if len(tmp) != 0: + thumbnail = tmp[-1] + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'thumbnail': thumbnail, + 'template': 'videos.html'}) + + return results diff --git a/searx/engines/ina.py b/searx/engines/ina.py new file mode 100644 index 000000000..37a05f099 --- /dev/null +++ b/searx/engines/ina.py @@ -0,0 +1,87 @@ +# INA (Videos) +# +# @website https://www.ina.fr/ +# @provide-api no +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate, thumbnail +# +# @todo set content-parameter with correct data +# @todo embedded (needs some md5 from video page) + +from json import loads +from lxml import html +from dateutil import parser +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +try: + from HTMLParser import HTMLParser +except: + from html.parser import HTMLParser + +# engine dependent config +categories = ['videos'] +paging = True +page_size = 48 + +# search-url +base_url = 'https://www.ina.fr' +search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' + +# specific xpath variables +results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' +url_xpath = './/a/@href' +title_xpath = './/h3[@class="h3--title media-heading"]' +thumbnail_xpath = './/img/@src' +publishedDate_xpath = './/span[@class="broadcast"]' +content_xpath = './/p[@class="media-body__summary"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(ps=page_size, + start=params['pageno'] * page_size, + query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # we get html in a JSON container... + response = loads(resp.text) + if "content" not in response: + return [] + dom = html.fromstring(response["content"]) + p = HTMLParser() + + # parse results + for result in dom.xpath(results_xpath): + videoid = result.xpath(url_xpath)[0] + url = base_url + videoid + title = p.unescape(extract_text(result.xpath(title_xpath))) + thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) + if thumbnail[0] == '/': + thumbnail = base_url + thumbnail + d = extract_text(result.xpath(publishedDate_xpath)[0]) + d = d.split('/') + # force ISO date to avoid wrong parsing + d = "%s-%s-%s" % (d[2], d[1], d[0]) + publishedDate = parser.parse(d) + content = extract_text(result.xpath(content_xpath)) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/invidious.py b/searx/engines/invidious.py new file mode 100644 index 000000000..8d81691fc --- /dev/null +++ b/searx/engines/invidious.py @@ -0,0 +1,100 @@ +# Invidious (Videos) +# +# @website https://invidio.us/ +# @provide-api yes (https://github.com/omarroth/invidious/wiki/API) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content, publishedDate, thumbnail, embedded + +from searx.url_utils import quote_plus +from dateutil import parser +import time + +# engine dependent config +categories = ["videos", "music"] +paging = True +language_support = True +time_range_support = True + +# search-url +base_url = "https://invidio.us/" + + +# do search-request +def request(query, params): + time_range_dict = { + "day": "today", + "week": "week", + "month": "month", + "year": "year", + } + search_url = base_url + "api/v1/search?q={query}" + params["url"] = search_url.format( + query=quote_plus(query) + ) + "&page={pageno}".format(pageno=params["pageno"]) + + if params["time_range"] in time_range_dict: + params["url"] += "&date={timerange}".format( + timerange=time_range_dict[params["time_range"]] + ) + + if params["language"] != "all": + lang = params["language"].split("-") + if len(lang) == 2: + params["url"] += "&range={lrange}".format(lrange=lang[1]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = resp.json() + embedded_url = ( + '<iframe width="540" height="304" ' + + 'data-src="' + + base_url + + 'embed/{videoid}" ' + + 'frameborder="0" allowfullscreen></iframe>' + ) + + base_invidious_url = base_url + "watch?v=" + + for result in search_results: + rtype = result.get("type", None) + if rtype == "video": + videoid = result.get("videoId", None) + if not videoid: + continue + + url = base_invidious_url + videoid + embedded = embedded_url.format(videoid=videoid) + thumbs = result.get("videoThumbnails", []) + thumb = next( + (th for th in thumbs if th["quality"] == "sddefault"), None + ) + if thumb: + thumbnail = thumb.get("url", "") + else: + thumbnail = "" + + publishedDate = parser.parse( + time.ctime(result.get("published", 0)) + ) + + results.append( + { + "url": url, + "title": result.get("title", ""), + "content": result.get("description", ""), + "template": "videos.html", + "publishedDate": publishedDate, + "embedded": embedded, + "thumbnail": thumbnail, + } + ) + + return results diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py new file mode 100644 index 000000000..785b0c490 --- /dev/null +++ b/searx/engines/json_engine.py @@ -0,0 +1,136 @@ +from collections import Iterable +from json import loads +from sys import version_info +from searx.url_utils import urlencode +from searx.utils import to_string + +if version_info[0] == 3: + unicode = str + +search_url = None +url_query = None +content_query = None +title_query = None +paging = False +suggestion_query = '' +results_query = '' + +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + + +def iterate(iterable): + if type(iterable) == dict: + it = iterable.items() + + else: + it = enumerate(iterable) + for index, value in it: + yield str(index), value + + +def is_iterable(obj): + if type(obj) == str: + return False + if type(obj) == unicode: + return False + return isinstance(obj, Iterable) + + +def parse(query): + q = [] + for part in query.split('/'): + if part == '': + continue + else: + q.append(part) + return q + + +def do_query(data, q): + ret = [] + if not q: + return ret + + qkey = q[0] + + for key, value in iterate(data): + + if len(q) == 1: + if key == qkey: + ret.append(value) + elif is_iterable(value): + ret.extend(do_query(value, q)) + else: + if not is_iterable(value): + continue + if key == qkey: + ret.extend(do_query(value, q[1:])) + else: + ret.extend(do_query(value, q)) + return ret + + +def query(data, query_string): + q = parse(query_string) + + return do_query(data, q) + + +def request(query, params): + query = urlencode({'q': query})[2:] + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num + + params['url'] = search_url.format(**fp) + params['query'] = query + + return params + + +def response(resp): + results = [] + json = loads(resp.text) + if results_query: + rs = query(json, results_query) + if not len(rs): + return results + for result in rs[0]: + try: + url = query(result, url_query)[0] + title = query(result, title_query)[0] + except: + continue + try: + content = query(result, content_query)[0] + except: + content = "" + results.append({ + 'url': to_string(url), + 'title': to_string(title), + 'content': to_string(content), + }) + else: + for url, title, content in zip( + query(json, url_query), + query(json, title_query), + query(json, content_query) + ): + results.append({ + 'url': to_string(url), + 'title': to_string(title), + 'content': to_string(content), + }) + + if not suggestion_query: + return results + for suggestion in query(json, suggestion_query): + results.append({'suggestion': suggestion}) + return results diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py new file mode 100644 index 000000000..5e897c96f --- /dev/null +++ b/searx/engines/kickass.py @@ -0,0 +1,92 @@ +""" + Kickass Torrent (Videos, Music, Files) + + @website https://kickass.so + @provide-api no (nothing found) + + @using-api no + @results HTML (using search portal) + @stable yes (HTML can change) + @parse url, title, content, seed, leech, magnetlink +""" + +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size, convert_str_to_int +from searx.url_utils import quote, urljoin + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://kickass.cd/' +search_url = url + 'search/{search_term}/{pageno}/' + +# specific xpath variables +magnet_xpath = './/a[@title="Torrent magnet link"]' +torrent_xpath = './/a[@title="Download torrent file"]' +content_xpath = './/span[@class="font11px lightgrey block"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//table[@class="data"]//tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res[1:]: + link = result.xpath('.//a[@class="cellMainLink"]')[0] + href = urljoin(url, link.attrib['href']) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) + leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) + filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) + files = extract_text(result.xpath('.//td[contains(@class, "center")][2]')) + + seed = convert_str_to_int(seed) + leech = convert_str_to_int(leech) + + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + if files.isdigit(): + files = int(files) + else: + files = None + + magnetlink = result.xpath(magnet_xpath)[0].attrib['href'] + + torrentfile = result.xpath(torrent_xpath)[0].attrib['href'] + torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*") + + # append result + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'files': files, + 'magnetlink': magnetlink, + 'torrentfile': torrentfileurl, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py new file mode 100644 index 000000000..0607ac93b --- /dev/null +++ b/searx/engines/mediawiki.py @@ -0,0 +1,90 @@ +""" + general mediawiki-engine (Web) + + @website websites built on mediawiki (https://www.mediawiki.org) + @provide-api yes (http://www.mediawiki.org/wiki/API:Search) + + @using-api yes + @results JSON + @stable yes + @parse url, title + + @todo content +""" + +from json import loads +from string import Formatter +from searx.url_utils import urlencode, quote + +# engine dependent config +categories = ['general'] +language_support = True +paging = True +number_of_results = 1 +search_type = 'nearmatch' # possible values: title, text, nearmatch + +# search-url +base_url = 'https://{language}.wikipedia.org/' +search_postfix = 'w/api.php?action=query'\ + '&list=search'\ + '&{query}'\ + '&format=json'\ + '&sroffset={offset}'\ + '&srlimit={limit}'\ + '&srwhat={searchtype}' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'srsearch': query}), + offset=offset, + limit=number_of_results, + searchtype=search_type) + + format_strings = list(Formatter().parse(base_url)) + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('-')[0] + + # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)] + if any(x[1] == 'language' for x in format_strings): + string_args['language'] = language + + # write search-language back to params, required in response + params['language'] = language + + search_url = base_url + search_postfix + + params['url'] = search_url.format(**string_args) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if not search_results.get('query', {}).get('search'): + return [] + + # parse results + for result in search_results['query']['search']: + if result.get('snippet', '').startswith('#REDIRECT'): + continue + url = base_url.format(language=resp.search_params['language']) +\ + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) + + # append result + results.append({'url': url, + 'title': result['title'], + 'content': ''}) + + # return results + return results diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py new file mode 100644 index 000000000..9387b08d0 --- /dev/null +++ b/searx/engines/microsoft_academic.py @@ -0,0 +1,75 @@ +""" +Microsoft Academic (Science) + +@website https://academic.microsoft.com +@provide-api yes +@using-api no +@results JSON +@stable no +@parse url, title, content +""" + +from datetime import datetime +from json import loads +from uuid import uuid4 + +from searx.url_utils import urlencode +from searx.utils import html_to_text + +categories = ['images'] +paging = True +result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' + + +def request(query, params): + correlation_id = uuid4() + msacademic = uuid4() + time_now = datetime.now() + + params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id})) + params['cookies']['msacademic'] = str(msacademic) + params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now)) + params['method'] = 'POST' + params['data'] = { + 'Query': '@{query}@'.format(query=query), + 'Limit': 10, + 'Offset': params['pageno'] - 1, + 'Filters': '', + 'OrderBy': '', + 'SortAscending': False, + } + + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + + for result in response_data['results']: + url = _get_url(result) + title = result['e']['dn'] + content = _get_content(result) + results.append({ + 'url': url, + 'title': html_to_text(title), + 'content': html_to_text(content), + }) + + return results + + +def _get_url(result): + if 's' in result['e']: + return result['e']['s'][0]['u'] + return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id']) + + +def _get_content(result): + if 'd' in result['e']: + content = result['e']['d'] + if len(content) > 300: + return content[:300] + '...' + return content + + return '' diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py new file mode 100644 index 000000000..470c007ea --- /dev/null +++ b/searx/engines/mixcloud.py @@ -0,0 +1,61 @@ +""" + Mixcloud (Music) + + @website https://http://www.mixcloud.com/ + @provide-api yes (http://www.mixcloud.com/developers/ + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, embedded, publishedDate +""" + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.mixcloud.com/' +search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}' + +embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\ + 'data-src="https://www.mixcloud.com/widget/iframe/?feed={url}" width="300" height="300"></iframe>' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('data', []): + title = result['name'] + url = result['url'] + content = result['user']['name'] + embedded = embedded_url.format(url=url) + publishedDate = parser.parse(result['created_time']) + + # append result + results.append({'url': url, + 'title': title, + 'embedded': embedded, + 'publishedDate': publishedDate, + 'content': content}) + + # return results + return results diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py new file mode 100644 index 000000000..c57979a5f --- /dev/null +++ b/searx/engines/nyaa.py @@ -0,0 +1,108 @@ +""" + Nyaa.si (Anime Bittorrent tracker) + + @website https://nyaa.si/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://nyaa.si/' +search_url = base_url + '?page=search&{query}&offset={offset}' + +# xpath queries +xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]' +xpath_category = './/td[1]/a[1]' +xpath_title = './/td[2]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' +xpath_seeds = './/td[6]/text()' +xpath_leeches = './/td[7]/text()' +xpath_downloads = './/td[8]/text()' + + +# do search-request +def request(query, params): + query = urlencode({'term': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "" + torrent_link = "" + + # category in which our torrent belongs + try: + category = result.xpath(xpath_category)[0].attrib.get('title') + except: + pass + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = extract_text(page_a) + + # link to the page + href = base_url + page_a.attrib.get('href') + + for link in result.xpath(xpath_torrent_links): + url = link.attrib.get('href') + if 'magnet' in url: + # link to the magnet + magnet_link = url + else: + # link to the torrent file + torrent_link = url + + # seed count + seed = int_or_zero(result.xpath(xpath_seeds)) + + # leech count + leech = int_or_zero(result.xpath(xpath_leeches)) + + # torrent downloads count + downloads = int_or_zero(result.xpath(xpath_downloads)) + + # let's try to calculate the torrent size + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + except: + pass + + # content string contains all information not included into template + content = 'Category: "{category}". Downloaded {downloads} times.' + content = content.format(category=category, downloads=downloads) + + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'filesize': filesize, + 'torrentfile': torrent_link, + 'magnetlink': magnet_link, + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py new file mode 100644 index 000000000..733ba6203 --- /dev/null +++ b/searx/engines/openstreetmap.py @@ -0,0 +1,95 @@ +""" + OpenStreetMap (Map) + + @website https://openstreetmap.org/ + @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim) + + @using-api yes + @results JSON + @stable yes + @parse url, title +""" + +from json import loads + +# engine dependent config +categories = ['map'] +paging = False + +# search-url +base_url = 'https://nominatim.openstreetmap.org/' +search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1' +result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' + + +# do search-request +def request(query, params): + params['url'] = base_url + search_string.format(query=query) + + return params + + +# get response from search-request +def response(resp): + results = [] + json = loads(resp.text) + + # parse results + for r in json: + if 'display_name' not in r: + continue + + title = r['display_name'] or u'' + osm_type = r.get('osm_type', r.get('type')) + url = result_base_url.format(osm_type=osm_type, + osm_id=r['osm_id']) + + osm = {'type': osm_type, + 'id': r['osm_id']} + + geojson = r.get('geojson') + + # if no geojson is found and osm_type is a node, add geojson Point + if not geojson and osm_type == 'node': + geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]} + + address_raw = r.get('address') + address = {} + + # get name + if r['class'] == 'amenity' or\ + r['class'] == 'shop' or\ + r['class'] == 'tourism' or\ + r['class'] == 'leisure': + if address_raw.get('address29'): + address = {'name': address_raw.get('address29')} + else: + address = {'name': address_raw.get(r['type'])} + + # add rest of adressdata, if something is already found + if address.get('name'): + address.update({'house_number': address_raw.get('house_number'), + 'road': address_raw.get('road'), + 'locality': address_raw.get('city', + address_raw.get('town', # noqa + address_raw.get('village'))), # noqa + 'postcode': address_raw.get('postcode'), + 'country': address_raw.get('country'), + 'country_code': address_raw.get('country_code')}) + else: + address = None + + # append result + results.append({'template': 'map.html', + 'title': title, + 'content': '', + 'longitude': r['lon'], + 'latitude': r['lat'], + 'boundingbox': r['boundingbox'], + 'geojson': geojson, + 'address': address, + 'osm': osm, + 'url': url}) + + # return results + return results diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py new file mode 100644 index 000000000..2db92868a --- /dev/null +++ b/searx/engines/pdbe.py @@ -0,0 +1,112 @@ +""" + PDBe (Protein Data Bank in Europe) + + @website https://www.ebi.ac.uk/pdbe + @provide-api yes (https://www.ebi.ac.uk/pdbe/api/doc/search.html), + unlimited + @using-api yes + @results python dictionary (from json) + @stable yes + @parse url, title, content, img_src +""" + +from json import loads +from flask_babel import gettext + +categories = ['science'] + +hide_obsolete = False + +# status codes of unpublished entries +pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN'] +# url for api query +pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?' +# base url for results +pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}' +# link to preview image of structure +pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png' + + +def request(query, params): + + params['url'] = pdbe_solr_url + params['method'] = 'POST' + params['data'] = { + 'q': query, + 'wt': "json" # request response in parsable format + } + return params + + +def construct_body(result): + # set title + title = result['title'] + + # construct content body + content = """{title} - {authors} {journal} ({volume}) {page} ({year})""" + + # replace placeholders with actual content + try: + if result['journal']: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], journal=result['journal'], volume=result['journal_volume'], + page=result['journal_page'], year=result['citation_year']) + else: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], journal='', volume='', page='', year=result['release_year']) + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + content = None + img_src = None + + # construct url for preview image + try: + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + img_src = None + + return [title, content, img_src] + + +def response(resp): + + results = [] + json = loads(resp.text)['response']['docs'] + + # parse results + for result in json: + # catch obsolete entries and mark them accordingly + if result['status'] in pdb_unpublished_codes: + continue + if hide_obsolete: + continue + if result['status'] == 'OBS': + # expand title to add some sort of warning message + title = gettext('{title} (OBSOLETE)').format(title=result['title']) + try: + superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) + except: + continue + + # since we can't construct a proper body from the response, we'll make up our own + msg_superseded = gettext("This entry has been superseded by") + content = '{msg_superseded}: {url} ({pdb_id})'.format( + msg_superseded=msg_superseded, + url=superseded_url, + pdb_id=result['superseded_by']) + + # obsoleted entries don't have preview images + img_src = None + else: + title, content, img_src = construct_body(result) + + results.append({ + 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']), + 'title': title, + 'content': content, + 'img_src': img_src + }) + + return results diff --git a/searx/engines/photon.py b/searx/engines/photon.py new file mode 100644 index 000000000..15236f680 --- /dev/null +++ b/searx/engines/photon.py @@ -0,0 +1,131 @@ +""" + Photon (Map) + + @website https://photon.komoot.de + @provide-api yes (https://photon.komoot.de/) + + @using-api yes + @results JSON + @stable yes + @parse url, title +""" + +from json import loads +from searx.utils import searx_useragent +from searx.url_utils import urlencode + +# engine dependent config +categories = ['map'] +paging = False +language_support = True +number_of_results = 10 + +# search-url +base_url = 'https://photon.komoot.de/' +search_string = 'api/?{query}&limit={limit}' +result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' + +# list of supported languages +supported_languages = ['de', 'en', 'fr', 'it'] + + +# do search-request +def request(query, params): + params['url'] = base_url +\ + search_string.format(query=urlencode({'q': query}), + limit=number_of_results) + + if params['language'] != 'all': + language = params['language'].split('_')[0] + if language in supported_languages: + params['url'] = params['url'] + "&lang=" + language + + # using searx User-Agent + params['headers']['User-Agent'] = searx_useragent() + + return params + + +# get response from search-request +def response(resp): + results = [] + json = loads(resp.text) + + # parse results + for r in json.get('features', {}): + + properties = r.get('properties') + + if not properties: + continue + + # get title + title = properties.get('name') + + # get osm-type + if properties.get('osm_type') == 'N': + osm_type = 'node' + elif properties.get('osm_type') == 'W': + osm_type = 'way' + elif properties.get('osm_type') == 'R': + osm_type = 'relation' + else: + # continue if invalide osm-type + continue + + url = result_base_url.format(osm_type=osm_type, + osm_id=properties.get('osm_id')) + + osm = {'type': osm_type, + 'id': properties.get('osm_id')} + + geojson = r.get('geometry') + + if properties.get('extent'): + boundingbox = [properties.get('extent')[3], + properties.get('extent')[1], + properties.get('extent')[0], + properties.get('extent')[2]] + else: + # TODO: better boundingbox calculation + boundingbox = [geojson['coordinates'][1], + geojson['coordinates'][1], + geojson['coordinates'][0], + geojson['coordinates'][0]] + + # address calculation + address = {} + + # get name + if properties.get('osm_key') == 'amenity' or\ + properties.get('osm_key') == 'shop' or\ + properties.get('osm_key') == 'tourism' or\ + properties.get('osm_key') == 'leisure': + address = {'name': properties.get('name')} + + # add rest of adressdata, if something is already found + if address.get('name'): + address.update({'house_number': properties.get('housenumber'), + 'road': properties.get('street'), + 'locality': properties.get('city', + properties.get('town', # noqa + properties.get('village'))), # noqa + 'postcode': properties.get('postcode'), + 'country': properties.get('country')}) + else: + address = None + + # append result + results.append({'template': 'map.html', + 'title': title, + 'content': '', + 'longitude': geojson['coordinates'][0], + 'latitude': geojson['coordinates'][1], + 'boundingbox': boundingbox, + 'geojson': geojson, + 'address': address, + 'osm': osm, + 'url': url}) + + # return results + return results diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py new file mode 100644 index 000000000..2f3f22a97 --- /dev/null +++ b/searx/engines/piratebay.py @@ -0,0 +1,96 @@ +# Piratebay (Videos, Music, Files) +# +# @website https://thepiratebay.se +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin + +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True + +# search-url +url = 'https://thepiratebay.org/' +search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' + +# piratebay specific type-definitions +search_types = {'files': '0', + 'music': '100', + 'videos': '200'} + +# specific xpath variables +magnet_xpath = './/a[@title="Download this torrent using magnet"]' +torrent_xpath = './/a[@title="Download this torrent"]' +content_xpath = './/font[@class="detDesc"]' + + +# do search-request +def request(query, params): + search_type = search_types.get(params['category'], '0') + + params['url'] = search_url.format(search_term=quote(query), + search_type=search_type, + pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + search_res = dom.xpath('//table[@id="searchResult"]//tr') + + # return empty array if nothing is found + if not search_res: + return [] + + # parse results + for result in search_res[1:]: + link = result.xpath('.//div[@class="detName"]//a')[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] + + # convert seed to int if possible + if seed.isdigit(): + seed = int(seed) + else: + seed = 0 + + # convert leech to int if possible + if leech.isdigit(): + leech = int(leech) + else: + leech = 0 + + magnetlink = result.xpath(magnet_xpath)[0] + torrentfile_links = result.xpath(torrent_xpath) + if torrentfile_links: + torrentfile_link = torrentfile_links[0].attrib.get('href') + else: + torrentfile_link = None + + # append result + results.append({'url': href, + 'title': title, + 'content': content, + 'seed': seed, + 'leech': leech, + 'magnetlink': magnetlink.attrib.get('href'), + 'torrentfile': torrentfile_link, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py new file mode 100644 index 000000000..055f09226 --- /dev/null +++ b/searx/engines/pubmed.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +""" + PubMed (Scholar publications) + @website https://www.ncbi.nlm.nih.gov/pubmed/ + @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/) + @using-api yes + @results XML + @stable yes + @parse url, title, publishedDate, content + More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/ +""" + +from flask_babel import gettext +from lxml import etree +from datetime import datetime +from searx.url_utils import urlencode +from searx.poolrequests import get + + +categories = ['science'] + +base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\ + + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' + +# engine dependent config +number_of_results = 10 +pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'term': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + # First retrieve notice of each result + pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\ + + 'db=pubmed&retmode=xml&id={pmids_string}' + + pmids_results = etree.XML(resp.content) + pmids = pmids_results.xpath('//eSearchResult/IdList/Id') + pmids_string = '' + + for item in pmids: + pmids_string += item.text + ',' + + retrieve_notice_args = dict(pmids_string=pmids_string) + + retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) + + search_results_xml = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') + + for entry in search_results: + title = entry.xpath('.//Article/ArticleTitle')[0].text + + pmid = entry.xpath('.//PMID')[0].text + url = pubmed_url + pmid + + try: + content = entry.xpath('.//Abstract/AbstractText')[0].text + except: + content = gettext('No abstract is available for this publication.') + + # If a doi is available, add it to the snipppet + try: + doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text + content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) + except: + pass + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + res_dict = {'url': url, + 'title': title, + 'content': content} + + try: + publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text + + '-' + entry.xpath('.//DateCreated/Month')[0].text + + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d') + res_dict['publishedDate'] = publishedDate + except: + pass + + results.append(res_dict) + + return results diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py new file mode 100644 index 000000000..de12955c6 --- /dev/null +++ b/searx/engines/qwant.py @@ -0,0 +1,133 @@ +""" + Qwant (Web, Images, News, Social) + + @website https://qwant.com/ + @provide-api not officially (https://api.qwant.com/api/search/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" + +from datetime import datetime +from json import loads +from searx.utils import html_to_text +from searx.url_utils import urlencode +from searx.utils import match_language + +# engine dependent config +categories = None +paging = True +language_support = True +supported_languages_url = 'https://qwant.com/region' + +category_to_keyword = {'general': 'web', + 'images': 'images', + 'news': 'news', + 'social media': 'social'} + +# search-url +url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + + if categories[0] and categories[0] in category_to_keyword: + + params['url'] = url.format(keyword=category_to_keyword[categories[0]], + query=urlencode({'q': query}), + offset=offset) + else: + params['url'] = url.format(keyword='web', + query=urlencode({'q': query}), + offset=offset) + + # add language tag + if params['language'] != 'all': + language = match_language(params['language'], supported_languages, language_aliases) + params['url'] += '&locale=' + language.replace('-', '_').lower() + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + data = search_results.get('data', {}) + + res = data.get('result', {}) + + # parse results + for result in res.get('items', {}): + + title = html_to_text(result['title']) + res_url = result['url'] + content = html_to_text(result['desc']) + + if category_to_keyword.get(categories[0], '') == 'web': + results.append({'title': title, + 'content': content, + 'url': res_url}) + + elif category_to_keyword.get(categories[0], '') == 'images': + thumbnail_src = result['thumbnail'] + img_src = result['media'] + results.append({'template': 'images.html', + 'url': res_url, + 'title': title, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'img_src': img_src}) + + elif category_to_keyword.get(categories[0], '') == 'social': + published_date = datetime.fromtimestamp(result['date'], None) + img_src = result.get('img', None) + results.append({'url': res_url, + 'title': title, + 'publishedDate': published_date, + 'content': content, + 'img_src': img_src}) + + elif category_to_keyword.get(categories[0], '') == 'news': + published_date = datetime.fromtimestamp(result['date'], None) + media = result.get('media', []) + if len(media) > 0: + img_src = media[0].get('pict', {}).get('url', None) + else: + img_src = None + results.append({'url': res_url, + 'title': title, + 'publishedDate': published_date, + 'content': content, + 'img_src': img_src}) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + # list of regions is embedded in page as a js object + response_text = resp.text + response_text = response_text[response_text.find('regionalisation'):] + response_text = response_text[response_text.find('{'):response_text.find(');')] + + regions_json = loads(response_text) + + supported_languages = [] + for lang in regions_json['languages'].values(): + if lang['code'] == 'nb': + lang['code'] = 'no' + for country in lang['countries']: + supported_languages.append(lang['code'] + '-' + country) + + return supported_languages diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py new file mode 100644 index 000000000..d19724906 --- /dev/null +++ b/searx/engines/reddit.py @@ -0,0 +1,76 @@ +""" + Reddit + + @website https://www.reddit.com/ + @provide-api yes (https://www.reddit.com/dev/api) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, thumbnail, publishedDate +""" + +import json +from datetime import datetime +from searx.url_utils import urlencode, urljoin, urlparse + +# engine dependent config +categories = ['general', 'images', 'news', 'social media'] +page_size = 25 + +# search-url +base_url = 'https://www.reddit.com/' +search_url = base_url + 'search.json?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'q': query, 'limit': page_size}) + params['url'] = search_url.format(query=query) + + return params + + +# get response from search-request +def response(resp): + img_results = [] + text_results = [] + + search_results = json.loads(resp.text) + + # return empty array if there are no results + if 'data' not in search_results: + return [] + + posts = search_results.get('data', {}).get('children', []) + + # process results + for post in posts: + data = post['data'] + + # extract post information + params = { + 'url': urljoin(base_url, data['permalink']), + 'title': data['title'] + } + + # if thumbnail field contains a valid URL, we need to change template + thumbnail = data['thumbnail'] + url_info = urlparse(thumbnail) + # netloc & path + if url_info[1] != '' and url_info[2] != '': + params['img_src'] = data['url'] + params['thumbnail_src'] = thumbnail + params['template'] = 'images.html' + img_results.append(params) + else: + created = datetime.fromtimestamp(data['created_utc']) + content = data['selftext'] + if len(content) > 500: + content = content[:500] + '...' + params['content'] = content + params['publishedDate'] = created + text_results.append(params) + + # show images first and text results second + return img_results + text_results diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py new file mode 100644 index 000000000..72fd2b3c9 --- /dev/null +++ b/searx/engines/scanr_structures.py @@ -0,0 +1,76 @@ +""" + ScanR Structures (Science) + + @website https://scanr.enseignementsup-recherche.gouv.fr + @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, img_src +""" + +from json import loads, dumps +from searx.utils import html_to_text + +# engine dependent config +categories = ['science'] +paging = True +page_size = 20 + +# search-url +url = 'https://scanr.enseignementsup-recherche.gouv.fr/' +search_url = url + 'api/structures/search' + + +# do search-request +def request(query, params): + + params['url'] = search_url + params['method'] = 'POST' + params['headers']['Content-type'] = "application/json" + params['data'] = dumps({"query": query, + "searchField": "ALL", + "sortDirection": "ASC", + "sortOrder": "RELEVANCY", + "page": params['pageno'], + "pageSize": page_size}) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if search_res.get('total', 0) < 1: + return [] + + # parse results + for result in search_res['results']: + if 'id' not in result: + continue + + # is it thumbnail or img_src?? + thumbnail = None + if 'logo' in result: + thumbnail = result['logo'] + if thumbnail[0] == '/': + thumbnail = url + thumbnail + + content = None + if 'highlights' in result: + content = result['highlights'][0]['value'] + + # append result + results.append({'url': url + 'structure/' + result['id'], + 'title': result['label'], + # 'thumbnail': thumbnail, + 'img_src': thumbnail, + 'content': html_to_text(content)}) + + # return results + return results diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py new file mode 100644 index 000000000..789e8e7a9 --- /dev/null +++ b/searx/engines/searchcode_code.py @@ -0,0 +1,69 @@ +""" + Searchcode (It) + + @website https://searchcode.com/ + @provide-api yes (https://searchcode.com/api/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" + +from json import loads +from searx.url_utils import urlencode + + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://searchcode.com/' +search_url = url + 'api/codesearch_I/?{query}&p={pageno}' + +# special code-endings which are not recognised by the file ending +code_endings = {'cs': 'c#', + 'h': 'c', + 'hpp': 'cpp', + 'cxx': 'cpp'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # parse results + for result in search_results.get('results', []): + href = result['url'] + title = "" + result['name'] + " - " + result['filename'] + repo = result['repo'] + + lines = dict() + for line, code in result['lines'].items(): + lines[int(line)] = code + + code_language = code_endings.get( + result['filename'].split('.')[-1].lower(), + result['filename'].split('.')[-1].lower()) + + # append result + results.append({'url': href, + 'title': title, + 'content': '', + 'repository': repo, + 'codelines': sorted(lines.items()), + 'code_language': code_language, + 'template': 'code.html'}) + + # return results + return results diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py new file mode 100644 index 000000000..4b8e9a84a --- /dev/null +++ b/searx/engines/searchcode_doc.py @@ -0,0 +1,49 @@ +""" + Searchcode (It) + + @website https://searchcode.com/ + @provide-api yes (https://searchcode.com/api/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://searchcode.com/' +search_url = url + 'api/search_IV/?{query}&p={pageno}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # parse results + for result in search_results.get('results', []): + href = result['url'] + title = "[{}] {} {}".format(result['type'], result['namespace'], result['name']) + + # append result + results.append({'url': href, + 'title': title, + 'content': result['description']}) + + # return results + return results diff --git a/searx/engines/searx_engine.py b/searx/engines/searx_engine.py new file mode 100644 index 000000000..d4c85bdc5 --- /dev/null +++ b/searx/engines/searx_engine.py @@ -0,0 +1,57 @@ +""" + Searx (all) + + @website https://github.com/asciimoo/searx + @provide-api yes (https://asciimoo.github.io/searx/dev/search_api.html) + + @using-api yes + @results JSON + @stable yes (using api) + @parse url, title, content +""" + +from json import loads +from searx.engines import categories as searx_categories + + +categories = searx_categories.keys() + +# search-url +instance_urls = [] +instance_index = 0 + + +# do search-request +def request(query, params): + global instance_index + params['url'] = instance_urls[instance_index % len(instance_urls)] + params['method'] = 'POST' + + instance_index += 1 + + params['data'] = { + 'q': query, + 'pageno': params['pageno'], + 'language': params['language'], + 'time_range': params['time_range'], + 'category': params['category'], + 'format': 'json' + } + + return params + + +# get response from search-request +def response(resp): + + response_json = loads(resp.text) + results = response_json['results'] + + for i in ('answers', 'infoboxes'): + results.extend(response_json[i]) + + results.extend({'suggestion': s} for s in response_json['suggestions']) + + results.append({'number_of_results': response_json['number_of_results']}) + + return results diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py new file mode 100644 index 000000000..f9b1f99c8 --- /dev/null +++ b/searx/engines/seedpeer.py @@ -0,0 +1,78 @@ +# Seedpeer (Videos, Music, Files) +# +# @website https://seedpeer.me +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + +from lxml import html +from json import loads +from operator import itemgetter +from searx.url_utils import quote, urljoin +from searx.engines.xpath import extract_text + + +url = 'https://seedpeer.me/' +search_url = url + 'search/{search_term}?page={page_no}' +torrent_file_url = url + 'torrent/{torrent_hash}' + +# specific xpath variables +script_xpath = '//script[@type="text/javascript"][not(@src)]' +torrent_xpath = '(//table)[2]/tbody/tr' +link_xpath = '(./td)[1]/a/@href' +age_xpath = '(./td)[2]' +size_xpath = '(./td)[3]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + page_no=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + dom = html.fromstring(resp.text) + result_rows = dom.xpath(torrent_xpath) + + try: + script_element = dom.xpath(script_xpath)[0] + json_string = script_element.text[script_element.text.find('{'):] + torrents_json = loads(json_string) + except: + return [] + + # parse results + for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']): + title = torrent_json['name'] + seed = int(torrent_json['seeds']) + leech = int(torrent_json['peers']) + size = int(torrent_json['size']) + torrent_hash = torrent_json['hash'] + + torrentfile = torrent_file_url.format(torrent_hash=torrent_hash) + magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash) + + age = extract_text(torrent_row.xpath(age_xpath)) + link = torrent_row.xpath(link_xpath)[0] + + href = urljoin(url, link) + + # append result + results.append({'url': href, + 'title': title, + 'content': age, + 'seed': seed, + 'leech': leech, + 'filesize': size, + 'torrentfile': torrentfile, + 'magnetlink': magnetlink, + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py new file mode 100644 index 000000000..284689bf6 --- /dev/null +++ b/searx/engines/soundcloud.py @@ -0,0 +1,111 @@ +""" + Soundcloud (Music) + + @website https://soundcloud.com + @provide-api yes (https://developers.soundcloud.com/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, publishedDate, embedded +""" + +import re +from json import loads +from lxml import html +from dateutil import parser +from searx import logger +from searx.poolrequests import get as http_get +from searx.url_utils import quote_plus, urlencode + +try: + from cStringIO import StringIO +except: + from io import StringIO + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +# missing attribute: user_id, app_version, app_locale +url = 'https://api-v2.soundcloud.com/' +search_url = url + 'search?{query}'\ + '&variant_ids='\ + '&facet=model'\ + '&limit=20'\ + '&offset={offset}'\ + '&linked_partitioning=1'\ + '&client_id={client_id}' # noqa + +embedded_url = '<iframe width="100%" height="166" ' +\ + 'scrolling="no" frameborder="no" ' +\ + 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>' + +cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) +guest_client_id = '' + + +def get_client_id(): + response = http_get("https://soundcloud.com") + + if response.ok: + tree = html.fromstring(response.content) + # script_tags has been moved from /assets/app/ to /assets/ path. I + # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js + script_tags = tree.xpath("//script[contains(@src, '/assets/')]") + app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] + + # extracts valid app_js urls from soundcloud.com content + for app_js_url in app_js_urls: + # gets app_js and searches for the clientid + response = http_get(app_js_url) + if response.ok: + cids = cid_re.search(response.content.decode("utf-8")) + if cids is not None and len(cids.groups()): + return cids.groups()[0] + logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") + return "" + + +def init(engine_settings=None): + global guest_client_id + # api-key + guest_client_id = get_client_id() + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset, + client_id=guest_client_id) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('collection', []): + if result['kind'] in ('track', 'playlist'): + title = result['title'] + content = result['description'] + publishedDate = parser.parse(result['last_modified']) + uri = quote_plus(result['uri']) + embedded = embedded_url.format(uri=uri) + + # append result + results.append({'url': result['permalink_url'], + 'title': title, + 'publishedDate': publishedDate, + 'embedded': embedded, + 'content': content}) + + # return results + return results diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py new file mode 100644 index 000000000..aed756be3 --- /dev/null +++ b/searx/engines/spotify.py @@ -0,0 +1,62 @@ +""" + Spotify (Music) + + @website https://spotify.com + @provide-api yes (https://developer.spotify.com/web-api/search-item/) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, embedded +""" + +from json import loads +from searx.url_utils import urlencode + +# engine dependent config +categories = ['music'] +paging = True + +# search-url +url = 'https://api.spotify.com/' +search_url = url + 'v1/search?{query}&type=track&offset={offset}' + +embedded_url = '<iframe data-src="https://embed.spotify.com/?uri=spotify:track:{audioid}"\ + width="300" height="80" frameborder="0" allowtransparency="true"></iframe>' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 20 + + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # parse results + for result in search_res.get('tracks', {}).get('items', {}): + if result['type'] == 'track': + title = result['name'] + url = result['external_urls']['spotify'] + content = u'{} - {} - {}'.format( + result['artists'][0]['name'], + result['album']['name'], + result['name']) + + embedded = embedded_url.format(audioid=result['id']) + + # append result + results.append({'url': url, + 'title': title, + 'embedded': embedded, + 'content': content}) + + # return results + return results diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py new file mode 100644 index 000000000..25875aa15 --- /dev/null +++ b/searx/engines/stackoverflow.py @@ -0,0 +1,57 @@ +""" + Stackoverflow (It) + + @website https://stackoverflow.com/ + @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://stackoverflow.com/' +search_url = url + 'search?{query}&page={pageno}' + +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +content_xpath = './/div[@class="excerpt"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py new file mode 100644 index 000000000..76567396f --- /dev/null +++ b/searx/engines/startpage.py @@ -0,0 +1,131 @@ +# Startpage (Web) +# +# @website https://startpage.com +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo paging + +from lxml import html +from dateutil import parser +from datetime import datetime, timedelta +import re +from searx.engines.xpath import extract_text +from searx.languages import language_codes +from searx.utils import eval_xpath + +# engine dependent config +categories = ['general'] +# there is a mechanism to block "bot" search +# (probably the parameter qid), require +# storing of qid's between mulitble search-calls + +paging = True +language_support = True + +# search-url +base_url = 'https://startpage.com/' +search_url = base_url + 'do/search' + +# specific xpath variables +# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] +# not ads: div[@class="result"] are the direct childs of div[@id="results"] +results_xpath = '//div[@class="w-gl__result"]' +link_xpath = './/a[@class="w-gl__result-title"]' +content_xpath = './/p[@class="w-gl__description"]' + + +# do search-request +def request(query, params): + + params['url'] = search_url + params['method'] = 'POST' + params['data'] = { + 'query': query, + 'page': params['pageno'], + 'cat': 'web', + 'cmd': 'process_search', + 'engine0': 'v1all', + } + + # set language if specified + if params['language'] != 'all': + language = 'english' + for lc, _, _, lang in language_codes: + if lc == params['language']: + language = lang + params['data']['language'] = language + params['data']['lui'] = language + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath(dom, results_xpath): + links = eval_xpath(result, link_xpath) + if not links: + continue + link = links[0] + url = link.attrib.get('href') + + # block google-ad url's + if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): + continue + + title = extract_text(link) + + if eval_xpath(result, content_xpath): + content = extract_text(eval_xpath(result, content_xpath)) + else: + content = '' + + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0:date_pos - 5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0:date_pos - 5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py new file mode 100644 index 000000000..773212043 --- /dev/null +++ b/searx/engines/tokyotoshokan.py @@ -0,0 +1,99 @@ +""" + Tokyo Toshokan (A BitTorrent Library for Japanese Media) + + @website https://www.tokyotosho.info/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, publishedDate, seed, leech, + filesize, magnetlink, content +""" + +import re +from lxml import html +from searx.engines.xpath import extract_text +from datetime import datetime +from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://www.tokyotosho.info/' +search_url = base_url + 'search.php?{query}' + + +# do search-request +def request(query, params): + query = urlencode({'page': params['pageno'], 'terms': query}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]') + + # check if there are no results or page layout was changed so we cannot parse it + # currently there are two rows for each result, so total count must be even + if len(rows) == 0 or len(rows) % 2 != 0: + return [] + + # regular expression for parsing torrent size strings + size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) + + # processing the results, two rows at a time + for i in range(0, len(rows), 2): + # parse the first row + name_row = rows[i] + + links = name_row.xpath('./td[@class="desc-top"]/a') + params = { + 'template': 'torrent.html', + 'url': links[-1].attrib.get('href'), + 'title': extract_text(links[-1]) + } + # I have not yet seen any torrents without magnet links, but + # it's better to be prepared to stumble upon one some day + if len(links) == 2: + magnet = links[0].attrib.get('href') + if magnet.startswith('magnet'): + # okay, we have a valid magnet link, let's add it to the result + params['magnetlink'] = magnet + + # no more info in the first row, start parsing the second one + info_row = rows[i + 1] + desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) + for item in desc.split('|'): + item = item.strip() + if item.startswith('Size:'): + try: + # ('1.228', 'GB') + groups = size_re.match(item).groups() + params['filesize'] = get_torrent_size(groups[0], groups[1]) + except: + pass + elif item.startswith('Date:'): + try: + # Date: 2016-02-21 21:44 UTC + date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') + params['publishedDate'] = date + except: + pass + elif item.startswith('Comment:'): + params['content'] = item + stats = info_row.xpath('./td[@class="stats"]/span') + # has the layout not changed yet? + if len(stats) == 3: + params['seed'] = int_or_zero(extract_text(stats[0])) + params['leech'] = int_or_zero(extract_text(stats[1])) + + results.append(params) + + return results diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py new file mode 100644 index 000000000..fd4164a66 --- /dev/null +++ b/searx/engines/torrentz.py @@ -0,0 +1,98 @@ +""" + Torrentz2.eu (BitTorrent meta-search engine) + + @website https://torrentz2.eu/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change, although unlikely, + see https://torrentz.eu/torrentz.btsearch) + @parse url, title, publishedDate, seed, leech, filesize, magnetlink +""" + +import re +from lxml import html +from datetime import datetime +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +from searx.utils import get_torrent_size + +# engine dependent config +categories = ['files', 'videos', 'music'] +paging = True + +# search-url +# https://torrentz2.eu/search?f=EXAMPLE&p=6 +base_url = 'https://torrentz2.eu/' +search_url = base_url + 'search?{query}' + + +# do search-request +def request(query, params): + page = params['pageno'] - 1 + query = urlencode({'f': query, 'p': page}) + params['url'] = search_url.format(query=query) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="results"]/dl'): + name_cell = result.xpath('./dt')[0] + title = extract_text(name_cell) + + # skip rows that do not contain a link to a torrent + links = name_cell.xpath('./a') + if len(links) != 1: + continue + + # extract url and remove a slash in the beginning + link = links[0].attrib.get('href').lstrip('/') + + seed = 0 + leech = 0 + try: + seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', '')) + leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', '')) + except: + pass + + params = { + 'url': base_url + link, + 'title': title, + 'seed': seed, + 'leech': leech, + 'template': 'torrent.html' + } + + # let's try to calculate the torrent size + try: + filesize_info = result.xpath('./dd/span[3]/text()')[0] + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) + + params['filesize'] = filesize + except: + pass + + # does our link contain a valid SHA1 sum? + if re.compile('[0-9a-fA-F]{40}').match(link): + # add a magnet link to the result + params['magnetlink'] = 'magnet:?xt=urn:btih:' + link + + # extract and convert creation date + try: + date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title') + date = datetime.fromtimestamp(float(date_ts)) + params['publishedDate'] = date + except: + pass + + results.append(params) + + return results diff --git a/searx/engines/translated.py b/searx/engines/translated.py new file mode 100644 index 000000000..5c7b17033 --- /dev/null +++ b/searx/engines/translated.py @@ -0,0 +1,68 @@ +""" + MyMemory Translated + + @website https://mymemory.translated.net/ + @provide-api yes (https://mymemory.translated.net/doc/spec.php) + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" +import re +from sys import version_info +from searx.utils import is_valid_lang + +if version_info[0] == 3: + unicode = str + +categories = ['general'] +url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' +web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) +api_key = '' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + if api_key: + key_form = '&key=' + api_key + else: + key_form = '' + params['url'] = url.format(from_lang=from_lang[1], + to_lang=to_lang[1], + query=query, + key=key_form) + params['query'] = query + params['from_lang'] = from_lang + params['to_lang'] = to_lang + + return params + + +def response(resp): + results = [] + results.append({ + 'url': web_url.format( + from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query']), + 'title': '[{0}-{1}] {2}'.format( + resp.search_params['from_lang'][1], + resp.search_params['to_lang'][1], + resp.search_params['query']), + 'content': resp.json()['responseData']['translatedText'] + }) + return results diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py new file mode 100644 index 000000000..d2a8d2088 --- /dev/null +++ b/searx/engines/twitter.py @@ -0,0 +1,87 @@ +""" + Twitter (Social media) + + @website https://twitter.com/ + @provide-api yes (https://dev.twitter.com/docs/using-search) + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content + + @todo publishedDate +""" + +from lxml import html +from datetime import datetime +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin + +# engine dependent config +categories = ['social media'] +language_support = True + +# search-url +base_url = 'https://twitter.com/' +search_url = base_url + 'search?' + +# specific xpath variables +results_xpath = '//li[@data-item-type="tweet"]' +avatar_xpath = './/img[contains(@class, "avatar")]/@src' +link_xpath = './/small[@class="time"]//a' +title_xpath = './/span[contains(@class, "username")]' +content_xpath = './/p[contains(@class, "tweet-text")]' +timestamp_xpath = './/span[contains(@class,"_timestamp")]' + + +# do search-request +def request(query, params): + params['url'] = search_url + urlencode({'q': query}) + + # set language if specified + if params['language'] != 'all': + params['cookies']['lang'] = params['language'].split('-')[0] + else: + params['cookies']['lang'] = 'en' + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for tweet in dom.xpath(results_xpath): + try: + link = tweet.xpath(link_xpath)[0] + content = extract_text(tweet.xpath(content_xpath)[0]) + img_src = tweet.xpath(avatar_xpath)[0] + img_src = img_src.replace('_bigger', '_normal') + except Exception: + continue + + url = urljoin(base_url, link.attrib.get('href')) + title = extract_text(tweet.xpath(title_xpath)) + + pubdate = tweet.xpath(timestamp_xpath) + if len(pubdate) > 0: + timestamp = float(pubdate[0].attrib.get('data-time')) + publishedDate = datetime.fromtimestamp(timestamp, None) + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src, + 'publishedDate': publishedDate}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'img_src': img_src}) + + # return results + return results diff --git a/searx/engines/unsplash.py b/searx/engines/unsplash.py new file mode 100644 index 000000000..2e8d6fdfc --- /dev/null +++ b/searx/engines/unsplash.py @@ -0,0 +1,52 @@ +""" + Unsplash + + @website https://unsplash.com + @provide-api yes (https://unsplash.com/developers) + + @using-api no + @results JSON (using search portal's infiniscroll API) + @stable no (JSON format could change any time) + @parse url, title, img_src, thumbnail_src +""" + +from searx.url_utils import urlencode, urlparse, urlunparse, parse_qsl +from json import loads + +url = 'https://unsplash.com/' +search_url = url + 'napi/search/photos?' +categories = ['images'] +page_size = 20 +paging = True + + +def clean_url(url): + parsed = urlparse(url) + query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] + + return urlunparse((parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + urlencode(query), + parsed.fragment)) + + +def request(query, params): + params['url'] = search_url + urlencode({'query': query, 'page': params['pageno'], 'per_page': page_size}) + return params + + +def response(resp): + results = [] + json_data = loads(resp.text) + + if 'results' in json_data: + for result in json_data['results']: + results.append({'template': 'images.html', + 'url': clean_url(result['links']['html']), + 'thumbnail_src': clean_url(result['urls']['thumb']), + 'img_src': clean_url(result['urls']['raw']), + 'title': result['description'], + 'content': ''}) + return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py new file mode 100644 index 000000000..a92271019 --- /dev/null +++ b/searx/engines/vimeo.py @@ -0,0 +1,67 @@ +# Vimeo (Videos) +# +# @website https://vimeo.com/ +# @provide-api yes (http://developer.vimeo.com/api), +# they have a maximum count of queries/hour +# +# @using-api no (TODO, rewrite to api) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, publishedDate, thumbnail, embedded +# +# @todo rewrite to api +# @todo set content-parameter with correct data + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['videos'] +paging = True + +# search-url +base_url = 'https://vimeo.com/' +search_url = base_url + '/search/page:{pageno}?{query}' + +embedded_url = '<iframe data-src="https://player.vimeo.com/video/{videoid}" ' +\ + 'width="540" height="304" frameborder="0" ' +\ + 'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + data_start_pos = resp.text.find('{"filtered"') + data_end_pos = resp.text.find(';\n', data_start_pos + 1) + data = loads(resp.text[data_start_pos:data_end_pos]) + + # parse results + for result in data['filtered']['data']: + result = result[result['type']] + videoid = result['uri'].split('/')[-1] + url = base_url + videoid + title = result['name'] + thumbnail = result['pictures']['sizes'][-1]['link'] + publishedDate = parser.parse(result['created_time']) + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': '', + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py new file mode 100644 index 000000000..e913b3915 --- /dev/null +++ b/searx/engines/wikidata.py @@ -0,0 +1,500 @@ +# -*- coding: utf-8 -*- +""" + Wikidata + + @website https://wikidata.org + @provide-api yes (https://wikidata.org/w/api.php) + + @using-api partially (most things require scraping) + @results JSON, HTML + @stable no (html can change) + @parse url, infobox +""" + +from searx import logger +from searx.poolrequests import get +from searx.engines.xpath import extract_text +from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode +from searx.utils import match_language, eval_xpath + +from json import loads +from lxml.html import fromstring +from lxml import etree + +logger = logger.getChild('wikidata') +result_count = 1 + +# urls +wikidata_host = 'https://www.wikidata.org' +url_search = wikidata_host \ + + '/w/index.php?{query}&ns0=1' + +wikidata_api = wikidata_host + '/w/api.php' +url_detail = wikidata_api\ + + '?action=parse&format=json&{query}'\ + + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\ + + '&disableeditsection=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' + +url_map = 'https://www.openstreetmap.org/'\ + + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' +url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' + +# xpaths +div_ids_xpath = '//div[@id]' +wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' +title_xpath = '//*[contains(@class,"wikibase-title-label")]' +description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' +label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' +url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' +wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ + + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' +property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' +preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' +value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ + + '/*/div[contains(@class,"wikibase-snakview-value")]' +language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' +calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' +media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' + + +def get_id_cache(result): + id_cache = {} + for e in eval_xpath(result, div_ids_xpath): + id = e.get('id') + if id.startswith('P'): + id_cache[id] = e + return id_cache + + +def request(query, params): + params['url'] = url_search.format( + query=urlencode({'search': query})) + return params + + +def response(resp): + results = [] + htmlparser = etree.HTMLParser() + html = fromstring(resp.content.decode("utf-8"), parser=htmlparser) + search_results = eval_xpath(html, wikidata_ids_xpath) + + if resp.search_params['language'].split('-')[0] == 'all': + language = 'en' + else: + language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] + + # TODO: make requests asynchronous to avoid timeout when result_count > 1 + for search_result in search_results[:result_count]: + wikidata_id = search_result.split('/')[-1] + url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) + htmlresponse = get(url) + jsonresponse = loads(htmlresponse.content.decode("utf-8")) + results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) + + return results + + +def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser): + results = [] + urls = [] + attributes = [] + + title = jsonresponse.get('parse', {}).get('displaytitle', {}) + result = jsonresponse.get('parse', {}).get('text', {}) + + if not title or not result: + return results + + title = fromstring(title, parser=htmlparser) + for elem in eval_xpath(title, language_fallback_xpath): + elem.getparent().remove(elem) + title = extract_text(eval_xpath(title, title_xpath)) + + result = fromstring(result, parser=htmlparser) + for elem in eval_xpath(result, language_fallback_xpath): + elem.getparent().remove(elem) + + description = extract_text(eval_xpath(result, description_xpath)) + + id_cache = get_id_cache(result) + + # URLS + + # official website + add_url(urls, result, id_cache, 'P856', results=results) + + # wikipedia + wikipedia_link_count = 0 + wikipedia_link = get_wikilink(result, language + 'wiki') + if wikipedia_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (' + language + ')', + 'url': wikipedia_link}) + + if language != 'en': + wikipedia_en_link = get_wikilink(result, 'enwiki') + if wikipedia_en_link: + wikipedia_link_count += 1 + urls.append({'title': 'Wikipedia (en)', + 'url': wikipedia_en_link}) + + # TODO: get_wiki_firstlanguage + # if wikipedia_link_count == 0: + + # more wikis + add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') + add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') + add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki') + + add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo') + + # musicbrainz + add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') + add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') + add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') + add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') + + # IMDb + add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') + # source code repository + add_url(urls, result, id_cache, 'P1324') + # blog + add_url(urls, result, id_cache, 'P1581') + # social media links + add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') + add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') + add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/') + add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/') + add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/') + + urls.append({'title': 'Wikidata', + 'url': 'https://www.wikidata.org/wiki/' + + wikidata_id + '?uselang=' + language}) + + # INFOBOX ATTRIBUTES (ROWS) + + # DATES + # inception date + add_attribute(attributes, id_cache, 'P571', date=True) + # dissolution date + add_attribute(attributes, id_cache, 'P576', date=True) + # start date + add_attribute(attributes, id_cache, 'P580', date=True) + # end date + add_attribute(attributes, id_cache, 'P582', date=True) + # date of birth + add_attribute(attributes, id_cache, 'P569', date=True) + # date of death + add_attribute(attributes, id_cache, 'P570', date=True) + # date of spacecraft launch + add_attribute(attributes, id_cache, 'P619', date=True) + # date of spacecraft landing + add_attribute(attributes, id_cache, 'P620', date=True) + + # nationality + add_attribute(attributes, id_cache, 'P27') + # country of origin + add_attribute(attributes, id_cache, 'P495') + # country + add_attribute(attributes, id_cache, 'P17') + # headquarters + add_attribute(attributes, id_cache, 'Q180') + + # PLACES + # capital + add_attribute(attributes, id_cache, 'P36', trim=True) + # head of state + add_attribute(attributes, id_cache, 'P35', trim=True) + # head of government + add_attribute(attributes, id_cache, 'P6', trim=True) + # type of government + add_attribute(attributes, id_cache, 'P122') + # official language + add_attribute(attributes, id_cache, 'P37') + # population + add_attribute(attributes, id_cache, 'P1082', trim=True) + # area + add_attribute(attributes, id_cache, 'P2046') + # currency + add_attribute(attributes, id_cache, 'P38', trim=True) + # heigth (building) + add_attribute(attributes, id_cache, 'P2048') + + # MEDIA + # platform (videogames) + add_attribute(attributes, id_cache, 'P400') + # author + add_attribute(attributes, id_cache, 'P50') + # creator + add_attribute(attributes, id_cache, 'P170') + # director + add_attribute(attributes, id_cache, 'P57') + # performer + add_attribute(attributes, id_cache, 'P175') + # developer + add_attribute(attributes, id_cache, 'P178') + # producer + add_attribute(attributes, id_cache, 'P162') + # manufacturer + add_attribute(attributes, id_cache, 'P176') + # screenwriter + add_attribute(attributes, id_cache, 'P58') + # production company + add_attribute(attributes, id_cache, 'P272') + # record label + add_attribute(attributes, id_cache, 'P264') + # publisher + add_attribute(attributes, id_cache, 'P123') + # original network + add_attribute(attributes, id_cache, 'P449') + # distributor + add_attribute(attributes, id_cache, 'P750') + # composer + add_attribute(attributes, id_cache, 'P86') + # publication date + add_attribute(attributes, id_cache, 'P577', date=True) + # genre + add_attribute(attributes, id_cache, 'P136') + # original language + add_attribute(attributes, id_cache, 'P364') + # isbn + add_attribute(attributes, id_cache, 'Q33057') + # software license + add_attribute(attributes, id_cache, 'P275') + # programming language + add_attribute(attributes, id_cache, 'P277') + # version + add_attribute(attributes, id_cache, 'P348', trim=True) + # narrative location + add_attribute(attributes, id_cache, 'P840') + + # LANGUAGES + # number of speakers + add_attribute(attributes, id_cache, 'P1098') + # writing system + add_attribute(attributes, id_cache, 'P282') + # regulatory body + add_attribute(attributes, id_cache, 'P1018') + # language code + add_attribute(attributes, id_cache, 'P218') + + # OTHER + # ceo + add_attribute(attributes, id_cache, 'P169', trim=True) + # founder + add_attribute(attributes, id_cache, 'P112') + # legal form (company/organization) + add_attribute(attributes, id_cache, 'P1454') + # operator + add_attribute(attributes, id_cache, 'P137') + # crew members (tripulation) + add_attribute(attributes, id_cache, 'P1029') + # taxon + add_attribute(attributes, id_cache, 'P225') + # chemical formula + add_attribute(attributes, id_cache, 'P274') + # winner (sports/contests) + add_attribute(attributes, id_cache, 'P1346') + # number of deaths + add_attribute(attributes, id_cache, 'P1120') + # currency code + add_attribute(attributes, id_cache, 'P498') + + image = add_image(id_cache) + + if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: + results.append({ + 'url': urls[0]['url'], + 'title': title, + 'content': description + }) + else: + results.append({ + 'infobox': title, + 'id': wikipedia_link, + 'content': description, + 'img_src': image, + 'attributes': attributes, + 'urls': urls + }) + + return results + + +# only returns first match +def add_image(id_cache): + # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon + property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] + + for property_id in property_ids: + image = id_cache.get(property_id, None) + if image is not None: + image_name = eval_xpath(image, media_xpath) + image_src = url_image.replace('{filename}', extract_text(image_name[0])) + return image_src + + +# setting trim will only returned high ranked rows OR the first row +def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False): + attribute = id_cache.get(property_id, None) + if attribute is not None: + + if default_label: + label = default_label + else: + label = extract_text(eval_xpath(attribute, label_xpath)) + label = label[0].upper() + label[1:] + + if date: + trim = True + # remove calendar name + calendar_name = eval_xpath(attribute, calendar_name_xpath) + for calendar in calendar_name: + calendar.getparent().remove(calendar) + + concat_values = "" + values = [] + first_value = None + for row in eval_xpath(attribute, property_row_xpath): + if not first_value or not trim or eval_xpath(row, preferred_rank_xpath): + value = eval_xpath(row, value_xpath) + if not value: + continue + value = extract_text(value) + + # save first value in case no ranked row is found + if trim and not first_value: + first_value = value + else: + # to avoid duplicate values + if value not in values: + concat_values += value + ", " + values.append(value) + + if trim and not values: + attributes.append({'label': label, + 'value': first_value}) + else: + attributes.append({'label': label, + 'value': concat_values[:-2]}) + + +# requires property_id unless it's a wiki link (defined in link_type) +def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, + link_type=None): + links = [] + + # wiki links don't have property in wikidata page + if link_type and 'wiki' in link_type: + links.append(get_wikilink(result, link_type)) + else: + dom_element = id_cache.get(property_id, None) + if dom_element is not None: + if not default_label: + label = extract_text(eval_xpath(dom_element, label_xpath)) + label = label[0].upper() + label[1:] + + if link_type == 'geo': + links.append(get_geolink(dom_element)) + + elif link_type == 'imdb': + links.append(get_imdblink(dom_element, url_prefix)) + + else: + url_results = eval_xpath(dom_element, url_xpath) + for link in url_results: + if link is not None: + if url_prefix: + link = url_prefix + extract_text(link) + else: + link = extract_text(link) + links.append(link) + + # append urls + for url in links: + if url is not None: + urls.append({'title': default_label or label, + 'url': url}) + if results is not None: + results.append({'title': default_label or label, + 'url': url}) + + +def get_imdblink(result, url_prefix): + imdb_id = eval_xpath(result, value_xpath) + if imdb_id: + imdb_id = extract_text(imdb_id) + id_prefix = imdb_id[:2] + if id_prefix == 'tt': + url = url_prefix + 'title/' + imdb_id + elif id_prefix == 'nm': + url = url_prefix + 'name/' + imdb_id + elif id_prefix == 'ch': + url = url_prefix + 'character/' + imdb_id + elif id_prefix == 'co': + url = url_prefix + 'company/' + imdb_id + elif id_prefix == 'ev': + url = url_prefix + 'event/' + imdb_id + else: + url = None + return url + + +def get_geolink(result): + coordinates = eval_xpath(result, value_xpath) + if not coordinates: + return None + coordinates = extract_text(coordinates[0]) + latitude, longitude = coordinates.split(',') + + # convert to decimal + lat = int(latitude[:latitude.find(u'°')]) + if latitude.find('\'') >= 0: + lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0 + if latitude.find('"') >= 0: + lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 + if latitude.find('S') >= 0: + lat *= -1 + lon = int(longitude[:longitude.find(u'°')]) + if longitude.find('\'') >= 0: + lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0 + if longitude.find('"') >= 0: + lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 + if longitude.find('W') >= 0: + lon *= -1 + + # TODO: get precision + precision = 0.0002 + # there is no zoom information, deduce from precision (error prone) + # samples : + # 13 --> 5 + # 1 --> 6 + # 0.016666666666667 --> 9 + # 0.00027777777777778 --> 19 + # wolframalpha : + # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}} + # 14.1186-8.8322 x+0.625447 x^2 + if precision < 0.0003: + zoom = 19 + else: + zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) + + url = url_map\ + .replace('{latitude}', str(lat))\ + .replace('{longitude}', str(lon))\ + .replace('{zoom}', str(zoom)) + + return url + + +def get_wikilink(result, wikiid): + url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid)) + if not url: + return None + url = url[0] + if url.startswith('http://'): + url = url.replace('http://', 'https://') + elif url.startswith('//'): + url = 'https:' + url + return url diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py new file mode 100644 index 000000000..4dae735d1 --- /dev/null +++ b/searx/engines/wikipedia.py @@ -0,0 +1,133 @@ +""" + Wikipedia (Web) + + @website https://{language}.wikipedia.org + @provide-api yes + + @using-api yes + @results JSON + @stable yes + @parse url, infobox +""" + +from json import loads +from lxml.html import fromstring +from searx.url_utils import quote, urlencode +from searx.utils import match_language + +# search-url +base_url = u'https://{language}.wikipedia.org/' +search_url = base_url + u'w/api.php?'\ + 'action=query'\ + '&format=json'\ + '&{query}'\ + '&prop=extracts|pageimages'\ + '&exintro'\ + '&explaintext'\ + '&pithumbsize=300'\ + '&redirects' +supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' + + +# set language in base_url +def url_lang(lang): + lang_pre = lang.split('-')[0] + if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases: + return 'en' + return match_language(lang, supported_languages, language_aliases).split('-')[0] + + +# do search-request +def request(query, params): + if query.islower(): + query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8') + + params['url'] = search_url.format(query=urlencode({'titles': query}), + language=url_lang(params['language'])) + + return params + + +# get first meaningful paragraph +# this should filter out disambiguation pages and notes above first paragraph +# "magic numbers" were obtained by fine tuning +def extract_first_paragraph(content, title, image): + first_paragraph = None + + failed_attempts = 0 + for paragraph in content.split('\n'): + + starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35) + length = len(paragraph) + + if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)): + first_paragraph = paragraph + break + + failed_attempts += 1 + if failed_attempts > 3: + return None + + return first_paragraph + + +# get response from search-request +def response(resp): + results = [] + + search_result = loads(resp.text) + + # wikipedia article's unique id + # first valid id is assumed to be the requested article + for article_id in search_result['query']['pages']: + page = search_result['query']['pages'][article_id] + if int(article_id) > 0: + break + + if int(article_id) < 0: + return [] + + title = page.get('title') + + image = page.get('thumbnail') + if image: + image = image.get('source') + + extract = page.get('extract') + + summary = extract_first_paragraph(extract, title, image) + + # link to wikipedia article + wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ + + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) + + results.append({'url': wikipedia_link, 'title': title}) + + results.append({'infobox': title, + 'id': wikipedia_link, + 'content': summary, + 'img_src': image, + 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = {} + dom = fromstring(resp.text) + tables = dom.xpath('//table[contains(@class,"sortable")]') + for table in tables: + # exclude header row + trs = table.xpath('.//tr')[1:] + for tr in trs: + td = tr.xpath('./td') + code = td[3].xpath('./a')[0].text + name = td[2].xpath('./a')[0].text + english_name = td[1].xpath('./a')[0].text + articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) + # exclude languages with too few articles + if articles >= 100: + supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} + + return supported_languages diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py new file mode 100644 index 000000000..1c58c4a9b --- /dev/null +++ b/searx/engines/wolframalpha_api.py @@ -0,0 +1,129 @@ +# Wolfram Alpha (Science) +# +# @website https://www.wolframalpha.com +# @provide-api yes (https://api.wolframalpha.com/v2/) +# +# @using-api yes +# @results XML +# @stable yes +# @parse url, infobox + +from lxml import etree +from searx.url_utils import urlencode + +# search-url +search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' +site_url = 'https://www.wolframalpha.com/input/?{query}' +api_key = '' # defined in settings.yml + +# xpath variables +failure_xpath = '/queryresult[attribute::success="false"]' +input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' +pods_xpath = '//pod' +subpods_xpath = './subpod' +pod_primary_xpath = './@primary' +pod_id_xpath = './@id' +pod_title_xpath = './@title' +plaintext_xpath = './plaintext' +image_xpath = './img' +img_src_xpath = './@src' +img_alt_xpath = './@alt' + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', + 'Illustration'} + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'input': query}), api_key=api_key) + params['headers']['Referer'] = site_url.format(query=urlencode({'i': query})) + + return params + + +# replace private user area characters to make text legible +def replace_pua_chars(text): + pua_chars = {u'\uf522': u'\u2192', # rigth arrow + u'\uf7b1': u'\u2115', # set of natural numbers + u'\uf7b4': u'\u211a', # set of rational numbers + u'\uf7b5': u'\u211d', # set of real numbers + u'\uf7bd': u'\u2124', # set of integer numbers + u'\uf74c': 'd', # differential + u'\uf74d': u'\u212f', # euler's number + u'\uf74e': 'i', # imaginary number + u'\uf7d9': '='} # equals sign + + for k, v in pua_chars.items(): + text = text.replace(k, v) + + return text + + +# get response from search-request +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + # return empty array if there are no results + if search_results.xpath(failure_xpath): + return [] + + try: + infobox_title = search_results.xpath(input_xpath)[0].text + except: + infobox_title = "" + + pods = search_results.xpath(pods_xpath) + result_chunks = [] + result_content = "" + for pod in pods: + pod_id = pod.xpath(pod_id_xpath)[0] + pod_title = pod.xpath(pod_title_xpath)[0] + pod_is_result = pod.xpath(pod_primary_xpath) + + subpods = pod.xpath(subpods_xpath) + if not subpods: + continue + + # Appends either a text or an image, depending on which one is more suitable + for subpod in subpods: + content = subpod.xpath(plaintext_xpath)[0].text + image = subpod.xpath(image_xpath) + + if content and pod_id not in image_pods: + + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = "%s: %s" % (pod_title, content) + + # if no input pod was found, title is first plaintext pod + if not infobox_title: + infobox_title = content + + content = replace_pua_chars(content) + result_chunks.append({'label': pod_title, 'value': content}) + + elif image: + result_chunks.append({'label': pod_title, + 'image': {'src': image[0].xpath(img_src_xpath)[0], + 'alt': image[0].xpath(img_alt_xpath)[0]}}) + + if not result_chunks: + return [] + + title = "Wolfram|Alpha (%s)" % infobox_title + + # append infobox + results.append({'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]}) + + # append link to site + results.append({'url': resp.request.headers['Referer'], + 'title': title, + 'content': result_content}) + + return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py new file mode 100644 index 000000000..387c9fa17 --- /dev/null +++ b/searx/engines/wolframalpha_noapi.py @@ -0,0 +1,121 @@ +# Wolfram|Alpha (Science) +# +# @website https://www.wolframalpha.com/ +# @provide-api yes (https://api.wolframalpha.com/v2/) +# +# @using-api no +# @results JSON +# @stable no +# @parse url, infobox + +from json import loads +from time import time + +from searx.poolrequests import get as http_get +from searx.url_utils import urlencode + +# search-url +url = 'https://www.wolframalpha.com/' + +search_url = url + 'input/json.jsp'\ + '?async=false'\ + '&banners=raw'\ + '&debuggingdata=false'\ + '&format=image,plaintext,imagemap,minput,moutput'\ + '&formattimeout=2'\ + '&{query}'\ + '&output=JSON'\ + '&parsetimeout=2'\ + '&proxycode={token}'\ + '&scantimeout=0.5'\ + '&sponsorcategories=true'\ + '&statemethod=deploybutton' + +referer_url = url + 'input/?{query}' + +token = {'value': '', + 'last_updated': None} + +# pods to display as image in infobox +# this pods do return a plaintext, but they look better and are more useful as images +image_pods = {'VisualRepresentation', + 'Illustration', + 'Symbol'} + + +# seems, wolframalpha resets its token in every hour +def obtain_token(): + update_time = time() - (time() % 3600) + try: + token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token['value'] = loads(token_response.text)['code'] + token['last_updated'] = update_time + except: + pass + return token + + +def init(engine_settings=None): + obtain_token() + + +# do search-request +def request(query, params): + # obtain token if last update was more than an hour + if time() - (token['last_updated'] or 0) > 3600: + obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + resp_json = loads(resp.text) + + if not resp_json['queryresult']['success']: + return [] + + # TODO handle resp_json['queryresult']['assumptions'] + result_chunks = [] + infobox_title = "" + result_content = "" + for pod in resp_json['queryresult']['pods']: + pod_id = pod.get('id', '') + pod_title = pod.get('title', '') + pod_is_result = pod.get('primary', None) + + if 'subpods' not in pod: + continue + + if pod_id == 'Input' or not infobox_title: + infobox_title = pod['subpods'][0]['plaintext'] + + for subpod in pod['subpods']: + if subpod['plaintext'] != '' and pod_id not in image_pods: + # append unless it's not an actual answer + if subpod['plaintext'] != '(requires interactivity)': + result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) + + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = pod_title + ': ' + subpod['plaintext'] + + elif 'img' in subpod: + result_chunks.append({'label': pod_title, 'image': subpod['img']}) + + if not result_chunks: + return [] + + results.append({'infobox': infobox_title, + 'attributes': result_chunks, + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]}) + + results.append({'url': resp.request.headers['Referer'], + 'title': 'Wolfram|Alpha (' + infobox_title + ')', + 'content': result_content}) + + return results diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py new file mode 100644 index 000000000..f1154b16d --- /dev/null +++ b/searx/engines/www1x.py @@ -0,0 +1,58 @@ +""" + 1x (Images) + + @website http://1x.com/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail, img_src, content +""" + +from lxml import html +from searx.url_utils import urlencode, urljoin +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['images'] +paging = False + +# search-url +base_url = 'https://1x.com' +search_url = base_url + '/backend/search.php?{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + for res in dom.xpath('//div[@class="List-item MainListing"]'): + # processed start and end of link + link = res.xpath('//a')[0] + + url = urljoin(base_url, link.attrib.get('href')) + title = extract_text(link) + + thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) + # TODO: get image with higher resolution + img_src = thumbnail_src + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py new file mode 100644 index 000000000..b75896cc7 --- /dev/null +++ b/searx/engines/xpath.py @@ -0,0 +1,133 @@ +from lxml import html +from lxml.etree import _ElementStringResult, _ElementUnicodeResult +from searx.utils import html_to_text, eval_xpath +from searx.url_utils import unquote, urlencode, urljoin, urlparse + +search_url = None +url_xpath = None +content_xpath = None +title_xpath = None +thumbnail_xpath = False +paging = False +suggestion_xpath = '' +results_xpath = '' + +# parameters for engines with paging support +# +# number of results on each page +# (only needed if the site requires not a page number, but an offset) +page_size = 1 +# number of the first page (usually 0 or 1) +first_page_num = 1 + + +''' +if xpath_results is list, extract the text from each result and concat the list +if xpath_results is a xml element, extract all the text node from it + ( text_content() method from lxml ) +if xpath_results is a string element, then it's already done +''' + + +def extract_text(xpath_results): + if type(xpath_results) == list: + # it's list of result : concat everything using recursive call + result = '' + for e in xpath_results: + result = result + extract_text(e) + return result.strip() + elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: + # it's a string + return ''.join(xpath_results) + else: + # it's a element + text = html.tostring( + xpath_results, encoding='unicode', method='text', with_tail=False + ) + text = text.strip().replace('\n', ' ') + return ' '.join(text.split()) + + +def extract_url(xpath_results, search_url): + if xpath_results == []: + raise Exception('Empty url resultset') + url = extract_text(xpath_results) + + if url.startswith('//'): + # add http or https to this kind of url //example.com/ + parsed_search_url = urlparse(search_url) + url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url) + elif url.startswith('/'): + # fix relative url to the search engine + url = urljoin(search_url, url) + + # normalize url + url = normalize_url(url) + + return url + + +def normalize_url(url): + parsed_url = urlparse(url) + + # add a / at this end of the url if there is no path + if not parsed_url.netloc: + raise Exception('Cannot parse url') + if not parsed_url.path: + url += '/' + + # FIXME : hack for yahoo + if parsed_url.hostname == 'search.yahoo.com'\ + and parsed_url.path.startswith('/r'): + p = parsed_url.path + mark = p.find('/**') + if mark != -1: + return unquote(p[mark + 3:]).decode('utf-8') + + return url + + +def request(query, params): + query = urlencode({'q': query})[2:] + + fp = {'query': query} + if paging and search_url.find('{pageno}') >= 0: + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num + + params['url'] = search_url.format(**fp) + params['query'] = query + + return params + + +def response(resp): + results = [] + dom = html.fromstring(resp.text) + if results_xpath: + for result in eval_xpath(dom, results_xpath): + url = extract_url(eval_xpath(result, url_xpath), search_url) + title = extract_text(eval_xpath(result, title_xpath)) + content = extract_text(eval_xpath(result, content_xpath)) + tmp_result = {'url': url, 'title': title, 'content': content} + + # add thumbnail if available + if thumbnail_xpath: + thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath) + if len(thumbnail_xpath_result) > 0: + tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) + + results.append(tmp_result) + else: + for url, title, content in zip( + (extract_url(x, search_url) for + x in eval_xpath(dom, url_xpath)), + map(extract_text, eval_xpath(dom, title_xpath)), + map(extract_text, eval_xpath(dom, content_xpath)) + ): + results.append({'url': url, 'title': title, 'content': content}) + + if not suggestion_xpath: + return results + for suggestion in eval_xpath(dom, suggestion_xpath): + results.append({'suggestion': extract_text(suggestion)}) + return results diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py new file mode 100644 index 000000000..25bc83687 --- /dev/null +++ b/searx/engines/yacy.py @@ -0,0 +1,108 @@ +# Yacy (Web, Images, Videos, Music, Files) +# +# @website http://yacy.net +# @provide-api yes +# (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse (general) url, title, content, publishedDate +# @parse (images) url, title, img_src +# +# @todo parse video, audio and file results + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +from searx.utils import html_to_text + +# engine dependent config +categories = ['general', 'images'] # TODO , 'music', 'videos', 'files' +paging = True +language_support = True +number_of_results = 5 + +# search-url +base_url = 'http://localhost:8090' +search_url = '/yacysearch.json?{query}'\ + '&startRecord={offset}'\ + '&maximumRecords={limit}'\ + '&contentdom={search_type}'\ + '&resource=global' + +# yacy specific type-definitions +search_types = {'general': 'text', + 'images': 'image', + 'files': 'app', + 'music': 'audio', + 'videos': 'video'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * number_of_results + search_type = search_types.get(params.get('category'), '0') + + params['url'] = base_url +\ + search_url.format(query=urlencode({'query': query}), + offset=offset, + limit=number_of_results, + search_type=search_type) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=lang_' + params['language'].split('-')[0] + + return params + + +# get response from search-request +def response(resp): + results = [] + + raw_search_results = loads(resp.text) + + # return empty array if there are no results + if not raw_search_results: + return [] + + search_results = raw_search_results.get('channels', []) + + if len(search_results) == 0: + return [] + + for result in search_results[0].get('items', []): + # parse image results + if result.get('image'): + + result_url = '' + if 'url' in result: + result_url = result['url'] + elif 'link' in result: + result_url = result['link'] + else: + continue + + # append result + results.append({'url': result_url, + 'title': result['title'], + 'content': '', + 'img_src': result['image'], + 'template': 'images.html'}) + + # parse general results + else: + publishedDate = parser.parse(result['pubDate']) + + # append result + results.append({'url': result['link'], + 'title': result['title'], + 'content': html_to_text(result['description']), + 'publishedDate': publishedDate}) + + # TODO parse video, audio and file results + + # return results + return results diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py new file mode 100644 index 000000000..36c1a11f8 --- /dev/null +++ b/searx/engines/yahoo.py @@ -0,0 +1,160 @@ +""" + Yahoo (Web) + + @website https://search.yahoo.com/web + @provide-api yes (https://developer.yahoo.com/boss/search/), + $0.80/1000 queries + + @using-api no (because pricing) + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, suggestion +""" + +from lxml import html +from searx.engines.xpath import extract_text, extract_url +from searx.url_utils import unquote, urlencode +from searx.utils import match_language, eval_xpath + +# engine dependent config +categories = ['general'] +paging = True +language_support = True +time_range_support = True + +# search-url +base_url = 'https://search.yahoo.com/' +search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' +search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' + +supported_languages_url = 'https://search.yahoo.com/web/advanced' + +# specific xpath variables +results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" +url_xpath = './/h3/a/@href' +title_xpath = './/h3/a' +content_xpath = './/div[@class="compText aAbs"]' +suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" + +time_range_dict = {'day': ['1d', 'd'], + 'week': ['1w', 'w'], + 'month': ['1m', 'm']} + +language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} + + +# remove yahoo-specific tracking-url +def parse_url(url_string): + endings = ['/RS', '/RK'] + endpositions = [] + start = url_string.find('http', url_string.find('/RU=') + 1) + + for ending in endings: + endpos = url_string.rfind(ending) + if endpos > -1: + endpositions.append(endpos) + + if start == 0 or len(endpositions) == 0: + return url_string + else: + end = min(endpositions) + return unquote(url_string[start:end]) + + +def _get_url(query, offset, language, time_range): + if time_range in time_range_dict: + return base_url + search_url_with_time.format(offset=offset, + query=urlencode({'p': query}), + lang=language, + age=time_range_dict[time_range][0], + btf=time_range_dict[time_range][1]) + return base_url + search_url.format(offset=offset, + query=urlencode({'p': query}), + lang=language) + + +def _get_language(params): + if params['language'] == 'all': + return 'en' + + language = match_language(params['language'], supported_languages, language_aliases) + if language not in language_aliases.values(): + language = language.split('-')[0] + language = language.replace('-', '_').lower() + + return language + + +# do search-request +def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + + offset = (params['pageno'] - 1) * 10 + 1 + language = _get_language(params) + + params['url'] = _get_url(query, offset, language, params['time_range']) + + # TODO required? + params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ + .format(lang=language) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + try: + results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0] + .split()[0].replace(',', '')) + results.append({'number_of_results': results_num}) + except: + pass + + # parse results + for result in eval_xpath(dom, results_xpath): + try: + url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url)) + title = extract_text(eval_xpath(result, title_xpath)[0]) + except: + continue + + content = extract_text(eval_xpath(result, content_xpath)[0]) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # if no suggestion found, return results + suggestions = eval_xpath(dom, suggestion_xpath) + if not suggestions: + return results + + # parse suggestion + for suggestion in suggestions: + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + # return results + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input') + for option in options: + code_parts = eval_xpath(option, './@value')[0][5:].split('_') + if len(code_parts) == 2: + code = code_parts[0] + '-' + code_parts[1].upper() + else: + code = code_parts[0] + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py new file mode 100644 index 000000000..9f6a4159b --- /dev/null +++ b/searx/engines/yahoo_news.py @@ -0,0 +1,110 @@ +# Yahoo (News) +# +# @website https://news.yahoo.com +# @provide-api yes (https://developer.yahoo.com/boss/search/) +# $0.80/1000 queries +# +# @using-api no (because pricing) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate + +import re +from datetime import datetime, timedelta +from lxml import html +from searx.engines.xpath import extract_text, extract_url +from searx.engines.yahoo import ( + parse_url, _fetch_supported_languages, supported_languages_url, language_aliases +) +from dateutil import parser +from searx.url_utils import urlencode +from searx.utils import match_language + +# engine dependent config +categories = ['news'] +paging = True +language_support = True + +# search-url +search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa + +# specific xpath variables +results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' +url_xpath = './/h3/a/@href' +title_xpath = './/h3/a' +content_xpath = './/div[@class="compText"]' +publishedDate_xpath = './/span[contains(@class,"tri")]' +suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + if params['language'] == 'all': + language = 'en' + else: + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + + params['url'] = search_url.format(offset=offset, + query=urlencode({'p': query}), + lang=language) + + # TODO required? + params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ + .format(lang=language) + return params + + +def sanitize_url(url): + if ".yahoo.com/" in url: + return re.sub(u"\\;\\_ylt\\=.+$", "", url) + else: + return url + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + urls = result.xpath(url_xpath) + if len(urls) != 1: + continue + url = sanitize_url(parse_url(extract_url(urls, search_url))) + title = extract_text(result.xpath(title_xpath)[0]) + content = extract_text(result.xpath(content_xpath)[0]) + + # parse publishedDate + publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) + + # still useful ? + if re.match("^[0-9]+ minute(s|) ago$", publishedDate): + publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) + elif re.match("^[0-9]+ days? ago$", publishedDate): + publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) + elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(hours=int(timeNumbers[0]))\ + - timedelta(minutes=int(timeNumbers[1])) + else: + try: + publishedDate = parser.parse(publishedDate) + except: + publishedDate = datetime.now() + + if publishedDate.year == 1900: + publishedDate = publishedDate.replace(year=datetime.now().year) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': publishedDate}) + + # return results + return results diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..1c789f6cb --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,64 @@ +""" + Yahoo (Web) + + @website https://yandex.ru/ + @provide-api ? + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from lxml import html +from searx import logger +from searx.url_utils import urlencode + +logger = logger.getChild('yandex engine') + +# engine dependent config +categories = ['general'] +paging = True +language_support = True # TODO + +default_tld = 'com' +language_map = {'ru': 'ru', + 'ua': 'ua', + 'be': 'by', + 'kk': 'kz', + 'tr': 'com.tr'} + +# search-url +base_url = 'https://yandex.{tld}/' +search_url = 'search/?{query}&p={page}' + +results_xpath = '//li[@class="serp-item"]' +url_xpath = './/h2/a/@href' +title_xpath = './/h2/a//text()' +content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()' + + +def request(query, params): + lang = params['language'].split('-')[0] + host = base_url.format(tld=language_map.get(lang) or default_tld) + params['url'] = host + search_url.format(page=params['pageno'] - 1, + query=urlencode({'text': query})) + return params + + +# get response from search-request +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for result in dom.xpath(results_xpath): + try: + res = {'url': result.xpath(url_xpath)[0], + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} + except: + logger.exception('yandex parse crash') + continue + + results.append(res) + + return results diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py new file mode 100644 index 000000000..bc4c0d58e --- /dev/null +++ b/searx/engines/youtube_api.py @@ -0,0 +1,83 @@ +# Youtube (Videos) +# +# @website https://www.youtube.com/ +# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content, publishedDate, thumbnail, embedded + +from json import loads +from dateutil import parser +from searx.url_utils import urlencode + +# engine dependent config +categories = ['videos', 'music'] +paging = False +language_support = True +api_key = None + +# search-url +base_url = 'https://www.googleapis.com/youtube/v3/search' +search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}' + +embedded_url = '<iframe width="540" height="304" ' +\ + 'data-src="https://www.youtube-nocookie.com/embed/{videoid}" ' +\ + 'frameborder="0" allowfullscreen></iframe>' + +base_youtube_url = 'https://www.youtube.com/watch?v=' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), + api_key=api_key) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0] + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_results = loads(resp.text) + + # return empty array if there are no results + if 'items' not in search_results: + return [] + + # parse results + for result in search_results['items']: + videoid = result['id']['videoId'] + + title = result['snippet']['title'] + content = '' + thumbnail = '' + + pubdate = result['snippet']['publishedAt'] + publishedDate = parser.parse(pubdate) + + thumbnail = result['snippet']['thumbnails']['high']['url'] + + content = result['snippet']['description'] + + url = base_youtube_url + videoid + + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'publishedDate': publishedDate, + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py new file mode 100644 index 000000000..49d0ae604 --- /dev/null +++ b/searx/engines/youtube_noapi.py @@ -0,0 +1,90 @@ +# Youtube (Videos) +# +# @website https://www.youtube.com/ +# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list) +# +# @using-api no +# @results HTML +# @stable no +# @parse url, title, content, publishedDate, thumbnail, embedded + +from functools import reduce +from json import loads +from searx.engines.xpath import extract_text +from searx.utils import list_get +from searx.url_utils import quote_plus + +# engine dependent config +categories = ['videos', 'music'] +paging = True +language_support = False +time_range_support = True + +# search-url +base_url = 'https://www.youtube.com/results' +search_url = base_url + '?search_query={query}&page={page}' +time_range_url = '&sp=EgII{time_range}%253D%253D' +time_range_dict = {'day': 'Ag', + 'week': 'Aw', + 'month': 'BA', + 'year': 'BQ'} + +embedded_url = '<iframe width="540" height="304" ' +\ + 'data-src="https://www.youtube-nocookie.com/embed/{videoid}" ' +\ + 'frameborder="0" allowfullscreen></iframe>' + +base_youtube_url = 'https://www.youtube.com/watch?v=' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=quote_plus(query), + page=params['pageno']) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + results_data = resp.text[resp.text.find('ytInitialData'):] + results_data = results_data[results_data.find('{'):results_data.find(';\n')] + + results_json = loads(results_data) if results_data else {} + sections = results_json.get('contents', {})\ + .get('twoColumnSearchResultsRenderer', {})\ + .get('primaryContents', {})\ + .get('sectionListRenderer', {})\ + .get('contents', []) + + for section in sections: + for video_container in section.get('itemSectionRenderer', {}).get('contents', []): + video = video_container.get('videoRenderer', {}) + videoid = video.get('videoId') + if videoid is not None: + url = base_youtube_url + videoid + thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' + title = get_text_from_json(video.get('title', {})) + content = get_text_from_json(video.get('descriptionSnippet', {})) + embedded = embedded_url.format(videoid=videoid) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'template': 'videos.html', + 'embedded': embedded, + 'thumbnail': thumbnail}) + + # return results + return results + + +def get_text_from_json(element): + if 'runs' in element: + return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '') + else: + return element.get('simpleText', '') |