summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/1337x.py39
-rw-r--r--searx/engines/__init__.py269
-rw-r--r--searx/engines/acgsou.py75
-rw-r--r--searx/engines/apkmirror.py61
-rw-r--r--searx/engines/archlinux.py142
-rw-r--r--searx/engines/arxiv.py77
-rwxr-xr-xsearx/engines/base.py121
-rw-r--r--searx/engines/bing.py123
-rw-r--r--searx/engines/bing_images.py124
-rw-r--r--searx/engines/bing_news.py127
-rw-r--r--searx/engines/bing_videos.py94
-rw-r--r--searx/engines/btdigg.py85
-rw-r--r--searx/engines/currency_convert.py99
-rw-r--r--searx/engines/dailymotion.py98
-rw-r--r--searx/engines/deezer.py67
-rw-r--r--searx/engines/deviantart.py81
-rw-r--r--searx/engines/dictzone.py68
-rw-r--r--searx/engines/digbt.py62
-rw-r--r--searx/engines/digg.py69
-rw-r--r--searx/engines/doku.py85
-rw-r--r--searx/engines/duckduckgo.py143
-rw-r--r--searx/engines/duckduckgo_definitions.py171
-rw-r--r--searx/engines/duckduckgo_images.py97
-rw-r--r--searx/engines/duden.py80
-rw-r--r--searx/engines/dummy.py16
-rw-r--r--searx/engines/faroo.py96
-rw-r--r--searx/engines/fdroid.py50
-rw-r--r--searx/engines/filecrop.py88
-rw-r--r--searx/engines/flickr.py90
-rw-r--r--searx/engines/flickr_noapi.py122
-rw-r--r--searx/engines/framalibre.py72
-rw-r--r--searx/engines/frinkiac.py44
-rw-r--r--searx/engines/genius.py88
-rw-r--r--searx/engines/gentoo.py128
-rw-r--r--searx/engines/gigablast.py114
-rw-r--r--searx/engines/github.py60
-rw-r--r--searx/engines/google.py391
-rw-r--r--searx/engines/google_images.py97
-rw-r--r--searx/engines/google_news.py86
-rw-r--r--searx/engines/google_videos.py97
-rw-r--r--searx/engines/ina.py87
-rw-r--r--searx/engines/invidious.py100
-rw-r--r--searx/engines/json_engine.py136
-rw-r--r--searx/engines/kickass.py92
-rw-r--r--searx/engines/mediawiki.py90
-rw-r--r--searx/engines/microsoft_academic.py75
-rw-r--r--searx/engines/mixcloud.py61
-rw-r--r--searx/engines/nyaa.py108
-rw-r--r--searx/engines/openstreetmap.py95
-rw-r--r--searx/engines/pdbe.py112
-rw-r--r--searx/engines/photon.py131
-rw-r--r--searx/engines/piratebay.py96
-rw-r--r--searx/engines/pubmed.py101
-rw-r--r--searx/engines/qwant.py133
-rw-r--r--searx/engines/reddit.py76
-rw-r--r--searx/engines/scanr_structures.py76
-rw-r--r--searx/engines/searchcode_code.py69
-rw-r--r--searx/engines/searchcode_doc.py49
-rw-r--r--searx/engines/searx_engine.py57
-rw-r--r--searx/engines/seedpeer.py78
-rw-r--r--searx/engines/soundcloud.py111
-rw-r--r--searx/engines/spotify.py62
-rw-r--r--searx/engines/stackoverflow.py57
-rw-r--r--searx/engines/startpage.py131
-rw-r--r--searx/engines/tokyotoshokan.py99
-rw-r--r--searx/engines/torrentz.py98
-rw-r--r--searx/engines/translated.py68
-rw-r--r--searx/engines/twitter.py87
-rw-r--r--searx/engines/unsplash.py52
-rw-r--r--searx/engines/vimeo.py67
-rw-r--r--searx/engines/wikidata.py500
-rw-r--r--searx/engines/wikipedia.py133
-rw-r--r--searx/engines/wolframalpha_api.py129
-rw-r--r--searx/engines/wolframalpha_noapi.py121
-rw-r--r--searx/engines/www1x.py58
-rw-r--r--searx/engines/xpath.py133
-rw-r--r--searx/engines/yacy.py108
-rw-r--r--searx/engines/yahoo.py160
-rw-r--r--searx/engines/yahoo_news.py110
-rw-r--r--searx/engines/yandex.py64
-rw-r--r--searx/engines/youtube_api.py83
-rw-r--r--searx/engines/youtube_noapi.py90
82 files changed, 8439 insertions, 0 deletions
diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py
new file mode 100644
index 000000000..0de04bd95
--- /dev/null
+++ b/searx/engines/1337x.py
@@ -0,0 +1,39 @@
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size
+from searx.url_utils import quote, urljoin
+
+url = 'https://1337x.to/'
+search_url = url + 'search/{search_term}/{pageno}/'
+categories = ['videos']
+paging = True
+
+
+def request(query, params):
+ params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'])
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for result in dom.xpath('//table[contains(@class, "table-list")]/tbody//tr'):
+ href = urljoin(url, result.xpath('./td[contains(@class, "name")]/a[2]/@href')[0])
+ title = extract_text(result.xpath('./td[contains(@class, "name")]/a[2]'))
+ seed = extract_text(result.xpath('.//td[contains(@class, "seeds")]'))
+ leech = extract_text(result.xpath('.//td[contains(@class, "leeches")]'))
+ filesize_info = extract_text(result.xpath('.//td[contains(@class, "size")]/text()'))
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+
+ results.append({'url': href,
+ 'title': title,
+ 'seed': seed,
+ 'leech': leech,
+ 'filesize': filesize,
+ 'template': 'torrent.html'})
+
+ return results
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
new file mode 100644
index 000000000..2393f52b6
--- /dev/null
+++ b/searx/engines/__init__.py
@@ -0,0 +1,269 @@
+
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
+'''
+
+import sys
+import threading
+from os.path import realpath, dirname
+from io import open
+from babel.localedata import locale_identifiers
+from flask_babel import gettext
+from operator import itemgetter
+from json import loads
+from requests import get
+from searx import settings
+from searx import logger
+from searx.utils import load_module, match_language, get_engine_from_settings
+
+
+logger = logger.getChild('engines')
+
+engine_dir = dirname(realpath(__file__))
+
+engines = {}
+
+categories = {'general': []}
+
+languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
+babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
+ for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
+
+engine_shortcuts = {}
+engine_default_args = {'paging': False,
+ 'categories': ['general'],
+ 'language_support': True,
+ 'supported_languages': [],
+ 'safesearch': False,
+ 'timeout': settings['outgoing']['request_timeout'],
+ 'shortcut': '-',
+ 'disabled': False,
+ 'suspend_end_time': 0,
+ 'continuous_errors': 0,
+ 'time_range_support': False,
+ 'offline': False}
+
+
+def load_engine(engine_data):
+ engine_name = engine_data['name']
+ if '_' in engine_name:
+ logger.error('Engine name contains underscore: "{}"'.format(engine_name))
+ sys.exit(1)
+
+ if engine_name.lower() != engine_name:
+ logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
+ engine_name = engine_name.lower()
+ engine_data['name'] = engine_name
+
+ engine_module = engine_data['engine']
+
+ try:
+ engine = load_module(engine_module + '.py', engine_dir)
+ except:
+ logger.exception('Cannot load engine "{}"'.format(engine_module))
+ return None
+
+ for param_name in engine_data:
+ if param_name == 'engine':
+ continue
+ if param_name == 'categories':
+ if engine_data['categories'] == 'none':
+ engine.categories = []
+ else:
+ engine.categories = list(map(str.strip, engine_data['categories'].split(',')))
+ continue
+ setattr(engine, param_name, engine_data[param_name])
+
+ for arg_name, arg_value in engine_default_args.items():
+ if not hasattr(engine, arg_name):
+ setattr(engine, arg_name, arg_value)
+
+ # checking required variables
+ for engine_attr in dir(engine):
+ if engine_attr.startswith('_'):
+ continue
+ if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
+ return None
+ if getattr(engine, engine_attr) is None:
+ logger.error('Missing engine config attribute: "{0}.{1}"'
+ .format(engine.name, engine_attr))
+ sys.exit(1)
+
+ # assign supported languages from json file
+ if engine_data['name'] in languages:
+ setattr(engine, 'supported_languages', languages[engine_data['name']])
+
+ # find custom aliases for non standard language codes
+ if hasattr(engine, 'supported_languages'):
+ if hasattr(engine, 'language_aliases'):
+ language_aliases = getattr(engine, 'language_aliases')
+ else:
+ language_aliases = {}
+
+ for engine_lang in getattr(engine, 'supported_languages'):
+ iso_lang = match_language(engine_lang, babel_langs, fallback=None)
+ if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
+ iso_lang not in getattr(engine, 'supported_languages'):
+ language_aliases[iso_lang] = engine_lang
+
+ setattr(engine, 'language_aliases', language_aliases)
+
+ # assign language fetching method if auxiliary method exists
+ if hasattr(engine, '_fetch_supported_languages'):
+ setattr(engine, 'fetch_supported_languages',
+ lambda: engine._fetch_supported_languages(get(engine.supported_languages_url)))
+
+ engine.stats = {
+ 'result_count': 0,
+ 'search_count': 0,
+ 'engine_time': 0,
+ 'engine_time_count': 0,
+ 'score_count': 0,
+ 'errors': 0
+ }
+
+ if not engine.offline:
+ engine.stats['page_load_time'] = 0
+ engine.stats['page_load_count'] = 0
+
+ for category_name in engine.categories:
+ categories.setdefault(category_name, []).append(engine)
+
+ if engine.shortcut in engine_shortcuts:
+ logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
+ sys.exit(1)
+
+ engine_shortcuts[engine.shortcut] = engine.name
+
+ return engine
+
+
+def to_percentage(stats, maxvalue):
+ for engine_stat in stats:
+ if maxvalue:
+ engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100)
+ else:
+ engine_stat['percentage'] = 0
+ return stats
+
+
+def get_engines_stats():
+ # TODO refactor
+ pageloads = []
+ engine_times = []
+ results = []
+ scores = []
+ errors = []
+ scores_per_result = []
+
+ max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
+ for engine in engines.values():
+ if engine.stats['search_count'] == 0:
+ continue
+ results_num = \
+ engine.stats['result_count'] / float(engine.stats['search_count'])
+
+ if engine.stats['engine_time_count'] != 0:
+ this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
+ else:
+ this_engine_time = 0
+
+ if results_num:
+ score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa
+ score_per_result = score / results_num
+ else:
+ score = score_per_result = 0.0
+
+ if not engine.offline:
+ load_times = 0
+ if engine.stats['page_load_count'] != 0:
+ load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
+ max_pageload = max(load_times, max_pageload)
+ pageloads.append({'avg': load_times, 'name': engine.name})
+
+ max_engine_times = max(this_engine_time, max_engine_times)
+ max_results = max(results_num, max_results)
+ max_score = max(score, max_score)
+ max_score_per_result = max(score_per_result, max_score_per_result)
+ max_errors = max(max_errors, engine.stats['errors'])
+
+ engine_times.append({'avg': this_engine_time, 'name': engine.name})
+ results.append({'avg': results_num, 'name': engine.name})
+ scores.append({'avg': score, 'name': engine.name})
+ errors.append({'avg': engine.stats['errors'], 'name': engine.name})
+ scores_per_result.append({
+ 'avg': score_per_result,
+ 'name': engine.name
+ })
+
+ pageloads = to_percentage(pageloads, max_pageload)
+ engine_times = to_percentage(engine_times, max_engine_times)
+ results = to_percentage(results, max_results)
+ scores = to_percentage(scores, max_score)
+ scores_per_result = to_percentage(scores_per_result, max_score_per_result)
+ erros = to_percentage(errors, max_errors)
+
+ return [
+ (
+ gettext('Engine time (sec)'),
+ sorted(engine_times, key=itemgetter('avg'))
+ ),
+ (
+ gettext('Page loads (sec)'),
+ sorted(pageloads, key=itemgetter('avg'))
+ ),
+ (
+ gettext('Number of results'),
+ sorted(results, key=itemgetter('avg'), reverse=True)
+ ),
+ (
+ gettext('Scores'),
+ sorted(scores, key=itemgetter('avg'), reverse=True)
+ ),
+ (
+ gettext('Scores per result'),
+ sorted(scores_per_result, key=itemgetter('avg'), reverse=True)
+ ),
+ (
+ gettext('Errors'),
+ sorted(errors, key=itemgetter('avg'), reverse=True)
+ ),
+ ]
+
+
+def load_engines(engine_list):
+ global engines
+ engines.clear()
+ for engine_data in engine_list:
+ engine = load_engine(engine_data)
+ if engine is not None:
+ engines[engine.name] = engine
+ return engines
+
+
+def initialize_engines(engine_list):
+ load_engines(engine_list)
+
+ def engine_init(engine_name, init_fn):
+ init_fn(get_engine_from_settings(engine_name))
+ logger.debug('%s engine: Initialized', engine_name)
+
+ for engine_name, engine in engines.items():
+ if hasattr(engine, 'init'):
+ init_fn = getattr(engine, 'init')
+ if init_fn:
+ logger.debug('%s engine: Starting background initialization', engine_name)
+ threading.Thread(target=engine_init, args=(engine_name, init_fn)).start()
diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py
new file mode 100644
index 000000000..cca28f0db
--- /dev/null
+++ b/searx/engines/acgsou.py
@@ -0,0 +1,75 @@
+"""
+ Acgsou (Japanese Animation/Music/Comics Bittorrent tracker)
+
+ @website https://www.acgsou.com/
+ @provide-api no
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, content, seed, leech, torrentfile
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero
+
+# engine dependent config
+categories = ['files', 'images', 'videos', 'music']
+paging = True
+
+# search-url
+base_url = 'http://www.acgsou.com/'
+search_url = base_url + 'search.php?{query}&page={offset}'
+# xpath queries
+xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]'
+xpath_category = './/td[2]/a[1]'
+xpath_title = './/td[3]/a[last()]'
+xpath_torrent_links = './/td[3]/a'
+xpath_filesize = './/td[4]/text()'
+
+
+def request(query, params):
+ query = urlencode({'keyword': query})
+ params['url'] = search_url.format(query=query, offset=params['pageno'])
+ return params
+
+
+def response(resp):
+ results = []
+ dom = html.fromstring(resp.text)
+ for result in dom.xpath(xpath_results):
+ # defaults
+ filesize = 0
+ magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce"
+ torrent_link = ""
+
+ try:
+ category = extract_text(result.xpath(xpath_category)[0])
+ except:
+ pass
+
+ page_a = result.xpath(xpath_title)[0]
+ title = extract_text(page_a)
+ href = base_url + page_a.attrib.get('href')
+
+ magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5])
+
+ try:
+ filesize_info = result.xpath(xpath_filesize)[0]
+ filesize = filesize_info[:-2]
+ filesize_multiplier = filesize_info[-2:]
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+ except:
+ pass
+ # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime
+ content = u'Category: "{category}".'
+ content = content.format(category=category)
+
+ results.append({'url': href,
+ 'title': title,
+ 'content': content,
+ 'filesize': filesize,
+ 'magnetlink': magnet_link,
+ 'template': 'torrent.html'})
+ return results
diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py
new file mode 100644
index 000000000..f2ee12b29
--- /dev/null
+++ b/searx/engines/apkmirror.py
@@ -0,0 +1,61 @@
+"""
+ APK Mirror
+
+ @website https://www.apkmirror.com
+
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, thumbnail_src
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['it']
+paging = True
+
+# I am not 100% certain about this, as apkmirror appears to be a wordpress site,
+# which might support time_range searching. If you want to implement it, go ahead.
+time_range_support = False
+
+# search-url
+base_url = 'https://www.apkmirror.com'
+search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'
+
+
+# do search-request
+def request(query, params):
+
+ params['url'] = search_url.format(pageno=params['pageno'],
+ query=urlencode({'s': query}))
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath('.//div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'):
+
+ link = result.xpath('.//h5/a')[0]
+ url = base_url + link.attrib.get('href') + '#downloads'
+ title = extract_text(link)
+ thumbnail_src = base_url + result.xpath('.//img')[0].attrib.get('src').replace('&w=32&h=32', '&w=64&h=64')
+
+ res = {
+ 'url': url,
+ 'title': title,
+ 'thumbnail_src': thumbnail_src
+ }
+
+ # append result
+ results.append(res)
+
+ # return results
+ return results
diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py
new file mode 100644
index 000000000..dce862f55
--- /dev/null
+++ b/searx/engines/archlinux.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Arch Linux Wiki
+
+ @website https://wiki.archlinux.org
+ @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode, urljoin
+
+# engine dependent config
+categories = ['it']
+language_support = True
+paging = True
+base_url = 'https://wiki.archlinux.org'
+
+# xpath queries
+xpath_results = '//ul[@class="mw-search-results"]/li'
+xpath_link = './/div[@class="mw-search-result-heading"]/a'
+
+
+# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
+def locale_to_lang_code(locale):
+ if locale.find('-') >= 0:
+ locale = locale.split('-')[0]
+ return locale
+
+
+# wikis for some languages were moved off from the main site, we need to make
+# requests to correct URLs to be able to get results in those languages
+lang_urls = {
+ 'all': {
+ 'base': 'https://wiki.archlinux.org',
+ 'search': '/index.php?title=Special:Search&offset={offset}&{query}'
+ },
+ 'de': {
+ 'base': 'https://wiki.archlinux.de',
+ 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}'
+ },
+ 'fr': {
+ 'base': 'https://wiki.archlinux.fr',
+ 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}'
+ },
+ 'ja': {
+ 'base': 'https://wiki.archlinuxjp.org',
+ 'search': '/index.php?title=特別:検索&offset={offset}&{query}'
+ },
+ 'ro': {
+ 'base': 'http://wiki.archlinux.ro',
+ 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}'
+ },
+ 'tr': {
+ 'base': 'http://archtr.org/wiki',
+ 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}'
+ }
+}
+
+
+# get base & search URLs for selected language
+def get_lang_urls(language):
+ if language in lang_urls:
+ return lang_urls[language]
+ return lang_urls['all']
+
+
+# Language names to build search requests for
+# those languages which are hosted on the main site.
+main_langs = {
+ 'ar': 'العربية',
+ 'bg': 'Български',
+ 'cs': 'Česky',
+ 'da': 'Dansk',
+ 'el': 'Ελληνικά',
+ 'es': 'Español',
+ 'he': 'עברית',
+ 'hr': 'Hrvatski',
+ 'hu': 'Magyar',
+ 'it': 'Italiano',
+ 'ko': '한국어',
+ 'lt': 'Lietuviškai',
+ 'nl': 'Nederlands',
+ 'pl': 'Polski',
+ 'pt': 'Português',
+ 'ru': 'Русский',
+ 'sl': 'Slovenský',
+ 'th': 'ไทย',
+ 'uk': 'Українська',
+ 'zh': '简体中文'
+}
+supported_languages = dict(lang_urls, **main_langs)
+
+
+# do search-request
+def request(query, params):
+ # translate the locale (e.g. 'en-US') to language code ('en')
+ language = locale_to_lang_code(params['language'])
+
+ # if our language is hosted on the main site, we need to add its name
+ # to the query in order to narrow the results to that language
+ if language in main_langs:
+ query += b' (' + main_langs[language] + b')'
+
+ # prepare the request parameters
+ query = urlencode({'search': query})
+ offset = (params['pageno'] - 1) * 20
+
+ # get request URLs for our language of choice
+ urls = get_lang_urls(language)
+ search_url = urls['base'] + urls['search']
+
+ params['url'] = search_url.format(query=query, offset=offset)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ # get the base URL for the language in which request was made
+ language = locale_to_lang_code(resp.search_params['language'])
+ base_url = get_lang_urls(language)['base']
+
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath(xpath_results):
+ link = result.xpath(xpath_link)[0]
+ href = urljoin(base_url, link.attrib.get('href'))
+ title = extract_text(link)
+
+ results.append({'url': href,
+ 'title': title})
+
+ return results
diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py
new file mode 100644
index 000000000..e3c871d17
--- /dev/null
+++ b/searx/engines/arxiv.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+"""
+ ArXiV (Scientific preprints)
+ @website https://arxiv.org
+ @provide-api yes (export.arxiv.org/api/query)
+ @using-api yes
+ @results XML-RSS
+ @stable yes
+ @parse url, title, publishedDate, content
+ More info on api: https://arxiv.org/help/api/user-manual
+"""
+
+from lxml import html
+from datetime import datetime
+from searx.url_utils import urlencode
+
+
+categories = ['science']
+paging = True
+
+base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
+ + '{query}&start={offset}&max_results={number_of_results}'
+
+# engine dependent config
+number_of_results = 10
+
+
+def request(query, params):
+ # basic search
+ offset = (params['pageno'] - 1) * number_of_results
+
+ string_args = dict(query=query.decode('utf-8'),
+ offset=offset,
+ number_of_results=number_of_results)
+
+ params['url'] = base_url.format(**string_args)
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.content)
+ search_results = dom.xpath('//entry')
+
+ for entry in search_results:
+ title = entry.xpath('.//title')[0].text
+
+ url = entry.xpath('.//id')[0].text
+
+ content_string = '{doi_content}{abstract_content}'
+
+ abstract = entry.xpath('.//summary')[0].text
+
+ # If a doi is available, add it to the snipppet
+ try:
+ doi_content = entry.xpath('.//link[@title="doi"]')[0].text
+ content = content_string.format(doi_content=doi_content, abstract_content=abstract)
+ except:
+ content = content_string.format(doi_content="", abstract_content=abstract)
+
+ if len(content) > 300:
+ content = content[0:300] + "..."
+ # TODO: center snippet on query term
+
+ publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ')
+
+ res_dict = {'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': content}
+
+ results.append(res_dict)
+
+ return results
diff --git a/searx/engines/base.py b/searx/engines/base.py
new file mode 100755
index 000000000..f1b1cf671
--- /dev/null
+++ b/searx/engines/base.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+"""
+ BASE (Scholar publications)
+
+ @website https://base-search.net
+ @provide-api yes with authorization (https://api.base-search.net/)
+
+ @using-api yes
+ @results XML
+ @stable ?
+ @parse url, title, publishedDate, content
+ More info on api: http://base-search.net/about/download/base_interface.pdf
+"""
+
+from lxml import etree
+from datetime import datetime
+import re
+from searx.url_utils import urlencode
+from searx.utils import searx_useragent
+
+
+categories = ['science']
+
+base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\
+ + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
+
+# engine dependent config
+paging = True
+number_of_results = 10
+
+# shortcuts for advanced search
+shorcut_dict = {
+ # user-friendly keywords
+ 'format:': 'dcformat:',
+ 'author:': 'dccreator:',
+ 'collection:': 'dccollection:',
+ 'hdate:': 'dchdate:',
+ 'contributor:': 'dccontributor:',
+ 'coverage:': 'dccoverage:',
+ 'date:': 'dcdate:',
+ 'abstract:': 'dcdescription:',
+ 'urls:': 'dcidentifier:',
+ 'language:': 'dclanguage:',
+ 'publisher:': 'dcpublisher:',
+ 'relation:': 'dcrelation:',
+ 'rights:': 'dcrights:',
+ 'source:': 'dcsource:',
+ 'subject:': 'dcsubject:',
+ 'title:': 'dctitle:',
+ 'type:': 'dcdctype:'
+}
+
+
+def request(query, params):
+ # replace shortcuts with API advanced search keywords
+ for key in shorcut_dict.keys():
+ query = re.sub(key, shorcut_dict[key], str(query))
+
+ # basic search
+ offset = (params['pageno'] - 1) * number_of_results
+
+ string_args = dict(query=urlencode({'query': query}),
+ offset=offset,
+ hits=number_of_results)
+
+ params['url'] = base_url.format(**string_args)
+
+ params['headers']['User-Agent'] = searx_useragent()
+ return params
+
+
+def response(resp):
+ results = []
+
+ search_results = etree.XML(resp.content)
+
+ for entry in search_results.xpath('./result/doc'):
+ content = "No description available"
+
+ date = datetime.now() # needed in case no dcdate is available for an item
+ for item in entry:
+ if item.attrib["name"] == "dchdate":
+ harvestDate = item.text
+
+ elif item.attrib["name"] == "dcdate":
+ date = item.text
+
+ elif item.attrib["name"] == "dctitle":
+ title = item.text
+
+ elif item.attrib["name"] == "dclink":
+ url = item.text
+
+ elif item.attrib["name"] == "dcdescription":
+ content = item.text[:300]
+ if len(item.text) > 300:
+ content += "..."
+
+# dates returned by the BASE API are not several formats
+ publishedDate = None
+ for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
+ try:
+ publishedDate = datetime.strptime(date, date_format)
+ break
+ except:
+ pass
+
+ if publishedDate is not None:
+ res_dict = {'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': content}
+ else:
+ res_dict = {'url': url,
+ 'title': title,
+ 'content': content}
+
+ results.append(res_dict)
+
+ return results
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
new file mode 100644
index 000000000..ed0b87dbd
--- /dev/null
+++ b/searx/engines/bing.py
@@ -0,0 +1,123 @@
+"""
+ Bing (Web)
+
+ @website https://www.bing.com
+ @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
+ max. 5000 query/month
+
+ @using-api no (because of query limit)
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+
+ @todo publishedDate
+"""
+
+import re
+from lxml import html
+from searx import logger, utils
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+from searx.utils import match_language, gen_useragent, eval_xpath
+
+logger = logger.getChild('bing engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True
+supported_languages_url = 'https://www.bing.com/account/general'
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
+
+# search-url
+base_url = 'https://www.bing.com/'
+search_string = 'search?{query}&first={offset}'
+
+
+def _get_offset_from_pageno(pageno):
+ return (pageno - 1) * 10 + 1
+
+
+# do search-request
+def request(query, params):
+ offset = _get_offset_from_pageno(params.get('pageno', 0))
+
+ if params['language'] == 'all':
+ lang = 'EN'
+ else:
+ lang = match_language(params['language'], supported_languages, language_aliases)
+
+ query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
+
+ search_path = search_string.format(
+ query=urlencode({'q': query}),
+ offset=offset)
+
+ params['url'] = base_url + search_path
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ result_len = 0
+
+ dom = html.fromstring(resp.text)
+ # parse results
+ for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
+ link = eval_xpath(result, './/h3/a')[0]
+ url = link.attrib.get('href')
+ title = extract_text(link)
+ content = extract_text(eval_xpath(result, './/p'))
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
+
+ # parse results again if nothing is found yet
+ for result in eval_xpath(dom, '//li[@class="b_algo"]'):
+ link = eval_xpath(result, './/h2/a')[0]
+ url = link.attrib.get('href')
+ title = extract_text(link)
+ content = extract_text(eval_xpath(result, './/p'))
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
+
+ try:
+ result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()'))
+ result_len_container = utils.to_string(result_len_container)
+ if "-" in result_len_container:
+ # Remove the part "from-to" for paginated request ...
+ result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:]
+
+ result_len_container = re.sub('[^0-9]', '', result_len_container)
+ if len(result_len_container) > 0:
+ result_len = int(result_len_container)
+ except Exception as e:
+ logger.debug('result error :\n%s', e)
+ pass
+
+ if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
+ return []
+
+ results.append({'number_of_results': result_len})
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = []
+ dom = html.fromstring(resp.text)
+ options = eval_xpath(dom, '//div[@id="limit-languages"]//input')
+ for option in options:
+ code = eval_xpath(option, './@id')[0].replace('_', '-')
+ if code == 'nb':
+ code = 'no'
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
new file mode 100644
index 000000000..44e2c3bbc
--- /dev/null
+++ b/searx/engines/bing_images.py
@@ -0,0 +1,124 @@
+"""
+ Bing (Images)
+
+ @website https://www.bing.com/images
+ @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
+ max. 5000 query/month
+
+ @using-api no (because of query limit)
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, img_src
+
+"""
+
+from lxml import html
+from json import loads
+import re
+from searx.url_utils import urlencode
+from searx.utils import match_language
+
+# engine dependent config
+categories = ['images']
+paging = True
+safesearch = True
+time_range_support = True
+language_support = True
+supported_languages_url = 'https://www.bing.com/account/general'
+number_of_results = 28
+
+# search-url
+base_url = 'https://www.bing.com/'
+search_string = 'images/search'\
+ '?{query}'\
+ '&count={count}'\
+ '&first={first}'\
+ '&FORM=IBASEP'
+time_range_string = '&qft=+filterui:age-lt{interval}'
+time_range_dict = {'day': '1440',
+ 'week': '10080',
+ 'month': '43200',
+ 'year': '525600'}
+
+# safesearch definitions
+safesearch_types = {2: 'STRICT',
+ 1: 'DEMOTE',
+ 0: 'OFF'}
+
+
+# do search-request
+def request(query, params):
+ offset = ((params['pageno'] - 1) * number_of_results) + 1
+
+ search_path = search_string.format(
+ query=urlencode({'q': query}),
+ count=number_of_results,
+ first=offset)
+
+ language = match_language(params['language'], supported_languages, language_aliases).lower()
+
+ params['cookies']['SRCHHPGUSR'] = \
+ 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+
+ params['cookies']['_EDGE_S'] = 'mkt=' + language +\
+ '&ui=' + language + '&F=1'
+
+ params['url'] = base_url + search_path
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath('//div[@class="imgpt"]'):
+
+ img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
+ # Microsoft seems to experiment with this code so don't make the path too specific,
+ # just catch the text section for the first anchor in img_info assuming this to be
+ # the originating site.
+ source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
+
+ try:
+ m = loads(result.xpath('./a/@m')[0])
+
+ # strip 'Unicode private use area' highlighting, they render to Tux
+ # the Linux penguin and a standing diamond on my machine...
+ title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '')
+ results.append({'template': 'images.html',
+ 'url': m['purl'],
+ 'thumbnail_src': m['turl'],
+ 'img_src': m['murl'],
+ 'content': '',
+ 'title': title,
+ 'source': source,
+ 'img_format': img_format})
+ except:
+ continue
+
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = []
+ dom = html.fromstring(resp.text)
+
+ regions_xpath = '//div[@id="region-section-content"]' \
+ + '//ul[@class="b_vList"]/li/a/@href'
+
+ regions = dom.xpath(regions_xpath)
+ for region in regions:
+ code = re.search('setmkt=[^\&]+', region).group()[7:]
+ if code == 'nb-NO':
+ code = 'no-NO'
+
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
new file mode 100644
index 000000000..669130c42
--- /dev/null
+++ b/searx/engines/bing_news.py
@@ -0,0 +1,127 @@
+"""
+ Bing (News)
+
+ @website https://www.bing.com/news
+ @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
+ max. 5000 query/month
+
+ @using-api no (because of query limit)
+ @results RSS (using search portal)
+ @stable yes (except perhaps for the images)
+ @parse url, title, content, publishedDate, thumbnail
+"""
+
+from datetime import datetime
+from dateutil import parser
+from lxml import etree
+from searx.utils import list_get, match_language
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
+from searx.url_utils import urlencode, urlparse, parse_qsl
+
+# engine dependent config
+categories = ['news']
+paging = True
+language_support = True
+time_range_support = True
+
+# search-url
+base_url = 'https://www.bing.com/'
+search_string = 'news/search?{query}&first={offset}&format=RSS'
+search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
+time_range_dict = {'day': '7',
+ 'week': '8',
+ 'month': '9'}
+
+
+# remove click
+def url_cleanup(url_string):
+ parsed_url = urlparse(url_string)
+ if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
+ query = dict(parse_qsl(parsed_url.query))
+ return query.get('url', None)
+ return url_string
+
+
+# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
+def image_url_cleanup(url_string):
+ parsed_url = urlparse(url_string)
+ if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
+ query = dict(parse_qsl(parsed_url.query))
+ return "https://www.bing.com/th?id=" + query.get('id')
+ return url_string
+
+
+def _get_url(query, language, offset, time_range):
+ if time_range in time_range_dict:
+ search_path = search_string_with_time.format(
+ query=urlencode({'q': query, 'setmkt': language}),
+ offset=offset,
+ interval=time_range_dict[time_range])
+ else:
+ search_path = search_string.format(
+ query=urlencode({'q': query, 'setmkt': language}),
+ offset=offset)
+ return base_url + search_path
+
+
+# do search-request
+def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
+ offset = (params['pageno'] - 1) * 10 + 1
+
+ if params['language'] == 'all':
+ language = 'en-US'
+ else:
+ language = match_language(params['language'], supported_languages, language_aliases)
+
+ params['url'] = _get_url(query, language, offset, params['time_range'])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ rss = etree.fromstring(resp.content)
+
+ ns = rss.nsmap
+
+ # parse results
+ for item in rss.xpath('./channel/item'):
+ # url / title / content
+ url = url_cleanup(item.xpath('./link/text()')[0])
+ title = list_get(item.xpath('./title/text()'), 0, url)
+ content = list_get(item.xpath('./description/text()'), 0, '')
+
+ # publishedDate
+ publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
+ try:
+ publishedDate = parser.parse(publishedDate, dayfirst=False)
+ except TypeError:
+ publishedDate = datetime.now()
+ except ValueError:
+ publishedDate = datetime.now()
+
+ # thumbnail
+ thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
+ if thumbnail is not None:
+ thumbnail = image_url_cleanup(thumbnail)
+
+ # append result
+ if thumbnail is not None:
+ results.append({'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': content,
+ 'img_src': thumbnail})
+ else:
+ results.append({'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py
new file mode 100644
index 000000000..f1e636819
--- /dev/null
+++ b/searx/engines/bing_videos.py
@@ -0,0 +1,94 @@
+"""
+ Bing (Videos)
+
+ @website https://www.bing.com/videos
+ @provide-api yes (http://datamarket.azure.com/dataset/bing/search)
+
+ @using-api no
+ @results HTML
+ @stable no
+ @parse url, title, content, thumbnail
+"""
+
+from json import loads
+from lxml import html
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
+from searx.url_utils import urlencode
+from searx.utils import match_language
+
+
+categories = ['videos']
+paging = True
+safesearch = True
+time_range_support = True
+number_of_results = 28
+language_support = True
+
+base_url = 'https://www.bing.com/'
+search_string = 'videos/search'\
+ '?{query}'\
+ '&count={count}'\
+ '&first={first}'\
+ '&scope=video'\
+ '&FORM=QBLH'
+time_range_string = '&qft=+filterui:videoage-lt{interval}'
+time_range_dict = {'day': '1440',
+ 'week': '10080',
+ 'month': '43200',
+ 'year': '525600'}
+
+# safesearch definitions
+safesearch_types = {2: 'STRICT',
+ 1: 'DEMOTE',
+ 0: 'OFF'}
+
+
+# do search-request
+def request(query, params):
+ offset = ((params['pageno'] - 1) * number_of_results) + 1
+
+ search_path = search_string.format(
+ query=urlencode({'q': query}),
+ count=number_of_results,
+ first=offset)
+
+ # safesearch cookie
+ params['cookies']['SRCHHPGUSR'] = \
+ 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+
+ # language cookie
+ language = match_language(params['language'], supported_languages, language_aliases).lower()
+ params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
+
+ # query and paging
+ params['url'] = base_url + search_path
+
+ # time range
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for result in dom.xpath('//div[@class="dg_u"]'):
+ try:
+ metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
+ info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
+ content = '{0} - {1}'.format(metadata['du'], info)
+ thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid'])
+ results.append({'url': metadata['murl'],
+ 'thumbnail': thumbnail,
+ 'title': metadata.get('vt', ''),
+ 'content': content,
+ 'template': 'videos.html'})
+
+ except:
+ continue
+
+ return results
diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py
new file mode 100644
index 000000000..82eedc24b
--- /dev/null
+++ b/searx/engines/btdigg.py
@@ -0,0 +1,85 @@
+"""
+ BTDigg (Videos, Music, Files)
+
+ @website https://btdig.com
+ @provide-api yes (on demand)
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content, seed, leech, magnetlink
+"""
+
+from lxml import html
+from operator import itemgetter
+from searx.engines.xpath import extract_text
+from searx.url_utils import quote, urljoin
+from searx.utils import get_torrent_size
+
+# engine dependent config
+categories = ['videos', 'music', 'files']
+paging = True
+
+# search-url
+url = 'https://btdig.com'
+search_url = url + '/search?q={search_term}&p={pageno}'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(search_term=quote(query),
+ pageno=params['pageno'] - 1)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ search_res = dom.xpath('//div[@class="one_result"]')
+
+ # return empty array if nothing is found
+ if not search_res:
+ return []
+
+ # parse results
+ for result in search_res:
+ link = result.xpath('.//div[@class="torrent_name"]//a')[0]
+ href = urljoin(url, link.attrib.get('href'))
+ title = extract_text(link)
+
+ excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
+ content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
+ # it is better to emit <br/> instead of |, but html tags are verboten
+ content = content.strip().replace('\n', ' | ')
+ content = ' '.join(content.split())
+
+ filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0]
+ filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1]
+ files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]
+
+ # convert filesize to byte if possible
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+
+ # convert files to int if possible
+ try:
+ files = int(files)
+ except:
+ files = None
+
+ magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': content,
+ 'filesize': filesize,
+ 'files': files,
+ 'magnetlink': magnetlink,
+ 'template': 'torrent.html'})
+
+ # return results sorted by seeder
+ return results
diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py
new file mode 100644
index 000000000..8eab8f673
--- /dev/null
+++ b/searx/engines/currency_convert.py
@@ -0,0 +1,99 @@
+import json
+import re
+import os
+import sys
+import unicodedata
+
+from io import open
+from datetime import datetime
+
+if sys.version_info[0] == 3:
+ unicode = str
+
+categories = []
+url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
+weight = 100
+
+parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
+
+db = 1
+
+
+def normalize_name(name):
+ name = name.decode('utf-8').lower().replace('-', ' ').rstrip('s')
+ name = re.sub(' +', ' ', name)
+ return unicodedata.normalize('NFKD', name).lower()
+
+
+def name_to_iso4217(name):
+ global db
+
+ name = normalize_name(name)
+ currencies = db['names'].get(name, [name])
+ return currencies[0]
+
+
+def iso4217_to_name(iso4217, language):
+ global db
+
+ return db['iso4217'].get(iso4217, {}).get(language, iso4217)
+
+
+def request(query, params):
+ m = parser_re.match(query)
+ if not m:
+ # wrong query
+ return params
+ amount, from_currency, to_currency = m.groups()
+ amount = float(amount)
+ from_currency = name_to_iso4217(from_currency.strip())
+ to_currency = name_to_iso4217(to_currency.strip())
+
+ q = (from_currency + to_currency).upper()
+
+ params['url'] = url.format(from_currency, to_currency)
+ params['amount'] = amount
+ params['from'] = from_currency
+ params['to'] = to_currency
+ params['from_name'] = iso4217_to_name(from_currency, 'en')
+ params['to_name'] = iso4217_to_name(to_currency, 'en')
+
+ return params
+
+
+def response(resp):
+ """remove first and last lines to get only json"""
+ json_resp = resp.text[resp.text.find('\n') + 1:resp.text.rfind('\n') - 2]
+ results = []
+ try:
+ conversion_rate = float(json.loads(json_resp)['conversion']['converted-amount'])
+ except:
+ return results
+ answer = '{0} {1} = {2} {3}, 1 {1} ({5}) = {4} {3} ({6})'.format(
+ resp.search_params['amount'],
+ resp.search_params['from'],
+ resp.search_params['amount'] * conversion_rate,
+ resp.search_params['to'],
+ conversion_rate,
+ resp.search_params['from_name'],
+ resp.search_params['to_name'],
+ )
+
+ url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'.format(
+ resp.search_params['from'].upper(), resp.search_params['to'])
+
+ results.append({'answer': answer, 'url': url})
+
+ return results
+
+
+def load():
+ global db
+
+ current_dir = os.path.dirname(os.path.realpath(__file__))
+ json_data = open(current_dir + "/../data/currencies.json", 'r', encoding='utf-8').read()
+
+ db = json.loads(json_data)
+
+
+load()
diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py
new file mode 100644
index 000000000..1038e64bf
--- /dev/null
+++ b/searx/engines/dailymotion.py
@@ -0,0 +1,98 @@
+"""
+ Dailymotion (Videos)
+
+ @website https://www.dailymotion.com
+ @provide-api yes (http://www.dailymotion.com/developer)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, thumbnail, publishedDate, embedded
+
+ @todo set content-parameter with correct data
+"""
+
+from json import loads
+from datetime import datetime
+from searx.url_utils import urlencode
+from searx.utils import match_language, html_to_text
+
+# engine dependent config
+categories = ['videos']
+paging = True
+language_support = True
+
+# search-url
+# see http://www.dailymotion.com/doc/api/obj-video.html
+search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa
+embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
+ 'data-src="https://www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
+
+supported_languages_url = 'https://api.dailymotion.com/languages'
+
+
+# do search-request
+def request(query, params):
+ if params['language'] == 'all':
+ locale = 'en-US'
+ else:
+ locale = match_language(params['language'], supported_languages)
+
+ params['url'] = search_url.format(
+ query=urlencode({'search': query, 'localization': locale}),
+ pageno=params['pageno'])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # return empty array if there are no results
+ if 'list' not in search_res:
+ return []
+
+ # parse results
+ for res in search_res['list']:
+ title = res['title']
+ url = res['url']
+ content = html_to_text(res['description'])
+ thumbnail = res['thumbnail_360_url']
+ publishedDate = datetime.fromtimestamp(res['created_time'], None)
+ embedded = embedded_url.format(videoid=res['id'])
+
+ # http to https
+ thumbnail = thumbnail.replace("http://", "https://")
+
+ results.append({'template': 'videos.html',
+ 'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': publishedDate,
+ 'embedded': embedded,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = {}
+
+ response_json = loads(resp.text)
+
+ for language in response_json['list']:
+ supported_languages[language['code']] = {}
+
+ name = language['native_name']
+ if name:
+ supported_languages[language['code']]['name'] = name
+ english_name = language['name']
+ if english_name:
+ supported_languages[language['code']]['english_name'] = english_name
+
+ return supported_languages
diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py
new file mode 100644
index 000000000..af63478fb
--- /dev/null
+++ b/searx/engines/deezer.py
@@ -0,0 +1,67 @@
+"""
+ Deezer (Music)
+
+ @website https://deezer.com
+ @provide-api yes (http://developers.deezer.com/api/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, embedded
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['music']
+paging = True
+
+# search-url
+url = 'https://api.deezer.com/'
+search_url = url + 'search?{query}&index={offset}'
+
+embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\
+ 'data-src="https://www.deezer.com/plugins/player?type=tracks&id={audioid}" ' +\
+ 'width="540" height="80"></iframe>'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 25
+
+ params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # parse results
+ for result in search_res.get('data', []):
+ if result['type'] == 'track':
+ title = result['title']
+ url = result['link']
+
+ if url.startswith('http://'):
+ url = 'https' + url[4:]
+
+ content = u'{} - {} - {}'.format(
+ result['artist']['name'],
+ result['album']['title'],
+ result['title'])
+
+ embedded = embedded_url.format(audioid=result['id'])
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'embedded': embedded,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py
new file mode 100644
index 000000000..a0e27e622
--- /dev/null
+++ b/searx/engines/deviantart.py
@@ -0,0 +1,81 @@
+"""
+ Deviantart (Images)
+
+ @website https://www.deviantart.com/
+ @provide-api yes (https://www.deviantart.com/developers/) (RSS)
+
+ @using-api no (TODO, rewrite to api)
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, thumbnail_src, img_src
+
+ @todo rewrite to api
+"""
+
+from lxml import html
+import re
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['images']
+paging = True
+time_range_support = True
+
+# search-url
+base_url = 'https://www.deviantart.com/'
+search_url = base_url + 'search?page={page}&{query}'
+time_range_url = '&order={range}'
+
+time_range_dict = {'day': 11,
+ 'week': 14,
+ 'month': 15}
+
+
+# do search-request
+def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
+ params['url'] = search_url.format(page=params['pageno'],
+ query=urlencode({'q': query}))
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ # return empty array if a redirection code is returned
+ if resp.status_code == 302:
+ return []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for row in dom.xpath('//div[contains(@data-hook, "content_row")]'):
+ for result in row.xpath('./div'):
+ link = result.xpath('.//a[@data-hook="deviation_link"]')[0]
+ url = link.attrib.get('href')
+ title = link.attrib.get('title')
+ thumbnail_src = result.xpath('.//img')[0].attrib.get('src')
+ img_src = thumbnail_src
+
+ # http to https, remove domain sharding
+ thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
+ thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)
+
+ url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'img_src': img_src,
+ 'thumbnail_src': thumbnail_src,
+ 'template': 'images.html'})
+
+ # return results
+ return results
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
new file mode 100644
index 000000000..423af0971
--- /dev/null
+++ b/searx/engines/dictzone.py
@@ -0,0 +1,68 @@
+"""
+ Dictzone
+
+ @website https://dictzone.com/
+ @provide-api no
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+import re
+from lxml import html
+from searx.utils import is_valid_lang, eval_xpath
+from searx.url_utils import urljoin
+
+categories = ['general']
+url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
+weight = 100
+
+parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
+results_xpath = './/table[@id="r"]/tr'
+
+
+def request(query, params):
+ m = parser_re.match(query)
+ if not m:
+ return params
+
+ from_lang, to_lang, query = m.groups()
+
+ from_lang = is_valid_lang(from_lang)
+ to_lang = is_valid_lang(to_lang)
+
+ if not from_lang or not to_lang:
+ return params
+
+ params['url'] = url.format(from_lang=from_lang[2],
+ to_lang=to_lang[2],
+ query=query.decode('utf-8'))
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]):
+ try:
+ from_result, to_results_raw = eval_xpath(result, './td')
+ except:
+ continue
+
+ to_results = []
+ for to_result in eval_xpath(to_results_raw, './p/a'):
+ t = to_result.text_content()
+ if t.strip():
+ to_results.append(to_result.text_content())
+
+ results.append({
+ 'url': urljoin(resp.url, '?%d' % k),
+ 'title': from_result.text_content(),
+ 'content': '; '.join(to_results)
+ })
+
+ return results
diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py
new file mode 100644
index 000000000..ff2f94593
--- /dev/null
+++ b/searx/engines/digbt.py
@@ -0,0 +1,62 @@
+"""
+ DigBT (Videos, Music, Files)
+
+ @website https://digbt.org
+ @provide-api no
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content, magnetlink
+"""
+
+from sys import version_info
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size
+from searx.url_utils import urljoin
+
+if version_info[0] == 3:
+ unicode = str
+
+categories = ['videos', 'music', 'files']
+paging = True
+
+URL = 'https://digbt.org'
+SEARCH_URL = URL + '/search/{query}-time-{pageno}'
+FILESIZE = 3
+FILESIZE_MULTIPLIER = 4
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
+
+ return params
+
+
+def response(resp):
+ dom = html.fromstring(resp.text)
+ search_res = dom.xpath('.//td[@class="x-item"]')
+
+ if not search_res:
+ return list()
+
+ results = list()
+ for result in search_res:
+ url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
+ title = extract_text(result.xpath('.//a[@title]'))
+ content = extract_text(result.xpath('.//div[@class="files"]'))
+ files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
+ filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER])
+ magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
+
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'filesize': filesize,
+ 'magnetlink': magnetlink,
+ 'seed': 'N/A',
+ 'leech': 'N/A',
+ 'template': 'torrent.html'})
+
+ return results
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
new file mode 100644
index 000000000..073410eb0
--- /dev/null
+++ b/searx/engines/digg.py
@@ -0,0 +1,69 @@
+"""
+ Digg (News, Social media)
+
+ @website https://digg.com/
+ @provide-api no
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content, publishedDate, thumbnail
+"""
+
+import random
+import string
+from dateutil import parser
+from json import loads
+from lxml import html
+from searx.url_utils import urlencode
+from datetime import datetime
+
+# engine dependent config
+categories = ['news', 'social media']
+paging = True
+
+# search-url
+base_url = 'https://digg.com/'
+search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html'
+
+# specific xpath variables
+results_xpath = '//article'
+link_xpath = './/small[@class="time"]//a'
+title_xpath = './/h2//a//text()'
+content_xpath = './/p//text()'
+pubdate_xpath = './/time'
+
+digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
+ string.digits + "+_"
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 20
+ params['url'] = search_url.format(position=offset,
+ query=urlencode({'q': query}))
+ params['cookies']['frontend.auid'] = ''.join(random.choice(
+ digg_cookie_chars) for _ in range(22))
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_result = loads(resp.text)
+
+ # parse results
+ for result in search_result['mapped']:
+
+ published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S")
+ # append result
+ results.append({'url': result['url'],
+ 'title': result['title'],
+ 'content': result['excerpt'],
+ 'template': 'videos.html',
+ 'publishedDate': published,
+ 'thumbnail': result['images']['thumbImage']})
+
+ # return results
+ return results
diff --git a/searx/engines/doku.py b/searx/engines/doku.py
new file mode 100644
index 000000000..d20e66026
--- /dev/null
+++ b/searx/engines/doku.py
@@ -0,0 +1,85 @@
+# Doku Wiki
+#
+# @website https://www.dokuwiki.org/
+# @provide-api yes
+# (https://www.dokuwiki.org/devel:xmlrpc)
+#
+# @using-api no
+# @results HTML
+# @stable yes
+# @parse (general) url, title, content
+
+from lxml.html import fromstring
+from searx.engines.xpath import extract_text
+from searx.utils import eval_xpath
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['general'] # TODO , 'images', 'music', 'videos', 'files'
+paging = False
+language_support = False
+number_of_results = 5
+
+# search-url
+# Doku is OpenSearch compatible
+base_url = 'http://localhost:8090'
+search_url = '/?do=search'\
+ '&{query}'
+# TODO '&startRecord={offset}'\
+# TODO '&maximumRecords={limit}'\
+
+
+# do search-request
+def request(query, params):
+
+ params['url'] = base_url +\
+ search_url.format(query=urlencode({'id': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ doc = fromstring(resp.text)
+
+ # parse results
+ # Quickhits
+ for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
+ try:
+ res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
+ except:
+ continue
+
+ if not res_url:
+ continue
+
+ title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
+
+ # append result
+ results.append({'title': title,
+ 'content': "",
+ 'url': base_url + res_url})
+
+ # Search results
+ for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
+ try:
+ if r.tag == "dt":
+ res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
+ title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
+ elif r.tag == "dd":
+ content = extract_text(eval_xpath(r, '.'))
+
+ # append result
+ results.append({'title': title,
+ 'content': content,
+ 'url': base_url + res_url})
+ except:
+ continue
+
+ if not res_url:
+ continue
+
+ # return results
+ return results
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
new file mode 100644
index 000000000..0d2c0af2d
--- /dev/null
+++ b/searx/engines/duckduckgo.py
@@ -0,0 +1,143 @@
+"""
+ DuckDuckGo (Web)
+
+ @website https://duckduckgo.com/
+ @provide-api yes (https://duckduckgo.com/api),
+ but not all results from search-site
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+
+ @todo rewrite to api
+"""
+
+from lxml.html import fromstring
+from json import loads
+from searx.engines.xpath import extract_text
+from searx.poolrequests import get
+from searx.url_utils import urlencode
+from searx.utils import match_language, eval_xpath
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True
+supported_languages_url = 'https://duckduckgo.com/util/u172.js'
+time_range_support = True
+
+language_aliases = {
+ 'ar-SA': 'ar-XA',
+ 'es-419': 'es-XL',
+ 'ja': 'jp-JP',
+ 'ko': 'kr-KR',
+ 'sl-SI': 'sl-SL',
+ 'zh-TW': 'tzh-TW',
+ 'zh-HK': 'tzh-HK'
+}
+
+# search-url
+url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
+time_range_url = '&df={range}'
+
+time_range_dict = {'day': 'd',
+ 'week': 'w',
+ 'month': 'm'}
+
+# specific xpath variables
+result_xpath = '//div[@class="result results_links results_links_deep web-result "]' # noqa
+url_xpath = './/a[@class="result__a"]/@href'
+title_xpath = './/a[@class="result__a"]'
+content_xpath = './/a[@class="result__snippet"]'
+
+
+# match query's language to a region code that duckduckgo will accept
+def get_region_code(lang, lang_list=[]):
+ if lang == 'all':
+ return None
+
+ lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
+ lang_parts = lang_code.split('-')
+
+ # country code goes first
+ return lang_parts[1].lower() + '-' + lang_parts[0].lower()
+
+
+def request(query, params):
+ if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict:
+ return params
+
+ offset = (params['pageno'] - 1) * 30
+
+ region_code = get_region_code(params['language'], supported_languages)
+ params['url'] = 'https://duckduckgo.com/html/'
+ if params['pageno'] > 1:
+ params['method'] = 'POST'
+ params['data']['q'] = query
+ params['data']['s'] = offset
+ params['data']['dc'] = 30
+ params['data']['nextParams'] = ''
+ params['data']['v'] = 'l'
+ params['data']['o'] = 'json'
+ params['data']['api'] = '/d.js'
+ if params['time_range'] in time_range_dict:
+ params['data']['df'] = time_range_dict[params['time_range']]
+ if region_code:
+ params['data']['kl'] = region_code
+ else:
+ if region_code:
+ params['url'] = url.format(
+ query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
+ else:
+ params['url'] = url.format(
+ query=urlencode({'q': query}), offset=offset, dc_param=offset)
+
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ doc = fromstring(resp.text)
+
+ # parse results
+ for i, r in enumerate(eval_xpath(doc, result_xpath)):
+ if i >= 30:
+ break
+ try:
+ res_url = eval_xpath(r, url_xpath)[-1]
+ except:
+ continue
+
+ if not res_url:
+ continue
+
+ title = extract_text(eval_xpath(r, title_xpath))
+ content = extract_text(eval_xpath(r, content_xpath))
+
+ # append result
+ results.append({'title': title,
+ 'content': content,
+ 'url': res_url})
+
+ # return results
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+
+ # response is a js file with regions as an embedded object
+ response_page = resp.text
+ response_page = response_page[response_page.find('regions:{') + 8:]
+ response_page = response_page[:response_page.find('}') + 1]
+
+ regions_json = loads(response_page)
+ supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
+
+ return list(supported_languages)
diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
new file mode 100644
index 000000000..79d10c303
--- /dev/null
+++ b/searx/engines/duckduckgo_definitions.py
@@ -0,0 +1,171 @@
+"""
+DuckDuckGo (definitions)
+
+- `Instant Answer API`_
+- `DuckDuckGo query`_
+
+.. _Instant Answer API: https://duckduckgo.com/api
+.. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1
+
+"""
+
+import json
+from lxml import html
+from re import compile
+from searx.engines.xpath import extract_text
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
+from searx.url_utils import urlencode
+from searx.utils import html_to_text, match_language
+
+url = 'https://api.duckduckgo.com/'\
+ + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
+
+http_regex = compile(r'^http:')
+
+
+def result_to_text(url, text, htmlResult):
+ # TODO : remove result ending with "Meaning" or "Category"
+ dom = html.fromstring(htmlResult)
+ a = dom.xpath('//a')
+ if len(a) >= 1:
+ return extract_text(a[0])
+ else:
+ return text
+
+
+def request(query, params):
+ params['url'] = url.format(query=urlencode({'q': query}))
+ language = match_language(params['language'], supported_languages, language_aliases)
+ language = language.split('-')[0]
+ params['headers']['Accept-Language'] = language
+ return params
+
+
+def response(resp):
+ results = []
+
+ search_res = json.loads(resp.text)
+
+ content = ''
+ heading = search_res.get('Heading', '')
+ attributes = []
+ urls = []
+ infobox_id = None
+ relatedTopics = []
+
+ # add answer if there is one
+ answer = search_res.get('Answer', '')
+ if answer:
+ if search_res.get('AnswerType', '') not in ['calc']:
+ results.append({'answer': html_to_text(answer)})
+
+ # add infobox
+ if 'Definition' in search_res:
+ content = content + search_res.get('Definition', '')
+
+ if 'Abstract' in search_res:
+ content = content + search_res.get('Abstract', '')
+
+ # image
+ image = search_res.get('Image', '')
+ image = None if image == '' else image
+
+ # attributes
+ if 'Infobox' in search_res:
+ infobox = search_res.get('Infobox', None)
+ if 'content' in infobox:
+ for info in infobox.get('content'):
+ attributes.append({'label': info.get('label'),
+ 'value': info.get('value')})
+
+ # urls
+ for ddg_result in search_res.get('Results', []):
+ if 'FirstURL' in ddg_result:
+ firstURL = ddg_result.get('FirstURL', '')
+ text = ddg_result.get('Text', '')
+ urls.append({'title': text, 'url': firstURL})
+ results.append({'title': heading, 'url': firstURL})
+
+ # related topics
+ for ddg_result in search_res.get('RelatedTopics', []):
+ if 'FirstURL' in ddg_result:
+ suggestion = result_to_text(ddg_result.get('FirstURL', None),
+ ddg_result.get('Text', None),
+ ddg_result.get('Result', None))
+ if suggestion != heading:
+ results.append({'suggestion': suggestion})
+ elif 'Topics' in ddg_result:
+ suggestions = []
+ relatedTopics.append({'name': ddg_result.get('Name', ''),
+ 'suggestions': suggestions})
+ for topic_result in ddg_result.get('Topics', []):
+ suggestion = result_to_text(topic_result.get('FirstURL', None),
+ topic_result.get('Text', None),
+ topic_result.get('Result', None))
+ if suggestion != heading:
+ suggestions.append(suggestion)
+
+ # abstract
+ abstractURL = search_res.get('AbstractURL', '')
+ if abstractURL != '':
+ # add as result ? problem always in english
+ infobox_id = abstractURL
+ urls.append({'title': search_res.get('AbstractSource'),
+ 'url': abstractURL})
+
+ # definition
+ definitionURL = search_res.get('DefinitionURL', '')
+ if definitionURL != '':
+ # add as result ? as answer ? problem always in english
+ infobox_id = definitionURL
+ urls.append({'title': search_res.get('DefinitionSource'),
+ 'url': definitionURL})
+
+ # to merge with wikidata's infobox
+ if infobox_id:
+ infobox_id = http_regex.sub('https:', infobox_id)
+
+ # entity
+ entity = search_res.get('Entity', None)
+ # TODO continent / country / department / location / waterfall /
+ # mountain range :
+ # link to map search, get weather, near by locations
+ # TODO musician : link to music search
+ # TODO concert tour : ??
+ # TODO film / actor / television / media franchise :
+ # links to IMDB / rottentomatoes (or scrap result)
+ # TODO music : link tu musicbrainz / last.fm
+ # TODO book : ??
+ # TODO artist / playwright : ??
+ # TODO compagny : ??
+ # TODO software / os : ??
+ # TODO software engineer : ??
+ # TODO prepared food : ??
+ # TODO website : ??
+ # TODO performing art : ??
+ # TODO prepared food : ??
+ # TODO programming language : ??
+ # TODO file format : ??
+
+ if len(heading) > 0:
+ # TODO get infobox.meta.value where .label='article_title'
+ if image is None and len(attributes) == 0 and len(urls) == 1 and\
+ len(relatedTopics) == 0 and len(content) == 0:
+ results.append({
+ 'url': urls[0]['url'],
+ 'title': heading,
+ 'content': content
+ })
+ else:
+ results.append({
+ 'infobox': heading,
+ 'id': infobox_id,
+ 'entity': entity,
+ 'content': content,
+ 'img_src': image,
+ 'attributes': attributes,
+ 'urls': urls,
+ 'relatedTopics': relatedTopics
+ })
+
+ return results
diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py
new file mode 100644
index 000000000..89924b71c
--- /dev/null
+++ b/searx/engines/duckduckgo_images.py
@@ -0,0 +1,97 @@
+"""
+ DuckDuckGo (Images)
+
+ @website https://duckduckgo.com/
+ @provide-api yes (https://duckduckgo.com/api),
+ but images are not supported
+
+ @using-api no
+ @results JSON (site requires js to get images)
+ @stable no (JSON can change)
+ @parse url, title, img_src
+
+ @todo avoid extra request
+"""
+
+from json import loads
+from searx.engines.xpath import extract_text
+from searx.engines.duckduckgo import (
+ _fetch_supported_languages, supported_languages_url,
+ get_region_code, language_aliases
+)
+from searx.poolrequests import get
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['images']
+paging = True
+language_support = True
+safesearch = True
+
+# search-url
+images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
+site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
+
+
+# run query in site to get vqd number needed for requesting images
+# TODO: find a way to get this number without an extra request (is it a hash of the query?)
+def get_vqd(query, headers):
+ query_url = site_url.format(query=urlencode({'q': query}))
+ res = get(query_url, headers=headers)
+ content = res.text
+ if content.find('vqd=\'') == -1:
+ raise Exception('Request failed')
+ vqd = content[content.find('vqd=\'') + 5:]
+ vqd = vqd[:vqd.find('\'')]
+ return vqd
+
+
+# do search-request
+def request(query, params):
+ # to avoid running actual external requests when testing
+ if 'is_test' not in params:
+ vqd = get_vqd(query, params['headers'])
+ else:
+ vqd = '12345'
+
+ offset = (params['pageno'] - 1) * 50
+
+ safesearch = params['safesearch'] - 1
+
+ region_code = get_region_code(params['language'], lang_list=supported_languages)
+ if region_code:
+ params['url'] = images_url.format(
+ query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd)
+ else:
+ params['url'] = images_url.format(
+ query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ content = resp.text
+ try:
+ res_json = loads(content)
+ except:
+ raise Exception('Cannot parse results')
+
+ # parse results
+ for result in res_json['results']:
+ title = result['title']
+ url = result['url']
+ thumbnail = result['thumbnail']
+ image = result['image']
+
+ # append result
+ results.append({'template': 'images.html',
+ 'title': title,
+ 'content': '',
+ 'thumbnail_src': thumbnail,
+ 'img_src': image,
+ 'url': url})
+
+ return results
diff --git a/searx/engines/duden.py b/searx/engines/duden.py
new file mode 100644
index 000000000..cf2f1a278
--- /dev/null
+++ b/searx/engines/duden.py
@@ -0,0 +1,80 @@
+"""
+ Duden
+ @website https://www.duden.de
+ @provide-api no
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from lxml import html, etree
+import re
+from searx.engines.xpath import extract_text
+from searx.utils import eval_xpath
+from searx.url_utils import quote, urljoin
+from searx import logger
+
+categories = ['general']
+paging = True
+language_support = False
+
+# search-url
+base_url = 'https://www.duden.de/'
+search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
+
+
+def request(query, params):
+ '''pre-request callback
+ params<dict>:
+ method : POST/GET
+ headers : {}
+ data : {} # if method == POST
+ url : ''
+ category: 'search category'
+ pageno : 1 # number of the requested page
+ '''
+
+ offset = (params['pageno'] - 1)
+ if offset == 0:
+ search_url_fmt = base_url + 'suchen/dudenonline/{query}'
+ params['url'] = search_url_fmt.format(query=quote(query))
+ else:
+ params['url'] = search_url.format(offset=offset, query=quote(query))
+ return params
+
+
+def response(resp):
+ '''post-response callback
+ resp: requests response object
+ '''
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ try:
+ number_of_results_string =\
+ re.sub('[^0-9]', '',
+ eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0])
+
+ results.append({'number_of_results': int(number_of_results_string)})
+
+ except:
+ logger.debug("Couldn't read number of results.")
+ pass
+
+ for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'):
+ try:
+ url = eval_xpath(result, './/h2/a')[0].get('href')
+ url = urljoin(base_url, url)
+ title = eval_xpath(result, 'string(.//h2/a)').strip()
+ content = extract_text(eval_xpath(result, './/p'))
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
+ except:
+ logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
+ continue
+
+ return results
diff --git a/searx/engines/dummy.py b/searx/engines/dummy.py
new file mode 100644
index 000000000..50b56ef78
--- /dev/null
+++ b/searx/engines/dummy.py
@@ -0,0 +1,16 @@
+"""
+ Dummy
+
+ @results empty array
+ @stable yes
+"""
+
+
+# do search-request
+def request(query, params):
+ return params
+
+
+# get response from search-request
+def response(resp):
+ return []
diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py
new file mode 100644
index 000000000..a36ecf778
--- /dev/null
+++ b/searx/engines/faroo.py
@@ -0,0 +1,96 @@
+"""
+ Faroo (Web, News)
+
+ @website http://www.faroo.com
+ @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key
+
+ @using-api no
+ @results JSON
+ @stable yes
+ @parse url, title, content, publishedDate, img_src
+"""
+
+from json import loads
+import datetime
+from searx.utils import searx_useragent
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['general', 'news']
+paging = True
+language_support = True
+number_of_results = 10
+
+# search-url
+url = 'http://www.faroo.com/'
+search_url = url + 'instant.json?{query}'\
+ '&start={offset}'\
+ '&length={number_of_results}'\
+ '&l={language}'\
+ '&src={categorie}'\
+ '&i=false'\
+ '&c=false'
+
+search_category = {'general': 'web',
+ 'news': 'news'}
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * number_of_results + 1
+ categorie = search_category.get(params['category'], 'web')
+
+ if params['language'] == 'all':
+ language = 'en'
+ else:
+ language = params['language'].split('-')[0]
+
+ # if language is not supported, put it in english
+ if language != 'en' and\
+ language != 'de' and\
+ language != 'zh':
+ language = 'en'
+
+ params['url'] = search_url.format(offset=offset,
+ number_of_results=number_of_results,
+ query=urlencode({'q': query}),
+ language=language,
+ categorie=categorie)
+
+ params['headers']['Referer'] = url
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ # HTTP-Code 429: rate limit exceeded
+ if resp.status_code == 429:
+ raise Exception("rate limit has been exceeded!")
+
+ results = []
+
+ search_res = loads(resp.text)
+
+ # return empty array if there are no results
+ if not search_res.get('results', {}):
+ return []
+
+ # parse results
+ for result in search_res['results']:
+ publishedDate = None
+ result_json = {'url': result['url'], 'title': result['title'],
+ 'content': result['kwic']}
+ if result['news']:
+ result_json['publishedDate'] = \
+ datetime.datetime.fromtimestamp(result['date'] / 1000.0)
+
+ # append image result if image url is set
+ if result['iurl']:
+ result_json['template'] = 'videos.html'
+ result_json['thumbnail'] = result['iurl']
+
+ results.append(result_json)
+
+ # return results
+ return results
diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py
new file mode 100644
index 000000000..4066dc716
--- /dev/null
+++ b/searx/engines/fdroid.py
@@ -0,0 +1,50 @@
+"""
+ F-Droid (a repository of FOSS applications for Android)
+
+ @website https://f-droid.org/
+ @provide-api no
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['files']
+paging = True
+
+# search-url
+base_url = 'https://search.f-droid.org/'
+search_url = base_url + '?{query}'
+
+
+# do search-request
+def request(query, params):
+ query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
+ params['url'] = search_url.format(query=query)
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for app in dom.xpath('//a[@class="package-header"]'):
+ app_url = app.xpath('./@href')[0]
+ app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
+ app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \
+ + ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
+ app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0]
+
+ results.append({'url': app_url,
+ 'title': app_title,
+ 'content': app_content,
+ 'img_src': app_img_src})
+
+ return results
diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py
new file mode 100644
index 000000000..ed57a6bf3
--- /dev/null
+++ b/searx/engines/filecrop.py
@@ -0,0 +1,88 @@
+from searx.url_utils import urlencode
+
+try:
+ from HTMLParser import HTMLParser
+except:
+ from html.parser import HTMLParser
+
+url = 'http://www.filecrop.com/'
+search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
+
+paging = True
+
+
+class FilecropResultParser(HTMLParser):
+
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.__start_processing = False
+
+ self.results = []
+ self.result = {}
+
+ self.tr_counter = 0
+ self.data_counter = 0
+
+ def handle_starttag(self, tag, attrs):
+
+ if tag == 'tr':
+ if ('bgcolor', '#edeff5') in attrs or\
+ ('bgcolor', '#ffffff') in attrs:
+ self.__start_processing = True
+
+ if not self.__start_processing:
+ return
+
+ if tag == 'label':
+ self.result['title'] = [attr[1] for attr in attrs
+ if attr[0] == 'title'][0]
+ elif tag == 'a' and ('rel', 'nofollow') in attrs\
+ and ('class', 'sourcelink') in attrs:
+ if 'content' in self.result:
+ self.result['content'] += [attr[1] for attr in attrs
+ if attr[0] == 'title'][0]
+ else:
+ self.result['content'] = [attr[1] for attr in attrs
+ if attr[0] == 'title'][0]
+ self.result['content'] += ' '
+ elif tag == 'a':
+ self.result['url'] = url + [attr[1] for attr in attrs
+ if attr[0] == 'href'][0]
+
+ def handle_endtag(self, tag):
+ if self.__start_processing is False:
+ return
+
+ if tag == 'tr':
+ self.tr_counter += 1
+
+ if self.tr_counter == 2:
+ self.__start_processing = False
+ self.tr_counter = 0
+ self.data_counter = 0
+ self.results.append(self.result)
+ self.result = {}
+
+ def handle_data(self, data):
+ if not self.__start_processing:
+ return
+
+ if 'content' in self.result:
+ self.result['content'] += data + ' '
+ else:
+ self.result['content'] = data + ' '
+
+ self.data_counter += 1
+
+
+def request(query, params):
+ index = 1 + (params['pageno'] - 1) * 30
+ params['url'] = search_url.format(query=urlencode({'w': query}), index=index)
+ return params
+
+
+def response(resp):
+ parser = FilecropResultParser()
+ parser.feed(resp.text)
+
+ return parser.results
diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py
new file mode 100644
index 000000000..de1769370
--- /dev/null
+++ b/searx/engines/flickr.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+"""
+ Flickr (Images)
+
+ @website https://www.flickr.com
+ @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, thumbnail, img_src
+ More info on api-key : https://www.flickr.com/services/apps/create/
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+categories = ['images']
+
+nb_per_page = 15
+paging = True
+api_key = None
+
+
+url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
+ '&api_key={api_key}&{text}&sort=relevance' +\
+ '&extras=description%2C+owner_name%2C+url_o%2C+url_n%2C+url_z' +\
+ '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
+photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
+
+paging = True
+
+
+def build_flickr_url(user_id, photo_id):
+ return photo_url.format(userid=user_id, photoid=photo_id)
+
+
+def request(query, params):
+ params['url'] = url.format(text=urlencode({'text': query}),
+ api_key=api_key,
+ nb_per_page=nb_per_page,
+ page=params['pageno'])
+ return params
+
+
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if 'photos' not in search_results:
+ return []
+
+ if 'photo' not in search_results['photos']:
+ return []
+
+ photos = search_results['photos']['photo']
+
+ # parse results
+ for photo in photos:
+ if 'url_o' in photo:
+ img_src = photo['url_o']
+ elif 'url_z' in photo:
+ img_src = photo['url_z']
+ else:
+ continue
+
+# For a bigger thumbnail, keep only the url_z, not the url_n
+ if 'url_n' in photo:
+ thumbnail_src = photo['url_n']
+ elif 'url_z' in photo:
+ thumbnail_src = photo['url_z']
+ else:
+ thumbnail_src = img_src
+
+ url = build_flickr_url(photo['owner'], photo['id'])
+
+ # append result
+ results.append({'url': url,
+ 'title': photo['title'],
+ 'img_src': img_src,
+ 'thumbnail_src': thumbnail_src,
+ 'content': photo['description']['_content'],
+ 'author': photo['ownername'],
+ 'template': 'images.html'})
+
+ # return results
+ return results
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
new file mode 100644
index 000000000..198ac2cff
--- /dev/null
+++ b/searx/engines/flickr_noapi.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+"""
+ Flickr (Images)
+
+ @website https://www.flickr.com
+ @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
+
+ @using-api no
+ @results HTML
+ @stable no
+ @parse url, title, thumbnail, img_src
+"""
+
+from json import loads
+from time import time
+import re
+from searx.engines import logger
+from searx.url_utils import urlencode
+from searx.utils import ecma_unescape, html_to_text
+
+logger = logger.getChild('flickr-noapi')
+
+categories = ['images']
+
+url = 'https://www.flickr.com/'
+search_url = url + 'search?{query}&page={page}'
+time_range_url = '&min_upload_date={start}&max_upload_date={end}'
+photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
+modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
+image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
+
+paging = True
+time_range_support = True
+time_range_dict = {'day': 60 * 60 * 24,
+ 'week': 60 * 60 * 24 * 7,
+ 'month': 60 * 60 * 24 * 7 * 4,
+ 'year': 60 * 60 * 24 * 7 * 52}
+
+
+def build_flickr_url(user_id, photo_id):
+ return photo_url.format(userid=user_id, photoid=photo_id)
+
+
+def _get_time_range_url(time_range):
+ if time_range in time_range_dict:
+ return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range]))
+ return ''
+
+
+def request(query, params):
+ params['url'] = (search_url.format(query=urlencode({'text': query}), page=params['pageno'])
+ + _get_time_range_url(params['time_range']))
+ return params
+
+
+def response(resp):
+ results = []
+
+ matches = modelexport_re.search(resp.text)
+
+ if matches is None:
+ return results
+
+ match = matches.group(1)
+ model_export = loads(match)
+
+ if 'legend' not in model_export:
+ return results
+
+ legend = model_export['legend']
+
+ # handle empty page
+ if not legend or not legend[0]:
+ return results
+
+ for index in legend:
+ photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
+ author = ecma_unescape(photo.get('realname', ''))
+ source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
+ title = ecma_unescape(photo.get('title', ''))
+ content = html_to_text(ecma_unescape(photo.get('description', '')))
+ img_src = None
+ # From the biggest to the lowest format
+ for image_size in image_sizes:
+ if image_size in photo['sizes']:
+ img_src = photo['sizes'][image_size]['url']
+ img_format = 'jpg ' \
+ + str(photo['sizes'][image_size]['width']) \
+ + 'x' \
+ + str(photo['sizes'][image_size]['height'])
+ break
+
+ if not img_src:
+ logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
+ continue
+
+ # For a bigger thumbnail, keep only the url_z, not the url_n
+ if 'n' in photo['sizes']:
+ thumbnail_src = photo['sizes']['n']['url']
+ elif 'z' in photo['sizes']:
+ thumbnail_src = photo['sizes']['z']['url']
+ else:
+ thumbnail_src = img_src
+
+ if 'ownerNsid' not in photo:
+ # should not happen, disowned photo? Show it anyway
+ url = img_src
+ else:
+ url = build_flickr_url(photo['ownerNsid'], photo['id'])
+
+ results.append({'url': url,
+ 'title': title,
+ 'img_src': img_src,
+ 'thumbnail_src': thumbnail_src,
+ 'content': content,
+ 'author': author,
+ 'source': source,
+ 'img_format': img_format,
+ 'template': 'images.html'})
+
+ return results
diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py
new file mode 100644
index 000000000..f3441fa5f
--- /dev/null
+++ b/searx/engines/framalibre.py
@@ -0,0 +1,72 @@
+"""
+ FramaLibre (It)
+
+ @website https://framalibre.org/
+ @provide-api no
+
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, content, thumbnail, img_src
+"""
+
+try:
+ from cgi import escape
+except:
+ from html import escape
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urljoin, urlencode
+
+# engine dependent config
+categories = ['it']
+paging = True
+
+# search-url
+base_url = 'https://framalibre.org/'
+search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}'
+
+# specific xpath variables
+results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]'
+link_xpath = './/h3[@class="node-title"]/a[@href]'
+thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src'
+content_xpath = './/div[@class="content"]//p'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1)
+ params['url'] = search_url.format(query=urlencode({'keys': query}),
+ offset=offset)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath(results_xpath):
+ link = result.xpath(link_xpath)[0]
+ href = urljoin(base_url, link.attrib.get('href'))
+ # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
+ title = escape(extract_text(link))
+ thumbnail_tags = result.xpath(thumbnail_xpath)
+ thumbnail = None
+ if len(thumbnail_tags) > 0:
+ thumbnail = extract_text(thumbnail_tags[0])
+ if thumbnail[0] == '/':
+ thumbnail = base_url + thumbnail
+ content = escape(extract_text(result.xpath(content_xpath)))
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'img_src': thumbnail,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py
new file mode 100644
index 000000000..a67b42dbe
--- /dev/null
+++ b/searx/engines/frinkiac.py
@@ -0,0 +1,44 @@
+"""
+Frinkiac (Images)
+
+@website https://www.frinkiac.com
+@provide-api no
+@using-api no
+@results JSON
+@stable no
+@parse url, title, img_src
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+categories = ['images']
+
+BASE = 'https://frinkiac.com/'
+SEARCH_URL = '{base}api/search?{query}'
+RESULT_URL = '{base}?{query}'
+THUMB_URL = '{base}img/{episode}/{timestamp}/medium.jpg'
+IMAGE_URL = '{base}img/{episode}/{timestamp}.jpg'
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(base=BASE, query=urlencode({'q': query}))
+ return params
+
+
+def response(resp):
+ results = []
+ response_data = loads(resp.text)
+ for result in response_data:
+ episode = result['Episode']
+ timestamp = result['Timestamp']
+
+ results.append({'template': 'images.html',
+ 'url': RESULT_URL.format(base=BASE,
+ query=urlencode({'p': 'caption', 'e': episode, 't': timestamp})),
+ 'title': episode,
+ 'content': '',
+ 'thumbnail_src': THUMB_URL.format(base=BASE, episode=episode, timestamp=timestamp),
+ 'img_src': IMAGE_URL.format(base=BASE, episode=episode, timestamp=timestamp)})
+
+ return results
diff --git a/searx/engines/genius.py b/searx/engines/genius.py
new file mode 100644
index 000000000..b265e9d76
--- /dev/null
+++ b/searx/engines/genius.py
@@ -0,0 +1,88 @@
+"""
+Genius
+
+ @website https://www.genius.com/
+ @provide-api yes (https://docs.genius.com/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, thumbnail, publishedDate
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+from datetime import datetime
+
+# engine dependent config
+categories = ['music']
+paging = True
+language_support = False
+page_size = 5
+
+url = 'https://genius.com/api/'
+search_url = url + 'search/{index}?{query}&page={pageno}&per_page={page_size}'
+
+
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ index='multi',
+ page_size=page_size,
+ pageno=params['pageno'])
+ return params
+
+
+def parse_lyric(hit):
+ try:
+ content = hit['highlights'][0]['value']
+ except:
+ content = None
+ timestamp = hit['result']['lyrics_updated_at']
+ result = {'url': hit['result']['url'],
+ 'title': hit['result']['full_title'],
+ 'content': content,
+ 'thumbnail': hit['result']['song_art_image_thumbnail_url'],
+ 'template': 'videos.html'}
+ if timestamp:
+ result.update({'publishedDate': datetime.fromtimestamp(timestamp)})
+ return result
+
+
+def parse_artist(hit):
+ result = {'url': hit['result']['url'],
+ 'title': hit['result']['name'],
+ 'content': None,
+ 'thumbnail': hit['result']['image_url'],
+ 'template': 'videos.html'}
+ return result
+
+
+def parse_album(hit):
+ result = {'url': hit['result']['url'],
+ 'title': hit['result']['full_title'],
+ 'thumbnail': hit['result']['cover_art_url'],
+ # 'thumbnail': hit['result']['cover_art_thumbnail_url'],
+ 'template': 'videos.html'}
+ try:
+ year = hit['result']['release_date_components']['year']
+ except:
+ pass
+ else:
+ if year:
+ result.update({'content': 'Released: {}'.format(year)})
+ return result
+
+parse = {'lyric': parse_lyric, 'song': parse_lyric, 'artist': parse_artist, 'album': parse_album}
+
+
+def response(resp):
+ results = []
+ json = loads(resp.text)
+ hits = [hit for section in json['response']['sections'] for hit in section['hits']]
+ for hit in hits:
+ try:
+ func = parse[hit['type']]
+ except KeyError:
+ continue
+ results.append(func(hit))
+ return results
diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py
new file mode 100644
index 000000000..a7a966cc9
--- /dev/null
+++ b/searx/engines/gentoo.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Gentoo Wiki
+
+ @website https://wiki.gentoo.org
+ @provide-api yes
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode, urljoin
+
+# engine dependent config
+categories = ['it']
+language_support = True
+paging = True
+base_url = 'https://wiki.gentoo.org'
+
+# xpath queries
+xpath_results = '//ul[@class="mw-search-results"]/li'
+xpath_link = './/div[@class="mw-search-result-heading"]/a'
+
+
+# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
+def locale_to_lang_code(locale):
+ if locale.find('-') >= 0:
+ locale = locale.split('-')[0]
+ return locale
+
+
+# wikis for some languages were moved off from the main site, we need to make
+# requests to correct URLs to be able to get results in those languages
+lang_urls = {
+ 'en': {
+ 'base': 'https://wiki.gentoo.org',
+ 'search': '/index.php?title=Special:Search&offset={offset}&{query}'
+ },
+ 'others': {
+ 'base': 'https://wiki.gentoo.org',
+ 'search': '/index.php?title=Special:Search&offset={offset}&{query}\
+ &profile=translation&languagefilter={language}'
+ }
+}
+
+
+# get base & search URLs for selected language
+def get_lang_urls(language):
+ if language != 'en':
+ return lang_urls['others']
+ return lang_urls['en']
+
+
+# Language names to build search requests for
+# those languages which are hosted on the main site.
+main_langs = {
+ 'ar': 'العربية',
+ 'bg': 'Български',
+ 'cs': 'Česky',
+ 'da': 'Dansk',
+ 'el': 'Ελληνικά',
+ 'es': 'Español',
+ 'he': 'עברית',
+ 'hr': 'Hrvatski',
+ 'hu': 'Magyar',
+ 'it': 'Italiano',
+ 'ko': '한국어',
+ 'lt': 'Lietuviškai',
+ 'nl': 'Nederlands',
+ 'pl': 'Polski',
+ 'pt': 'Português',
+ 'ru': 'Русский',
+ 'sl': 'Slovenský',
+ 'th': 'ไทย',
+ 'uk': 'Українська',
+ 'zh': '简体中文'
+}
+supported_languages = dict(lang_urls, **main_langs)
+
+
+# do search-request
+def request(query, params):
+ # translate the locale (e.g. 'en-US') to language code ('en')
+ language = locale_to_lang_code(params['language'])
+
+ # if our language is hosted on the main site, we need to add its name
+ # to the query in order to narrow the results to that language
+ if language in main_langs:
+ query += b' (' + (main_langs[language]).encode('utf-8') + b')'
+
+ # prepare the request parameters
+ query = urlencode({'search': query})
+ offset = (params['pageno'] - 1) * 20
+
+ # get request URLs for our language of choice
+ urls = get_lang_urls(language)
+ search_url = urls['base'] + urls['search']
+
+ params['url'] = search_url.format(query=query, offset=offset,
+ language=language)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ # get the base URL for the language in which request was made
+ language = locale_to_lang_code(resp.search_params['language'])
+ base_url = get_lang_urls(language)['base']
+
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath(xpath_results):
+ link = result.xpath(xpath_link)[0]
+ href = urljoin(base_url, link.attrib.get('href'))
+ title = extract_text(link)
+
+ results.append({'url': href,
+ 'title': title})
+
+ return results
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
new file mode 100644
index 000000000..a84f3f69d
--- /dev/null
+++ b/searx/engines/gigablast.py
@@ -0,0 +1,114 @@
+"""
+ Gigablast (Web)
+
+ @website https://gigablast.com
+ @provide-api yes (https://gigablast.com/api.html)
+
+ @using-api yes
+ @results XML
+ @stable yes
+ @parse url, title, content
+"""
+
+import random
+from json import loads
+from time import time
+from lxml.html import fromstring
+from searx.url_utils import urlencode
+from searx.utils import eval_xpath
+
+# engine dependent config
+categories = ['general']
+paging = True
+number_of_results = 10
+language_support = True
+safesearch = True
+
+# search-url
+base_url = 'https://gigablast.com/'
+search_string = 'search?{query}'\
+ '&n={number_of_results}'\
+ '&c=main'\
+ '&s={offset}'\
+ '&format=json'\
+ '&qh=0'\
+ '&qlang={lang}'\
+ '&ff={safesearch}'\
+ '&rxiec={rxieu}'\
+ '&ulse={ulse}'\
+ '&rand={rxikd}'\
+ '&dbez={dbez}'
+# specific xpath variables
+results_xpath = '//response//result'
+url_xpath = './/url'
+title_xpath = './/title'
+content_xpath = './/sum'
+
+supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * number_of_results
+
+ if params['language'] == 'all':
+ language = 'xx'
+ else:
+ language = params['language'].replace('-', '_').lower()
+ if language.split('-')[0] != 'zh':
+ language = language.split('-')[0]
+
+ if params['safesearch'] >= 1:
+ safesearch = 1
+ else:
+ safesearch = 0
+
+ # rxieu is some kind of hash from the search query, but accepts random atm
+ search_path = search_string.format(query=urlencode({'q': query}),
+ offset=offset,
+ number_of_results=number_of_results,
+ rxikd=int(time() * 1000),
+ rxieu=random.randint(1000000000, 9999999999),
+ ulse=random.randint(100000000, 999999999),
+ lang=language,
+ safesearch=safesearch,
+ dbez=random.randint(100000000, 999999999))
+
+ params['url'] = base_url + search_path
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ # parse results
+ response_json = loads(resp.text)
+
+ for result in response_json['results']:
+ # append result
+ results.append({'url': result['url'],
+ 'title': result['title'],
+ 'content': result['sum']})
+
+ # return results
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = []
+ dom = fromstring(resp.text)
+ links = eval_xpath(dom, '//span[@id="menu2"]/a')
+ for link in links:
+ href = eval_xpath(link, './@href')[0].split('lang%3A')
+ if len(href) == 2:
+ code = href[1].split('_')
+ if len(code) == 2:
+ code = code[0] + '-' + code[1].upper()
+ else:
+ code = code[0]
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/github.py b/searx/engines/github.py
new file mode 100644
index 000000000..eaa00da4f
--- /dev/null
+++ b/searx/engines/github.py
@@ -0,0 +1,60 @@
+"""
+ Github (It)
+
+ @website https://github.com/
+ @provide-api yes (https://developer.github.com/v3/)
+
+ @using-api yes
+ @results JSON
+ @stable yes (using api)
+ @parse url, title, content
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['it']
+
+# search-url
+search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa
+
+accept_header = 'application/vnd.github.preview.text-match+json'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}))
+
+ params['headers']['Accept'] = accept_header
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # check if items are recieved
+ if 'items' not in search_res:
+ return []
+
+ # parse results
+ for res in search_res['items']:
+ title = res['name']
+ url = res['html_url']
+
+ if res['description']:
+ content = res['description'][:500]
+ else:
+ content = ''
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/google.py b/searx/engines/google.py
new file mode 100644
index 000000000..eed3a044e
--- /dev/null
+++ b/searx/engines/google.py
@@ -0,0 +1,391 @@
+# Google (Web)
+#
+# @website https://www.google.com
+# @provide-api yes (https://developers.google.com/custom-search/)
+#
+# @using-api no
+# @results HTML
+# @stable no (HTML can change)
+# @parse url, title, content, suggestion
+
+import re
+from flask_babel import gettext
+from lxml import html, etree
+from searx.engines.xpath import extract_text, extract_url
+from searx import logger
+from searx.url_utils import urlencode, urlparse, parse_qsl
+from searx.utils import match_language, eval_xpath
+
+logger = logger.getChild('google engine')
+
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True
+use_locale_domain = True
+time_range_support = True
+
+# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
+default_hostname = 'www.google.com'
+
+country_to_hostname = {
+ 'BG': 'www.google.bg', # Bulgaria
+ 'CZ': 'www.google.cz', # Czech Republic
+ 'DE': 'www.google.de', # Germany
+ 'DK': 'www.google.dk', # Denmark
+ 'AT': 'www.google.at', # Austria
+ 'CH': 'www.google.ch', # Switzerland
+ 'GR': 'www.google.gr', # Greece
+ 'AU': 'www.google.com.au', # Australia
+ 'CA': 'www.google.ca', # Canada
+ 'GB': 'www.google.co.uk', # United Kingdom
+ 'ID': 'www.google.co.id', # Indonesia
+ 'IE': 'www.google.ie', # Ireland
+ 'IN': 'www.google.co.in', # India
+ 'MY': 'www.google.com.my', # Malaysia
+ 'NZ': 'www.google.co.nz', # New Zealand
+ 'PH': 'www.google.com.ph', # Philippines
+ 'SG': 'www.google.com.sg', # Singapore
+ # 'US': 'www.google.us', # United States, redirect to .com
+ 'ZA': 'www.google.co.za', # South Africa
+ 'AR': 'www.google.com.ar', # Argentina
+ 'CL': 'www.google.cl', # Chile
+ 'ES': 'www.google.es', # Spain
+ 'MX': 'www.google.com.mx', # Mexico
+ 'EE': 'www.google.ee', # Estonia
+ 'FI': 'www.google.fi', # Finland
+ 'BE': 'www.google.be', # Belgium
+ 'FR': 'www.google.fr', # France
+ 'IL': 'www.google.co.il', # Israel
+ 'HR': 'www.google.hr', # Croatia
+ 'HU': 'www.google.hu', # Hungary
+ 'IT': 'www.google.it', # Italy
+ 'JP': 'www.google.co.jp', # Japan
+ 'KR': 'www.google.co.kr', # South Korea
+ 'LT': 'www.google.lt', # Lithuania
+ 'LV': 'www.google.lv', # Latvia
+ 'NO': 'www.google.no', # Norway
+ 'NL': 'www.google.nl', # Netherlands
+ 'PL': 'www.google.pl', # Poland
+ 'BR': 'www.google.com.br', # Brazil
+ 'PT': 'www.google.pt', # Portugal
+ 'RO': 'www.google.ro', # Romania
+ 'RU': 'www.google.ru', # Russia
+ 'SK': 'www.google.sk', # Slovakia
+ 'SI': 'www.google.si', # Slovenia
+ 'SE': 'www.google.se', # Sweden
+ 'TH': 'www.google.co.th', # Thailand
+ 'TR': 'www.google.com.tr', # Turkey
+ 'UA': 'www.google.com.ua', # Ukraine
+ # 'CN': 'www.google.cn', # China, only from China ?
+ 'HK': 'www.google.com.hk', # Hong Kong
+ 'TW': 'www.google.com.tw' # Taiwan
+}
+
+# osm
+url_map = 'https://www.openstreetmap.org/'\
+ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
+
+# search-url
+search_path = '/search'
+search_url = ('https://{hostname}' +
+ search_path +
+ '?{query}&start={offset}&gws_rd=cr&gbv=1&lr={lang}&hl={lang_short}&ei=x')
+
+time_range_search = "&tbs=qdr:{range}"
+time_range_dict = {'day': 'd',
+ 'week': 'w',
+ 'month': 'm',
+ 'year': 'y'}
+
+# other URLs
+map_hostname_start = 'maps.google.'
+maps_path = '/maps'
+redirect_path = '/url'
+images_path = '/images'
+supported_languages_url = 'https://www.google.com/preferences?#languages'
+
+# specific xpath variables
+results_xpath = '//div[contains(@class, "ZINbbc")]'
+url_xpath = './/div[@class="kCrYT"][1]/a/@href'
+title_xpath = './/div[@class="kCrYT"][1]/a/div[1]'
+content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]'
+suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]'
+spelling_suggestion_xpath = '//div[@id="scc"]//a'
+
+# map : detail location
+map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
+map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span'
+map_website_url_xpath = 'h3[2]/a/@href'
+map_website_title_xpath = 'h3[2]'
+
+# map : near the location
+map_near = 'table[@class="ts"]//tr'
+map_near_title = './/h4'
+map_near_url = './/h4/a/@href'
+map_near_phone = './/span[@class="nobr"]'
+
+# images
+images_xpath = './/div/a'
+image_url_xpath = './@href'
+image_img_src_xpath = './img/@src'
+
+# property names
+# FIXME : no translation
+property_address = "Address"
+property_phone = "Phone number"
+
+
+# remove google-specific tracking-url
+def parse_url(url_string, google_hostname):
+ # sanity check
+ if url_string is None:
+ return url_string
+
+ # normal case
+ parsed_url = urlparse(url_string)
+ if (parsed_url.netloc in [google_hostname, '']
+ and parsed_url.path == redirect_path):
+ query = dict(parse_qsl(parsed_url.query))
+ return query['q']
+ else:
+ return url_string
+
+
+# returns extract_text on the first result selected by the xpath or None
+def extract_text_from_dom(result, xpath):
+ r = eval_xpath(result, xpath)
+ if len(r) > 0:
+ return extract_text(r[0])
+ return None
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 10
+
+ if params['language'] == 'all' or params['language'] == 'en-US':
+ language = 'en-GB'
+ else:
+ language = match_language(params['language'], supported_languages, language_aliases)
+
+ language_array = language.split('-')
+ if params['language'].find('-') > 0:
+ country = params['language'].split('-')[1]
+ elif len(language_array) == 2:
+ country = language_array[1]
+ else:
+ country = 'US'
+
+ url_lang = 'lang_' + language
+
+ if use_locale_domain:
+ google_hostname = country_to_hostname.get(country.upper(), default_hostname)
+ else:
+ google_hostname = default_hostname
+
+ # original format: ID=3e2b6616cee08557:TM=5556667580:C=r:IP=4.1.12.5-:S=23ASdf0soFgF2d34dfgf-_22JJOmHdfgg
+ params['cookies']['GOOGLE_ABUSE_EXEMPTION'] = 'x'
+ params['url'] = search_url.format(offset=offset,
+ query=urlencode({'q': query}),
+ hostname=google_hostname,
+ lang=url_lang,
+ lang_short=language)
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
+
+ params['headers']['Accept-Language'] = language + ',' + language + '-' + country
+ params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+
+ params['google_hostname'] = google_hostname
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ # detect google sorry
+ resp_url = urlparse(resp.url)
+ if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
+ raise RuntimeWarning('sorry.google.com')
+
+ if resp_url.path.startswith('/sorry'):
+ raise RuntimeWarning(gettext('CAPTCHA required'))
+
+ # which hostname ?
+ google_hostname = resp.search_params.get('google_hostname')
+ google_url = "https://" + google_hostname
+
+ # convert the text to dom
+ dom = html.fromstring(resp.text)
+
+ instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
+ if instant_answer:
+ results.append({'answer': u' '.join(instant_answer)})
+ try:
+ results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0]
+ .split()[1].replace(',', ''))
+ results.append({'number_of_results': results_num})
+ except:
+ pass
+
+ # parse results
+ for result in eval_xpath(dom, results_xpath):
+ try:
+ title = extract_text(eval_xpath(result, title_xpath)[0])
+ url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname)
+ parsed_url = urlparse(url, google_hostname)
+
+ # map result
+ if parsed_url.netloc == google_hostname:
+ # TODO fix inside links
+ continue
+ # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
+ # print "yooooo"*30
+ # x = eval_xpath(result, map_near)
+ # if len(x) > 0:
+ # # map : near the location
+ # results = results + parse_map_near(parsed_url, x, google_hostname)
+ # else:
+ # # map : detail about a location
+ # results = results + parse_map_detail(parsed_url, result, google_hostname)
+ # # google news
+ # elif parsed_url.path == search_path:
+ # # skipping news results
+ # pass
+
+ # # images result
+ # elif parsed_url.path == images_path:
+ # # only thumbnail image provided,
+ # # so skipping image results
+ # # results = results + parse_images(result, google_hostname)
+ # pass
+
+ else:
+ # normal result
+ content = extract_text_from_dom(result, content_xpath)
+ if content is None:
+ continue
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content
+ })
+ except:
+ logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
+ continue
+
+ # parse suggestion
+ for suggestion in eval_xpath(dom, suggestion_xpath):
+ # append suggestion
+ results.append({'suggestion': extract_text(suggestion)})
+
+ for correction in eval_xpath(dom, spelling_suggestion_xpath):
+ results.append({'correction': extract_text(correction)})
+
+ # return results
+ return results
+
+
+def parse_images(result, google_hostname):
+ results = []
+ for image in eval_xpath(result, images_xpath):
+ url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname)
+ img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0])
+
+ # append result
+ results.append({'url': url,
+ 'title': '',
+ 'content': '',
+ 'img_src': img_src,
+ 'template': 'images.html'
+ })
+
+ return results
+
+
+def parse_map_near(parsed_url, x, google_hostname):
+ results = []
+
+ for result in x:
+ title = extract_text_from_dom(result, map_near_title)
+ url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname)
+ attributes = []
+ phone = extract_text_from_dom(result, map_near_phone)
+ add_attributes(attributes, property_phone, phone, 'tel:' + phone)
+ results.append({'title': title,
+ 'url': url,
+ 'content': attributes_to_html(attributes)
+ })
+
+ return results
+
+
+def parse_map_detail(parsed_url, result, google_hostname):
+ results = []
+
+ # try to parse the geoloc
+ m = re.search(r'@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path)
+ if m is None:
+ m = re.search(r'll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query)
+
+ if m is not None:
+ # geoloc found (ignored)
+ lon = float(m.group(2)) # noqa
+ lat = float(m.group(1)) # noqa
+ zoom = int(m.group(3)) # noqa
+
+ # attributes
+ attributes = []
+ address = extract_text_from_dom(result, map_address_xpath)
+ phone = extract_text_from_dom(result, map_phone_xpath)
+ add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon))
+ add_attributes(attributes, property_phone, phone, 'tel:' + phone)
+
+ # title / content / url
+ website_title = extract_text_from_dom(result, map_website_title_xpath)
+ content = extract_text_from_dom(result, content_xpath)
+ website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname)
+
+ # add a result if there is a website
+ if website_url is not None:
+ results.append({'title': website_title,
+ 'content': (content + '<br />' if content is not None else '')
+ + attributes_to_html(attributes),
+ 'url': website_url
+ })
+
+ return results
+
+
+def add_attributes(attributes, name, value, url):
+ if value is not None and len(value) > 0:
+ attributes.append({'label': name, 'value': value, 'url': url})
+
+
+def attributes_to_html(attributes):
+ retval = '<table class="table table-striped">'
+ for a in attributes:
+ value = a.get('value')
+ if 'url' in a:
+ value = '<a href="' + a.get('url') + '">' + value + '</a>'
+ retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
+ retval = retval + '</table>'
+ return retval
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = {}
+ dom = html.fromstring(resp.text)
+ options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]')
+ for option in options:
+ code = eval_xpath(option, './@value')[0].split('_')[-1]
+ name = eval_xpath(option, './@data-name')[0].title()
+ supported_languages[code] = {"name": name}
+
+ return supported_languages
diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py
new file mode 100644
index 000000000..636913114
--- /dev/null
+++ b/searx/engines/google_images.py
@@ -0,0 +1,97 @@
+"""
+ Google (Images)
+
+ @website https://www.google.com
+ @provide-api yes (https://developers.google.com/custom-search/)
+
+ @using-api no
+ @results HTML chunks with JSON inside
+ @stable no
+ @parse url, title, img_src
+"""
+
+from datetime import date, timedelta
+from json import loads
+from lxml import html
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['images']
+paging = True
+safesearch = True
+time_range_support = True
+number_of_results = 100
+
+search_url = 'https://www.google.com/search'\
+ '?{query}'\
+ '&tbm=isch'\
+ '&yv=2'\
+ '&{search_options}'
+time_range_attr = "qdr:{range}"
+time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
+time_range_dict = {'day': 'd',
+ 'week': 'w',
+ 'month': 'm'}
+
+
+# do search-request
+def request(query, params):
+ search_options = {
+ 'ijn': params['pageno'] - 1,
+ 'start': (params['pageno'] - 1) * number_of_results
+ }
+
+ if params['time_range'] in time_range_dict:
+ search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
+ elif params['time_range'] == 'year':
+ now = date.today()
+ then = now - timedelta(days=365)
+ start = then.strftime('%m/%d/%Y')
+ end = now.strftime('%m/%d/%Y')
+ search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
+
+ if safesearch and params['safesearch']:
+ search_options['safe'] = 'on'
+
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ search_options=urlencode(search_options))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'):
+
+ try:
+ metadata = loads(result)
+
+ img_format = metadata.get('ity', '')
+ img_width = metadata.get('ow', '')
+ img_height = metadata.get('oh', '')
+ if img_width and img_height:
+ img_format += " {0}x{1}".format(img_width, img_height)
+
+ source = metadata.get('st', '')
+ source_url = metadata.get('isu', '')
+ if source_url:
+ source += " ({0})".format(source_url)
+
+ results.append({'url': metadata['ru'],
+ 'title': metadata['pt'],
+ 'content': metadata.get('s', ''),
+ 'source': source,
+ 'img_format': img_format,
+ 'thumbnail_src': metadata['tu'],
+ 'img_src': metadata['ou'],
+ 'template': 'images.html'})
+
+ except:
+ continue
+
+ return results
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
new file mode 100644
index 000000000..9c837b45b
--- /dev/null
+++ b/searx/engines/google_news.py
@@ -0,0 +1,86 @@
+"""
+ Google (News)
+
+ @website https://news.google.com
+ @provide-api no
+
+ @using-api no
+ @results HTML
+ @stable no
+ @parse url, title, content, publishedDate
+"""
+
+from lxml import html
+from searx.engines.google import _fetch_supported_languages, supported_languages_url
+from searx.url_utils import urlencode
+from searx.utils import match_language
+
+# search-url
+categories = ['news']
+paging = True
+language_support = True
+safesearch = True
+time_range_support = True
+number_of_results = 10
+
+search_url = 'https://www.google.com/search'\
+ '?{query}'\
+ '&tbm=nws'\
+ '&gws_rd=cr'\
+ '&{search_options}'
+time_range_attr = "qdr:{range}"
+time_range_dict = {'day': 'd',
+ 'week': 'w',
+ 'month': 'm',
+ 'year': 'y'}
+
+
+# do search-request
+def request(query, params):
+
+ search_options = {
+ 'start': (params['pageno'] - 1) * number_of_results
+ }
+
+ if params['time_range'] in time_range_dict:
+ search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
+
+ if safesearch and params['safesearch']:
+ search_options['safe'] = 'on'
+
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ search_options=urlencode(search_options))
+
+ if params['language'] != 'all':
+ language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
+ if language:
+ params['url'] += '&lr=lang_' + language
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
+ try:
+ r = {
+ 'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"),
+ 'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')),
+ 'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
+ }
+ except:
+ continue
+
+ imgs = result.xpath('.//img/@src')
+ if len(imgs) and not imgs[0].startswith('data'):
+ r['img_src'] = imgs[0]
+
+ results.append(r)
+
+ # return results
+ return results
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
new file mode 100644
index 000000000..fd6b2e3be
--- /dev/null
+++ b/searx/engines/google_videos.py
@@ -0,0 +1,97 @@
+"""
+ Google (Videos)
+
+ @website https://www.google.com
+ @provide-api yes (https://developers.google.com/custom-search/)
+
+ @using-api no
+ @results HTML
+ @stable no
+ @parse url, title, content, thumbnail
+"""
+
+from datetime import date, timedelta
+from json import loads
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+import re
+
+# engine dependent config
+categories = ['videos']
+paging = True
+safesearch = True
+time_range_support = True
+number_of_results = 10
+
+search_url = 'https://www.google.com/search'\
+ '?q={query}'\
+ '&tbm=vid'\
+ '&{search_options}'
+time_range_attr = "qdr:{range}"
+time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
+time_range_dict = {'day': 'd',
+ 'week': 'w',
+ 'month': 'm'}
+
+
+# do search-request
+def request(query, params):
+ search_options = {
+ 'ijn': params['pageno'] - 1,
+ 'start': (params['pageno'] - 1) * number_of_results
+ }
+
+ if params['time_range'] in time_range_dict:
+ search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
+ elif params['time_range'] == 'year':
+ now = date.today()
+ then = now - timedelta(days=365)
+ start = then.strftime('%m/%d/%Y')
+ end = now.strftime('%m/%d/%Y')
+ search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
+
+ if safesearch and params['safesearch']:
+ search_options['safe'] = 'on'
+
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ search_options=urlencode(search_options))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath('//div[@class="g"]'):
+
+ title = extract_text(result.xpath('.//h3'))
+ url = result.xpath('.//div[@class="r"]/a/@href')[0]
+ content = extract_text(result.xpath('.//span[@class="st"]'))
+
+ # get thumbnails
+ script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
+ ids = result.xpath('.//div[@class="s"]//img/@id')
+ if len(ids) > 0:
+ thumbnails_data = \
+ re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
+ script)
+ tmp = []
+ if len(thumbnails_data) != 0:
+ tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
+ thumbnail = ''
+ if len(tmp) != 0:
+ thumbnail = tmp[-1]
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'thumbnail': thumbnail,
+ 'template': 'videos.html'})
+
+ return results
diff --git a/searx/engines/ina.py b/searx/engines/ina.py
new file mode 100644
index 000000000..37a05f099
--- /dev/null
+++ b/searx/engines/ina.py
@@ -0,0 +1,87 @@
+# INA (Videos)
+#
+# @website https://www.ina.fr/
+# @provide-api no
+#
+# @using-api no
+# @results HTML (using search portal)
+# @stable no (HTML can change)
+# @parse url, title, content, publishedDate, thumbnail
+#
+# @todo set content-parameter with correct data
+# @todo embedded (needs some md5 from video page)
+
+from json import loads
+from lxml import html
+from dateutil import parser
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+
+try:
+ from HTMLParser import HTMLParser
+except:
+ from html.parser import HTMLParser
+
+# engine dependent config
+categories = ['videos']
+paging = True
+page_size = 48
+
+# search-url
+base_url = 'https://www.ina.fr'
+search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
+
+# specific xpath variables
+results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]'
+url_xpath = './/a/@href'
+title_xpath = './/h3[@class="h3--title media-heading"]'
+thumbnail_xpath = './/img/@src'
+publishedDate_xpath = './/span[@class="broadcast"]'
+content_xpath = './/p[@class="media-body__summary"]'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(ps=page_size,
+ start=params['pageno'] * page_size,
+ query=urlencode({'q': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ # we get html in a JSON container...
+ response = loads(resp.text)
+ if "content" not in response:
+ return []
+ dom = html.fromstring(response["content"])
+ p = HTMLParser()
+
+ # parse results
+ for result in dom.xpath(results_xpath):
+ videoid = result.xpath(url_xpath)[0]
+ url = base_url + videoid
+ title = p.unescape(extract_text(result.xpath(title_xpath)))
+ thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
+ if thumbnail[0] == '/':
+ thumbnail = base_url + thumbnail
+ d = extract_text(result.xpath(publishedDate_xpath)[0])
+ d = d.split('/')
+ # force ISO date to avoid wrong parsing
+ d = "%s-%s-%s" % (d[2], d[1], d[0])
+ publishedDate = parser.parse(d)
+ content = extract_text(result.xpath(content_xpath))
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'videos.html',
+ 'publishedDate': publishedDate,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
diff --git a/searx/engines/invidious.py b/searx/engines/invidious.py
new file mode 100644
index 000000000..8d81691fc
--- /dev/null
+++ b/searx/engines/invidious.py
@@ -0,0 +1,100 @@
+# Invidious (Videos)
+#
+# @website https://invidio.us/
+# @provide-api yes (https://github.com/omarroth/invidious/wiki/API)
+#
+# @using-api yes
+# @results JSON
+# @stable yes
+# @parse url, title, content, publishedDate, thumbnail, embedded
+
+from searx.url_utils import quote_plus
+from dateutil import parser
+import time
+
+# engine dependent config
+categories = ["videos", "music"]
+paging = True
+language_support = True
+time_range_support = True
+
+# search-url
+base_url = "https://invidio.us/"
+
+
+# do search-request
+def request(query, params):
+ time_range_dict = {
+ "day": "today",
+ "week": "week",
+ "month": "month",
+ "year": "year",
+ }
+ search_url = base_url + "api/v1/search?q={query}"
+ params["url"] = search_url.format(
+ query=quote_plus(query)
+ ) + "&page={pageno}".format(pageno=params["pageno"])
+
+ if params["time_range"] in time_range_dict:
+ params["url"] += "&date={timerange}".format(
+ timerange=time_range_dict[params["time_range"]]
+ )
+
+ if params["language"] != "all":
+ lang = params["language"].split("-")
+ if len(lang) == 2:
+ params["url"] += "&range={lrange}".format(lrange=lang[1])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = resp.json()
+ embedded_url = (
+ '<iframe width="540" height="304" '
+ + 'data-src="'
+ + base_url
+ + 'embed/{videoid}" '
+ + 'frameborder="0" allowfullscreen></iframe>'
+ )
+
+ base_invidious_url = base_url + "watch?v="
+
+ for result in search_results:
+ rtype = result.get("type", None)
+ if rtype == "video":
+ videoid = result.get("videoId", None)
+ if not videoid:
+ continue
+
+ url = base_invidious_url + videoid
+ embedded = embedded_url.format(videoid=videoid)
+ thumbs = result.get("videoThumbnails", [])
+ thumb = next(
+ (th for th in thumbs if th["quality"] == "sddefault"), None
+ )
+ if thumb:
+ thumbnail = thumb.get("url", "")
+ else:
+ thumbnail = ""
+
+ publishedDate = parser.parse(
+ time.ctime(result.get("published", 0))
+ )
+
+ results.append(
+ {
+ "url": url,
+ "title": result.get("title", ""),
+ "content": result.get("description", ""),
+ "template": "videos.html",
+ "publishedDate": publishedDate,
+ "embedded": embedded,
+ "thumbnail": thumbnail,
+ }
+ )
+
+ return results
diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py
new file mode 100644
index 000000000..785b0c490
--- /dev/null
+++ b/searx/engines/json_engine.py
@@ -0,0 +1,136 @@
+from collections import Iterable
+from json import loads
+from sys import version_info
+from searx.url_utils import urlencode
+from searx.utils import to_string
+
+if version_info[0] == 3:
+ unicode = str
+
+search_url = None
+url_query = None
+content_query = None
+title_query = None
+paging = False
+suggestion_query = ''
+results_query = ''
+
+# parameters for engines with paging support
+#
+# number of results on each page
+# (only needed if the site requires not a page number, but an offset)
+page_size = 1
+# number of the first page (usually 0 or 1)
+first_page_num = 1
+
+
+def iterate(iterable):
+ if type(iterable) == dict:
+ it = iterable.items()
+
+ else:
+ it = enumerate(iterable)
+ for index, value in it:
+ yield str(index), value
+
+
+def is_iterable(obj):
+ if type(obj) == str:
+ return False
+ if type(obj) == unicode:
+ return False
+ return isinstance(obj, Iterable)
+
+
+def parse(query):
+ q = []
+ for part in query.split('/'):
+ if part == '':
+ continue
+ else:
+ q.append(part)
+ return q
+
+
+def do_query(data, q):
+ ret = []
+ if not q:
+ return ret
+
+ qkey = q[0]
+
+ for key, value in iterate(data):
+
+ if len(q) == 1:
+ if key == qkey:
+ ret.append(value)
+ elif is_iterable(value):
+ ret.extend(do_query(value, q))
+ else:
+ if not is_iterable(value):
+ continue
+ if key == qkey:
+ ret.extend(do_query(value, q[1:]))
+ else:
+ ret.extend(do_query(value, q))
+ return ret
+
+
+def query(data, query_string):
+ q = parse(query_string)
+
+ return do_query(data, q)
+
+
+def request(query, params):
+ query = urlencode({'q': query})[2:]
+
+ fp = {'query': query}
+ if paging and search_url.find('{pageno}') >= 0:
+ fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
+
+ params['url'] = search_url.format(**fp)
+ params['query'] = query
+
+ return params
+
+
+def response(resp):
+ results = []
+ json = loads(resp.text)
+ if results_query:
+ rs = query(json, results_query)
+ if not len(rs):
+ return results
+ for result in rs[0]:
+ try:
+ url = query(result, url_query)[0]
+ title = query(result, title_query)[0]
+ except:
+ continue
+ try:
+ content = query(result, content_query)[0]
+ except:
+ content = ""
+ results.append({
+ 'url': to_string(url),
+ 'title': to_string(title),
+ 'content': to_string(content),
+ })
+ else:
+ for url, title, content in zip(
+ query(json, url_query),
+ query(json, title_query),
+ query(json, content_query)
+ ):
+ results.append({
+ 'url': to_string(url),
+ 'title': to_string(title),
+ 'content': to_string(content),
+ })
+
+ if not suggestion_query:
+ return results
+ for suggestion in query(json, suggestion_query):
+ results.append({'suggestion': suggestion})
+ return results
diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py
new file mode 100644
index 000000000..5e897c96f
--- /dev/null
+++ b/searx/engines/kickass.py
@@ -0,0 +1,92 @@
+"""
+ Kickass Torrent (Videos, Music, Files)
+
+ @website https://kickass.so
+ @provide-api no (nothing found)
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable yes (HTML can change)
+ @parse url, title, content, seed, leech, magnetlink
+"""
+
+from lxml import html
+from operator import itemgetter
+from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size, convert_str_to_int
+from searx.url_utils import quote, urljoin
+
+# engine dependent config
+categories = ['videos', 'music', 'files']
+paging = True
+
+# search-url
+url = 'https://kickass.cd/'
+search_url = url + 'search/{search_term}/{pageno}/'
+
+# specific xpath variables
+magnet_xpath = './/a[@title="Torrent magnet link"]'
+torrent_xpath = './/a[@title="Download torrent file"]'
+content_xpath = './/span[@class="font11px lightgrey block"]'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(search_term=quote(query),
+ pageno=params['pageno'])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ search_res = dom.xpath('//table[@class="data"]//tr')
+
+ # return empty array if nothing is found
+ if not search_res:
+ return []
+
+ # parse results
+ for result in search_res[1:]:
+ link = result.xpath('.//a[@class="cellMainLink"]')[0]
+ href = urljoin(url, link.attrib['href'])
+ title = extract_text(link)
+ content = extract_text(result.xpath(content_xpath))
+ seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
+ leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
+ filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))
+ files = extract_text(result.xpath('.//td[contains(@class, "center")][2]'))
+
+ seed = convert_str_to_int(seed)
+ leech = convert_str_to_int(leech)
+
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+ if files.isdigit():
+ files = int(files)
+ else:
+ files = None
+
+ magnetlink = result.xpath(magnet_xpath)[0].attrib['href']
+
+ torrentfile = result.xpath(torrent_xpath)[0].attrib['href']
+ torrentfileurl = quote(torrentfile, safe="%/:=&?~#+!$,;'@()*")
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': content,
+ 'seed': seed,
+ 'leech': leech,
+ 'filesize': filesize,
+ 'files': files,
+ 'magnetlink': magnetlink,
+ 'torrentfile': torrentfileurl,
+ 'template': 'torrent.html'})
+
+ # return results sorted by seeder
+ return sorted(results, key=itemgetter('seed'), reverse=True)
diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py
new file mode 100644
index 000000000..0607ac93b
--- /dev/null
+++ b/searx/engines/mediawiki.py
@@ -0,0 +1,90 @@
+"""
+ general mediawiki-engine (Web)
+
+ @website websites built on mediawiki (https://www.mediawiki.org)
+ @provide-api yes (http://www.mediawiki.org/wiki/API:Search)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title
+
+ @todo content
+"""
+
+from json import loads
+from string import Formatter
+from searx.url_utils import urlencode, quote
+
+# engine dependent config
+categories = ['general']
+language_support = True
+paging = True
+number_of_results = 1
+search_type = 'nearmatch' # possible values: title, text, nearmatch
+
+# search-url
+base_url = 'https://{language}.wikipedia.org/'
+search_postfix = 'w/api.php?action=query'\
+ '&list=search'\
+ '&{query}'\
+ '&format=json'\
+ '&sroffset={offset}'\
+ '&srlimit={limit}'\
+ '&srwhat={searchtype}'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * number_of_results
+
+ string_args = dict(query=urlencode({'srsearch': query}),
+ offset=offset,
+ limit=number_of_results,
+ searchtype=search_type)
+
+ format_strings = list(Formatter().parse(base_url))
+
+ if params['language'] == 'all':
+ language = 'en'
+ else:
+ language = params['language'].split('-')[0]
+
+ # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
+ if any(x[1] == 'language' for x in format_strings):
+ string_args['language'] = language
+
+ # write search-language back to params, required in response
+ params['language'] = language
+
+ search_url = base_url + search_postfix
+
+ params['url'] = search_url.format(**string_args)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if not search_results.get('query', {}).get('search'):
+ return []
+
+ # parse results
+ for result in search_results['query']['search']:
+ if result.get('snippet', '').startswith('#REDIRECT'):
+ continue
+ url = base_url.format(language=resp.search_params['language']) +\
+ 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
+
+ # append result
+ results.append({'url': url,
+ 'title': result['title'],
+ 'content': ''})
+
+ # return results
+ return results
diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py
new file mode 100644
index 000000000..9387b08d0
--- /dev/null
+++ b/searx/engines/microsoft_academic.py
@@ -0,0 +1,75 @@
+"""
+Microsoft Academic (Science)
+
+@website https://academic.microsoft.com
+@provide-api yes
+@using-api no
+@results JSON
+@stable no
+@parse url, title, content
+"""
+
+from datetime import datetime
+from json import loads
+from uuid import uuid4
+
+from searx.url_utils import urlencode
+from searx.utils import html_to_text
+
+categories = ['images']
+paging = True
+result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}'
+
+
+def request(query, params):
+ correlation_id = uuid4()
+ msacademic = uuid4()
+ time_now = datetime.now()
+
+ params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id}))
+ params['cookies']['msacademic'] = str(msacademic)
+ params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now))
+ params['method'] = 'POST'
+ params['data'] = {
+ 'Query': '@{query}@'.format(query=query),
+ 'Limit': 10,
+ 'Offset': params['pageno'] - 1,
+ 'Filters': '',
+ 'OrderBy': '',
+ 'SortAscending': False,
+ }
+
+ return params
+
+
+def response(resp):
+ results = []
+ response_data = loads(resp.text)
+
+ for result in response_data['results']:
+ url = _get_url(result)
+ title = result['e']['dn']
+ content = _get_content(result)
+ results.append({
+ 'url': url,
+ 'title': html_to_text(title),
+ 'content': html_to_text(content),
+ })
+
+ return results
+
+
+def _get_url(result):
+ if 's' in result['e']:
+ return result['e']['s'][0]['u']
+ return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id'])
+
+
+def _get_content(result):
+ if 'd' in result['e']:
+ content = result['e']['d']
+ if len(content) > 300:
+ return content[:300] + '...'
+ return content
+
+ return ''
diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py
new file mode 100644
index 000000000..470c007ea
--- /dev/null
+++ b/searx/engines/mixcloud.py
@@ -0,0 +1,61 @@
+"""
+ Mixcloud (Music)
+
+ @website https://http://www.mixcloud.com/
+ @provide-api yes (http://www.mixcloud.com/developers/
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, embedded, publishedDate
+"""
+
+from json import loads
+from dateutil import parser
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['music']
+paging = True
+
+# search-url
+url = 'https://api.mixcloud.com/'
+search_url = url + 'search/?{query}&type=cloudcast&limit=10&offset={offset}'
+
+embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" ' +\
+ 'data-src="https://www.mixcloud.com/widget/iframe/?feed={url}" width="300" height="300"></iframe>'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 10
+
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ offset=offset)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # parse results
+ for result in search_res.get('data', []):
+ title = result['name']
+ url = result['url']
+ content = result['user']['name']
+ embedded = embedded_url.format(url=url)
+ publishedDate = parser.parse(result['created_time'])
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'embedded': embedded,
+ 'publishedDate': publishedDate,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py
new file mode 100644
index 000000000..c57979a5f
--- /dev/null
+++ b/searx/engines/nyaa.py
@@ -0,0 +1,108 @@
+"""
+ Nyaa.si (Anime Bittorrent tracker)
+
+ @website https://nyaa.si/
+ @provide-api no
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, content, seed, leech, torrentfile
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero
+
+# engine dependent config
+categories = ['files', 'images', 'videos', 'music']
+paging = True
+
+# search-url
+base_url = 'https://nyaa.si/'
+search_url = base_url + '?page=search&{query}&offset={offset}'
+
+# xpath queries
+xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]'
+xpath_category = './/td[1]/a[1]'
+xpath_title = './/td[2]/a[last()]'
+xpath_torrent_links = './/td[3]/a'
+xpath_filesize = './/td[4]/text()'
+xpath_seeds = './/td[6]/text()'
+xpath_leeches = './/td[7]/text()'
+xpath_downloads = './/td[8]/text()'
+
+
+# do search-request
+def request(query, params):
+ query = urlencode({'term': query})
+ params['url'] = search_url.format(query=query, offset=params['pageno'])
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for result in dom.xpath(xpath_results):
+ # defaults
+ filesize = 0
+ magnet_link = ""
+ torrent_link = ""
+
+ # category in which our torrent belongs
+ try:
+ category = result.xpath(xpath_category)[0].attrib.get('title')
+ except:
+ pass
+
+ # torrent title
+ page_a = result.xpath(xpath_title)[0]
+ title = extract_text(page_a)
+
+ # link to the page
+ href = base_url + page_a.attrib.get('href')
+
+ for link in result.xpath(xpath_torrent_links):
+ url = link.attrib.get('href')
+ if 'magnet' in url:
+ # link to the magnet
+ magnet_link = url
+ else:
+ # link to the torrent file
+ torrent_link = url
+
+ # seed count
+ seed = int_or_zero(result.xpath(xpath_seeds))
+
+ # leech count
+ leech = int_or_zero(result.xpath(xpath_leeches))
+
+ # torrent downloads count
+ downloads = int_or_zero(result.xpath(xpath_downloads))
+
+ # let's try to calculate the torrent size
+ try:
+ filesize_info = result.xpath(xpath_filesize)[0]
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+ except:
+ pass
+
+ # content string contains all information not included into template
+ content = 'Category: "{category}". Downloaded {downloads} times.'
+ content = content.format(category=category, downloads=downloads)
+
+ results.append({'url': href,
+ 'title': title,
+ 'content': content,
+ 'seed': seed,
+ 'leech': leech,
+ 'filesize': filesize,
+ 'torrentfile': torrent_link,
+ 'magnetlink': magnet_link,
+ 'template': 'torrent.html'})
+
+ return results
diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py
new file mode 100644
index 000000000..733ba6203
--- /dev/null
+++ b/searx/engines/openstreetmap.py
@@ -0,0 +1,95 @@
+"""
+ OpenStreetMap (Map)
+
+ @website https://openstreetmap.org/
+ @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title
+"""
+
+from json import loads
+
+# engine dependent config
+categories = ['map']
+paging = False
+
+# search-url
+base_url = 'https://nominatim.openstreetmap.org/'
+search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1'
+result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = base_url + search_string.format(query=query)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ json = loads(resp.text)
+
+ # parse results
+ for r in json:
+ if 'display_name' not in r:
+ continue
+
+ title = r['display_name'] or u''
+ osm_type = r.get('osm_type', r.get('type'))
+ url = result_base_url.format(osm_type=osm_type,
+ osm_id=r['osm_id'])
+
+ osm = {'type': osm_type,
+ 'id': r['osm_id']}
+
+ geojson = r.get('geojson')
+
+ # if no geojson is found and osm_type is a node, add geojson Point
+ if not geojson and osm_type == 'node':
+ geojson = {u'type': u'Point', u'coordinates': [r['lon'], r['lat']]}
+
+ address_raw = r.get('address')
+ address = {}
+
+ # get name
+ if r['class'] == 'amenity' or\
+ r['class'] == 'shop' or\
+ r['class'] == 'tourism' or\
+ r['class'] == 'leisure':
+ if address_raw.get('address29'):
+ address = {'name': address_raw.get('address29')}
+ else:
+ address = {'name': address_raw.get(r['type'])}
+
+ # add rest of adressdata, if something is already found
+ if address.get('name'):
+ address.update({'house_number': address_raw.get('house_number'),
+ 'road': address_raw.get('road'),
+ 'locality': address_raw.get('city',
+ address_raw.get('town', # noqa
+ address_raw.get('village'))), # noqa
+ 'postcode': address_raw.get('postcode'),
+ 'country': address_raw.get('country'),
+ 'country_code': address_raw.get('country_code')})
+ else:
+ address = None
+
+ # append result
+ results.append({'template': 'map.html',
+ 'title': title,
+ 'content': '',
+ 'longitude': r['lon'],
+ 'latitude': r['lat'],
+ 'boundingbox': r['boundingbox'],
+ 'geojson': geojson,
+ 'address': address,
+ 'osm': osm,
+ 'url': url})
+
+ # return results
+ return results
diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py
new file mode 100644
index 000000000..2db92868a
--- /dev/null
+++ b/searx/engines/pdbe.py
@@ -0,0 +1,112 @@
+"""
+ PDBe (Protein Data Bank in Europe)
+
+ @website https://www.ebi.ac.uk/pdbe
+ @provide-api yes (https://www.ebi.ac.uk/pdbe/api/doc/search.html),
+ unlimited
+ @using-api yes
+ @results python dictionary (from json)
+ @stable yes
+ @parse url, title, content, img_src
+"""
+
+from json import loads
+from flask_babel import gettext
+
+categories = ['science']
+
+hide_obsolete = False
+
+# status codes of unpublished entries
+pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN']
+# url for api query
+pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?'
+# base url for results
+pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}'
+# link to preview image of structure
+pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png'
+
+
+def request(query, params):
+
+ params['url'] = pdbe_solr_url
+ params['method'] = 'POST'
+ params['data'] = {
+ 'q': query,
+ 'wt': "json" # request response in parsable format
+ }
+ return params
+
+
+def construct_body(result):
+ # set title
+ title = result['title']
+
+ # construct content body
+ content = """{title} - {authors} {journal} ({volume}) {page} ({year})"""
+
+ # replace placeholders with actual content
+ try:
+ if result['journal']:
+ content = content.format(
+ title=result['citation_title'],
+ authors=result['entry_author_list'][0], journal=result['journal'], volume=result['journal_volume'],
+ page=result['journal_page'], year=result['citation_year'])
+ else:
+ content = content.format(
+ title=result['citation_title'],
+ authors=result['entry_author_list'][0], journal='', volume='', page='', year=result['release_year'])
+ img_src = pdbe_preview_url.format(pdb_id=result['pdb_id'])
+ except (KeyError):
+ content = None
+ img_src = None
+
+ # construct url for preview image
+ try:
+ img_src = pdbe_preview_url.format(pdb_id=result['pdb_id'])
+ except (KeyError):
+ img_src = None
+
+ return [title, content, img_src]
+
+
+def response(resp):
+
+ results = []
+ json = loads(resp.text)['response']['docs']
+
+ # parse results
+ for result in json:
+ # catch obsolete entries and mark them accordingly
+ if result['status'] in pdb_unpublished_codes:
+ continue
+ if hide_obsolete:
+ continue
+ if result['status'] == 'OBS':
+ # expand title to add some sort of warning message
+ title = gettext('{title} (OBSOLETE)').format(title=result['title'])
+ try:
+ superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by'])
+ except:
+ continue
+
+ # since we can't construct a proper body from the response, we'll make up our own
+ msg_superseded = gettext("This entry has been superseded by")
+ content = '{msg_superseded}: {url} ({pdb_id})'.format(
+ msg_superseded=msg_superseded,
+ url=superseded_url,
+ pdb_id=result['superseded_by'])
+
+ # obsoleted entries don't have preview images
+ img_src = None
+ else:
+ title, content, img_src = construct_body(result)
+
+ results.append({
+ 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']),
+ 'title': title,
+ 'content': content,
+ 'img_src': img_src
+ })
+
+ return results
diff --git a/searx/engines/photon.py b/searx/engines/photon.py
new file mode 100644
index 000000000..15236f680
--- /dev/null
+++ b/searx/engines/photon.py
@@ -0,0 +1,131 @@
+"""
+ Photon (Map)
+
+ @website https://photon.komoot.de
+ @provide-api yes (https://photon.komoot.de/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title
+"""
+
+from json import loads
+from searx.utils import searx_useragent
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['map']
+paging = False
+language_support = True
+number_of_results = 10
+
+# search-url
+base_url = 'https://photon.komoot.de/'
+search_string = 'api/?{query}&limit={limit}'
+result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
+
+# list of supported languages
+supported_languages = ['de', 'en', 'fr', 'it']
+
+
+# do search-request
+def request(query, params):
+ params['url'] = base_url +\
+ search_string.format(query=urlencode({'q': query}),
+ limit=number_of_results)
+
+ if params['language'] != 'all':
+ language = params['language'].split('_')[0]
+ if language in supported_languages:
+ params['url'] = params['url'] + "&lang=" + language
+
+ # using searx User-Agent
+ params['headers']['User-Agent'] = searx_useragent()
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ json = loads(resp.text)
+
+ # parse results
+ for r in json.get('features', {}):
+
+ properties = r.get('properties')
+
+ if not properties:
+ continue
+
+ # get title
+ title = properties.get('name')
+
+ # get osm-type
+ if properties.get('osm_type') == 'N':
+ osm_type = 'node'
+ elif properties.get('osm_type') == 'W':
+ osm_type = 'way'
+ elif properties.get('osm_type') == 'R':
+ osm_type = 'relation'
+ else:
+ # continue if invalide osm-type
+ continue
+
+ url = result_base_url.format(osm_type=osm_type,
+ osm_id=properties.get('osm_id'))
+
+ osm = {'type': osm_type,
+ 'id': properties.get('osm_id')}
+
+ geojson = r.get('geometry')
+
+ if properties.get('extent'):
+ boundingbox = [properties.get('extent')[3],
+ properties.get('extent')[1],
+ properties.get('extent')[0],
+ properties.get('extent')[2]]
+ else:
+ # TODO: better boundingbox calculation
+ boundingbox = [geojson['coordinates'][1],
+ geojson['coordinates'][1],
+ geojson['coordinates'][0],
+ geojson['coordinates'][0]]
+
+ # address calculation
+ address = {}
+
+ # get name
+ if properties.get('osm_key') == 'amenity' or\
+ properties.get('osm_key') == 'shop' or\
+ properties.get('osm_key') == 'tourism' or\
+ properties.get('osm_key') == 'leisure':
+ address = {'name': properties.get('name')}
+
+ # add rest of adressdata, if something is already found
+ if address.get('name'):
+ address.update({'house_number': properties.get('housenumber'),
+ 'road': properties.get('street'),
+ 'locality': properties.get('city',
+ properties.get('town', # noqa
+ properties.get('village'))), # noqa
+ 'postcode': properties.get('postcode'),
+ 'country': properties.get('country')})
+ else:
+ address = None
+
+ # append result
+ results.append({'template': 'map.html',
+ 'title': title,
+ 'content': '',
+ 'longitude': geojson['coordinates'][0],
+ 'latitude': geojson['coordinates'][1],
+ 'boundingbox': boundingbox,
+ 'geojson': geojson,
+ 'address': address,
+ 'osm': osm,
+ 'url': url})
+
+ # return results
+ return results
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
new file mode 100644
index 000000000..2f3f22a97
--- /dev/null
+++ b/searx/engines/piratebay.py
@@ -0,0 +1,96 @@
+# Piratebay (Videos, Music, Files)
+#
+# @website https://thepiratebay.se
+# @provide-api no (nothing found)
+#
+# @using-api no
+# @results HTML (using search portal)
+# @stable yes (HTML can change)
+# @parse url, title, content, seed, leech, magnetlink
+
+from lxml import html
+from operator import itemgetter
+from searx.engines.xpath import extract_text
+from searx.url_utils import quote, urljoin
+
+# engine dependent config
+categories = ['videos', 'music', 'files']
+paging = True
+
+# search-url
+url = 'https://thepiratebay.org/'
+search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
+
+# piratebay specific type-definitions
+search_types = {'files': '0',
+ 'music': '100',
+ 'videos': '200'}
+
+# specific xpath variables
+magnet_xpath = './/a[@title="Download this torrent using magnet"]'
+torrent_xpath = './/a[@title="Download this torrent"]'
+content_xpath = './/font[@class="detDesc"]'
+
+
+# do search-request
+def request(query, params):
+ search_type = search_types.get(params['category'], '0')
+
+ params['url'] = search_url.format(search_term=quote(query),
+ search_type=search_type,
+ pageno=params['pageno'] - 1)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ search_res = dom.xpath('//table[@id="searchResult"]//tr')
+
+ # return empty array if nothing is found
+ if not search_res:
+ return []
+
+ # parse results
+ for result in search_res[1:]:
+ link = result.xpath('.//div[@class="detName"]//a')[0]
+ href = urljoin(url, link.attrib.get('href'))
+ title = extract_text(link)
+ content = extract_text(result.xpath(content_xpath))
+ seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
+
+ # convert seed to int if possible
+ if seed.isdigit():
+ seed = int(seed)
+ else:
+ seed = 0
+
+ # convert leech to int if possible
+ if leech.isdigit():
+ leech = int(leech)
+ else:
+ leech = 0
+
+ magnetlink = result.xpath(magnet_xpath)[0]
+ torrentfile_links = result.xpath(torrent_xpath)
+ if torrentfile_links:
+ torrentfile_link = torrentfile_links[0].attrib.get('href')
+ else:
+ torrentfile_link = None
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': content,
+ 'seed': seed,
+ 'leech': leech,
+ 'magnetlink': magnetlink.attrib.get('href'),
+ 'torrentfile': torrentfile_link,
+ 'template': 'torrent.html'})
+
+ # return results sorted by seeder
+ return sorted(results, key=itemgetter('seed'), reverse=True)
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py
new file mode 100644
index 000000000..055f09226
--- /dev/null
+++ b/searx/engines/pubmed.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+"""
+ PubMed (Scholar publications)
+ @website https://www.ncbi.nlm.nih.gov/pubmed/
+ @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/)
+ @using-api yes
+ @results XML
+ @stable yes
+ @parse url, title, publishedDate, content
+ More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/
+"""
+
+from flask_babel import gettext
+from lxml import etree
+from datetime import datetime
+from searx.url_utils import urlencode
+from searx.poolrequests import get
+
+
+categories = ['science']
+
+base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\
+ + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
+
+# engine dependent config
+number_of_results = 10
+pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
+
+
+def request(query, params):
+ # basic search
+ offset = (params['pageno'] - 1) * number_of_results
+
+ string_args = dict(query=urlencode({'term': query}),
+ offset=offset,
+ hits=number_of_results)
+
+ params['url'] = base_url.format(**string_args)
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ # First retrieve notice of each result
+ pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\
+ + 'db=pubmed&retmode=xml&id={pmids_string}'
+
+ pmids_results = etree.XML(resp.content)
+ pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
+ pmids_string = ''
+
+ for item in pmids:
+ pmids_string += item.text + ','
+
+ retrieve_notice_args = dict(pmids_string=pmids_string)
+
+ retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
+
+ search_results_xml = get(retrieve_url_encoded).content
+ search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
+
+ for entry in search_results:
+ title = entry.xpath('.//Article/ArticleTitle')[0].text
+
+ pmid = entry.xpath('.//PMID')[0].text
+ url = pubmed_url + pmid
+
+ try:
+ content = entry.xpath('.//Abstract/AbstractText')[0].text
+ except:
+ content = gettext('No abstract is available for this publication.')
+
+ # If a doi is available, add it to the snipppet
+ try:
+ doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
+ content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
+ except:
+ pass
+
+ if len(content) > 300:
+ content = content[0:300] + "..."
+ # TODO: center snippet on query term
+
+ res_dict = {'url': url,
+ 'title': title,
+ 'content': content}
+
+ try:
+ publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text
+ + '-' + entry.xpath('.//DateCreated/Month')[0].text
+ + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d')
+ res_dict['publishedDate'] = publishedDate
+ except:
+ pass
+
+ results.append(res_dict)
+
+ return results
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
new file mode 100644
index 000000000..de12955c6
--- /dev/null
+++ b/searx/engines/qwant.py
@@ -0,0 +1,133 @@
+"""
+ Qwant (Web, Images, News, Social)
+
+ @website https://qwant.com/
+ @provide-api not officially (https://api.qwant.com/api/search/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content
+"""
+
+from datetime import datetime
+from json import loads
+from searx.utils import html_to_text
+from searx.url_utils import urlencode
+from searx.utils import match_language
+
+# engine dependent config
+categories = None
+paging = True
+language_support = True
+supported_languages_url = 'https://qwant.com/region'
+
+category_to_keyword = {'general': 'web',
+ 'images': 'images',
+ 'news': 'news',
+ 'social media': 'social'}
+
+# search-url
+url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 10
+
+ if categories[0] and categories[0] in category_to_keyword:
+
+ params['url'] = url.format(keyword=category_to_keyword[categories[0]],
+ query=urlencode({'q': query}),
+ offset=offset)
+ else:
+ params['url'] = url.format(keyword='web',
+ query=urlencode({'q': query}),
+ offset=offset)
+
+ # add language tag
+ if params['language'] != 'all':
+ language = match_language(params['language'], supported_languages, language_aliases)
+ params['url'] += '&locale=' + language.replace('-', '_').lower()
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if 'data' not in search_results:
+ return []
+
+ data = search_results.get('data', {})
+
+ res = data.get('result', {})
+
+ # parse results
+ for result in res.get('items', {}):
+
+ title = html_to_text(result['title'])
+ res_url = result['url']
+ content = html_to_text(result['desc'])
+
+ if category_to_keyword.get(categories[0], '') == 'web':
+ results.append({'title': title,
+ 'content': content,
+ 'url': res_url})
+
+ elif category_to_keyword.get(categories[0], '') == 'images':
+ thumbnail_src = result['thumbnail']
+ img_src = result['media']
+ results.append({'template': 'images.html',
+ 'url': res_url,
+ 'title': title,
+ 'content': '',
+ 'thumbnail_src': thumbnail_src,
+ 'img_src': img_src})
+
+ elif category_to_keyword.get(categories[0], '') == 'social':
+ published_date = datetime.fromtimestamp(result['date'], None)
+ img_src = result.get('img', None)
+ results.append({'url': res_url,
+ 'title': title,
+ 'publishedDate': published_date,
+ 'content': content,
+ 'img_src': img_src})
+
+ elif category_to_keyword.get(categories[0], '') == 'news':
+ published_date = datetime.fromtimestamp(result['date'], None)
+ media = result.get('media', [])
+ if len(media) > 0:
+ img_src = media[0].get('pict', {}).get('url', None)
+ else:
+ img_src = None
+ results.append({'url': res_url,
+ 'title': title,
+ 'publishedDate': published_date,
+ 'content': content,
+ 'img_src': img_src})
+
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ # list of regions is embedded in page as a js object
+ response_text = resp.text
+ response_text = response_text[response_text.find('regionalisation'):]
+ response_text = response_text[response_text.find('{'):response_text.find(');')]
+
+ regions_json = loads(response_text)
+
+ supported_languages = []
+ for lang in regions_json['languages'].values():
+ if lang['code'] == 'nb':
+ lang['code'] = 'no'
+ for country in lang['countries']:
+ supported_languages.append(lang['code'] + '-' + country)
+
+ return supported_languages
diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py
new file mode 100644
index 000000000..d19724906
--- /dev/null
+++ b/searx/engines/reddit.py
@@ -0,0 +1,76 @@
+"""
+ Reddit
+
+ @website https://www.reddit.com/
+ @provide-api yes (https://www.reddit.com/dev/api)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, thumbnail, publishedDate
+"""
+
+import json
+from datetime import datetime
+from searx.url_utils import urlencode, urljoin, urlparse
+
+# engine dependent config
+categories = ['general', 'images', 'news', 'social media']
+page_size = 25
+
+# search-url
+base_url = 'https://www.reddit.com/'
+search_url = base_url + 'search.json?{query}'
+
+
+# do search-request
+def request(query, params):
+ query = urlencode({'q': query, 'limit': page_size})
+ params['url'] = search_url.format(query=query)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ img_results = []
+ text_results = []
+
+ search_results = json.loads(resp.text)
+
+ # return empty array if there are no results
+ if 'data' not in search_results:
+ return []
+
+ posts = search_results.get('data', {}).get('children', [])
+
+ # process results
+ for post in posts:
+ data = post['data']
+
+ # extract post information
+ params = {
+ 'url': urljoin(base_url, data['permalink']),
+ 'title': data['title']
+ }
+
+ # if thumbnail field contains a valid URL, we need to change template
+ thumbnail = data['thumbnail']
+ url_info = urlparse(thumbnail)
+ # netloc & path
+ if url_info[1] != '' and url_info[2] != '':
+ params['img_src'] = data['url']
+ params['thumbnail_src'] = thumbnail
+ params['template'] = 'images.html'
+ img_results.append(params)
+ else:
+ created = datetime.fromtimestamp(data['created_utc'])
+ content = data['selftext']
+ if len(content) > 500:
+ content = content[:500] + '...'
+ params['content'] = content
+ params['publishedDate'] = created
+ text_results.append(params)
+
+ # show images first and text results second
+ return img_results + text_results
diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py
new file mode 100644
index 000000000..72fd2b3c9
--- /dev/null
+++ b/searx/engines/scanr_structures.py
@@ -0,0 +1,76 @@
+"""
+ ScanR Structures (Science)
+
+ @website https://scanr.enseignementsup-recherche.gouv.fr
+ @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, img_src
+"""
+
+from json import loads, dumps
+from searx.utils import html_to_text
+
+# engine dependent config
+categories = ['science']
+paging = True
+page_size = 20
+
+# search-url
+url = 'https://scanr.enseignementsup-recherche.gouv.fr/'
+search_url = url + 'api/structures/search'
+
+
+# do search-request
+def request(query, params):
+
+ params['url'] = search_url
+ params['method'] = 'POST'
+ params['headers']['Content-type'] = "application/json"
+ params['data'] = dumps({"query": query,
+ "searchField": "ALL",
+ "sortDirection": "ASC",
+ "sortOrder": "RELEVANCY",
+ "page": params['pageno'],
+ "pageSize": page_size})
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # return empty array if there are no results
+ if search_res.get('total', 0) < 1:
+ return []
+
+ # parse results
+ for result in search_res['results']:
+ if 'id' not in result:
+ continue
+
+ # is it thumbnail or img_src??
+ thumbnail = None
+ if 'logo' in result:
+ thumbnail = result['logo']
+ if thumbnail[0] == '/':
+ thumbnail = url + thumbnail
+
+ content = None
+ if 'highlights' in result:
+ content = result['highlights'][0]['value']
+
+ # append result
+ results.append({'url': url + 'structure/' + result['id'],
+ 'title': result['label'],
+ # 'thumbnail': thumbnail,
+ 'img_src': thumbnail,
+ 'content': html_to_text(content)})
+
+ # return results
+ return results
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
new file mode 100644
index 000000000..789e8e7a9
--- /dev/null
+++ b/searx/engines/searchcode_code.py
@@ -0,0 +1,69 @@
+"""
+ Searchcode (It)
+
+ @website https://searchcode.com/
+ @provide-api yes (https://searchcode.com/api/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+
+# engine dependent config
+categories = ['it']
+paging = True
+
+# search-url
+url = 'https://searchcode.com/'
+search_url = url + 'api/codesearch_I/?{query}&p={pageno}'
+
+# special code-endings which are not recognised by the file ending
+code_endings = {'cs': 'c#',
+ 'h': 'c',
+ 'hpp': 'cpp',
+ 'cxx': 'cpp'}
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # parse results
+ for result in search_results.get('results', []):
+ href = result['url']
+ title = "" + result['name'] + " - " + result['filename']
+ repo = result['repo']
+
+ lines = dict()
+ for line, code in result['lines'].items():
+ lines[int(line)] = code
+
+ code_language = code_endings.get(
+ result['filename'].split('.')[-1].lower(),
+ result['filename'].split('.')[-1].lower())
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': '',
+ 'repository': repo,
+ 'codelines': sorted(lines.items()),
+ 'code_language': code_language,
+ 'template': 'code.html'})
+
+ # return results
+ return results
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
new file mode 100644
index 000000000..4b8e9a84a
--- /dev/null
+++ b/searx/engines/searchcode_doc.py
@@ -0,0 +1,49 @@
+"""
+ Searchcode (It)
+
+ @website https://searchcode.com/
+ @provide-api yes (https://searchcode.com/api/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['it']
+paging = True
+
+# search-url
+url = 'https://searchcode.com/'
+search_url = url + 'api/search_IV/?{query}&p={pageno}'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # parse results
+ for result in search_results.get('results', []):
+ href = result['url']
+ title = "[{}] {} {}".format(result['type'], result['namespace'], result['name'])
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': result['description']})
+
+ # return results
+ return results
diff --git a/searx/engines/searx_engine.py b/searx/engines/searx_engine.py
new file mode 100644
index 000000000..d4c85bdc5
--- /dev/null
+++ b/searx/engines/searx_engine.py
@@ -0,0 +1,57 @@
+"""
+ Searx (all)
+
+ @website https://github.com/asciimoo/searx
+ @provide-api yes (https://asciimoo.github.io/searx/dev/search_api.html)
+
+ @using-api yes
+ @results JSON
+ @stable yes (using api)
+ @parse url, title, content
+"""
+
+from json import loads
+from searx.engines import categories as searx_categories
+
+
+categories = searx_categories.keys()
+
+# search-url
+instance_urls = []
+instance_index = 0
+
+
+# do search-request
+def request(query, params):
+ global instance_index
+ params['url'] = instance_urls[instance_index % len(instance_urls)]
+ params['method'] = 'POST'
+
+ instance_index += 1
+
+ params['data'] = {
+ 'q': query,
+ 'pageno': params['pageno'],
+ 'language': params['language'],
+ 'time_range': params['time_range'],
+ 'category': params['category'],
+ 'format': 'json'
+ }
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+
+ response_json = loads(resp.text)
+ results = response_json['results']
+
+ for i in ('answers', 'infoboxes'):
+ results.extend(response_json[i])
+
+ results.extend({'suggestion': s} for s in response_json['suggestions'])
+
+ results.append({'number_of_results': response_json['number_of_results']})
+
+ return results
diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py
new file mode 100644
index 000000000..f9b1f99c8
--- /dev/null
+++ b/searx/engines/seedpeer.py
@@ -0,0 +1,78 @@
+# Seedpeer (Videos, Music, Files)
+#
+# @website https://seedpeer.me
+# @provide-api no (nothing found)
+#
+# @using-api no
+# @results HTML (using search portal)
+# @stable yes (HTML can change)
+# @parse url, title, content, seed, leech, magnetlink
+
+from lxml import html
+from json import loads
+from operator import itemgetter
+from searx.url_utils import quote, urljoin
+from searx.engines.xpath import extract_text
+
+
+url = 'https://seedpeer.me/'
+search_url = url + 'search/{search_term}?page={page_no}'
+torrent_file_url = url + 'torrent/{torrent_hash}'
+
+# specific xpath variables
+script_xpath = '//script[@type="text/javascript"][not(@src)]'
+torrent_xpath = '(//table)[2]/tbody/tr'
+link_xpath = '(./td)[1]/a/@href'
+age_xpath = '(./td)[2]'
+size_xpath = '(./td)[3]'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(search_term=quote(query),
+ page_no=params['pageno'])
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ dom = html.fromstring(resp.text)
+ result_rows = dom.xpath(torrent_xpath)
+
+ try:
+ script_element = dom.xpath(script_xpath)[0]
+ json_string = script_element.text[script_element.text.find('{'):]
+ torrents_json = loads(json_string)
+ except:
+ return []
+
+ # parse results
+ for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']):
+ title = torrent_json['name']
+ seed = int(torrent_json['seeds'])
+ leech = int(torrent_json['peers'])
+ size = int(torrent_json['size'])
+ torrent_hash = torrent_json['hash']
+
+ torrentfile = torrent_file_url.format(torrent_hash=torrent_hash)
+ magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash)
+
+ age = extract_text(torrent_row.xpath(age_xpath))
+ link = torrent_row.xpath(link_xpath)[0]
+
+ href = urljoin(url, link)
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': age,
+ 'seed': seed,
+ 'leech': leech,
+ 'filesize': size,
+ 'torrentfile': torrentfile,
+ 'magnetlink': magnetlink,
+ 'template': 'torrent.html'})
+
+ # return results sorted by seeder
+ return sorted(results, key=itemgetter('seed'), reverse=True)
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
new file mode 100644
index 000000000..284689bf6
--- /dev/null
+++ b/searx/engines/soundcloud.py
@@ -0,0 +1,111 @@
+"""
+ Soundcloud (Music)
+
+ @website https://soundcloud.com
+ @provide-api yes (https://developers.soundcloud.com/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, publishedDate, embedded
+"""
+
+import re
+from json import loads
+from lxml import html
+from dateutil import parser
+from searx import logger
+from searx.poolrequests import get as http_get
+from searx.url_utils import quote_plus, urlencode
+
+try:
+ from cStringIO import StringIO
+except:
+ from io import StringIO
+
+# engine dependent config
+categories = ['music']
+paging = True
+
+# search-url
+# missing attribute: user_id, app_version, app_locale
+url = 'https://api-v2.soundcloud.com/'
+search_url = url + 'search?{query}'\
+ '&variant_ids='\
+ '&facet=model'\
+ '&limit=20'\
+ '&offset={offset}'\
+ '&linked_partitioning=1'\
+ '&client_id={client_id}' # noqa
+
+embedded_url = '<iframe width="100%" height="166" ' +\
+ 'scrolling="no" frameborder="no" ' +\
+ 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>'
+
+cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
+guest_client_id = ''
+
+
+def get_client_id():
+ response = http_get("https://soundcloud.com")
+
+ if response.ok:
+ tree = html.fromstring(response.content)
+ # script_tags has been moved from /assets/app/ to /assets/ path. I
+ # found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js
+ script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
+ app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
+
+ # extracts valid app_js urls from soundcloud.com content
+ for app_js_url in app_js_urls:
+ # gets app_js and searches for the clientid
+ response = http_get(app_js_url)
+ if response.ok:
+ cids = cid_re.search(response.content.decode("utf-8"))
+ if cids is not None and len(cids.groups()):
+ return cids.groups()[0]
+ logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!")
+ return ""
+
+
+def init(engine_settings=None):
+ global guest_client_id
+ # api-key
+ guest_client_id = get_client_id()
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 20
+
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ offset=offset,
+ client_id=guest_client_id)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # parse results
+ for result in search_res.get('collection', []):
+ if result['kind'] in ('track', 'playlist'):
+ title = result['title']
+ content = result['description']
+ publishedDate = parser.parse(result['last_modified'])
+ uri = quote_plus(result['uri'])
+ embedded = embedded_url.format(uri=uri)
+
+ # append result
+ results.append({'url': result['permalink_url'],
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'embedded': embedded,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
new file mode 100644
index 000000000..aed756be3
--- /dev/null
+++ b/searx/engines/spotify.py
@@ -0,0 +1,62 @@
+"""
+ Spotify (Music)
+
+ @website https://spotify.com
+ @provide-api yes (https://developer.spotify.com/web-api/search-item/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content, embedded
+"""
+
+from json import loads
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['music']
+paging = True
+
+# search-url
+url = 'https://api.spotify.com/'
+search_url = url + 'v1/search?{query}&type=track&offset={offset}'
+
+embedded_url = '<iframe data-src="https://embed.spotify.com/?uri=spotify:track:{audioid}"\
+ width="300" height="80" frameborder="0" allowtransparency="true"></iframe>'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 20
+
+ params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_res = loads(resp.text)
+
+ # parse results
+ for result in search_res.get('tracks', {}).get('items', {}):
+ if result['type'] == 'track':
+ title = result['name']
+ url = result['external_urls']['spotify']
+ content = u'{} - {} - {}'.format(
+ result['artists'][0]['name'],
+ result['album']['name'],
+ result['name'])
+
+ embedded = embedded_url.format(audioid=result['id'])
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'embedded': embedded,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
new file mode 100644
index 000000000..25875aa15
--- /dev/null
+++ b/searx/engines/stackoverflow.py
@@ -0,0 +1,57 @@
+"""
+ Stackoverflow (It)
+
+ @website https://stackoverflow.com/
+ @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
+
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode, urljoin
+
+# engine dependent config
+categories = ['it']
+paging = True
+
+# search-url
+url = 'https://stackoverflow.com/'
+search_url = url + 'search?{query}&page={pageno}'
+
+# specific xpath variables
+results_xpath = '//div[contains(@class,"question-summary")]'
+link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
+content_xpath = './/div[@class="excerpt"]'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath(results_xpath):
+ link = result.xpath(link_xpath)[0]
+ href = urljoin(url, link.attrib.get('href'))
+ title = extract_text(link)
+ content = extract_text(result.xpath(content_xpath))
+
+ # append result
+ results.append({'url': href,
+ 'title': title,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
new file mode 100644
index 000000000..76567396f
--- /dev/null
+++ b/searx/engines/startpage.py
@@ -0,0 +1,131 @@
+# Startpage (Web)
+#
+# @website https://startpage.com
+# @provide-api no (nothing found)
+#
+# @using-api no
+# @results HTML
+# @stable no (HTML can change)
+# @parse url, title, content
+#
+# @todo paging
+
+from lxml import html
+from dateutil import parser
+from datetime import datetime, timedelta
+import re
+from searx.engines.xpath import extract_text
+from searx.languages import language_codes
+from searx.utils import eval_xpath
+
+# engine dependent config
+categories = ['general']
+# there is a mechanism to block "bot" search
+# (probably the parameter qid), require
+# storing of qid's between mulitble search-calls
+
+paging = True
+language_support = True
+
+# search-url
+base_url = 'https://startpage.com/'
+search_url = base_url + 'do/search'
+
+# specific xpath variables
+# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
+# not ads: div[@class="result"] are the direct childs of div[@id="results"]
+results_xpath = '//div[@class="w-gl__result"]'
+link_xpath = './/a[@class="w-gl__result-title"]'
+content_xpath = './/p[@class="w-gl__description"]'
+
+
+# do search-request
+def request(query, params):
+
+ params['url'] = search_url
+ params['method'] = 'POST'
+ params['data'] = {
+ 'query': query,
+ 'page': params['pageno'],
+ 'cat': 'web',
+ 'cmd': 'process_search',
+ 'engine0': 'v1all',
+ }
+
+ # set language if specified
+ if params['language'] != 'all':
+ language = 'english'
+ for lc, _, _, lang in language_codes:
+ if lc == params['language']:
+ language = lang
+ params['data']['language'] = language
+ params['data']['lui'] = language
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in eval_xpath(dom, results_xpath):
+ links = eval_xpath(result, link_xpath)
+ if not links:
+ continue
+ link = links[0]
+ url = link.attrib.get('href')
+
+ # block google-ad url's
+ if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
+ continue
+
+ # block startpage search url's
+ if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
+ continue
+
+ title = extract_text(link)
+
+ if eval_xpath(result, content_xpath):
+ content = extract_text(eval_xpath(result, content_xpath))
+ else:
+ content = ''
+
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...') + 4
+ date_string = content[0:date_pos - 5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...') + 4
+ date_string = content[0:date_pos - 5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py
new file mode 100644
index 000000000..773212043
--- /dev/null
+++ b/searx/engines/tokyotoshokan.py
@@ -0,0 +1,99 @@
+"""
+ Tokyo Toshokan (A BitTorrent Library for Japanese Media)
+
+ @website https://www.tokyotosho.info/
+ @provide-api no
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, publishedDate, seed, leech,
+ filesize, magnetlink, content
+"""
+
+import re
+from lxml import html
+from searx.engines.xpath import extract_text
+from datetime import datetime
+from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero
+
+# engine dependent config
+categories = ['files', 'videos', 'music']
+paging = True
+
+# search-url
+base_url = 'https://www.tokyotosho.info/'
+search_url = base_url + 'search.php?{query}'
+
+
+# do search-request
+def request(query, params):
+ query = urlencode({'page': params['pageno'], 'terms': query})
+ params['url'] = search_url.format(query=query)
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+ rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
+
+ # check if there are no results or page layout was changed so we cannot parse it
+ # currently there are two rows for each result, so total count must be even
+ if len(rows) == 0 or len(rows) % 2 != 0:
+ return []
+
+ # regular expression for parsing torrent size strings
+ size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
+
+ # processing the results, two rows at a time
+ for i in range(0, len(rows), 2):
+ # parse the first row
+ name_row = rows[i]
+
+ links = name_row.xpath('./td[@class="desc-top"]/a')
+ params = {
+ 'template': 'torrent.html',
+ 'url': links[-1].attrib.get('href'),
+ 'title': extract_text(links[-1])
+ }
+ # I have not yet seen any torrents without magnet links, but
+ # it's better to be prepared to stumble upon one some day
+ if len(links) == 2:
+ magnet = links[0].attrib.get('href')
+ if magnet.startswith('magnet'):
+ # okay, we have a valid magnet link, let's add it to the result
+ params['magnetlink'] = magnet
+
+ # no more info in the first row, start parsing the second one
+ info_row = rows[i + 1]
+ desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
+ for item in desc.split('|'):
+ item = item.strip()
+ if item.startswith('Size:'):
+ try:
+ # ('1.228', 'GB')
+ groups = size_re.match(item).groups()
+ params['filesize'] = get_torrent_size(groups[0], groups[1])
+ except:
+ pass
+ elif item.startswith('Date:'):
+ try:
+ # Date: 2016-02-21 21:44 UTC
+ date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
+ params['publishedDate'] = date
+ except:
+ pass
+ elif item.startswith('Comment:'):
+ params['content'] = item
+ stats = info_row.xpath('./td[@class="stats"]/span')
+ # has the layout not changed yet?
+ if len(stats) == 3:
+ params['seed'] = int_or_zero(extract_text(stats[0]))
+ params['leech'] = int_or_zero(extract_text(stats[1]))
+
+ results.append(params)
+
+ return results
diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py
new file mode 100644
index 000000000..fd4164a66
--- /dev/null
+++ b/searx/engines/torrentz.py
@@ -0,0 +1,98 @@
+"""
+ Torrentz2.eu (BitTorrent meta-search engine)
+
+ @website https://torrentz2.eu/
+ @provide-api no
+
+ @using-api no
+ @results HTML
+ @stable no (HTML can change, although unlikely,
+ see https://torrentz.eu/torrentz.btsearch)
+ @parse url, title, publishedDate, seed, leech, filesize, magnetlink
+"""
+
+import re
+from lxml import html
+from datetime import datetime
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode
+from searx.utils import get_torrent_size
+
+# engine dependent config
+categories = ['files', 'videos', 'music']
+paging = True
+
+# search-url
+# https://torrentz2.eu/search?f=EXAMPLE&p=6
+base_url = 'https://torrentz2.eu/'
+search_url = base_url + 'search?{query}'
+
+
+# do search-request
+def request(query, params):
+ page = params['pageno'] - 1
+ query = urlencode({'f': query, 'p': page})
+ params['url'] = search_url.format(query=query)
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for result in dom.xpath('//div[@class="results"]/dl'):
+ name_cell = result.xpath('./dt')[0]
+ title = extract_text(name_cell)
+
+ # skip rows that do not contain a link to a torrent
+ links = name_cell.xpath('./a')
+ if len(links) != 1:
+ continue
+
+ # extract url and remove a slash in the beginning
+ link = links[0].attrib.get('href').lstrip('/')
+
+ seed = 0
+ leech = 0
+ try:
+ seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', ''))
+ leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', ''))
+ except:
+ pass
+
+ params = {
+ 'url': base_url + link,
+ 'title': title,
+ 'seed': seed,
+ 'leech': leech,
+ 'template': 'torrent.html'
+ }
+
+ # let's try to calculate the torrent size
+ try:
+ filesize_info = result.xpath('./dd/span[3]/text()')[0]
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
+
+ params['filesize'] = filesize
+ except:
+ pass
+
+ # does our link contain a valid SHA1 sum?
+ if re.compile('[0-9a-fA-F]{40}').match(link):
+ # add a magnet link to the result
+ params['magnetlink'] = 'magnet:?xt=urn:btih:' + link
+
+ # extract and convert creation date
+ try:
+ date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title')
+ date = datetime.fromtimestamp(float(date_ts))
+ params['publishedDate'] = date
+ except:
+ pass
+
+ results.append(params)
+
+ return results
diff --git a/searx/engines/translated.py b/searx/engines/translated.py
new file mode 100644
index 000000000..5c7b17033
--- /dev/null
+++ b/searx/engines/translated.py
@@ -0,0 +1,68 @@
+"""
+ MyMemory Translated
+
+ @website https://mymemory.translated.net/
+ @provide-api yes (https://mymemory.translated.net/doc/spec.php)
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content
+"""
+import re
+from sys import version_info
+from searx.utils import is_valid_lang
+
+if version_info[0] == 3:
+ unicode = str
+
+categories = ['general']
+url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}'
+web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}'
+weight = 100
+
+parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I)
+api_key = ''
+
+
+def request(query, params):
+ m = parser_re.match(unicode(query, 'utf8'))
+ if not m:
+ return params
+
+ from_lang, to_lang, query = m.groups()
+
+ from_lang = is_valid_lang(from_lang)
+ to_lang = is_valid_lang(to_lang)
+
+ if not from_lang or not to_lang:
+ return params
+
+ if api_key:
+ key_form = '&key=' + api_key
+ else:
+ key_form = ''
+ params['url'] = url.format(from_lang=from_lang[1],
+ to_lang=to_lang[1],
+ query=query,
+ key=key_form)
+ params['query'] = query
+ params['from_lang'] = from_lang
+ params['to_lang'] = to_lang
+
+ return params
+
+
+def response(resp):
+ results = []
+ results.append({
+ 'url': web_url.format(
+ from_lang=resp.search_params['from_lang'][2],
+ to_lang=resp.search_params['to_lang'][2],
+ query=resp.search_params['query']),
+ 'title': '[{0}-{1}] {2}'.format(
+ resp.search_params['from_lang'][1],
+ resp.search_params['to_lang'][1],
+ resp.search_params['query']),
+ 'content': resp.json()['responseData']['translatedText']
+ })
+ return results
diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py
new file mode 100644
index 000000000..d2a8d2088
--- /dev/null
+++ b/searx/engines/twitter.py
@@ -0,0 +1,87 @@
+"""
+ Twitter (Social media)
+
+ @website https://twitter.com/
+ @provide-api yes (https://dev.twitter.com/docs/using-search)
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+
+ @todo publishedDate
+"""
+
+from lxml import html
+from datetime import datetime
+from searx.engines.xpath import extract_text
+from searx.url_utils import urlencode, urljoin
+
+# engine dependent config
+categories = ['social media']
+language_support = True
+
+# search-url
+base_url = 'https://twitter.com/'
+search_url = base_url + 'search?'
+
+# specific xpath variables
+results_xpath = '//li[@data-item-type="tweet"]'
+avatar_xpath = './/img[contains(@class, "avatar")]/@src'
+link_xpath = './/small[@class="time"]//a'
+title_xpath = './/span[contains(@class, "username")]'
+content_xpath = './/p[contains(@class, "tweet-text")]'
+timestamp_xpath = './/span[contains(@class,"_timestamp")]'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url + urlencode({'q': query})
+
+ # set language if specified
+ if params['language'] != 'all':
+ params['cookies']['lang'] = params['language'].split('-')[0]
+ else:
+ params['cookies']['lang'] = 'en'
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for tweet in dom.xpath(results_xpath):
+ try:
+ link = tweet.xpath(link_xpath)[0]
+ content = extract_text(tweet.xpath(content_xpath)[0])
+ img_src = tweet.xpath(avatar_xpath)[0]
+ img_src = img_src.replace('_bigger', '_normal')
+ except Exception:
+ continue
+
+ url = urljoin(base_url, link.attrib.get('href'))
+ title = extract_text(tweet.xpath(title_xpath))
+
+ pubdate = tweet.xpath(timestamp_xpath)
+ if len(pubdate) > 0:
+ timestamp = float(pubdate[0].attrib.get('data-time'))
+ publishedDate = datetime.fromtimestamp(timestamp, None)
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'img_src': img_src,
+ 'publishedDate': publishedDate})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'img_src': img_src})
+
+ # return results
+ return results
diff --git a/searx/engines/unsplash.py b/searx/engines/unsplash.py
new file mode 100644
index 000000000..2e8d6fdfc
--- /dev/null
+++ b/searx/engines/unsplash.py
@@ -0,0 +1,52 @@
+"""
+ Unsplash
+
+ @website https://unsplash.com
+ @provide-api yes (https://unsplash.com/developers)
+
+ @using-api no
+ @results JSON (using search portal's infiniscroll API)
+ @stable no (JSON format could change any time)
+ @parse url, title, img_src, thumbnail_src
+"""
+
+from searx.url_utils import urlencode, urlparse, urlunparse, parse_qsl
+from json import loads
+
+url = 'https://unsplash.com/'
+search_url = url + 'napi/search/photos?'
+categories = ['images']
+page_size = 20
+paging = True
+
+
+def clean_url(url):
+ parsed = urlparse(url)
+ query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]
+
+ return urlunparse((parsed.scheme,
+ parsed.netloc,
+ parsed.path,
+ parsed.params,
+ urlencode(query),
+ parsed.fragment))
+
+
+def request(query, params):
+ params['url'] = search_url + urlencode({'query': query, 'page': params['pageno'], 'per_page': page_size})
+ return params
+
+
+def response(resp):
+ results = []
+ json_data = loads(resp.text)
+
+ if 'results' in json_data:
+ for result in json_data['results']:
+ results.append({'template': 'images.html',
+ 'url': clean_url(result['links']['html']),
+ 'thumbnail_src': clean_url(result['urls']['thumb']),
+ 'img_src': clean_url(result['urls']['raw']),
+ 'title': result['description'],
+ 'content': ''})
+ return results
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
new file mode 100644
index 000000000..a92271019
--- /dev/null
+++ b/searx/engines/vimeo.py
@@ -0,0 +1,67 @@
+# Vimeo (Videos)
+#
+# @website https://vimeo.com/
+# @provide-api yes (http://developer.vimeo.com/api),
+# they have a maximum count of queries/hour
+#
+# @using-api no (TODO, rewrite to api)
+# @results HTML (using search portal)
+# @stable no (HTML can change)
+# @parse url, title, publishedDate, thumbnail, embedded
+#
+# @todo rewrite to api
+# @todo set content-parameter with correct data
+
+from json import loads
+from dateutil import parser
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['videos']
+paging = True
+
+# search-url
+base_url = 'https://vimeo.com/'
+search_url = base_url + '/search/page:{pageno}?{query}'
+
+embedded_url = '<iframe data-src="https://player.vimeo.com/video/{videoid}" ' +\
+ 'width="540" height="304" frameborder="0" ' +\
+ 'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(pageno=params['pageno'],
+ query=urlencode({'q': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ data_start_pos = resp.text.find('{"filtered"')
+ data_end_pos = resp.text.find(';\n', data_start_pos + 1)
+ data = loads(resp.text[data_start_pos:data_end_pos])
+
+ # parse results
+ for result in data['filtered']['data']:
+ result = result[result['type']]
+ videoid = result['uri'].split('/')[-1]
+ url = base_url + videoid
+ title = result['name']
+ thumbnail = result['pictures']['sizes'][-1]['link']
+ publishedDate = parser.parse(result['created_time'])
+ embedded = embedded_url.format(videoid=videoid)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': '',
+ 'template': 'videos.html',
+ 'publishedDate': publishedDate,
+ 'embedded': embedded,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
new file mode 100644
index 000000000..e913b3915
--- /dev/null
+++ b/searx/engines/wikidata.py
@@ -0,0 +1,500 @@
+# -*- coding: utf-8 -*-
+"""
+ Wikidata
+
+ @website https://wikidata.org
+ @provide-api yes (https://wikidata.org/w/api.php)
+
+ @using-api partially (most things require scraping)
+ @results JSON, HTML
+ @stable no (html can change)
+ @parse url, infobox
+"""
+
+from searx import logger
+from searx.poolrequests import get
+from searx.engines.xpath import extract_text
+from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
+from searx.url_utils import urlencode
+from searx.utils import match_language, eval_xpath
+
+from json import loads
+from lxml.html import fromstring
+from lxml import etree
+
+logger = logger.getChild('wikidata')
+result_count = 1
+
+# urls
+wikidata_host = 'https://www.wikidata.org'
+url_search = wikidata_host \
+ + '/w/index.php?{query}&ns0=1'
+
+wikidata_api = wikidata_host + '/w/api.php'
+url_detail = wikidata_api\
+ + '?action=parse&format=json&{query}'\
+ + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\
+ + '&disableeditsection=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2'
+
+url_map = 'https://www.openstreetmap.org/'\
+ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
+url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
+
+# xpaths
+div_ids_xpath = '//div[@id]'
+wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href'
+title_xpath = '//*[contains(@class,"wikibase-title-label")]'
+description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
+label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a'
+url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]'
+wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\
+ + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href'
+property_row_xpath = './/div[contains(@class,"wikibase-statementview")]'
+preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]'
+value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\
+ + '/*/div[contains(@class,"wikibase-snakview-value")]'
+language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]'
+calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
+media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
+
+
+def get_id_cache(result):
+ id_cache = {}
+ for e in eval_xpath(result, div_ids_xpath):
+ id = e.get('id')
+ if id.startswith('P'):
+ id_cache[id] = e
+ return id_cache
+
+
+def request(query, params):
+ params['url'] = url_search.format(
+ query=urlencode({'search': query}))
+ return params
+
+
+def response(resp):
+ results = []
+ htmlparser = etree.HTMLParser()
+ html = fromstring(resp.content.decode("utf-8"), parser=htmlparser)
+ search_results = eval_xpath(html, wikidata_ids_xpath)
+
+ if resp.search_params['language'].split('-')[0] == 'all':
+ language = 'en'
+ else:
+ language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]
+
+ # TODO: make requests asynchronous to avoid timeout when result_count > 1
+ for search_result in search_results[:result_count]:
+ wikidata_id = search_result.split('/')[-1]
+ url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
+ htmlresponse = get(url)
+ jsonresponse = loads(htmlresponse.content.decode("utf-8"))
+ results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser)
+
+ return results
+
+
+def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser):
+ results = []
+ urls = []
+ attributes = []
+
+ title = jsonresponse.get('parse', {}).get('displaytitle', {})
+ result = jsonresponse.get('parse', {}).get('text', {})
+
+ if not title or not result:
+ return results
+
+ title = fromstring(title, parser=htmlparser)
+ for elem in eval_xpath(title, language_fallback_xpath):
+ elem.getparent().remove(elem)
+ title = extract_text(eval_xpath(title, title_xpath))
+
+ result = fromstring(result, parser=htmlparser)
+ for elem in eval_xpath(result, language_fallback_xpath):
+ elem.getparent().remove(elem)
+
+ description = extract_text(eval_xpath(result, description_xpath))
+
+ id_cache = get_id_cache(result)
+
+ # URLS
+
+ # official website
+ add_url(urls, result, id_cache, 'P856', results=results)
+
+ # wikipedia
+ wikipedia_link_count = 0
+ wikipedia_link = get_wikilink(result, language + 'wiki')
+ if wikipedia_link:
+ wikipedia_link_count += 1
+ urls.append({'title': 'Wikipedia (' + language + ')',
+ 'url': wikipedia_link})
+
+ if language != 'en':
+ wikipedia_en_link = get_wikilink(result, 'enwiki')
+ if wikipedia_en_link:
+ wikipedia_link_count += 1
+ urls.append({'title': 'Wikipedia (en)',
+ 'url': wikipedia_en_link})
+
+ # TODO: get_wiki_firstlanguage
+ # if wikipedia_link_count == 0:
+
+ # more wikis
+ add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
+ add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
+ add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki')
+
+ add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo')
+
+ # musicbrainz
+ add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
+ add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
+ add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
+ add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')
+
+ # IMDb
+ add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
+ # source code repository
+ add_url(urls, result, id_cache, 'P1324')
+ # blog
+ add_url(urls, result, id_cache, 'P1581')
+ # social media links
+ add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
+ add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
+ add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/')
+ add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/')
+ add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/')
+
+ urls.append({'title': 'Wikidata',
+ 'url': 'https://www.wikidata.org/wiki/'
+ + wikidata_id + '?uselang=' + language})
+
+ # INFOBOX ATTRIBUTES (ROWS)
+
+ # DATES
+ # inception date
+ add_attribute(attributes, id_cache, 'P571', date=True)
+ # dissolution date
+ add_attribute(attributes, id_cache, 'P576', date=True)
+ # start date
+ add_attribute(attributes, id_cache, 'P580', date=True)
+ # end date
+ add_attribute(attributes, id_cache, 'P582', date=True)
+ # date of birth
+ add_attribute(attributes, id_cache, 'P569', date=True)
+ # date of death
+ add_attribute(attributes, id_cache, 'P570', date=True)
+ # date of spacecraft launch
+ add_attribute(attributes, id_cache, 'P619', date=True)
+ # date of spacecraft landing
+ add_attribute(attributes, id_cache, 'P620', date=True)
+
+ # nationality
+ add_attribute(attributes, id_cache, 'P27')
+ # country of origin
+ add_attribute(attributes, id_cache, 'P495')
+ # country
+ add_attribute(attributes, id_cache, 'P17')
+ # headquarters
+ add_attribute(attributes, id_cache, 'Q180')
+
+ # PLACES
+ # capital
+ add_attribute(attributes, id_cache, 'P36', trim=True)
+ # head of state
+ add_attribute(attributes, id_cache, 'P35', trim=True)
+ # head of government
+ add_attribute(attributes, id_cache, 'P6', trim=True)
+ # type of government
+ add_attribute(attributes, id_cache, 'P122')
+ # official language
+ add_attribute(attributes, id_cache, 'P37')
+ # population
+ add_attribute(attributes, id_cache, 'P1082', trim=True)
+ # area
+ add_attribute(attributes, id_cache, 'P2046')
+ # currency
+ add_attribute(attributes, id_cache, 'P38', trim=True)
+ # heigth (building)
+ add_attribute(attributes, id_cache, 'P2048')
+
+ # MEDIA
+ # platform (videogames)
+ add_attribute(attributes, id_cache, 'P400')
+ # author
+ add_attribute(attributes, id_cache, 'P50')
+ # creator
+ add_attribute(attributes, id_cache, 'P170')
+ # director
+ add_attribute(attributes, id_cache, 'P57')
+ # performer
+ add_attribute(attributes, id_cache, 'P175')
+ # developer
+ add_attribute(attributes, id_cache, 'P178')
+ # producer
+ add_attribute(attributes, id_cache, 'P162')
+ # manufacturer
+ add_attribute(attributes, id_cache, 'P176')
+ # screenwriter
+ add_attribute(attributes, id_cache, 'P58')
+ # production company
+ add_attribute(attributes, id_cache, 'P272')
+ # record label
+ add_attribute(attributes, id_cache, 'P264')
+ # publisher
+ add_attribute(attributes, id_cache, 'P123')
+ # original network
+ add_attribute(attributes, id_cache, 'P449')
+ # distributor
+ add_attribute(attributes, id_cache, 'P750')
+ # composer
+ add_attribute(attributes, id_cache, 'P86')
+ # publication date
+ add_attribute(attributes, id_cache, 'P577', date=True)
+ # genre
+ add_attribute(attributes, id_cache, 'P136')
+ # original language
+ add_attribute(attributes, id_cache, 'P364')
+ # isbn
+ add_attribute(attributes, id_cache, 'Q33057')
+ # software license
+ add_attribute(attributes, id_cache, 'P275')
+ # programming language
+ add_attribute(attributes, id_cache, 'P277')
+ # version
+ add_attribute(attributes, id_cache, 'P348', trim=True)
+ # narrative location
+ add_attribute(attributes, id_cache, 'P840')
+
+ # LANGUAGES
+ # number of speakers
+ add_attribute(attributes, id_cache, 'P1098')
+ # writing system
+ add_attribute(attributes, id_cache, 'P282')
+ # regulatory body
+ add_attribute(attributes, id_cache, 'P1018')
+ # language code
+ add_attribute(attributes, id_cache, 'P218')
+
+ # OTHER
+ # ceo
+ add_attribute(attributes, id_cache, 'P169', trim=True)
+ # founder
+ add_attribute(attributes, id_cache, 'P112')
+ # legal form (company/organization)
+ add_attribute(attributes, id_cache, 'P1454')
+ # operator
+ add_attribute(attributes, id_cache, 'P137')
+ # crew members (tripulation)
+ add_attribute(attributes, id_cache, 'P1029')
+ # taxon
+ add_attribute(attributes, id_cache, 'P225')
+ # chemical formula
+ add_attribute(attributes, id_cache, 'P274')
+ # winner (sports/contests)
+ add_attribute(attributes, id_cache, 'P1346')
+ # number of deaths
+ add_attribute(attributes, id_cache, 'P1120')
+ # currency code
+ add_attribute(attributes, id_cache, 'P498')
+
+ image = add_image(id_cache)
+
+ if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
+ results.append({
+ 'url': urls[0]['url'],
+ 'title': title,
+ 'content': description
+ })
+ else:
+ results.append({
+ 'infobox': title,
+ 'id': wikipedia_link,
+ 'content': description,
+ 'img_src': image,
+ 'attributes': attributes,
+ 'urls': urls
+ })
+
+ return results
+
+
+# only returns first match
+def add_image(id_cache):
+ # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon
+ property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910']
+
+ for property_id in property_ids:
+ image = id_cache.get(property_id, None)
+ if image is not None:
+ image_name = eval_xpath(image, media_xpath)
+ image_src = url_image.replace('{filename}', extract_text(image_name[0]))
+ return image_src
+
+
+# setting trim will only returned high ranked rows OR the first row
+def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False):
+ attribute = id_cache.get(property_id, None)
+ if attribute is not None:
+
+ if default_label:
+ label = default_label
+ else:
+ label = extract_text(eval_xpath(attribute, label_xpath))
+ label = label[0].upper() + label[1:]
+
+ if date:
+ trim = True
+ # remove calendar name
+ calendar_name = eval_xpath(attribute, calendar_name_xpath)
+ for calendar in calendar_name:
+ calendar.getparent().remove(calendar)
+
+ concat_values = ""
+ values = []
+ first_value = None
+ for row in eval_xpath(attribute, property_row_xpath):
+ if not first_value or not trim or eval_xpath(row, preferred_rank_xpath):
+ value = eval_xpath(row, value_xpath)
+ if not value:
+ continue
+ value = extract_text(value)
+
+ # save first value in case no ranked row is found
+ if trim and not first_value:
+ first_value = value
+ else:
+ # to avoid duplicate values
+ if value not in values:
+ concat_values += value + ", "
+ values.append(value)
+
+ if trim and not values:
+ attributes.append({'label': label,
+ 'value': first_value})
+ else:
+ attributes.append({'label': label,
+ 'value': concat_values[:-2]})
+
+
+# requires property_id unless it's a wiki link (defined in link_type)
+def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None,
+ link_type=None):
+ links = []
+
+ # wiki links don't have property in wikidata page
+ if link_type and 'wiki' in link_type:
+ links.append(get_wikilink(result, link_type))
+ else:
+ dom_element = id_cache.get(property_id, None)
+ if dom_element is not None:
+ if not default_label:
+ label = extract_text(eval_xpath(dom_element, label_xpath))
+ label = label[0].upper() + label[1:]
+
+ if link_type == 'geo':
+ links.append(get_geolink(dom_element))
+
+ elif link_type == 'imdb':
+ links.append(get_imdblink(dom_element, url_prefix))
+
+ else:
+ url_results = eval_xpath(dom_element, url_xpath)
+ for link in url_results:
+ if link is not None:
+ if url_prefix:
+ link = url_prefix + extract_text(link)
+ else:
+ link = extract_text(link)
+ links.append(link)
+
+ # append urls
+ for url in links:
+ if url is not None:
+ urls.append({'title': default_label or label,
+ 'url': url})
+ if results is not None:
+ results.append({'title': default_label or label,
+ 'url': url})
+
+
+def get_imdblink(result, url_prefix):
+ imdb_id = eval_xpath(result, value_xpath)
+ if imdb_id:
+ imdb_id = extract_text(imdb_id)
+ id_prefix = imdb_id[:2]
+ if id_prefix == 'tt':
+ url = url_prefix + 'title/' + imdb_id
+ elif id_prefix == 'nm':
+ url = url_prefix + 'name/' + imdb_id
+ elif id_prefix == 'ch':
+ url = url_prefix + 'character/' + imdb_id
+ elif id_prefix == 'co':
+ url = url_prefix + 'company/' + imdb_id
+ elif id_prefix == 'ev':
+ url = url_prefix + 'event/' + imdb_id
+ else:
+ url = None
+ return url
+
+
+def get_geolink(result):
+ coordinates = eval_xpath(result, value_xpath)
+ if not coordinates:
+ return None
+ coordinates = extract_text(coordinates[0])
+ latitude, longitude = coordinates.split(',')
+
+ # convert to decimal
+ lat = int(latitude[:latitude.find(u'°')])
+ if latitude.find('\'') >= 0:
+ lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0
+ if latitude.find('"') >= 0:
+ lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
+ if latitude.find('S') >= 0:
+ lat *= -1
+ lon = int(longitude[:longitude.find(u'°')])
+ if longitude.find('\'') >= 0:
+ lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0
+ if longitude.find('"') >= 0:
+ lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
+ if longitude.find('W') >= 0:
+ lon *= -1
+
+ # TODO: get precision
+ precision = 0.0002
+ # there is no zoom information, deduce from precision (error prone)
+ # samples :
+ # 13 --> 5
+ # 1 --> 6
+ # 0.016666666666667 --> 9
+ # 0.00027777777777778 --> 19
+ # wolframalpha :
+ # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
+ # 14.1186-8.8322 x+0.625447 x^2
+ if precision < 0.0003:
+ zoom = 19
+ else:
+ zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
+
+ url = url_map\
+ .replace('{latitude}', str(lat))\
+ .replace('{longitude}', str(lon))\
+ .replace('{zoom}', str(zoom))
+
+ return url
+
+
+def get_wikilink(result, wikiid):
+ url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid))
+ if not url:
+ return None
+ url = url[0]
+ if url.startswith('http://'):
+ url = url.replace('http://', 'https://')
+ elif url.startswith('//'):
+ url = 'https:' + url
+ return url
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
new file mode 100644
index 000000000..4dae735d1
--- /dev/null
+++ b/searx/engines/wikipedia.py
@@ -0,0 +1,133 @@
+"""
+ Wikipedia (Web)
+
+ @website https://{language}.wikipedia.org
+ @provide-api yes
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, infobox
+"""
+
+from json import loads
+from lxml.html import fromstring
+from searx.url_utils import quote, urlencode
+from searx.utils import match_language
+
+# search-url
+base_url = u'https://{language}.wikipedia.org/'
+search_url = base_url + u'w/api.php?'\
+ 'action=query'\
+ '&format=json'\
+ '&{query}'\
+ '&prop=extracts|pageimages'\
+ '&exintro'\
+ '&explaintext'\
+ '&pithumbsize=300'\
+ '&redirects'
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
+
+
+# set language in base_url
+def url_lang(lang):
+ lang_pre = lang.split('-')[0]
+ if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
+ return 'en'
+ return match_language(lang, supported_languages, language_aliases).split('-')[0]
+
+
+# do search-request
+def request(query, params):
+ if query.islower():
+ query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
+
+ params['url'] = search_url.format(query=urlencode({'titles': query}),
+ language=url_lang(params['language']))
+
+ return params
+
+
+# get first meaningful paragraph
+# this should filter out disambiguation pages and notes above first paragraph
+# "magic numbers" were obtained by fine tuning
+def extract_first_paragraph(content, title, image):
+ first_paragraph = None
+
+ failed_attempts = 0
+ for paragraph in content.split('\n'):
+
+ starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
+ length = len(paragraph)
+
+ if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
+ first_paragraph = paragraph
+ break
+
+ failed_attempts += 1
+ if failed_attempts > 3:
+ return None
+
+ return first_paragraph
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_result = loads(resp.text)
+
+ # wikipedia article's unique id
+ # first valid id is assumed to be the requested article
+ for article_id in search_result['query']['pages']:
+ page = search_result['query']['pages'][article_id]
+ if int(article_id) > 0:
+ break
+
+ if int(article_id) < 0:
+ return []
+
+ title = page.get('title')
+
+ image = page.get('thumbnail')
+ if image:
+ image = image.get('source')
+
+ extract = page.get('extract')
+
+ summary = extract_first_paragraph(extract, title, image)
+
+ # link to wikipedia article
+ wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
+ + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
+
+ results.append({'url': wikipedia_link, 'title': title})
+
+ results.append({'infobox': title,
+ 'id': wikipedia_link,
+ 'content': summary,
+ 'img_src': image,
+ 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
+
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = {}
+ dom = fromstring(resp.text)
+ tables = dom.xpath('//table[contains(@class,"sortable")]')
+ for table in tables:
+ # exclude header row
+ trs = table.xpath('.//tr')[1:]
+ for tr in trs:
+ td = tr.xpath('./td')
+ code = td[3].xpath('./a')[0].text
+ name = td[2].xpath('./a')[0].text
+ english_name = td[1].xpath('./a')[0].text
+ articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
+ # exclude languages with too few articles
+ if articles >= 100:
+ supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
+
+ return supported_languages
diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py
new file mode 100644
index 000000000..1c58c4a9b
--- /dev/null
+++ b/searx/engines/wolframalpha_api.py
@@ -0,0 +1,129 @@
+# Wolfram Alpha (Science)
+#
+# @website https://www.wolframalpha.com
+# @provide-api yes (https://api.wolframalpha.com/v2/)
+#
+# @using-api yes
+# @results XML
+# @stable yes
+# @parse url, infobox
+
+from lxml import etree
+from searx.url_utils import urlencode
+
+# search-url
+search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
+site_url = 'https://www.wolframalpha.com/input/?{query}'
+api_key = '' # defined in settings.yml
+
+# xpath variables
+failure_xpath = '/queryresult[attribute::success="false"]'
+input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext'
+pods_xpath = '//pod'
+subpods_xpath = './subpod'
+pod_primary_xpath = './@primary'
+pod_id_xpath = './@id'
+pod_title_xpath = './@title'
+plaintext_xpath = './plaintext'
+image_xpath = './img'
+img_src_xpath = './@src'
+img_alt_xpath = './@alt'
+
+# pods to display as image in infobox
+# this pods do return a plaintext, but they look better and are more useful as images
+image_pods = {'VisualRepresentation',
+ 'Illustration'}
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'input': query}), api_key=api_key)
+ params['headers']['Referer'] = site_url.format(query=urlencode({'i': query}))
+
+ return params
+
+
+# replace private user area characters to make text legible
+def replace_pua_chars(text):
+ pua_chars = {u'\uf522': u'\u2192', # rigth arrow
+ u'\uf7b1': u'\u2115', # set of natural numbers
+ u'\uf7b4': u'\u211a', # set of rational numbers
+ u'\uf7b5': u'\u211d', # set of real numbers
+ u'\uf7bd': u'\u2124', # set of integer numbers
+ u'\uf74c': 'd', # differential
+ u'\uf74d': u'\u212f', # euler's number
+ u'\uf74e': 'i', # imaginary number
+ u'\uf7d9': '='} # equals sign
+
+ for k, v in pua_chars.items():
+ text = text.replace(k, v)
+
+ return text
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = etree.XML(resp.content)
+
+ # return empty array if there are no results
+ if search_results.xpath(failure_xpath):
+ return []
+
+ try:
+ infobox_title = search_results.xpath(input_xpath)[0].text
+ except:
+ infobox_title = ""
+
+ pods = search_results.xpath(pods_xpath)
+ result_chunks = []
+ result_content = ""
+ for pod in pods:
+ pod_id = pod.xpath(pod_id_xpath)[0]
+ pod_title = pod.xpath(pod_title_xpath)[0]
+ pod_is_result = pod.xpath(pod_primary_xpath)
+
+ subpods = pod.xpath(subpods_xpath)
+ if not subpods:
+ continue
+
+ # Appends either a text or an image, depending on which one is more suitable
+ for subpod in subpods:
+ content = subpod.xpath(plaintext_xpath)[0].text
+ image = subpod.xpath(image_xpath)
+
+ if content and pod_id not in image_pods:
+
+ if pod_is_result or not result_content:
+ if pod_id != "Input":
+ result_content = "%s: %s" % (pod_title, content)
+
+ # if no input pod was found, title is first plaintext pod
+ if not infobox_title:
+ infobox_title = content
+
+ content = replace_pua_chars(content)
+ result_chunks.append({'label': pod_title, 'value': content})
+
+ elif image:
+ result_chunks.append({'label': pod_title,
+ 'image': {'src': image[0].xpath(img_src_xpath)[0],
+ 'alt': image[0].xpath(img_alt_xpath)[0]}})
+
+ if not result_chunks:
+ return []
+
+ title = "Wolfram|Alpha (%s)" % infobox_title
+
+ # append infobox
+ results.append({'infobox': infobox_title,
+ 'attributes': result_chunks,
+ 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
+
+ # append link to site
+ results.append({'url': resp.request.headers['Referer'],
+ 'title': title,
+ 'content': result_content})
+
+ return results
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
new file mode 100644
index 000000000..387c9fa17
--- /dev/null
+++ b/searx/engines/wolframalpha_noapi.py
@@ -0,0 +1,121 @@
+# Wolfram|Alpha (Science)
+#
+# @website https://www.wolframalpha.com/
+# @provide-api yes (https://api.wolframalpha.com/v2/)
+#
+# @using-api no
+# @results JSON
+# @stable no
+# @parse url, infobox
+
+from json import loads
+from time import time
+
+from searx.poolrequests import get as http_get
+from searx.url_utils import urlencode
+
+# search-url
+url = 'https://www.wolframalpha.com/'
+
+search_url = url + 'input/json.jsp'\
+ '?async=false'\
+ '&banners=raw'\
+ '&debuggingdata=false'\
+ '&format=image,plaintext,imagemap,minput,moutput'\
+ '&formattimeout=2'\
+ '&{query}'\
+ '&output=JSON'\
+ '&parsetimeout=2'\
+ '&proxycode={token}'\
+ '&scantimeout=0.5'\
+ '&sponsorcategories=true'\
+ '&statemethod=deploybutton'
+
+referer_url = url + 'input/?{query}'
+
+token = {'value': '',
+ 'last_updated': None}
+
+# pods to display as image in infobox
+# this pods do return a plaintext, but they look better and are more useful as images
+image_pods = {'VisualRepresentation',
+ 'Illustration',
+ 'Symbol'}
+
+
+# seems, wolframalpha resets its token in every hour
+def obtain_token():
+ update_time = time() - (time() % 3600)
+ try:
+ token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
+ token['value'] = loads(token_response.text)['code']
+ token['last_updated'] = update_time
+ except:
+ pass
+ return token
+
+
+def init(engine_settings=None):
+ obtain_token()
+
+
+# do search-request
+def request(query, params):
+ # obtain token if last update was more than an hour
+ if time() - (token['last_updated'] or 0) > 3600:
+ obtain_token()
+ params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
+ params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ resp_json = loads(resp.text)
+
+ if not resp_json['queryresult']['success']:
+ return []
+
+ # TODO handle resp_json['queryresult']['assumptions']
+ result_chunks = []
+ infobox_title = ""
+ result_content = ""
+ for pod in resp_json['queryresult']['pods']:
+ pod_id = pod.get('id', '')
+ pod_title = pod.get('title', '')
+ pod_is_result = pod.get('primary', None)
+
+ if 'subpods' not in pod:
+ continue
+
+ if pod_id == 'Input' or not infobox_title:
+ infobox_title = pod['subpods'][0]['plaintext']
+
+ for subpod in pod['subpods']:
+ if subpod['plaintext'] != '' and pod_id not in image_pods:
+ # append unless it's not an actual answer
+ if subpod['plaintext'] != '(requires interactivity)':
+ result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
+
+ if pod_is_result or not result_content:
+ if pod_id != "Input":
+ result_content = pod_title + ': ' + subpod['plaintext']
+
+ elif 'img' in subpod:
+ result_chunks.append({'label': pod_title, 'image': subpod['img']})
+
+ if not result_chunks:
+ return []
+
+ results.append({'infobox': infobox_title,
+ 'attributes': result_chunks,
+ 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
+
+ results.append({'url': resp.request.headers['Referer'],
+ 'title': 'Wolfram|Alpha (' + infobox_title + ')',
+ 'content': result_content})
+
+ return results
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py
new file mode 100644
index 000000000..f1154b16d
--- /dev/null
+++ b/searx/engines/www1x.py
@@ -0,0 +1,58 @@
+"""
+ 1x (Images)
+
+ @website http://1x.com/
+ @provide-api no
+
+ @using-api no
+ @results HTML
+ @stable no (HTML can change)
+ @parse url, title, thumbnail, img_src, content
+"""
+
+from lxml import html
+from searx.url_utils import urlencode, urljoin
+from searx.engines.xpath import extract_text
+
+# engine dependent config
+categories = ['images']
+paging = False
+
+# search-url
+base_url = 'https://1x.com'
+search_url = base_url + '/backend/search.php?{query}'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}))
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+ for res in dom.xpath('//div[@class="List-item MainListing"]'):
+ # processed start and end of link
+ link = res.xpath('//a')[0]
+
+ url = urljoin(base_url, link.attrib.get('href'))
+ title = extract_text(link)
+
+ thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src'])
+ # TODO: get image with higher resolution
+ img_src = thumbnail_src
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'img_src': img_src,
+ 'content': '',
+ 'thumbnail_src': thumbnail_src,
+ 'template': 'images.html'})
+
+ # return results
+ return results
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
new file mode 100644
index 000000000..b75896cc7
--- /dev/null
+++ b/searx/engines/xpath.py
@@ -0,0 +1,133 @@
+from lxml import html
+from lxml.etree import _ElementStringResult, _ElementUnicodeResult
+from searx.utils import html_to_text, eval_xpath
+from searx.url_utils import unquote, urlencode, urljoin, urlparse
+
+search_url = None
+url_xpath = None
+content_xpath = None
+title_xpath = None
+thumbnail_xpath = False
+paging = False
+suggestion_xpath = ''
+results_xpath = ''
+
+# parameters for engines with paging support
+#
+# number of results on each page
+# (only needed if the site requires not a page number, but an offset)
+page_size = 1
+# number of the first page (usually 0 or 1)
+first_page_num = 1
+
+
+'''
+if xpath_results is list, extract the text from each result and concat the list
+if xpath_results is a xml element, extract all the text node from it
+ ( text_content() method from lxml )
+if xpath_results is a string element, then it's already done
+'''
+
+
+def extract_text(xpath_results):
+ if type(xpath_results) == list:
+ # it's list of result : concat everything using recursive call
+ result = ''
+ for e in xpath_results:
+ result = result + extract_text(e)
+ return result.strip()
+ elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]:
+ # it's a string
+ return ''.join(xpath_results)
+ else:
+ # it's a element
+ text = html.tostring(
+ xpath_results, encoding='unicode', method='text', with_tail=False
+ )
+ text = text.strip().replace('\n', ' ')
+ return ' '.join(text.split())
+
+
+def extract_url(xpath_results, search_url):
+ if xpath_results == []:
+ raise Exception('Empty url resultset')
+ url = extract_text(xpath_results)
+
+ if url.startswith('//'):
+ # add http or https to this kind of url //example.com/
+ parsed_search_url = urlparse(search_url)
+ url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
+ elif url.startswith('/'):
+ # fix relative url to the search engine
+ url = urljoin(search_url, url)
+
+ # normalize url
+ url = normalize_url(url)
+
+ return url
+
+
+def normalize_url(url):
+ parsed_url = urlparse(url)
+
+ # add a / at this end of the url if there is no path
+ if not parsed_url.netloc:
+ raise Exception('Cannot parse url')
+ if not parsed_url.path:
+ url += '/'
+
+ # FIXME : hack for yahoo
+ if parsed_url.hostname == 'search.yahoo.com'\
+ and parsed_url.path.startswith('/r'):
+ p = parsed_url.path
+ mark = p.find('/**')
+ if mark != -1:
+ return unquote(p[mark + 3:]).decode('utf-8')
+
+ return url
+
+
+def request(query, params):
+ query = urlencode({'q': query})[2:]
+
+ fp = {'query': query}
+ if paging and search_url.find('{pageno}') >= 0:
+ fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
+
+ params['url'] = search_url.format(**fp)
+ params['query'] = query
+
+ return params
+
+
+def response(resp):
+ results = []
+ dom = html.fromstring(resp.text)
+ if results_xpath:
+ for result in eval_xpath(dom, results_xpath):
+ url = extract_url(eval_xpath(result, url_xpath), search_url)
+ title = extract_text(eval_xpath(result, title_xpath))
+ content = extract_text(eval_xpath(result, content_xpath))
+ tmp_result = {'url': url, 'title': title, 'content': content}
+
+ # add thumbnail if available
+ if thumbnail_xpath:
+ thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
+ if len(thumbnail_xpath_result) > 0:
+ tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
+
+ results.append(tmp_result)
+ else:
+ for url, title, content in zip(
+ (extract_url(x, search_url) for
+ x in eval_xpath(dom, url_xpath)),
+ map(extract_text, eval_xpath(dom, title_xpath)),
+ map(extract_text, eval_xpath(dom, content_xpath))
+ ):
+ results.append({'url': url, 'title': title, 'content': content})
+
+ if not suggestion_xpath:
+ return results
+ for suggestion in eval_xpath(dom, suggestion_xpath):
+ results.append({'suggestion': extract_text(suggestion)})
+ return results
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
new file mode 100644
index 000000000..25bc83687
--- /dev/null
+++ b/searx/engines/yacy.py
@@ -0,0 +1,108 @@
+# Yacy (Web, Images, Videos, Music, Files)
+#
+# @website http://yacy.net
+# @provide-api yes
+# (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch)
+#
+# @using-api yes
+# @results JSON
+# @stable yes
+# @parse (general) url, title, content, publishedDate
+# @parse (images) url, title, img_src
+#
+# @todo parse video, audio and file results
+
+from json import loads
+from dateutil import parser
+from searx.url_utils import urlencode
+
+from searx.utils import html_to_text
+
+# engine dependent config
+categories = ['general', 'images'] # TODO , 'music', 'videos', 'files'
+paging = True
+language_support = True
+number_of_results = 5
+
+# search-url
+base_url = 'http://localhost:8090'
+search_url = '/yacysearch.json?{query}'\
+ '&startRecord={offset}'\
+ '&maximumRecords={limit}'\
+ '&contentdom={search_type}'\
+ '&resource=global'
+
+# yacy specific type-definitions
+search_types = {'general': 'text',
+ 'images': 'image',
+ 'files': 'app',
+ 'music': 'audio',
+ 'videos': 'video'}
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * number_of_results
+ search_type = search_types.get(params.get('category'), '0')
+
+ params['url'] = base_url +\
+ search_url.format(query=urlencode({'query': query}),
+ offset=offset,
+ limit=number_of_results,
+ search_type=search_type)
+
+ # add language tag if specified
+ if params['language'] != 'all':
+ params['url'] += '&lr=lang_' + params['language'].split('-')[0]
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ raw_search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if not raw_search_results:
+ return []
+
+ search_results = raw_search_results.get('channels', [])
+
+ if len(search_results) == 0:
+ return []
+
+ for result in search_results[0].get('items', []):
+ # parse image results
+ if result.get('image'):
+
+ result_url = ''
+ if 'url' in result:
+ result_url = result['url']
+ elif 'link' in result:
+ result_url = result['link']
+ else:
+ continue
+
+ # append result
+ results.append({'url': result_url,
+ 'title': result['title'],
+ 'content': '',
+ 'img_src': result['image'],
+ 'template': 'images.html'})
+
+ # parse general results
+ else:
+ publishedDate = parser.parse(result['pubDate'])
+
+ # append result
+ results.append({'url': result['link'],
+ 'title': result['title'],
+ 'content': html_to_text(result['description']),
+ 'publishedDate': publishedDate})
+
+ # TODO parse video, audio and file results
+
+ # return results
+ return results
diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py
new file mode 100644
index 000000000..36c1a11f8
--- /dev/null
+++ b/searx/engines/yahoo.py
@@ -0,0 +1,160 @@
+"""
+ Yahoo (Web)
+
+ @website https://search.yahoo.com/web
+ @provide-api yes (https://developer.yahoo.com/boss/search/),
+ $0.80/1000 queries
+
+ @using-api no (because pricing)
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content, suggestion
+"""
+
+from lxml import html
+from searx.engines.xpath import extract_text, extract_url
+from searx.url_utils import unquote, urlencode
+from searx.utils import match_language, eval_xpath
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True
+time_range_support = True
+
+# search-url
+base_url = 'https://search.yahoo.com/'
+search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
+search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
+
+supported_languages_url = 'https://search.yahoo.com/web/advanced'
+
+# specific xpath variables
+results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
+url_xpath = './/h3/a/@href'
+title_xpath = './/h3/a'
+content_xpath = './/div[@class="compText aAbs"]'
+suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a"
+
+time_range_dict = {'day': ['1d', 'd'],
+ 'week': ['1w', 'w'],
+ 'month': ['1m', 'm']}
+
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
+
+
+# remove yahoo-specific tracking-url
+def parse_url(url_string):
+ endings = ['/RS', '/RK']
+ endpositions = []
+ start = url_string.find('http', url_string.find('/RU=') + 1)
+
+ for ending in endings:
+ endpos = url_string.rfind(ending)
+ if endpos > -1:
+ endpositions.append(endpos)
+
+ if start == 0 or len(endpositions) == 0:
+ return url_string
+ else:
+ end = min(endpositions)
+ return unquote(url_string[start:end])
+
+
+def _get_url(query, offset, language, time_range):
+ if time_range in time_range_dict:
+ return base_url + search_url_with_time.format(offset=offset,
+ query=urlencode({'p': query}),
+ lang=language,
+ age=time_range_dict[time_range][0],
+ btf=time_range_dict[time_range][1])
+ return base_url + search_url.format(offset=offset,
+ query=urlencode({'p': query}),
+ lang=language)
+
+
+def _get_language(params):
+ if params['language'] == 'all':
+ return 'en'
+
+ language = match_language(params['language'], supported_languages, language_aliases)
+ if language not in language_aliases.values():
+ language = language.split('-')[0]
+ language = language.replace('-', '_').lower()
+
+ return language
+
+
+# do search-request
+def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
+ offset = (params['pageno'] - 1) * 10 + 1
+ language = _get_language(params)
+
+ params['url'] = _get_url(query, offset, language, params['time_range'])
+
+ # TODO required?
+ params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
+ .format(lang=language)
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ try:
+ results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0]
+ .split()[0].replace(',', ''))
+ results.append({'number_of_results': results_num})
+ except:
+ pass
+
+ # parse results
+ for result in eval_xpath(dom, results_xpath):
+ try:
+ url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url))
+ title = extract_text(eval_xpath(result, title_xpath)[0])
+ except:
+ continue
+
+ content = extract_text(eval_xpath(result, content_xpath)[0])
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
+
+ # if no suggestion found, return results
+ suggestions = eval_xpath(dom, suggestion_xpath)
+ if not suggestions:
+ return results
+
+ # parse suggestion
+ for suggestion in suggestions:
+ # append suggestion
+ results.append({'suggestion': extract_text(suggestion)})
+
+ # return results
+ return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+ supported_languages = []
+ dom = html.fromstring(resp.text)
+ options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input')
+ for option in options:
+ code_parts = eval_xpath(option, './@value')[0][5:].split('_')
+ if len(code_parts) == 2:
+ code = code_parts[0] + '-' + code_parts[1].upper()
+ else:
+ code = code_parts[0]
+ supported_languages.append(code)
+
+ return supported_languages
diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py
new file mode 100644
index 000000000..9f6a4159b
--- /dev/null
+++ b/searx/engines/yahoo_news.py
@@ -0,0 +1,110 @@
+# Yahoo (News)
+#
+# @website https://news.yahoo.com
+# @provide-api yes (https://developer.yahoo.com/boss/search/)
+# $0.80/1000 queries
+#
+# @using-api no (because pricing)
+# @results HTML (using search portal)
+# @stable no (HTML can change)
+# @parse url, title, content, publishedDate
+
+import re
+from datetime import datetime, timedelta
+from lxml import html
+from searx.engines.xpath import extract_text, extract_url
+from searx.engines.yahoo import (
+ parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
+)
+from dateutil import parser
+from searx.url_utils import urlencode
+from searx.utils import match_language
+
+# engine dependent config
+categories = ['news']
+paging = True
+language_support = True
+
+# search-url
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa
+
+# specific xpath variables
+results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li'
+url_xpath = './/h3/a/@href'
+title_xpath = './/h3/a'
+content_xpath = './/div[@class="compText"]'
+publishedDate_xpath = './/span[contains(@class,"tri")]'
+suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 10 + 1
+
+ if params['language'] == 'all':
+ language = 'en'
+ else:
+ language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
+
+ params['url'] = search_url.format(offset=offset,
+ query=urlencode({'p': query}),
+ lang=language)
+
+ # TODO required?
+ params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
+ .format(lang=language)
+ return params
+
+
+def sanitize_url(url):
+ if ".yahoo.com/" in url:
+ return re.sub(u"\\;\\_ylt\\=.+$", "", url)
+ else:
+ return url
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath(results_xpath):
+ urls = result.xpath(url_xpath)
+ if len(urls) != 1:
+ continue
+ url = sanitize_url(parse_url(extract_url(urls, search_url)))
+ title = extract_text(result.xpath(title_xpath)[0])
+ content = extract_text(result.xpath(content_xpath)[0])
+
+ # parse publishedDate
+ publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
+
+ # still useful ?
+ if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
+ publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))
+ elif re.match("^[0-9]+ days? ago$", publishedDate):
+ publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group()))
+ elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
+ timeNumbers = re.findall(r'\d+', publishedDate)
+ publishedDate = datetime.now()\
+ - timedelta(hours=int(timeNumbers[0]))\
+ - timedelta(minutes=int(timeNumbers[1]))
+ else:
+ try:
+ publishedDate = parser.parse(publishedDate)
+ except:
+ publishedDate = datetime.now()
+
+ if publishedDate.year == 1900:
+ publishedDate = publishedDate.replace(year=datetime.now().year)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': publishedDate})
+
+ # return results
+ return results
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
new file mode 100644
index 000000000..1c789f6cb
--- /dev/null
+++ b/searx/engines/yandex.py
@@ -0,0 +1,64 @@
+"""
+ Yahoo (Web)
+
+ @website https://yandex.ru/
+ @provide-api ?
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from lxml import html
+from searx import logger
+from searx.url_utils import urlencode
+
+logger = logger.getChild('yandex engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True # TODO
+
+default_tld = 'com'
+language_map = {'ru': 'ru',
+ 'ua': 'ua',
+ 'be': 'by',
+ 'kk': 'kz',
+ 'tr': 'com.tr'}
+
+# search-url
+base_url = 'https://yandex.{tld}/'
+search_url = 'search/?{query}&p={page}'
+
+results_xpath = '//li[@class="serp-item"]'
+url_xpath = './/h2/a/@href'
+title_xpath = './/h2/a//text()'
+content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()'
+
+
+def request(query, params):
+ lang = params['language'].split('-')[0]
+ host = base_url.format(tld=language_map.get(lang) or default_tld)
+ params['url'] = host + search_url.format(page=params['pageno'] - 1,
+ query=urlencode({'text': query}))
+ return params
+
+
+# get response from search-request
+def response(resp):
+ dom = html.fromstring(resp.text)
+ results = []
+
+ for result in dom.xpath(results_xpath):
+ try:
+ res = {'url': result.xpath(url_xpath)[0],
+ 'title': ''.join(result.xpath(title_xpath)),
+ 'content': ''.join(result.xpath(content_xpath))}
+ except:
+ logger.exception('yandex parse crash')
+ continue
+
+ results.append(res)
+
+ return results
diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py
new file mode 100644
index 000000000..bc4c0d58e
--- /dev/null
+++ b/searx/engines/youtube_api.py
@@ -0,0 +1,83 @@
+# Youtube (Videos)
+#
+# @website https://www.youtube.com/
+# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list)
+#
+# @using-api yes
+# @results JSON
+# @stable yes
+# @parse url, title, content, publishedDate, thumbnail, embedded
+
+from json import loads
+from dateutil import parser
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['videos', 'music']
+paging = False
+language_support = True
+api_key = None
+
+# search-url
+base_url = 'https://www.googleapis.com/youtube/v3/search'
+search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}'
+
+embedded_url = '<iframe width="540" height="304" ' +\
+ 'data-src="https://www.youtube-nocookie.com/embed/{videoid}" ' +\
+ 'frameborder="0" allowfullscreen></iframe>'
+
+base_youtube_url = 'https://www.youtube.com/watch?v='
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ api_key=api_key)
+
+ # add language tag if specified
+ if params['language'] != 'all':
+ params['url'] += '&relevanceLanguage=' + params['language'].split('-')[0]
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if 'items' not in search_results:
+ return []
+
+ # parse results
+ for result in search_results['items']:
+ videoid = result['id']['videoId']
+
+ title = result['snippet']['title']
+ content = ''
+ thumbnail = ''
+
+ pubdate = result['snippet']['publishedAt']
+ publishedDate = parser.parse(pubdate)
+
+ thumbnail = result['snippet']['thumbnails']['high']['url']
+
+ content = result['snippet']['description']
+
+ url = base_youtube_url + videoid
+
+ embedded = embedded_url.format(videoid=videoid)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'videos.html',
+ 'publishedDate': publishedDate,
+ 'embedded': embedded,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
new file mode 100644
index 000000000..49d0ae604
--- /dev/null
+++ b/searx/engines/youtube_noapi.py
@@ -0,0 +1,90 @@
+# Youtube (Videos)
+#
+# @website https://www.youtube.com/
+# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list)
+#
+# @using-api no
+# @results HTML
+# @stable no
+# @parse url, title, content, publishedDate, thumbnail, embedded
+
+from functools import reduce
+from json import loads
+from searx.engines.xpath import extract_text
+from searx.utils import list_get
+from searx.url_utils import quote_plus
+
+# engine dependent config
+categories = ['videos', 'music']
+paging = True
+language_support = False
+time_range_support = True
+
+# search-url
+base_url = 'https://www.youtube.com/results'
+search_url = base_url + '?search_query={query}&page={page}'
+time_range_url = '&sp=EgII{time_range}%253D%253D'
+time_range_dict = {'day': 'Ag',
+ 'week': 'Aw',
+ 'month': 'BA',
+ 'year': 'BQ'}
+
+embedded_url = '<iframe width="540" height="304" ' +\
+ 'data-src="https://www.youtube-nocookie.com/embed/{videoid}" ' +\
+ 'frameborder="0" allowfullscreen></iframe>'
+
+base_youtube_url = 'https://www.youtube.com/watch?v='
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=quote_plus(query),
+ page=params['pageno'])
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ results_data = resp.text[resp.text.find('ytInitialData'):]
+ results_data = results_data[results_data.find('{'):results_data.find(';\n')]
+
+ results_json = loads(results_data) if results_data else {}
+ sections = results_json.get('contents', {})\
+ .get('twoColumnSearchResultsRenderer', {})\
+ .get('primaryContents', {})\
+ .get('sectionListRenderer', {})\
+ .get('contents', [])
+
+ for section in sections:
+ for video_container in section.get('itemSectionRenderer', {}).get('contents', []):
+ video = video_container.get('videoRenderer', {})
+ videoid = video.get('videoId')
+ if videoid is not None:
+ url = base_youtube_url + videoid
+ thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'
+ title = get_text_from_json(video.get('title', {}))
+ content = get_text_from_json(video.get('descriptionSnippet', {}))
+ embedded = embedded_url.format(videoid=videoid)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'videos.html',
+ 'embedded': embedded,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
+
+
+def get_text_from_json(element):
+ if 'runs' in element:
+ return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '')
+ else:
+ return element.get('simpleText', '')