summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py98
-rw-r--r--searx/engines/archlinux.py3
-rwxr-xr-xsearx/engines/base.py3
-rw-r--r--searx/engines/bing.py17
-rw-r--r--searx/engines/bing_images.py8
-rw-r--r--searx/engines/bing_news.py27
-rw-r--r--searx/engines/btdigg.py21
-rw-r--r--searx/engines/dailymotion.py3
-rw-r--r--searx/engines/deezer.py9
-rw-r--r--searx/engines/deviantart.py3
-rw-r--r--searx/engines/dictzone.py68
-rw-r--r--searx/engines/digbt.py58
-rw-r--r--searx/engines/digg.py3
-rw-r--r--searx/engines/duckduckgo.py3
-rw-r--r--searx/engines/fdroid.py3
-rw-r--r--searx/engines/flickr.py14
-rw-r--r--searx/engines/flickr_noapi.py25
-rw-r--r--searx/engines/gigablast.py5
-rw-r--r--searx/engines/github.py3
-rw-r--r--searx/engines/google.py8
-rw-r--r--searx/engines/google_images.py10
-rw-r--r--searx/engines/google_news.py83
-rw-r--r--searx/engines/json_engine.py2
-rw-r--r--searx/engines/kickass.py49
-rw-r--r--searx/engines/nyaa.py6
-rw-r--r--searx/engines/openstreetmap.py2
-rw-r--r--searx/engines/pdbe.py109
-rw-r--r--searx/engines/piratebay.py3
-rw-r--r--searx/engines/qwant.py10
-rw-r--r--searx/engines/reddit.py3
-rw-r--r--searx/engines/searchcode_code.py5
-rw-r--r--searx/engines/searchcode_doc.py17
-rw-r--r--searx/engines/seedpeer.py77
-rw-r--r--searx/engines/spotify.py9
-rw-r--r--searx/engines/stackoverflow.py5
-rw-r--r--searx/engines/startpage.py5
-rw-r--r--searx/engines/subtitleseeker.py5
-rw-r--r--searx/engines/swisscows.py9
-rw-r--r--searx/engines/tokyotoshokan.py1
-rw-r--r--searx/engines/torrentz.py1
-rw-r--r--searx/engines/translated.py64
-rw-r--r--searx/engines/vimeo.py32
-rw-r--r--searx/engines/wolframalpha_api.py16
-rw-r--r--searx/engines/wolframalpha_noapi.py15
-rw-r--r--searx/engines/www500px.py42
-rw-r--r--searx/engines/xpath.py2
-rw-r--r--searx/engines/yacy.py4
-rw-r--r--searx/engines/yahoo.py3
-rw-r--r--searx/engines/yandex.py9
-rw-r--r--searx/engines/youtube_noapi.py8
50 files changed, 688 insertions, 300 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 782b622b0..87b1b0eb4 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -16,13 +16,13 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
-from os.path import realpath, dirname, splitext, join
+from os.path import realpath, dirname
import sys
-from imp import load_source
from flask_babel import gettext
from operator import itemgetter
from searx import settings
from searx import logger
+from searx.utils import load_module
logger = logger.getChild('engines')
@@ -32,6 +32,7 @@ engine_dir = dirname(realpath(__file__))
engines = {}
categories = {'general': []}
+_initialized = False
engine_shortcuts = {}
engine_default_args = {'paging': False,
@@ -46,22 +47,18 @@ engine_default_args = {'paging': False,
'time_range_support': False}
-def load_module(filename):
- modname = splitext(filename)[0]
- if modname in sys.modules:
- del sys.modules[modname]
- filepath = join(engine_dir, filename)
- module = load_source(modname, filepath)
- module.name = modname
- return module
+def load_engine(engine_data):
+
+ if '_' in engine_data['name']:
+ logger.error('Engine name conains underscore: "{}"'.format(engine_data['name']))
+ sys.exit(1)
+ engine_module = engine_data['engine']
-def load_engine(engine_data):
- engine_name = engine_data['engine']
try:
- engine = load_module(engine_name + '.py')
+ engine = load_module(engine_module + '.py', engine_dir)
except:
- logger.exception('Cannot load engine "{}"'.format(engine_name))
+ logger.exception('Cannot load engine "{}"'.format(engine_module))
return None
for param_name in engine_data:
@@ -93,6 +90,9 @@ def load_engine(engine_data):
'result_count': 0,
'search_count': 0,
'page_load_time': 0,
+ 'page_load_count': 0,
+ 'engine_time': 0,
+ 'engine_time_count': 0,
'score_count': 0,
'errors': 0
}
@@ -109,32 +109,56 @@ def load_engine(engine_data):
return engine
+def to_percentage(stats, maxvalue):
+ for engine_stat in stats:
+ if maxvalue:
+ engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100)
+ else:
+ engine_stat['percentage'] = 0
+ return stats
+
+
def get_engines_stats():
# TODO refactor
pageloads = []
+ engine_times = []
results = []
scores = []
errors = []
scores_per_result = []
- max_pageload = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
+ max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
for engine in engines.values():
if engine.stats['search_count'] == 0:
continue
results_num = \
engine.stats['result_count'] / float(engine.stats['search_count'])
- load_times = engine.stats['page_load_time'] / float(engine.stats['search_count']) # noqa
+
+ if engine.stats['page_load_count'] != 0:
+ load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
+ else:
+ load_times = 0
+
+ if engine.stats['engine_time_count'] != 0:
+ this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
+ else:
+ this_engine_time = 0
+
if results_num:
score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa
score_per_result = score / results_num
else:
score = score_per_result = 0.0
- max_results = max(results_num, max_results)
+
max_pageload = max(load_times, max_pageload)
+ max_engine_times = max(this_engine_time, max_engine_times)
+ max_results = max(results_num, max_results)
max_score = max(score, max_score)
max_score_per_result = max(score_per_result, max_score_per_result)
max_errors = max(max_errors, engine.stats['errors'])
+
pageloads.append({'avg': load_times, 'name': engine.name})
+ engine_times.append({'avg': this_engine_time, 'name': engine.name})
results.append({'avg': results_num, 'name': engine.name})
scores.append({'avg': score, 'name': engine.name})
errors.append({'avg': engine.stats['errors'], 'name': engine.name})
@@ -143,39 +167,19 @@ def get_engines_stats():
'name': engine.name
})
- for engine in pageloads:
- if max_pageload:
- engine['percentage'] = int(engine['avg'] / max_pageload * 100)
- else:
- engine['percentage'] = 0
-
- for engine in results:
- if max_results:
- engine['percentage'] = int(engine['avg'] / max_results * 100)
- else:
- engine['percentage'] = 0
-
- for engine in scores:
- if max_score:
- engine['percentage'] = int(engine['avg'] / max_score * 100)
- else:
- engine['percentage'] = 0
-
- for engine in scores_per_result:
- if max_score_per_result:
- engine['percentage'] = int(engine['avg']
- / max_score_per_result * 100)
- else:
- engine['percentage'] = 0
-
- for engine in errors:
- if max_errors:
- engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
- else:
- engine['percentage'] = 0
+ pageloads = to_percentage(pageloads, max_pageload)
+ engine_times = to_percentage(engine_times, max_engine_times)
+ results = to_percentage(results, max_results)
+ scores = to_percentage(scores, max_score)
+ scores_per_result = to_percentage(scores_per_result, max_score_per_result)
+ erros = to_percentage(errors, max_errors)
return [
(
+ gettext('Engine time (sec)'),
+ sorted(engine_times, key=itemgetter('avg'))
+ ),
+ (
gettext('Page loads (sec)'),
sorted(pageloads, key=itemgetter('avg'))
),
diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py
index b846934f7..5ba512766 100644
--- a/searx/engines/archlinux.py
+++ b/searx/engines/archlinux.py
@@ -12,7 +12,6 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
@@ -135,7 +134,7 @@ def response(resp):
for result in dom.xpath(xpath_results):
link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href'))
- title = escape(extract_text(link))
+ title = extract_text(link)
results.append({'url': href,
'title': title})
diff --git a/searx/engines/base.py b/searx/engines/base.py
index 66491d395..a552453ce 100755
--- a/searx/engines/base.py
+++ b/searx/engines/base.py
@@ -16,7 +16,6 @@
from lxml import etree
from urllib import urlencode
from searx.utils import searx_useragent
-from cgi import escape
from datetime import datetime
import re
@@ -94,7 +93,7 @@ def response(resp):
url = item.text
elif item.attrib["name"] == "dcdescription":
- content = escape(item.text[:300])
+ content = item.text[:300]
if len(item.text) > 300:
content += "..."
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index 6bdfd378b..58db61251 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -14,7 +14,6 @@
"""
from urllib import urlencode
-from cgi import escape
from lxml import html
from searx.engines.xpath import extract_text
@@ -32,18 +31,14 @@ search_string = 'search?{query}&first={offset}'
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
- if params['language'] == 'all':
- language = 'en-US'
- else:
- language = params['language'].replace('_', '-')
+ if params['language'] != 'all':
+ query = u'language:{} {}'.format(params['language'].split('_')[0].upper(),
+ query.decode('utf-8')).encode('utf-8')
search_path = search_string.format(
- query=urlencode({'q': query, 'setmkt': language}),
+ query=urlencode({'q': query}),
offset=offset)
- params['cookies']['SRCHHPGUSR'] = \
- 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
-
params['url'] = base_url + search_path
return params
@@ -65,7 +60,7 @@ def response(resp):
link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
- content = escape(extract_text(result.xpath('.//p')))
+ content = extract_text(result.xpath('.//p'))
# append result
results.append({'url': url,
@@ -77,7 +72,7 @@ def response(resp):
link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
- content = escape(extract_text(result.xpath('.//p')))
+ content = extract_text(result.xpath('.//p'))
# append result
results.append({'url': url,
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 384520392..4dd362cb3 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -24,11 +24,17 @@ import re
categories = ['images']
paging = True
safesearch = True
+time_range_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}'
+time_range_string = '&qft=+filterui:age-lt{interval}'
thumb_url = "https://www.bing.com/th?id={ihk}"
+time_range_dict = {'day': '1440',
+ 'week': '10080',
+ 'month': '43200',
+ 'year': '525600'}
# safesearch definitions
safesearch_types = {2: 'STRICT',
@@ -58,6 +64,8 @@ def request(query, params):
'&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
params['url'] = base_url + search_path
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
return params
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index a2397c48e..4e7c33129 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -22,10 +22,15 @@ from searx.utils import list_get
categories = ['news']
paging = True
language_support = True
+time_range_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}&format=RSS'
+search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
+time_range_dict = {'day': '7',
+ 'week': '8',
+ 'month': '9'}
# remove click
@@ -46,8 +51,24 @@ def image_url_cleanup(url_string):
return url_string
+def _get_url(query, language, offset, time_range):
+ if time_range in time_range_dict:
+ search_path = search_string_with_time.format(
+ query=urlencode({'q': query, 'setmkt': language}),
+ offset=offset,
+ interval=time_range_dict[time_range])
+ else:
+ search_path = search_string.format(
+ query=urlencode({'q': query, 'setmkt': language}),
+ offset=offset)
+ return base_url + search_path
+
+
# do search-request
def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
@@ -55,11 +76,7 @@ def request(query, params):
else:
language = params['language'].replace('_', '-')
- search_path = search_string.format(
- query=urlencode({'q': query, 'setmkt': language}),
- offset=offset)
-
- params['url'] = base_url + search_path
+ params['url'] = _get_url(query, language, offset, params['time_range'])
return params
diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py
index c2b22f003..33c8355de 100644
--- a/searx/engines/btdigg.py
+++ b/searx/engines/btdigg.py
@@ -11,11 +11,11 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size
# engine dependent config
categories = ['videos', 'music', 'files']
@@ -50,8 +50,8 @@ def response(resp):
for result in search_res:
link = result.xpath('.//td[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
- title = escape(extract_text(link))
- content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
+ title = extract_text(link)
+ content = extract_text(result.xpath('.//pre[@class="snippet"]')[0])
content = "<br />".join(content.split("\n"))
filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]
@@ -68,20 +68,7 @@ def response(resp):
leech = 0
# convert filesize to byte if possible
- try:
- filesize = float(filesize)
-
- # convert filesize to byte
- if filesize_multiplier == 'TB':
- filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
- elif filesize_multiplier == 'GB':
- filesize = int(filesize * 1024 * 1024 * 1024)
- elif filesize_multiplier == 'MB':
- filesize = int(filesize * 1024 * 1024)
- elif filesize_multiplier == 'KB':
- filesize = int(filesize * 1024)
- except:
- filesize = None
+ filesize = get_torrent_size(filesize, filesize_multiplier)
# convert files to int if possible
if files.isdigit():
diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py
index 4eb894725..317f34f59 100644
--- a/searx/engines/dailymotion.py
+++ b/searx/engines/dailymotion.py
@@ -14,7 +14,6 @@
from urllib import urlencode
from json import loads
-from cgi import escape
from datetime import datetime
# engine dependent config
@@ -57,7 +56,7 @@ def response(resp):
for res in search_res['list']:
title = res['title']
url = res['url']
- content = escape(res['description'])
+ content = res['description']
thumbnail = res['thumbnail_360_url']
publishedDate = datetime.fromtimestamp(res['created_time'], None)
embedded = embedded_url.format(videoid=res['id'])
diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py
index 0530bc072..3db1af3d2 100644
--- a/searx/engines/deezer.py
+++ b/searx/engines/deezer.py
@@ -51,10 +51,11 @@ def response(resp):
if url.startswith('http://'):
url = 'https' + url[4:]
- content = result['artist']['name'] +\
- " &bull; " +\
- result['album']['title'] +\
- " &bull; " + result['title']
+ content = u'{} - {} - {}'.format(
+ result['artist']['name'],
+ result['album']['title'],
+ result['title'])
+
embedded = embedded_url.format(audioid=result['id'])
# append result
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py
index d893fc7fe..a24b75b8a 100644
--- a/searx/engines/deviantart.py
+++ b/searx/engines/deviantart.py
@@ -34,6 +34,9 @@ time_range_dict = {'day': 11,
# do search-request
def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
offset = (params['pageno'] - 1) * 24
params['url'] = search_url.format(offset=offset,
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
new file mode 100644
index 000000000..20a9a8980
--- /dev/null
+++ b/searx/engines/dictzone.py
@@ -0,0 +1,68 @@
+"""
+ Dictzone
+
+ @website https://dictzone.com/
+ @provide-api no
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+import re
+from urlparse import urljoin
+from lxml import html
+from searx.utils import is_valid_lang
+
+categories = ['general']
+url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
+weight = 100
+
+parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
+results_xpath = './/table[@id="r"]/tr'
+
+
+def request(query, params):
+ m = parser_re.match(unicode(query, 'utf8'))
+ if not m:
+ return params
+
+ from_lang, to_lang, query = m.groups()
+
+ from_lang = is_valid_lang(from_lang)
+ to_lang = is_valid_lang(to_lang)
+
+ if not from_lang or not to_lang:
+ return params
+
+ params['url'] = url.format(from_lang=from_lang[2],
+ to_lang=to_lang[2],
+ query=query)
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ for k, result in enumerate(dom.xpath(results_xpath)[1:]):
+ try:
+ from_result, to_results_raw = result.xpath('./td')
+ except:
+ continue
+
+ to_results = []
+ for to_result in to_results_raw.xpath('./p/a'):
+ t = to_result.text_content()
+ if t.strip():
+ to_results.append(to_result.text_content())
+
+ results.append({
+ 'url': urljoin(resp.url, '?%d' % k),
+ 'title': from_result.text_content(),
+ 'content': '; '.join(to_results)
+ })
+
+ return results
diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py
new file mode 100644
index 000000000..b55d7747a
--- /dev/null
+++ b/searx/engines/digbt.py
@@ -0,0 +1,58 @@
+"""
+ DigBT (Videos, Music, Files)
+
+ @website https://digbt.org
+ @provide-api no
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content, magnetlink
+"""
+
+from urlparse import urljoin
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size
+
+categories = ['videos', 'music', 'files']
+paging = True
+
+URL = 'https://digbt.org'
+SEARCH_URL = URL + '/search/{query}-time-{pageno}'
+FILESIZE = 3
+FILESIZE_MULTIPLIER = 4
+
+
+def request(query, params):
+ params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno'])
+
+ return params
+
+
+def response(resp):
+ dom = html.fromstring(resp.content)
+ search_res = dom.xpath('.//td[@class="x-item"]')
+
+ if not search_res:
+ return list()
+
+ results = list()
+ for result in search_res:
+ url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
+ title = extract_text(result.xpath('.//a[@title]'))
+ content = extract_text(result.xpath('.//div[@class="files"]'))
+ files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
+ filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER])
+ magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]
+
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'filesize': filesize,
+ 'magnetlink': magnetlink,
+ 'seed': 'N/A',
+ 'leech': 'N/A',
+ 'template': 'torrent.html'})
+
+ return results
diff --git a/searx/engines/digg.py b/searx/engines/digg.py
index a10b38bb6..238b466a0 100644
--- a/searx/engines/digg.py
+++ b/searx/engines/digg.py
@@ -13,7 +13,6 @@
from urllib import quote_plus
from json import loads
from lxml import html
-from cgi import escape
from dateutil import parser
# engine dependent config
@@ -56,7 +55,7 @@ def response(resp):
url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath))
- content = escape(''.join(result.xpath(content_xpath)))
+ content = ''.join(result.xpath(content_xpath))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate)
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 2153492e9..9959a52e6 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -41,6 +41,9 @@ content_xpath = './/a[@class="result__snippet"]'
# do search-request
def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
offset = (params['pageno'] - 1) * 30
if params['language'] == 'all':
diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py
index 0b16773e3..6d470a4eb 100644
--- a/searx/engines/fdroid.py
+++ b/searx/engines/fdroid.py
@@ -9,7 +9,6 @@
@parse url, title, content
"""
-from cgi import escape
from urllib import urlencode
from searx.engines.xpath import extract_text
from lxml import html
@@ -43,7 +42,7 @@ def response(resp):
img_src = app.xpath('.//img/@src')[0]
content = extract_text(app.xpath('./p')[0])
- content = escape(content.replace(title, '', 1).strip())
+ content = content.replace(title, '', 1).strip()
results.append({'url': url,
'title': title,
diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py
index 68d45bc17..5ce1160e9 100644
--- a/searx/engines/flickr.py
+++ b/searx/engines/flickr.py
@@ -77,21 +77,13 @@ def response(resp):
url = build_flickr_url(photo['owner'], photo['id'])
- title = photo['title']
-
- content = '<span class="photo-author">' +\
- photo['ownername'] +\
- '</span><br />' +\
- '<span class="description">' +\
- photo['description']['_content'] +\
- '</span>'
-
# append result
results.append({'url': url,
- 'title': title,
+ 'title': photo['title'],
'img_src': img_src,
'thumbnail_src': thumbnail_src,
- 'content': content,
+ 'content': photo['description']['_content'],
+ 'author': photo['ownername'],
'template': 'images.html'})
# return results
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
index 87b912eb3..3c0ec7b70 100644
--- a/searx/engines/flickr_noapi.py
+++ b/searx/engines/flickr_noapi.py
@@ -14,6 +14,7 @@
from urllib import urlencode
from json import loads
+from time import time
import re
from searx.engines import logger
@@ -24,21 +25,32 @@ categories = ['images']
url = 'https://www.flickr.com/'
search_url = url + 'search?{query}&page={page}'
+time_range_url = '&min_upload_date={start}&max_upload_date={end}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
paging = True
+time_range_support = True
+time_range_dict = {'day': 60 * 60 * 24,
+ 'week': 60 * 60 * 24 * 7,
+ 'month': 60 * 60 * 24 * 7 * 4,
+ 'year': 60 * 60 * 24 * 7 * 52}
def build_flickr_url(user_id, photo_id):
return photo_url.format(userid=user_id, photoid=photo_id)
-def request(query, params):
- params['url'] = search_url.format(query=urlencode({'text': query}),
- page=params['pageno'])
+def _get_time_range_url(time_range):
+ if time_range in time_range_dict:
+ return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range]))
+ return ''
+
+def request(query, params):
+ params['url'] = (search_url.format(query=urlencode({'text': query}), page=params['pageno'])
+ + _get_time_range_url(params['time_range']))
return params
@@ -91,16 +103,15 @@ def response(resp):
title = photo.get('title', '')
- content = '<span class="photo-author">' +\
- photo['username'] +\
- '</span><br />'
+ author = photo['username']
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
- 'content': content,
+ 'content': '',
+ 'author': author,
'template': 'images.html'})
return results
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index 6e4e24b68..5430eb3ba 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -10,7 +10,6 @@
@parse url, title, content
"""
-from cgi import escape
from json import loads
from random import randint
from time import time
@@ -78,8 +77,8 @@ def response(resp):
for result in response_json['results']:
# append result
results.append({'url': result['url'],
- 'title': escape(result['title']),
- 'content': escape(result['sum'])})
+ 'title': result['title'],
+ 'content': result['sum']})
# return results
return results
diff --git a/searx/engines/github.py b/searx/engines/github.py
index cc1fc470c..7adef3be9 100644
--- a/searx/engines/github.py
+++ b/searx/engines/github.py
@@ -12,7 +12,6 @@
from urllib import urlencode
from json import loads
-from cgi import escape
# engine dependent config
categories = ['it']
@@ -48,7 +47,7 @@ def response(resp):
url = res['html_url']
if res['description']:
- content = escape(res['description'][:500])
+ content = res['description'][:500]
else:
content = ''
diff --git a/searx/engines/google.py b/searx/engines/google.py
index ea93bc94f..a02b6940e 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -9,7 +9,6 @@
# @parse url, title, content, suggestion
import re
-from cgi import escape
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html, etree
@@ -96,7 +95,8 @@ search_url = ('https://{hostname}' +
time_range_search = "&tbs=qdr:{range}"
time_range_dict = {'day': 'd',
'week': 'w',
- 'month': 'm'}
+ 'month': 'm',
+ 'year': 'y'}
# other URLs
map_hostname_start = 'maps.google.'
@@ -155,7 +155,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
- return escape(extract_text(r[0]))
+ return extract_text(r[0])
return None
@@ -264,7 +264,7 @@ def response(resp):
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
- results.append({'suggestion': escape(extract_text(suggestion))})
+ results.append({'suggestion': extract_text(suggestion)})
# return results
return results
diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py
index 77bdc13b2..9a3c71c7e 100644
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -10,10 +10,12 @@
@parse url, title, img_src
"""
+from datetime import date, timedelta
from urllib import urlencode
from json import loads
from lxml import html
+
# engine dependent config
categories = ['images']
paging = True
@@ -29,6 +31,7 @@ search_url = 'https://www.google.com/search'\
'&yv=2'\
'&{search_options}'
time_range_attr = "qdr:{range}"
+time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
time_range_dict = {'day': 'd',
'week': 'w',
'month': 'm'}
@@ -36,7 +39,6 @@ time_range_dict = {'day': 'd',
# do search-request
def request(query, params):
-
search_options = {
'ijn': params['pageno'] - 1,
'start': (params['pageno'] - 1) * number_of_results
@@ -44,6 +46,12 @@ def request(query, params):
if params['time_range'] in time_range_dict:
search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
+ elif params['time_range'] == 'year':
+ now = date.today()
+ then = now - timedelta(days=365)
+ start = then.strftime('%m/%d/%Y')
+ end = now.strftime('%m/%d/%Y')
+ search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
if safesearch and params['safesearch']:
search_options['safe'] = 'on'
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 95d15cfb9..37253c6a7 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -1,41 +1,57 @@
"""
Google (News)
- @website https://www.google.com
- @provide-api yes (https://developers.google.com/web-search/docs/),
- deprecated!
+ @website https://news.google.com
+ @provide-api no
- @using-api yes
- @results JSON
- @stable yes (but deprecated)
+ @using-api no
+ @results HTML
+ @stable no
@parse url, title, content, publishedDate
"""
+from lxml import html
from urllib import urlencode
-from json import loads
-from dateutil import parser
# search-url
categories = ['news']
paging = True
language_support = True
-
-# engine dependent config
-url = 'https://ajax.googleapis.com/'
-search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}'
+safesearch = True
+time_range_support = True
+number_of_results = 10
+
+search_url = 'https://www.google.com/search'\
+ '?{query}'\
+ '&tbm=nws'\
+ '&gws_rd=cr'\
+ '&{search_options}'
+time_range_attr = "qdr:{range}"
+time_range_dict = {'day': 'd',
+ 'week': 'w',
+ 'month': 'm',
+ 'year': 'y'}
# do search-request
def request(query, params):
- offset = (params['pageno'] - 1) * 8
- language = 'en-US'
- if params['language'] != 'all':
- language = params['language'].replace('_', '-')
+ search_options = {
+ 'start': (params['pageno'] - 1) * number_of_results
+ }
+
+ if params['time_range'] in time_range_dict:
+ search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
+
+ if safesearch and params['safesearch']:
+ search_options['safe'] = 'on'
- params['url'] = search_url.format(offset=offset,
- query=urlencode({'q': query}),
- lang=language)
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ search_options=urlencode(search_options))
+
+ if params['language'] != 'all':
+ language_array = params['language'].lower().split('_')
+ params['url'] += '&lr=lang_' + language_array[0]
return params
@@ -44,24 +60,21 @@ def request(query, params):
def response(resp):
results = []
- search_res = loads(resp.text)
-
- # return empty array if there are no results
- if not search_res.get('responseData', {}).get('results'):
- return []
+ dom = html.fromstring(resp.text)
# parse results
- for result in search_res['responseData']['results']:
- # parse publishedDate
- publishedDate = parser.parse(result['publishedDate'])
- if 'url' not in result:
- continue
-
- # append result
- results.append({'url': result['unescapedUrl'],
- 'title': result['titleNoFormatting'],
- 'publishedDate': publishedDate,
- 'content': result['content']})
+ for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
+ r = {
+ 'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0],
+ 'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')),
+ 'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
+ }
+
+ img = result.xpath('.//img/@src')[0]
+ if img and not img.startswith('data'):
+ r['img_src'] = img
+
+ results.append(r)
# return results
return results
diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py
index a824c38e5..4604c3cac 100644
--- a/searx/engines/json_engine.py
+++ b/searx/engines/json_engine.py
@@ -81,7 +81,7 @@ def request(query, params):
fp = {'query': query}
if paging and search_url.find('{pageno}') >= 0:
- fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size
+ fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['url'] = search_url.format(**fp)
params['query'] = query
diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py
index 4c5d24008..059fa2a66 100644
--- a/searx/engines/kickass.py
+++ b/searx/engines/kickass.py
@@ -11,18 +11,18 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
from searx.engines.xpath import extract_text
+from searx.utils import get_torrent_size, convert_str_to_int
# engine dependent config
categories = ['videos', 'music', 'files']
paging = True
# search-url
-url = 'https://kickass.to/'
+url = 'https://kickass.cd/'
search_url = url + 'search/{search_term}/{pageno}/'
# specific xpath variables
@@ -56,42 +56,17 @@ def response(resp):
link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href'])
title = extract_text(link)
- content = escape(extract_text(result.xpath(content_xpath)))
- seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
- leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
- filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0]
- filesize_multiplier = result.xpath('.//td[contains(@class, "nobr")]//span/text()')[0]
- files = result.xpath('.//td[contains(@class, "center")][2]/text()')[0]
-
- # convert seed to int if possible
- if seed.isdigit():
- seed = int(seed)
- else:
- seed = 0
+ content = extract_text(result.xpath(content_xpath))
+ seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
+ leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
+ filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))
+ files = extract_text(result.xpath('.//td[contains(@class, "center")][2]'))
- # convert leech to int if possible
- if leech.isdigit():
- leech = int(leech)
- else:
- leech = 0
-
- # convert filesize to byte if possible
- try:
- filesize = float(filesize)
-
- # convert filesize to byte
- if filesize_multiplier == 'TB':
- filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
- elif filesize_multiplier == 'GB':
- filesize = int(filesize * 1024 * 1024 * 1024)
- elif filesize_multiplier == 'MB':
- filesize = int(filesize * 1024 * 1024)
- elif filesize_multiplier == 'KB':
- filesize = int(filesize * 1024)
- except:
- filesize = None
-
- # convert files to int if possible
+ seed = convert_str_to_int(seed)
+ leech = convert_str_to_int(leech)
+
+ filesize, filesize_multiplier = filesize_info.split()
+ filesize = get_torrent_size(filesize, filesize_multiplier)
if files.isdigit():
files = int(files)
else:
diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py
index cda8231f7..4ca5b3171 100644
--- a/searx/engines/nyaa.py
+++ b/searx/engines/nyaa.py
@@ -9,7 +9,6 @@
@parse url, title, content, seed, leech, torrentfile
"""
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
@@ -78,7 +77,7 @@ def response(resp):
# torrent title
page_a = result.xpath(xpath_title)[0]
- title = escape(extract_text(page_a))
+ title = extract_text(page_a)
# link to the page
href = page_a.attrib.get('href')
@@ -90,7 +89,7 @@ def response(resp):
try:
file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
file_size = int(float(file_size) * get_filesize_mul(suffix))
- except Exception as e:
+ except:
file_size = None
# seed count
@@ -105,7 +104,6 @@ def response(resp):
# content string contains all information not included into template
content = 'Category: "{category}". Downloaded {downloads} times.'
content = content.format(category=category, downloads=downloads)
- content = escape(content)
results.append({'url': href,
'title': title,
diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py
index 38baaada9..01ca7d42d 100644
--- a/searx/engines/openstreetmap.py
+++ b/searx/engines/openstreetmap.py
@@ -43,7 +43,7 @@ def response(resp):
if 'display_name' not in r:
continue
- title = r['display_name']
+ title = r['display_name'] or u''
osm_type = r.get('osm_type', r.get('type'))
url = result_base_url.format(osm_type=osm_type,
osm_id=r['osm_id'])
diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py
new file mode 100644
index 000000000..f784e106f
--- /dev/null
+++ b/searx/engines/pdbe.py
@@ -0,0 +1,109 @@
+"""
+ PDBe (Protein Data Bank in Europe)
+
+ @website https://www.ebi.ac.uk/pdbe
+ @provide-api yes (https://www.ebi.ac.uk/pdbe/api/doc/search.html),
+ unlimited
+ @using-api yes
+ @results python dictionary (from json)
+ @stable yes
+ @parse url, title, content, img_src
+"""
+
+from json import loads
+from flask_babel import gettext
+
+categories = ['science']
+
+hide_obsolete = False
+
+# status codes of unpublished entries
+pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN']
+# url for api query
+pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?'
+# base url for results
+pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}'
+# link to preview image of structure
+pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png'
+
+
+def request(query, params):
+
+ params['url'] = pdbe_solr_url
+ params['method'] = 'POST'
+ params['data'] = {
+ 'q': query,
+ 'wt': "json" # request response in parsable format
+ }
+ return params
+
+
+def construct_body(result):
+ # set title
+ title = result['title']
+
+ # construct content body
+ content = """{title}<br />{authors} {journal} <strong>{volume}</strong>&nbsp;{page} ({year})"""
+
+ # replace placeholders with actual content
+ try:
+ if result['journal']:
+ content = content.format(
+ title=result['citation_title'],
+ authors=result['entry_author_list'][0], journal=result['journal'], volume=result['journal_volume'],
+ page=result['journal_page'], year=result['citation_year'])
+ else:
+ content = content.format(
+ title=result['citation_title'],
+ authors=result['entry_author_list'][0], journal='', volume='', page='', year=result['release_year'])
+ img_src = pdbe_preview_url.format(pdb_id=result['pdb_id'])
+ except (KeyError):
+ content = None
+ img_src = None
+
+ # construct url for preview image
+ try:
+ img_src = pdbe_preview_url.format(pdb_id=result['pdb_id'])
+ except (KeyError):
+ img_src = None
+
+ return [title, content, img_src]
+
+
+def response(resp):
+
+ results = []
+ json = loads(resp.text)['response']['docs']
+
+ # parse results
+ for result in json:
+ # catch obsolete entries and mark them accordingly
+ if result['status'] in pdb_unpublished_codes:
+ continue
+ if hide_obsolete:
+ continue
+ if result['status'] == 'OBS':
+ # expand title to add some sort of warning message
+ title = gettext('{title}&nbsp;(OBSOLETE)').format(title=result['title'])
+ superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by'])
+
+ # since we can't construct a proper body from the response, we'll make up our own
+ msg_superseded = gettext("This entry has been superseded by")
+ content = '<em>{msg_superseded} \<a href="{url}">{pdb_id}</a></em>'.format(
+ msg_superseded=msg_superseded,
+ url=superseded_url,
+ pdb_id=result['superseded_by'], )
+
+ # obsoleted entries don't have preview images
+ img_src = None
+ else:
+ title, content, img_src = construct_body(result)
+
+ results.append({
+ 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']),
+ 'title': title,
+ 'content': content,
+ 'img_src': img_src
+ })
+
+ return results
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
index 55446b410..ca21a3bb2 100644
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -9,7 +9,6 @@
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin
-from cgi import escape
from urllib import quote
from lxml import html
from operator import itemgetter
@@ -62,7 +61,7 @@ def response(resp):
link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
- content = escape(extract_text(result.xpath(content_xpath)))
+ content = extract_text(result.xpath(content_xpath))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
index 872bd4e95..d8b084292 100644
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -10,9 +10,11 @@
@parse url, title, content
"""
-from urllib import urlencode
-from json import loads
from datetime import datetime
+from json import loads
+from urllib import urlencode
+
+from searx.utils import html_to_text
# engine dependent config
categories = None
@@ -66,9 +68,9 @@ def response(resp):
# parse results
for result in res.get('items', {}):
- title = result['title']
+ title = html_to_text(result['title'])
res_url = result['url']
- content = result['desc']
+ content = html_to_text(result['desc'])
if category_to_keyword.get(categories[0], '') == 'web':
results.append({'title': title,
diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py
index 3ca7e44f6..b29792a3a 100644
--- a/searx/engines/reddit.py
+++ b/searx/engines/reddit.py
@@ -11,7 +11,6 @@
"""
import json
-from cgi import escape
from urllib import urlencode
from urlparse import urlparse, urljoin
from datetime import datetime
@@ -68,7 +67,7 @@ def response(resp):
img_results.append(params)
else:
created = datetime.fromtimestamp(data['created_utc'])
- content = escape(data['selftext'])
+ content = data['selftext']
if len(content) > 500:
content = content[:500] + '...'
params['content'] = content
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index de8cd43be..be7a6d385 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -34,11 +34,6 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno'] - 1)
- # Disable SSL verification
- # error: (60) SSL certificate problem: unable to get local issuer
- # certificate
- params['verify'] = False
-
return params
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
index f24fe6f90..99e10be62 100644
--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -27,11 +27,6 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno'] - 1)
- # Disable SSL verification
- # error: (60) SSL certificate problem: unable to get local issuer
- # certificate
- params['verify'] = False
-
return params
@@ -44,20 +39,12 @@ def response(resp):
# parse results
for result in search_results.get('results', []):
href = result['url']
- title = "[" + result['type'] + "] " +\
- result['namespace'] +\
- " " + result['name']
- content = '<span class="highlight">[' +\
- result['type'] + "] " +\
- result['name'] + " " +\
- result['synopsis'] +\
- "</span><br />" +\
- result['description']
+ title = "[{}] {} {}".format(result['type'], result['namespace'], result['name'])
# append result
results.append({'url': href,
'title': title,
- 'content': content})
+ 'content': result['description']})
# return results
return results
diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py
new file mode 100644
index 000000000..e1309a9b5
--- /dev/null
+++ b/searx/engines/seedpeer.py
@@ -0,0 +1,77 @@
+# Seedpeer (Videos, Music, Files)
+#
+# @website http://seedpeer.eu
+# @provide-api no (nothing found)
+#
+# @using-api no
+# @results HTML (using search portal)
+# @stable yes (HTML can change)
+# @parse url, title, content, seed, leech, magnetlink
+
+from urlparse import urljoin
+from urllib import quote
+from lxml import html
+from operator import itemgetter
+from searx.engines.xpath import extract_text
+
+
+url = 'http://www.seedpeer.eu/'
+search_url = url + 'search/{search_term}/7/{page_no}.html'
+# specific xpath variables
+torrent_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a'
+alternative_torrent_xpath = '//*[@id="body"]/center/center/table[1]/tr/td/a'
+title_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a/text()'
+alternative_title_xpath = '//*[@id="body"]/center/center/table/tr/td/a'
+seeds_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[4]/font/text()'
+alternative_seeds_xpath = '//*[@id="body"]/center/center/table/tr/td[4]/font/text()'
+peers_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[5]/font/text()'
+alternative_peers_xpath = '//*[@id="body"]/center/center/table/tr/td[5]/font/text()'
+age_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[2]/text()'
+alternative_age_xpath = '//*[@id="body"]/center/center/table/tr/td[2]/text()'
+size_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[3]/text()'
+alternative_size_xpath = '//*[@id="body"]/center/center/table/tr/td[3]/text()'
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(search_term=quote(query),
+ page_no=params['pageno'] - 1)
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+ dom = html.fromstring(resp.text)
+ torrent_links = dom.xpath(torrent_xpath)
+ if len(torrent_links) > 0:
+ seeds = dom.xpath(seeds_xpath)
+ peers = dom.xpath(peers_xpath)
+ titles = dom.xpath(title_xpath)
+ sizes = dom.xpath(size_xpath)
+ ages = dom.xpath(age_xpath)
+ else: # under ~5 results uses a different xpath
+ torrent_links = dom.xpath(alternative_torrent_xpath)
+ seeds = dom.xpath(alternative_seeds_xpath)
+ peers = dom.xpath(alternative_peers_xpath)
+ titles = dom.xpath(alternative_title_xpath)
+ sizes = dom.xpath(alternative_size_xpath)
+ ages = dom.xpath(alternative_age_xpath)
+ # return empty array if nothing is found
+ if not torrent_links:
+ return []
+
+ # parse results
+ for index, result in enumerate(torrent_links):
+ link = result.attrib.get('href')
+ href = urljoin(url, link)
+ results.append({'url': href,
+ 'title': titles[index].text_content(),
+ 'content': '{}, {}'.format(sizes[index], ages[index]),
+ 'seed': seeds[index],
+ 'leech': peers[index],
+
+ 'template': 'torrent.html'})
+
+ # return results sorted by seeder
+ return sorted(results, key=itemgetter('seed'), reverse=True)
diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py
index f75796e83..249ba91ef 100644
--- a/searx/engines/spotify.py
+++ b/searx/engines/spotify.py
@@ -46,10 +46,11 @@ def response(resp):
if result['type'] == 'track':
title = result['name']
url = result['external_urls']['spotify']
- content = result['artists'][0]['name'] +\
- " &bull; " +\
- result['album']['name'] +\
- " &bull; " + result['name']
+ content = u'{} - {} - {}'.format(
+ result['artists'][0]['name'],
+ result['album']['name'],
+ result['name'])
+
embedded = embedded_url.format(audioid=result['id'])
# append result
diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py
index fdd3711a9..5e7ab2901 100644
--- a/searx/engines/stackoverflow.py
+++ b/searx/engines/stackoverflow.py
@@ -11,7 +11,6 @@
"""
from urlparse import urljoin
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
@@ -48,8 +47,8 @@ def response(resp):
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href'))
- title = escape(extract_text(link))
- content = escape(extract_text(result.xpath(content_xpath)))
+ title = extract_text(link)
+ content = extract_text(result.xpath(content_xpath))
# append result
results.append({'url': href,
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index d8b702c4d..6f6eae1cf 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -11,7 +11,6 @@
# @todo paging
from lxml import html
-from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re
@@ -79,10 +78,10 @@ def response(resp):
if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue
- title = escape(extract_text(link))
+ title = extract_text(link)
if result.xpath('./p[@class="desc clk"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
+ content = extract_text(result.xpath('./p[@class="desc clk"]'))
else:
content = ''
diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py
index 47d27d0b2..daba68be7 100644
--- a/searx/engines/subtitleseeker.py
+++ b/searx/engines/subtitleseeker.py
@@ -10,7 +10,6 @@
@parse url, title, content
"""
-from cgi import escape
from urllib import quote_plus
from lxml import html
from searx.languages import language_codes
@@ -59,7 +58,7 @@ def response(resp):
elif search_lang:
href = href + search_lang + '/'
- title = escape(extract_text(link))
+ title = extract_text(link)
content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
content = content + " - "
@@ -75,7 +74,7 @@ def response(resp):
# append result
results.append({'url': href,
'title': title,
- 'content': escape(content)})
+ 'content': content})
# return results
return results
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
index 1a94ed64e..72184e428 100644
--- a/searx/engines/swisscows.py
+++ b/searx/engines/swisscows.py
@@ -10,7 +10,6 @@
@parse url, title, content
"""
-from cgi import escape
from json import loads
from urllib import urlencode, unquote
import re
@@ -78,7 +77,7 @@ def response(resp):
# append result
results.append({'url': result['SourceUrl'],
- 'title': escape(result['Title']),
+ 'title': result['Title'],
'content': '',
'img_src': img_url,
'template': 'images.html'})
@@ -90,8 +89,8 @@ def response(resp):
# append result
results.append({'url': result_url,
- 'title': escape(result_title),
- 'content': escape(result_content)})
+ 'title': result_title,
+ 'content': result_content})
# parse images
for result in json.get('Images', []):
@@ -100,7 +99,7 @@ def response(resp):
# append result
results.append({'url': result['SourceUrl'],
- 'title': escape(result['Title']),
+ 'title': result['Title'],
'content': '',
'img_src': img_url,
'template': 'images.html'})
diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py
index e2990e153..52b2cbe07 100644
--- a/searx/engines/tokyotoshokan.py
+++ b/searx/engines/tokyotoshokan.py
@@ -11,7 +11,6 @@
"""
import re
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py
index 92fbe7013..f9c832651 100644
--- a/searx/engines/torrentz.py
+++ b/searx/engines/torrentz.py
@@ -12,7 +12,6 @@
"""
import re
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
diff --git a/searx/engines/translated.py b/searx/engines/translated.py
new file mode 100644
index 000000000..e78db0d8e
--- /dev/null
+++ b/searx/engines/translated.py
@@ -0,0 +1,64 @@
+"""
+ MyMemory Translated
+
+ @website https://mymemory.translated.net/
+ @provide-api yes (https://mymemory.translated.net/doc/spec.php)
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content
+"""
+import re
+from searx.utils import is_valid_lang
+
+categories = ['general']
+url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}'
+web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}'
+weight = 100
+
+parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I)
+api_key = ''
+
+
+def request(query, params):
+ m = parser_re.match(unicode(query, 'utf8'))
+ if not m:
+ return params
+
+ from_lang, to_lang, query = m.groups()
+
+ from_lang = is_valid_lang(from_lang)
+ to_lang = is_valid_lang(to_lang)
+
+ if not from_lang or not to_lang:
+ return params
+
+ if api_key:
+ key_form = '&key=' + api_key
+ else:
+ key_form = ''
+ params['url'] = url.format(from_lang=from_lang[1],
+ to_lang=to_lang[1],
+ query=query,
+ key=key_form)
+ params['query'] = query
+ params['from_lang'] = from_lang
+ params['to_lang'] = to_lang
+
+ return params
+
+
+def response(resp):
+ results = []
+ results.append({
+ 'url': web_url.format(
+ from_lang=resp.search_params['from_lang'][2],
+ to_lang=resp.search_params['to_lang'][2],
+ query=resp.search_params['query']),
+ 'title': '[{0}-{1}] {2}'.format(
+ resp.search_params['from_lang'][1],
+ resp.search_params['to_lang'][1],
+ resp.search_params['query']),
+ 'content': resp.json()['responseData']['translatedText']
+ })
+ return results
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
index 517ac1c44..5d5310544 100644
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
@@ -12,10 +12,8 @@
# @todo rewrite to api
# @todo set content-parameter with correct data
+from json import loads
from urllib import urlencode
-from lxml import html
-from HTMLParser import HTMLParser
-from searx.engines.xpath import extract_text
from dateutil import parser
# engine dependent config
@@ -23,17 +21,10 @@ categories = ['videos']
paging = True
# search-url
-base_url = 'https://vimeo.com'
+base_url = 'https://vimeo.com/'
search_url = base_url + '/search/page:{pageno}?{query}'
-# specific xpath variables
-results_xpath = '//div[contains(@class,"results_grid")]/ul/li'
-url_xpath = './/a/@href'
-title_xpath = './/span[@class="title"]'
-thumbnail_xpath = './/img[@class="js-clip_thumbnail_image"]/@src'
-publishedDate_xpath = './/time/attribute::datetime'
-
-embedded_url = '<iframe data-src="//player.vimeo.com/video{videoid}" ' +\
+embedded_url = '<iframe data-src="//player.vimeo.com/video/{videoid}" ' +\
'width="540" height="304" frameborder="0" ' +\
'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>'
@@ -49,17 +40,18 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
-
- dom = html.fromstring(resp.text)
- p = HTMLParser()
+ data_start_pos = resp.text.find('{"filtered"')
+ data_end_pos = resp.text.find(';\n', data_start_pos + 1)
+ data = loads(resp.text[data_start_pos:data_end_pos])
# parse results
- for result in dom.xpath(results_xpath):
- videoid = result.xpath(url_xpath)[0]
+ for result in data['filtered']['data']:
+ result = result[result['type']]
+ videoid = result['uri'].split('/')[-1]
url = base_url + videoid
- title = p.unescape(extract_text(result.xpath(title_xpath)))
- thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
- publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
+ title = result['name']
+ thumbnail = result['pictures']['sizes'][-1]['link']
+ publishedDate = parser.parse(result['created_time'])
embedded = embedded_url.format(videoid=videoid)
# append result
diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py
index 4526c825f..e743c8f56 100644
--- a/searx/engines/wolframalpha_api.py
+++ b/searx/engines/wolframalpha_api.py
@@ -18,10 +18,10 @@ api_key = '' # defined in settings.yml
# xpath variables
failure_xpath = '/queryresult[attribute::success="false"]'
-answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext'
pods_xpath = '//pod'
subpods_xpath = './subpod'
+pod_primary_xpath = './@primary'
pod_id_xpath = './@id'
pod_title_xpath = './@title'
plaintext_xpath = './plaintext'
@@ -75,13 +75,15 @@ def response(resp):
try:
infobox_title = search_results.xpath(input_xpath)[0].text
except:
- infobox_title = None
+ infobox_title = ""
pods = search_results.xpath(pods_xpath)
result_chunks = []
+ result_content = ""
for pod in pods:
pod_id = pod.xpath(pod_id_xpath)[0]
pod_title = pod.xpath(pod_title_xpath)[0]
+ pod_is_result = pod.xpath(pod_primary_xpath)
subpods = pod.xpath(subpods_xpath)
if not subpods:
@@ -94,6 +96,10 @@ def response(resp):
if content and pod_id not in image_pods:
+ if pod_is_result or not result_content:
+ if pod_id != "Input":
+ result_content = "%s: %s" % (pod_title, content)
+
# if no input pod was found, title is first plaintext pod
if not infobox_title:
infobox_title = content
@@ -109,6 +115,8 @@ def response(resp):
if not result_chunks:
return []
+ title = "Wolfram|Alpha (%s)" % infobox_title
+
# append infobox
results.append({'infobox': infobox_title,
'attributes': result_chunks,
@@ -116,7 +124,7 @@ def response(resp):
# append link to site
results.append({'url': resp.request.headers['Referer'].decode('utf8'),
- 'title': 'Wolfram|Alpha',
- 'content': infobox_title})
+ 'title': title,
+ 'content': result_content})
return results
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
index 3a8180f04..1534501b3 100644
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@@ -11,6 +11,7 @@
from json import loads
from time import time
from urllib import urlencode
+from lxml.etree import XML
from searx.poolrequests import get as http_get
@@ -34,7 +35,7 @@ search_url = url + 'input/json.jsp'\
referer_url = url + 'input/?{query}'
token = {'value': '',
- 'last_updated': 0}
+ 'last_updated': None}
# pods to display as image in infobox
# this pods do return a plaintext, but they look better and are more useful as images
@@ -80,10 +81,12 @@ def response(resp):
# TODO handle resp_json['queryresult']['assumptions']
result_chunks = []
- infobox_title = None
+ infobox_title = ""
+ result_content = ""
for pod in resp_json['queryresult']['pods']:
pod_id = pod.get('id', '')
pod_title = pod.get('title', '')
+ pod_is_result = pod.get('primary', None)
if 'subpods' not in pod:
continue
@@ -97,6 +100,10 @@ def response(resp):
if subpod['plaintext'] != '(requires interactivity)':
result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
+ if pod_is_result or not result_content:
+ if pod_id != "Input":
+ result_content = pod_title + ': ' + subpod['plaintext']
+
elif 'img' in subpod:
result_chunks.append({'label': pod_title, 'image': subpod['img']})
@@ -108,7 +115,7 @@ def response(resp):
'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]})
results.append({'url': resp.request.headers['Referer'].decode('utf8'),
- 'title': 'Wolfram|Alpha',
- 'content': infobox_title})
+ 'title': 'Wolfram|Alpha (' + infobox_title + ')',
+ 'content': result_content})
return results
diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py
index f1bc6c583..546521ba3 100644
--- a/searx/engines/www500px.py
+++ b/searx/engines/www500px.py
@@ -12,12 +12,9 @@
@todo rewrite to api
"""
-
+from json import loads
from urllib import urlencode
from urlparse import urljoin
-from lxml import html
-import re
-from searx.engines.xpath import extract_text
# engine dependent config
categories = ['images']
@@ -25,13 +22,27 @@ paging = True
# search-url
base_url = 'https://500px.com'
-search_url = base_url + '/search?search?page={pageno}&type=photos&{query}'
+search_url = 'https://api.500px.com/v1/photos/search?type=photos'\
+ '&{query}'\
+ '&image_size%5B%5D=4'\
+ '&image_size%5B%5D=20'\
+ '&image_size%5B%5D=21'\
+ '&image_size%5B%5D=1080'\
+ '&image_size%5B%5D=1600'\
+ '&image_size%5B%5D=2048'\
+ '&include_states=true'\
+ '&formats=jpeg%2Clytro'\
+ '&include_tags=true'\
+ '&exclude_nude=true'\
+ '&page={pageno}'\
+ '&rpp=50'\
+ '&sdk_key=b68e60cff4c929bedea36ca978830c5caca790c3'
# do search-request
def request(query, params):
params['url'] = search_url.format(pageno=params['pageno'],
- query=urlencode({'q': query}))
+ query=urlencode({'term': query}))
return params
@@ -40,19 +51,16 @@ def request(query, params):
def response(resp):
results = []
- dom = html.fromstring(resp.text)
- regex = re.compile(r'3\.jpg.*$')
+ response_json = loads(resp.text)
# parse results
- for result in dom.xpath('//div[@class="photo"]'):
- link = result.xpath('.//a')[0]
- url = urljoin(base_url, link.attrib.get('href'))
- title = extract_text(result.xpath('.//div[@class="title"]'))
- thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
- # To have a bigger thumbnail, uncomment the next line
- # thumbnail_src = regex.sub('4.jpg', thumbnail_src)
- content = extract_text(result.xpath('.//div[@class="info"]'))
- img_src = regex.sub('2048.jpg', thumbnail_src)
+ for result in response_json['photos']:
+ url = urljoin(base_url, result['url'])
+ title = result['name']
+ # last index is the biggest resolution
+ img_src = result['image_url'][-1]
+ thumbnail_src = result['image_url'][0]
+ content = result['description'] or ''
# append result
results.append({'url': url,
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index e701c02bf..e5c0c5bea 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -87,7 +87,7 @@ def request(query, params):
fp = {'query': query}
if paging and search_url.find('{pageno}') >= 0:
- fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size
+ fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['url'] = search_url.format(**fp)
params['query'] = query
diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py
index c2f1bc7ef..92cf881c0 100644
--- a/searx/engines/yacy.py
+++ b/searx/engines/yacy.py
@@ -16,6 +16,8 @@ from json import loads
from urllib import urlencode
from dateutil import parser
+from searx.utils import html_to_text
+
# engine dependent config
categories = ['general', 'images'] # TODO , 'music', 'videos', 'files'
paging = True
@@ -88,7 +90,7 @@ def response(resp):
# append result
results.append({'url': result['link'],
'title': result['title'],
- 'content': result['description'],
+ 'content': html_to_text(result['description']),
'publishedDate': publishedDate})
# TODO parse video, audio and file results
diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py
index 8e24a283e..2bb34b83d 100644
--- a/searx/engines/yahoo.py
+++ b/searx/engines/yahoo.py
@@ -77,6 +77,9 @@ def _get_language(params):
# do search-request
def request(query, params):
+ if params['time_range'] and params['time_range'] not in time_range_dict:
+ return params
+
offset = (params['pageno'] - 1) * 10 + 1
language = _get_language(params)
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
index be3ec36ce..b83a747f9 100644
--- a/searx/engines/yandex.py
+++ b/searx/engines/yandex.py
@@ -9,7 +9,6 @@
@parse url, title, content
"""
-from cgi import escape
from urllib import urlencode
from lxml import html
from searx.search import logger
@@ -30,10 +29,10 @@ language_map = {'ru': 'ru',
base_url = 'https://yandex.{tld}/'
search_url = 'search/?{query}&p={page}'
-results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
+results_xpath = '//li[@class="serp-item"]'
url_xpath = './/h2/a/@href'
title_xpath = './/h2/a//text()'
-content_xpath = './/div[@class="serp-item__text"]//text()'
+content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()'
def request(query, params):
@@ -52,8 +51,8 @@ def response(resp):
for result in dom.xpath(results_xpath):
try:
res = {'url': result.xpath(url_xpath)[0],
- 'title': escape(''.join(result.xpath(title_xpath))),
- 'content': escape(''.join(result.xpath(content_xpath)))}
+ 'title': ''.join(result.xpath(title_xpath)),
+ 'content': ''.join(result.xpath(content_xpath))}
except:
logger.exception('yandex parse crash')
continue
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
index 401fca4c9..9b7ca64c8 100644
--- a/searx/engines/youtube_noapi.py
+++ b/searx/engines/youtube_noapi.py
@@ -17,10 +17,16 @@ from searx.utils import list_get
categories = ['videos', 'music']
paging = True
language_support = False
+time_range_support = True
# search-url
base_url = 'https://www.youtube.com/results'
search_url = base_url + '?search_query={query}&page={page}'
+time_range_url = '&sp=EgII{time_range}%253D%253D'
+time_range_dict = {'day': 'Ag',
+ 'week': 'Aw',
+ 'month': 'BA',
+ 'year': 'BQ'}
embedded_url = '<iframe width="540" height="304" ' +\
'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
@@ -47,6 +53,8 @@ def extract_text_from_dom(result, xpath):
def request(query, params):
params['url'] = search_url.format(query=quote_plus(query),
page=params['pageno'])
+ if params['time_range'] in time_range_dict:
+ params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']])
return params