diff options
Diffstat (limited to 'searx/engines')
50 files changed, 688 insertions, 300 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 782b622b0..87b1b0eb4 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -16,13 +16,13 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. (C) 2013- by Adam Tauber, <asciimoo@gmail.com> ''' -from os.path import realpath, dirname, splitext, join +from os.path import realpath, dirname import sys -from imp import load_source from flask_babel import gettext from operator import itemgetter from searx import settings from searx import logger +from searx.utils import load_module logger = logger.getChild('engines') @@ -32,6 +32,7 @@ engine_dir = dirname(realpath(__file__)) engines = {} categories = {'general': []} +_initialized = False engine_shortcuts = {} engine_default_args = {'paging': False, @@ -46,22 +47,18 @@ engine_default_args = {'paging': False, 'time_range_support': False} -def load_module(filename): - modname = splitext(filename)[0] - if modname in sys.modules: - del sys.modules[modname] - filepath = join(engine_dir, filename) - module = load_source(modname, filepath) - module.name = modname - return module +def load_engine(engine_data): + + if '_' in engine_data['name']: + logger.error('Engine name conains underscore: "{}"'.format(engine_data['name'])) + sys.exit(1) + engine_module = engine_data['engine'] -def load_engine(engine_data): - engine_name = engine_data['engine'] try: - engine = load_module(engine_name + '.py') + engine = load_module(engine_module + '.py', engine_dir) except: - logger.exception('Cannot load engine "{}"'.format(engine_name)) + logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name in engine_data: @@ -93,6 +90,9 @@ def load_engine(engine_data): 'result_count': 0, 'search_count': 0, 'page_load_time': 0, + 'page_load_count': 0, + 'engine_time': 0, + 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } @@ -109,32 +109,56 @@ def load_engine(engine_data): return engine +def to_percentage(stats, maxvalue): + for engine_stat in stats: + if maxvalue: + engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100) + else: + engine_stat['percentage'] = 0 + return stats + + def get_engines_stats(): # TODO refactor pageloads = [] + engine_times = [] results = [] scores = [] errors = [] scores_per_result = [] - max_pageload = max_results = max_score = max_errors = max_score_per_result = 0 # noqa + max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0 # noqa for engine in engines.values(): if engine.stats['search_count'] == 0: continue results_num = \ engine.stats['result_count'] / float(engine.stats['search_count']) - load_times = engine.stats['page_load_time'] / float(engine.stats['search_count']) # noqa + + if engine.stats['page_load_count'] != 0: + load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa + else: + load_times = 0 + + if engine.stats['engine_time_count'] != 0: + this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa + else: + this_engine_time = 0 + if results_num: score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa score_per_result = score / results_num else: score = score_per_result = 0.0 - max_results = max(results_num, max_results) + max_pageload = max(load_times, max_pageload) + max_engine_times = max(this_engine_time, max_engine_times) + max_results = max(results_num, max_results) max_score = max(score, max_score) max_score_per_result = max(score_per_result, max_score_per_result) max_errors = max(max_errors, engine.stats['errors']) + pageloads.append({'avg': load_times, 'name': engine.name}) + engine_times.append({'avg': this_engine_time, 'name': engine.name}) results.append({'avg': results_num, 'name': engine.name}) scores.append({'avg': score, 'name': engine.name}) errors.append({'avg': engine.stats['errors'], 'name': engine.name}) @@ -143,39 +167,19 @@ def get_engines_stats(): 'name': engine.name }) - for engine in pageloads: - if max_pageload: - engine['percentage'] = int(engine['avg'] / max_pageload * 100) - else: - engine['percentage'] = 0 - - for engine in results: - if max_results: - engine['percentage'] = int(engine['avg'] / max_results * 100) - else: - engine['percentage'] = 0 - - for engine in scores: - if max_score: - engine['percentage'] = int(engine['avg'] / max_score * 100) - else: - engine['percentage'] = 0 - - for engine in scores_per_result: - if max_score_per_result: - engine['percentage'] = int(engine['avg'] - / max_score_per_result * 100) - else: - engine['percentage'] = 0 - - for engine in errors: - if max_errors: - engine['percentage'] = int(float(engine['avg']) / max_errors * 100) - else: - engine['percentage'] = 0 + pageloads = to_percentage(pageloads, max_pageload) + engine_times = to_percentage(engine_times, max_engine_times) + results = to_percentage(results, max_results) + scores = to_percentage(scores, max_score) + scores_per_result = to_percentage(scores_per_result, max_score_per_result) + erros = to_percentage(errors, max_errors) return [ ( + gettext('Engine time (sec)'), + sorted(engine_times, key=itemgetter('avg')) + ), + ( gettext('Page loads (sec)'), sorted(pageloads, key=itemgetter('avg')) ), diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index b846934f7..5ba512766 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -12,7 +12,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -135,7 +134,7 @@ def response(resp): for result in dom.xpath(xpath_results): link = result.xpath(xpath_link)[0] href = urljoin(base_url, link.attrib.get('href')) - title = escape(extract_text(link)) + title = extract_text(link) results.append({'url': href, 'title': title}) diff --git a/searx/engines/base.py b/searx/engines/base.py index 66491d395..a552453ce 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -16,7 +16,6 @@ from lxml import etree from urllib import urlencode from searx.utils import searx_useragent -from cgi import escape from datetime import datetime import re @@ -94,7 +93,7 @@ def response(resp): url = item.text elif item.attrib["name"] == "dcdescription": - content = escape(item.text[:300]) + content = item.text[:300] if len(item.text) > 300: content += "..." diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 6bdfd378b..58db61251 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -14,7 +14,6 @@ """ from urllib import urlencode -from cgi import escape from lxml import html from searx.engines.xpath import extract_text @@ -32,18 +31,14 @@ search_string = 'search?{query}&first={offset}' def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - if params['language'] == 'all': - language = 'en-US' - else: - language = params['language'].replace('_', '-') + if params['language'] != 'all': + query = u'language:{} {}'.format(params['language'].split('_')[0].upper(), + query.decode('utf-8')).encode('utf-8') search_path = search_string.format( - query=urlencode({'q': query, 'setmkt': language}), + query=urlencode({'q': query}), offset=offset) - params['cookies']['SRCHHPGUSR'] = \ - 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] - params['url'] = base_url + search_path return params @@ -65,7 +60,7 @@ def response(resp): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = escape(extract_text(result.xpath('.//p'))) + content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, @@ -77,7 +72,7 @@ def response(resp): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = escape(extract_text(result.xpath('.//p'))) + content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 384520392..4dd362cb3 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -24,11 +24,17 @@ import re categories = ['images'] paging = True safesearch = True +time_range_support = True # search-url base_url = 'https://www.bing.com/' search_string = 'images/search?{query}&count=10&first={offset}' +time_range_string = '&qft=+filterui:age-lt{interval}' thumb_url = "https://www.bing.com/th?id={ihk}" +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} # safesearch definitions safesearch_types = {2: 'STRICT', @@ -58,6 +64,8 @@ def request(query, params): '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') params['url'] = base_url + search_path + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) return params diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index a2397c48e..4e7c33129 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -22,10 +22,15 @@ from searx.utils import list_get categories = ['news'] paging = True language_support = True +time_range_support = True # search-url base_url = 'https://www.bing.com/' search_string = 'news/search?{query}&first={offset}&format=RSS' +search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' +time_range_dict = {'day': '7', + 'week': '8', + 'month': '9'} # remove click @@ -46,8 +51,24 @@ def image_url_cleanup(url_string): return url_string +def _get_url(query, language, offset, time_range): + if time_range in time_range_dict: + search_path = search_string_with_time.format( + query=urlencode({'q': query, 'setmkt': language}), + offset=offset, + interval=time_range_dict[time_range]) + else: + search_path = search_string.format( + query=urlencode({'q': query, 'setmkt': language}), + offset=offset) + return base_url + search_path + + # do search-request def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + offset = (params['pageno'] - 1) * 10 + 1 if params['language'] == 'all': @@ -55,11 +76,7 @@ def request(query, params): else: language = params['language'].replace('_', '-') - search_path = search_string.format( - query=urlencode({'q': query, 'setmkt': language}), - offset=offset) - - params['url'] = base_url + search_path + params['url'] = _get_url(query, language, offset, params['time_range']) return params diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index c2b22f003..33c8355de 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -11,11 +11,11 @@ """ from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size # engine dependent config categories = ['videos', 'music', 'files'] @@ -50,8 +50,8 @@ def response(resp): for result in search_res: link = result.xpath('.//td[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) - title = escape(extract_text(link)) - content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0])) + title = extract_text(link) + content = extract_text(result.xpath('.//pre[@class="snippet"]')[0]) content = "<br />".join(content.split("\n")) filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] @@ -68,20 +68,7 @@ def response(resp): leech = 0 # convert filesize to byte if possible - try: - filesize = float(filesize) - - # convert filesize to byte - if filesize_multiplier == 'TB': - filesize = int(filesize * 1024 * 1024 * 1024 * 1024) - elif filesize_multiplier == 'GB': - filesize = int(filesize * 1024 * 1024 * 1024) - elif filesize_multiplier == 'MB': - filesize = int(filesize * 1024 * 1024) - elif filesize_multiplier == 'KB': - filesize = int(filesize * 1024) - except: - filesize = None + filesize = get_torrent_size(filesize, filesize_multiplier) # convert files to int if possible if files.isdigit(): diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 4eb894725..317f34f59 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -14,7 +14,6 @@ from urllib import urlencode from json import loads -from cgi import escape from datetime import datetime # engine dependent config @@ -57,7 +56,7 @@ def response(resp): for res in search_res['list']: title = res['title'] url = res['url'] - content = escape(res['description']) + content = res['description'] thumbnail = res['thumbnail_360_url'] publishedDate = datetime.fromtimestamp(res['created_time'], None) embedded = embedded_url.format(videoid=res['id']) diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py index 0530bc072..3db1af3d2 100644 --- a/searx/engines/deezer.py +++ b/searx/engines/deezer.py @@ -51,10 +51,11 @@ def response(resp): if url.startswith('http://'): url = 'https' + url[4:] - content = result['artist']['name'] +\ - " • " +\ - result['album']['title'] +\ - " • " + result['title'] + content = u'{} - {} - {}'.format( + result['artist']['name'], + result['album']['title'], + result['title']) + embedded = embedded_url.format(audioid=result['id']) # append result diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index d893fc7fe..a24b75b8a 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -34,6 +34,9 @@ time_range_dict = {'day': 11, # do search-request def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + offset = (params['pageno'] - 1) * 24 params['url'] = search_url.format(offset=offset, diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py new file mode 100644 index 000000000..20a9a8980 --- /dev/null +++ b/searx/engines/dictzone.py @@ -0,0 +1,68 @@ +""" + Dictzone + + @website https://dictzone.com/ + @provide-api no + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +import re +from urlparse import urljoin +from lxml import html +from searx.utils import is_valid_lang + +categories = ['general'] +url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) +results_xpath = './/table[@id="r"]/tr' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + params['url'] = url.format(from_lang=from_lang[2], + to_lang=to_lang[2], + query=query) + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for k, result in enumerate(dom.xpath(results_xpath)[1:]): + try: + from_result, to_results_raw = result.xpath('./td') + except: + continue + + to_results = [] + for to_result in to_results_raw.xpath('./p/a'): + t = to_result.text_content() + if t.strip(): + to_results.append(to_result.text_content()) + + results.append({ + 'url': urljoin(resp.url, '?%d' % k), + 'title': from_result.text_content(), + 'content': '; '.join(to_results) + }) + + return results diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py new file mode 100644 index 000000000..b55d7747a --- /dev/null +++ b/searx/engines/digbt.py @@ -0,0 +1,58 @@ +""" + DigBT (Videos, Music, Files) + + @website https://digbt.org + @provide-api no + + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content, magnetlink +""" + +from urlparse import urljoin +from lxml import html +from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size + +categories = ['videos', 'music', 'files'] +paging = True + +URL = 'https://digbt.org' +SEARCH_URL = URL + '/search/{query}-time-{pageno}' +FILESIZE = 3 +FILESIZE_MULTIPLIER = 4 + + +def request(query, params): + params['url'] = SEARCH_URL.format(query=query, pageno=params['pageno']) + + return params + + +def response(resp): + dom = html.fromstring(resp.content) + search_res = dom.xpath('.//td[@class="x-item"]') + + if not search_res: + return list() + + results = list() + for result in search_res: + url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) + title = extract_text(result.xpath('.//a[@title]')) + content = extract_text(result.xpath('.//div[@class="files"]')) + files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() + filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) + magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0] + + results.append({'url': url, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnetlink, + 'seed': 'N/A', + 'leech': 'N/A', + 'template': 'torrent.html'}) + + return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py index a10b38bb6..238b466a0 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -13,7 +13,6 @@ from urllib import quote_plus from json import loads from lxml import html -from cgi import escape from dateutil import parser # engine dependent config @@ -56,7 +55,7 @@ def response(resp): url = result.attrib.get('data-contenturl') thumbnail = result.xpath('.//img')[0].attrib.get('src') title = ''.join(result.xpath(title_xpath)) - content = escape(''.join(result.xpath(content_xpath))) + content = ''.join(result.xpath(content_xpath)) pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') publishedDate = parser.parse(pubdate) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 2153492e9..9959a52e6 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -41,6 +41,9 @@ content_xpath = './/a[@class="result__snippet"]' # do search-request def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + offset = (params['pageno'] - 1) * 30 if params['language'] == 'all': diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index 0b16773e3..6d470a4eb 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -9,7 +9,6 @@ @parse url, title, content """ -from cgi import escape from urllib import urlencode from searx.engines.xpath import extract_text from lxml import html @@ -43,7 +42,7 @@ def response(resp): img_src = app.xpath('.//img/@src')[0] content = extract_text(app.xpath('./p')[0]) - content = escape(content.replace(title, '', 1).strip()) + content = content.replace(title, '', 1).strip() results.append({'url': url, 'title': title, diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py index 68d45bc17..5ce1160e9 100644 --- a/searx/engines/flickr.py +++ b/searx/engines/flickr.py @@ -77,21 +77,13 @@ def response(resp): url = build_flickr_url(photo['owner'], photo['id']) - title = photo['title'] - - content = '<span class="photo-author">' +\ - photo['ownername'] +\ - '</span><br />' +\ - '<span class="description">' +\ - photo['description']['_content'] +\ - '</span>' - # append result results.append({'url': url, - 'title': title, + 'title': photo['title'], 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': content, + 'content': photo['description']['_content'], + 'author': photo['ownername'], 'template': 'images.html'}) # return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 87b912eb3..3c0ec7b70 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -14,6 +14,7 @@ from urllib import urlencode from json import loads +from time import time import re from searx.engines import logger @@ -24,21 +25,32 @@ categories = ['images'] url = 'https://www.flickr.com/' search_url = url + 'search?{query}&page={page}' +time_range_url = '&min_upload_date={start}&max_upload_date={end}' photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') paging = True +time_range_support = True +time_range_dict = {'day': 60 * 60 * 24, + 'week': 60 * 60 * 24 * 7, + 'month': 60 * 60 * 24 * 7 * 4, + 'year': 60 * 60 * 24 * 7 * 52} def build_flickr_url(user_id, photo_id): return photo_url.format(userid=user_id, photoid=photo_id) -def request(query, params): - params['url'] = search_url.format(query=urlencode({'text': query}), - page=params['pageno']) +def _get_time_range_url(time_range): + if time_range in time_range_dict: + return time_range_url.format(start=time(), end=str(int(time()) - time_range_dict[time_range])) + return '' + +def request(query, params): + params['url'] = (search_url.format(query=urlencode({'text': query}), page=params['pageno']) + + _get_time_range_url(params['time_range'])) return params @@ -91,16 +103,15 @@ def response(resp): title = photo.get('title', '') - content = '<span class="photo-author">' +\ - photo['username'] +\ - '</span><br />' + author = photo['username'] # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': content, + 'content': '', + 'author': author, 'template': 'images.html'}) return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 6e4e24b68..5430eb3ba 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from json import loads from random import randint from time import time @@ -78,8 +77,8 @@ def response(resp): for result in response_json['results']: # append result results.append({'url': result['url'], - 'title': escape(result['title']), - 'content': escape(result['sum'])}) + 'title': result['title'], + 'content': result['sum']}) # return results return results diff --git a/searx/engines/github.py b/searx/engines/github.py index cc1fc470c..7adef3be9 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -12,7 +12,6 @@ from urllib import urlencode from json import loads -from cgi import escape # engine dependent config categories = ['it'] @@ -48,7 +47,7 @@ def response(resp): url = res['html_url'] if res['description']: - content = escape(res['description'][:500]) + content = res['description'][:500] else: content = '' diff --git a/searx/engines/google.py b/searx/engines/google.py index ea93bc94f..a02b6940e 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,7 +9,6 @@ # @parse url, title, content, suggestion import re -from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree @@ -96,7 +95,8 @@ search_url = ('https://{hostname}' + time_range_search = "&tbs=qdr:{range}" time_range_dict = {'day': 'd', 'week': 'w', - 'month': 'm'} + 'month': 'm', + 'year': 'y'} # other URLs map_hostname_start = 'maps.google.' @@ -155,7 +155,7 @@ def parse_url(url_string, google_hostname): def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: - return escape(extract_text(r[0])) + return extract_text(r[0]) return None @@ -264,7 +264,7 @@ def response(resp): # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion - results.append({'suggestion': escape(extract_text(suggestion))}) + results.append({'suggestion': extract_text(suggestion)}) # return results return results diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 77bdc13b2..9a3c71c7e 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -10,10 +10,12 @@ @parse url, title, img_src """ +from datetime import date, timedelta from urllib import urlencode from json import loads from lxml import html + # engine dependent config categories = ['images'] paging = True @@ -29,6 +31,7 @@ search_url = 'https://www.google.com/search'\ '&yv=2'\ '&{search_options}' time_range_attr = "qdr:{range}" +time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'} @@ -36,7 +39,6 @@ time_range_dict = {'day': 'd', # do search-request def request(query, params): - search_options = { 'ijn': params['pageno'] - 1, 'start': (params['pageno'] - 1) * number_of_results @@ -44,6 +46,12 @@ def request(query, params): if params['time_range'] in time_range_dict: search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + elif params['time_range'] == 'year': + now = date.today() + then = now - timedelta(days=365) + start = then.strftime('%m/%d/%Y') + end = now.strftime('%m/%d/%Y') + search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) if safesearch and params['safesearch']: search_options['safe'] = 'on' diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 95d15cfb9..37253c6a7 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,41 +1,57 @@ """ Google (News) - @website https://www.google.com - @provide-api yes (https://developers.google.com/web-search/docs/), - deprecated! + @website https://news.google.com + @provide-api no - @using-api yes - @results JSON - @stable yes (but deprecated) + @using-api no + @results HTML + @stable no @parse url, title, content, publishedDate """ +from lxml import html from urllib import urlencode -from json import loads -from dateutil import parser # search-url categories = ['news'] paging = True language_support = True - -# engine dependent config -url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}' +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.google.com/search'\ + '?{query}'\ + '&tbm=nws'\ + '&gws_rd=cr'\ + '&{search_options}' +time_range_attr = "qdr:{range}" +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 8 - language = 'en-US' - if params['language'] != 'all': - language = params['language'].replace('_', '-') + search_options = { + 'start': (params['pageno'] - 1) * number_of_results + } + + if params['time_range'] in time_range_dict: + search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) + + if safesearch and params['safesearch']: + search_options['safe'] = 'on' - params['url'] = search_url.format(offset=offset, - query=urlencode({'q': query}), - lang=language) + params['url'] = search_url.format(query=urlencode({'q': query}), + search_options=urlencode(search_options)) + + if params['language'] != 'all': + language_array = params['language'].lower().split('_') + params['url'] += '&lr=lang_' + language_array[0] return params @@ -44,24 +60,21 @@ def request(query, params): def response(resp): results = [] - search_res = loads(resp.text) - - # return empty array if there are no results - if not search_res.get('responseData', {}).get('results'): - return [] + dom = html.fromstring(resp.text) # parse results - for result in search_res['responseData']['results']: - # parse publishedDate - publishedDate = parser.parse(result['publishedDate']) - if 'url' not in result: - continue - - # append result - results.append({'url': result['unescapedUrl'], - 'title': result['titleNoFormatting'], - 'publishedDate': publishedDate, - 'content': result['content']}) + for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): + r = { + 'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0], + 'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')), + 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), + } + + img = result.xpath('.//img/@src')[0] + if img and not img.startswith('data'): + r['img_src'] = img + + results.append(r) # return results return results diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index a824c38e5..4604c3cac 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -81,7 +81,7 @@ def request(query, params): fp = {'query': query} if paging and search_url.find('{pageno}') >= 0: - fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num params['url'] = search_url.format(**fp) params['query'] = query diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py index 4c5d24008..059fa2a66 100644 --- a/searx/engines/kickass.py +++ b/searx/engines/kickass.py @@ -11,18 +11,18 @@ """ from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter from searx.engines.xpath import extract_text +from searx.utils import get_torrent_size, convert_str_to_int # engine dependent config categories = ['videos', 'music', 'files'] paging = True # search-url -url = 'https://kickass.to/' +url = 'https://kickass.cd/' search_url = url + 'search/{search_term}/{pageno}/' # specific xpath variables @@ -56,42 +56,17 @@ def response(resp): link = result.xpath('.//a[@class="cellMainLink"]')[0] href = urljoin(url, link.attrib['href']) title = extract_text(link) - content = escape(extract_text(result.xpath(content_xpath))) - seed = result.xpath('.//td[contains(@class, "green")]/text()')[0] - leech = result.xpath('.//td[contains(@class, "red")]/text()')[0] - filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0] - filesize_multiplier = result.xpath('.//td[contains(@class, "nobr")]//span/text()')[0] - files = result.xpath('.//td[contains(@class, "center")][2]/text()')[0] - - # convert seed to int if possible - if seed.isdigit(): - seed = int(seed) - else: - seed = 0 + content = extract_text(result.xpath(content_xpath)) + seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) + leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) + filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) + files = extract_text(result.xpath('.//td[contains(@class, "center")][2]')) - # convert leech to int if possible - if leech.isdigit(): - leech = int(leech) - else: - leech = 0 - - # convert filesize to byte if possible - try: - filesize = float(filesize) - - # convert filesize to byte - if filesize_multiplier == 'TB': - filesize = int(filesize * 1024 * 1024 * 1024 * 1024) - elif filesize_multiplier == 'GB': - filesize = int(filesize * 1024 * 1024 * 1024) - elif filesize_multiplier == 'MB': - filesize = int(filesize * 1024 * 1024) - elif filesize_multiplier == 'KB': - filesize = int(filesize * 1024) - except: - filesize = None - - # convert files to int if possible + seed = convert_str_to_int(seed) + leech = convert_str_to_int(leech) + + filesize, filesize_multiplier = filesize_info.split() + filesize = get_torrent_size(filesize, filesize_multiplier) if files.isdigit(): files = int(files) else: diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index cda8231f7..4ca5b3171 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -9,7 +9,6 @@ @parse url, title, content, seed, leech, torrentfile """ -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -78,7 +77,7 @@ def response(resp): # torrent title page_a = result.xpath(xpath_title)[0] - title = escape(extract_text(page_a)) + title = extract_text(page_a) # link to the page href = page_a.attrib.get('href') @@ -90,7 +89,7 @@ def response(resp): try: file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') file_size = int(float(file_size) * get_filesize_mul(suffix)) - except Exception as e: + except: file_size = None # seed count @@ -105,7 +104,6 @@ def response(resp): # content string contains all information not included into template content = 'Category: "{category}". Downloaded {downloads} times.' content = content.format(category=category, downloads=downloads) - content = escape(content) results.append({'url': href, 'title': title, diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 38baaada9..01ca7d42d 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -43,7 +43,7 @@ def response(resp): if 'display_name' not in r: continue - title = r['display_name'] + title = r['display_name'] or u'' osm_type = r.get('osm_type', r.get('type')) url = result_base_url.format(osm_type=osm_type, osm_id=r['osm_id']) diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py new file mode 100644 index 000000000..f784e106f --- /dev/null +++ b/searx/engines/pdbe.py @@ -0,0 +1,109 @@ +""" + PDBe (Protein Data Bank in Europe) + + @website https://www.ebi.ac.uk/pdbe + @provide-api yes (https://www.ebi.ac.uk/pdbe/api/doc/search.html), + unlimited + @using-api yes + @results python dictionary (from json) + @stable yes + @parse url, title, content, img_src +""" + +from json import loads +from flask_babel import gettext + +categories = ['science'] + +hide_obsolete = False + +# status codes of unpublished entries +pdb_unpublished_codes = ['HPUB', 'HOLD', 'PROC', 'WAIT', 'AUTH', 'AUCO', 'REPL', 'POLC', 'REFI', 'TRSF', 'WDRN'] +# url for api query +pdbe_solr_url = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?' +# base url for results +pdbe_entry_url = 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}' +# link to preview image of structure +pdbe_preview_url = 'https://www.ebi.ac.uk/pdbe/static/entry/{pdb_id}_deposited_chain_front_image-200x200.png' + + +def request(query, params): + + params['url'] = pdbe_solr_url + params['method'] = 'POST' + params['data'] = { + 'q': query, + 'wt': "json" # request response in parsable format + } + return params + + +def construct_body(result): + # set title + title = result['title'] + + # construct content body + content = """{title}<br />{authors} {journal} <strong>{volume}</strong> {page} ({year})""" + + # replace placeholders with actual content + try: + if result['journal']: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], journal=result['journal'], volume=result['journal_volume'], + page=result['journal_page'], year=result['citation_year']) + else: + content = content.format( + title=result['citation_title'], + authors=result['entry_author_list'][0], journal='', volume='', page='', year=result['release_year']) + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + content = None + img_src = None + + # construct url for preview image + try: + img_src = pdbe_preview_url.format(pdb_id=result['pdb_id']) + except (KeyError): + img_src = None + + return [title, content, img_src] + + +def response(resp): + + results = [] + json = loads(resp.text)['response']['docs'] + + # parse results + for result in json: + # catch obsolete entries and mark them accordingly + if result['status'] in pdb_unpublished_codes: + continue + if hide_obsolete: + continue + if result['status'] == 'OBS': + # expand title to add some sort of warning message + title = gettext('{title} (OBSOLETE)').format(title=result['title']) + superseded_url = pdbe_entry_url.format(pdb_id=result['superseded_by']) + + # since we can't construct a proper body from the response, we'll make up our own + msg_superseded = gettext("This entry has been superseded by") + content = '<em>{msg_superseded} \<a href="{url}">{pdb_id}</a></em>'.format( + msg_superseded=msg_superseded, + url=superseded_url, + pdb_id=result['superseded_by'], ) + + # obsoleted entries don't have preview images + img_src = None + else: + title, content, img_src = construct_body(result) + + results.append({ + 'url': pdbe_entry_url.format(pdb_id=result['pdb_id']), + 'title': title, + 'content': content, + 'img_src': img_src + }) + + return results diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index 55446b410..ca21a3bb2 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -9,7 +9,6 @@ # @parse url, title, content, seed, leech, magnetlink from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -62,7 +61,7 @@ def response(resp): link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) - content = escape(extract_text(result.xpath(content_xpath))) + content = extract_text(result.xpath(content_xpath)) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] # convert seed to int if possible diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 872bd4e95..d8b084292 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -10,9 +10,11 @@ @parse url, title, content """ -from urllib import urlencode -from json import loads from datetime import datetime +from json import loads +from urllib import urlencode + +from searx.utils import html_to_text # engine dependent config categories = None @@ -66,9 +68,9 @@ def response(resp): # parse results for result in res.get('items', {}): - title = result['title'] + title = html_to_text(result['title']) res_url = result['url'] - content = result['desc'] + content = html_to_text(result['desc']) if category_to_keyword.get(categories[0], '') == 'web': results.append({'title': title, diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index 3ca7e44f6..b29792a3a 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -11,7 +11,6 @@ """ import json -from cgi import escape from urllib import urlencode from urlparse import urlparse, urljoin from datetime import datetime @@ -68,7 +67,7 @@ def response(resp): img_results.append(params) else: created = datetime.fromtimestamp(data['created_utc']) - content = escape(data['selftext']) + content = data['selftext'] if len(content) > 500: content = content[:500] + '...' params['content'] = content diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index de8cd43be..be7a6d385 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -34,11 +34,6 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) - # Disable SSL verification - # error: (60) SSL certificate problem: unable to get local issuer - # certificate - params['verify'] = False - return params diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index f24fe6f90..99e10be62 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -27,11 +27,6 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) - # Disable SSL verification - # error: (60) SSL certificate problem: unable to get local issuer - # certificate - params['verify'] = False - return params @@ -44,20 +39,12 @@ def response(resp): # parse results for result in search_results.get('results', []): href = result['url'] - title = "[" + result['type'] + "] " +\ - result['namespace'] +\ - " " + result['name'] - content = '<span class="highlight">[' +\ - result['type'] + "] " +\ - result['name'] + " " +\ - result['synopsis'] +\ - "</span><br />" +\ - result['description'] + title = "[{}] {} {}".format(result['type'], result['namespace'], result['name']) # append result results.append({'url': href, 'title': title, - 'content': content}) + 'content': result['description']}) # return results return results diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py new file mode 100644 index 000000000..e1309a9b5 --- /dev/null +++ b/searx/engines/seedpeer.py @@ -0,0 +1,77 @@ +# Seedpeer (Videos, Music, Files) +# +# @website http://seedpeer.eu +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + +from urlparse import urljoin +from urllib import quote +from lxml import html +from operator import itemgetter +from searx.engines.xpath import extract_text + + +url = 'http://www.seedpeer.eu/' +search_url = url + 'search/{search_term}/7/{page_no}.html' +# specific xpath variables +torrent_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a' +alternative_torrent_xpath = '//*[@id="body"]/center/center/table[1]/tr/td/a' +title_xpath = '//*[@id="body"]/center/center/table[2]/tr/td/a/text()' +alternative_title_xpath = '//*[@id="body"]/center/center/table/tr/td/a' +seeds_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[4]/font/text()' +alternative_seeds_xpath = '//*[@id="body"]/center/center/table/tr/td[4]/font/text()' +peers_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[5]/font/text()' +alternative_peers_xpath = '//*[@id="body"]/center/center/table/tr/td[5]/font/text()' +age_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[2]/text()' +alternative_age_xpath = '//*[@id="body"]/center/center/table/tr/td[2]/text()' +size_xpath = '//*[@id="body"]/center/center/table[2]/tr/td[3]/text()' +alternative_size_xpath = '//*[@id="body"]/center/center/table/tr/td[3]/text()' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(search_term=quote(query), + page_no=params['pageno'] - 1) + return params + + +# get response from search-request +def response(resp): + results = [] + dom = html.fromstring(resp.text) + torrent_links = dom.xpath(torrent_xpath) + if len(torrent_links) > 0: + seeds = dom.xpath(seeds_xpath) + peers = dom.xpath(peers_xpath) + titles = dom.xpath(title_xpath) + sizes = dom.xpath(size_xpath) + ages = dom.xpath(age_xpath) + else: # under ~5 results uses a different xpath + torrent_links = dom.xpath(alternative_torrent_xpath) + seeds = dom.xpath(alternative_seeds_xpath) + peers = dom.xpath(alternative_peers_xpath) + titles = dom.xpath(alternative_title_xpath) + sizes = dom.xpath(alternative_size_xpath) + ages = dom.xpath(alternative_age_xpath) + # return empty array if nothing is found + if not torrent_links: + return [] + + # parse results + for index, result in enumerate(torrent_links): + link = result.attrib.get('href') + href = urljoin(url, link) + results.append({'url': href, + 'title': titles[index].text_content(), + 'content': '{}, {}'.format(sizes[index], ages[index]), + 'seed': seeds[index], + 'leech': peers[index], + + 'template': 'torrent.html'}) + + # return results sorted by seeder + return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index f75796e83..249ba91ef 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -46,10 +46,11 @@ def response(resp): if result['type'] == 'track': title = result['name'] url = result['external_urls']['spotify'] - content = result['artists'][0]['name'] +\ - " • " +\ - result['album']['name'] +\ - " • " + result['name'] + content = u'{} - {} - {}'.format( + result['artists'][0]['name'], + result['album']['name'], + result['name']) + embedded = embedded_url.format(audioid=result['id']) # append result diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index fdd3711a9..5e7ab2901 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -48,8 +47,8 @@ def response(resp): for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(extract_text(link)) - content = escape(extract_text(result.xpath(content_xpath))) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': href, diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index d8b702c4d..6f6eae1cf 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -11,7 +11,6 @@ # @todo paging from lxml import html -from cgi import escape from dateutil import parser from datetime import datetime, timedelta import re @@ -79,10 +78,10 @@ def response(resp): if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): continue - title = escape(extract_text(link)) + title = extract_text(link) if result.xpath('./p[@class="desc clk"]'): - content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) + content = extract_text(result.xpath('./p[@class="desc clk"]')) else: content = '' diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 47d27d0b2..daba68be7 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from urllib import quote_plus from lxml import html from searx.languages import language_codes @@ -59,7 +58,7 @@ def response(resp): elif search_lang: href = href + search_lang + '/' - title = escape(extract_text(link)) + title = extract_text(link) content = extract_text(result.xpath('.//div[contains(@class,"red")]')) content = content + " - " @@ -75,7 +74,7 @@ def response(resp): # append result results.append({'url': href, 'title': title, - 'content': escape(content)}) + 'content': content}) # return results return results diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 1a94ed64e..72184e428 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from json import loads from urllib import urlencode, unquote import re @@ -78,7 +77,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': escape(result['Title']), + 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) @@ -90,8 +89,8 @@ def response(resp): # append result results.append({'url': result_url, - 'title': escape(result_title), - 'content': escape(result_content)}) + 'title': result_title, + 'content': result_content}) # parse images for result in json.get('Images', []): @@ -100,7 +99,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': escape(result['Title']), + 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index e2990e153..52b2cbe07 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -11,7 +11,6 @@ """ import re -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index 92fbe7013..f9c832651 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -12,7 +12,6 @@ """ import re -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text diff --git a/searx/engines/translated.py b/searx/engines/translated.py new file mode 100644 index 000000000..e78db0d8e --- /dev/null +++ b/searx/engines/translated.py @@ -0,0 +1,64 @@ +""" + MyMemory Translated + + @website https://mymemory.translated.net/ + @provide-api yes (https://mymemory.translated.net/doc/spec.php) + @using-api yes + @results JSON + @stable yes + @parse url, title, content +""" +import re +from searx.utils import is_valid_lang + +categories = ['general'] +url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' +web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' +weight = 100 + +parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) +api_key = '' + + +def request(query, params): + m = parser_re.match(unicode(query, 'utf8')) + if not m: + return params + + from_lang, to_lang, query = m.groups() + + from_lang = is_valid_lang(from_lang) + to_lang = is_valid_lang(to_lang) + + if not from_lang or not to_lang: + return params + + if api_key: + key_form = '&key=' + api_key + else: + key_form = '' + params['url'] = url.format(from_lang=from_lang[1], + to_lang=to_lang[1], + query=query, + key=key_form) + params['query'] = query + params['from_lang'] = from_lang + params['to_lang'] = to_lang + + return params + + +def response(resp): + results = [] + results.append({ + 'url': web_url.format( + from_lang=resp.search_params['from_lang'][2], + to_lang=resp.search_params['to_lang'][2], + query=resp.search_params['query']), + 'title': '[{0}-{1}] {2}'.format( + resp.search_params['from_lang'][1], + resp.search_params['to_lang'][1], + resp.search_params['query']), + 'content': resp.json()['responseData']['translatedText'] + }) + return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 517ac1c44..5d5310544 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -12,10 +12,8 @@ # @todo rewrite to api # @todo set content-parameter with correct data +from json import loads from urllib import urlencode -from lxml import html -from HTMLParser import HTMLParser -from searx.engines.xpath import extract_text from dateutil import parser # engine dependent config @@ -23,17 +21,10 @@ categories = ['videos'] paging = True # search-url -base_url = 'https://vimeo.com' +base_url = 'https://vimeo.com/' search_url = base_url + '/search/page:{pageno}?{query}' -# specific xpath variables -results_xpath = '//div[contains(@class,"results_grid")]/ul/li' -url_xpath = './/a/@href' -title_xpath = './/span[@class="title"]' -thumbnail_xpath = './/img[@class="js-clip_thumbnail_image"]/@src' -publishedDate_xpath = './/time/attribute::datetime' - -embedded_url = '<iframe data-src="//player.vimeo.com/video{videoid}" ' +\ +embedded_url = '<iframe data-src="//player.vimeo.com/video/{videoid}" ' +\ 'width="540" height="304" frameborder="0" ' +\ 'webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>' @@ -49,17 +40,18 @@ def request(query, params): # get response from search-request def response(resp): results = [] - - dom = html.fromstring(resp.text) - p = HTMLParser() + data_start_pos = resp.text.find('{"filtered"') + data_end_pos = resp.text.find(';\n', data_start_pos + 1) + data = loads(resp.text[data_start_pos:data_end_pos]) # parse results - for result in dom.xpath(results_xpath): - videoid = result.xpath(url_xpath)[0] + for result in data['filtered']['data']: + result = result[result['type']] + videoid = result['uri'].split('/')[-1] url = base_url + videoid - title = p.unescape(extract_text(result.xpath(title_xpath))) - thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) - publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0])) + title = result['name'] + thumbnail = result['pictures']['sizes'][-1]['link'] + publishedDate = parser.parse(result['created_time']) embedded = embedded_url.format(videoid=videoid) # append result diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 4526c825f..e743c8f56 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -18,10 +18,10 @@ api_key = '' # defined in settings.yml # xpath variables failure_xpath = '/queryresult[attribute::success="false"]' -answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' input_xpath = '//pod[starts-with(attribute::id, "Input")]/subpod/plaintext' pods_xpath = '//pod' subpods_xpath = './subpod' +pod_primary_xpath = './@primary' pod_id_xpath = './@id' pod_title_xpath = './@title' plaintext_xpath = './plaintext' @@ -75,13 +75,15 @@ def response(resp): try: infobox_title = search_results.xpath(input_xpath)[0].text except: - infobox_title = None + infobox_title = "" pods = search_results.xpath(pods_xpath) result_chunks = [] + result_content = "" for pod in pods: pod_id = pod.xpath(pod_id_xpath)[0] pod_title = pod.xpath(pod_title_xpath)[0] + pod_is_result = pod.xpath(pod_primary_xpath) subpods = pod.xpath(subpods_xpath) if not subpods: @@ -94,6 +96,10 @@ def response(resp): if content and pod_id not in image_pods: + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = "%s: %s" % (pod_title, content) + # if no input pod was found, title is first plaintext pod if not infobox_title: infobox_title = content @@ -109,6 +115,8 @@ def response(resp): if not result_chunks: return [] + title = "Wolfram|Alpha (%s)" % infobox_title + # append infobox results.append({'infobox': infobox_title, 'attributes': result_chunks, @@ -116,7 +124,7 @@ def response(resp): # append link to site results.append({'url': resp.request.headers['Referer'].decode('utf8'), - 'title': 'Wolfram|Alpha', - 'content': infobox_title}) + 'title': title, + 'content': result_content}) return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 3a8180f04..1534501b3 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -11,6 +11,7 @@ from json import loads from time import time from urllib import urlencode +from lxml.etree import XML from searx.poolrequests import get as http_get @@ -34,7 +35,7 @@ search_url = url + 'input/json.jsp'\ referer_url = url + 'input/?{query}' token = {'value': '', - 'last_updated': 0} + 'last_updated': None} # pods to display as image in infobox # this pods do return a plaintext, but they look better and are more useful as images @@ -80,10 +81,12 @@ def response(resp): # TODO handle resp_json['queryresult']['assumptions'] result_chunks = [] - infobox_title = None + infobox_title = "" + result_content = "" for pod in resp_json['queryresult']['pods']: pod_id = pod.get('id', '') pod_title = pod.get('title', '') + pod_is_result = pod.get('primary', None) if 'subpods' not in pod: continue @@ -97,6 +100,10 @@ def response(resp): if subpod['plaintext'] != '(requires interactivity)': result_chunks.append({'label': pod_title, 'value': subpod['plaintext']}) + if pod_is_result or not result_content: + if pod_id != "Input": + result_content = pod_title + ': ' + subpod['plaintext'] + elif 'img' in subpod: result_chunks.append({'label': pod_title, 'image': subpod['img']}) @@ -108,7 +115,7 @@ def response(resp): 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) results.append({'url': resp.request.headers['Referer'].decode('utf8'), - 'title': 'Wolfram|Alpha', - 'content': infobox_title}) + 'title': 'Wolfram|Alpha (' + infobox_title + ')', + 'content': result_content}) return results diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py index f1bc6c583..546521ba3 100644 --- a/searx/engines/www500px.py +++ b/searx/engines/www500px.py @@ -12,12 +12,9 @@ @todo rewrite to api """ - +from json import loads from urllib import urlencode from urlparse import urljoin -from lxml import html -import re -from searx.engines.xpath import extract_text # engine dependent config categories = ['images'] @@ -25,13 +22,27 @@ paging = True # search-url base_url = 'https://500px.com' -search_url = base_url + '/search?search?page={pageno}&type=photos&{query}' +search_url = 'https://api.500px.com/v1/photos/search?type=photos'\ + '&{query}'\ + '&image_size%5B%5D=4'\ + '&image_size%5B%5D=20'\ + '&image_size%5B%5D=21'\ + '&image_size%5B%5D=1080'\ + '&image_size%5B%5D=1600'\ + '&image_size%5B%5D=2048'\ + '&include_states=true'\ + '&formats=jpeg%2Clytro'\ + '&include_tags=true'\ + '&exclude_nude=true'\ + '&page={pageno}'\ + '&rpp=50'\ + '&sdk_key=b68e60cff4c929bedea36ca978830c5caca790c3' # do search-request def request(query, params): params['url'] = search_url.format(pageno=params['pageno'], - query=urlencode({'q': query})) + query=urlencode({'term': query})) return params @@ -40,19 +51,16 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.text) - regex = re.compile(r'3\.jpg.*$') + response_json = loads(resp.text) # parse results - for result in dom.xpath('//div[@class="photo"]'): - link = result.xpath('.//a')[0] - url = urljoin(base_url, link.attrib.get('href')) - title = extract_text(result.xpath('.//div[@class="title"]')) - thumbnail_src = link.xpath('.//img')[0].attrib.get('src') - # To have a bigger thumbnail, uncomment the next line - # thumbnail_src = regex.sub('4.jpg', thumbnail_src) - content = extract_text(result.xpath('.//div[@class="info"]')) - img_src = regex.sub('2048.jpg', thumbnail_src) + for result in response_json['photos']: + url = urljoin(base_url, result['url']) + title = result['name'] + # last index is the biggest resolution + img_src = result['image_url'][-1] + thumbnail_src = result['image_url'][0] + content = result['description'] or '' # append result results.append({'url': url, diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index e701c02bf..e5c0c5bea 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -87,7 +87,7 @@ def request(query, params): fp = {'query': query} if paging and search_url.find('{pageno}') >= 0: - fp['pageno'] = (params['pageno'] + first_page_num - 1) * page_size + fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num params['url'] = search_url.format(**fp) params['query'] = query diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index c2f1bc7ef..92cf881c0 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -16,6 +16,8 @@ from json import loads from urllib import urlencode from dateutil import parser +from searx.utils import html_to_text + # engine dependent config categories = ['general', 'images'] # TODO , 'music', 'videos', 'files' paging = True @@ -88,7 +90,7 @@ def response(resp): # append result results.append({'url': result['link'], 'title': result['title'], - 'content': result['description'], + 'content': html_to_text(result['description']), 'publishedDate': publishedDate}) # TODO parse video, audio and file results diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 8e24a283e..2bb34b83d 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -77,6 +77,9 @@ def _get_language(params): # do search-request def request(query, params): + if params['time_range'] and params['time_range'] not in time_range_dict: + return params + offset = (params['pageno'] - 1) * 10 + 1 language = _get_language(params) diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index be3ec36ce..b83a747f9 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -9,7 +9,6 @@ @parse url, title, content """ -from cgi import escape from urllib import urlencode from lxml import html from searx.search import logger @@ -30,10 +29,10 @@ language_map = {'ru': 'ru', base_url = 'https://yandex.{tld}/' search_url = 'search/?{query}&p={page}' -results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]' +results_xpath = '//li[@class="serp-item"]' url_xpath = './/h2/a/@href' title_xpath = './/h2/a//text()' -content_xpath = './/div[@class="serp-item__text"]//text()' +content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()' def request(query, params): @@ -52,8 +51,8 @@ def response(resp): for result in dom.xpath(results_xpath): try: res = {'url': result.xpath(url_xpath)[0], - 'title': escape(''.join(result.xpath(title_xpath))), - 'content': escape(''.join(result.xpath(content_xpath)))} + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} except: logger.exception('yandex parse crash') continue diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 401fca4c9..9b7ca64c8 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -17,10 +17,16 @@ from searx.utils import list_get categories = ['videos', 'music'] paging = True language_support = False +time_range_support = True # search-url base_url = 'https://www.youtube.com/results' search_url = base_url + '?search_query={query}&page={page}' +time_range_url = '&sp=EgII{time_range}%253D%253D' +time_range_dict = {'day': 'Ag', + 'week': 'Aw', + 'month': 'BA', + 'year': 'BQ'} embedded_url = '<iframe width="540" height="304" ' +\ 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\ @@ -47,6 +53,8 @@ def extract_text_from_dom(result, xpath): def request(query, params): params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) return params |