diff options
Diffstat (limited to 'searx/engines')
28 files changed, 1101 insertions, 443 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 72e537423..82c9407a2 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -19,19 +19,12 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. from os.path import realpath, dirname, splitext, join import sys from imp import load_source -from itertools import izip_longest, chain -from operator import itemgetter -from urlparse import urlparse -from datetime import datetime -import grequests from flask.ext.babel import gettext +from operator import itemgetter from searx import settings -from searx.utils import gen_useragent engine_dir = dirname(realpath(__file__)) -number_of_searches = 0 - engines = {} categories = {'general': []} @@ -114,146 +107,6 @@ for engine_data in settings['engines']: engine_shortcuts[engine.shortcut] = engine.name -def default_request_params(): - return { - 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} - - -def make_callback(engine_name, results, suggestions, callback, params): - # creating a callback wrapper for the search engine results - def process_callback(response, **kwargs): - cb_res = [] - response.search_params = params - engines[engine_name].stats['page_load_time'] += \ - (datetime.now() - params['started']).total_seconds() - try: - search_results = callback(response) - except Exception, e: - engines[engine_name].stats['errors'] += 1 - results[engine_name] = cb_res - print '[E] Error with engine "{0}":\n\t{1}'.format( - engine_name, str(e)) - return - for result in search_results: - result['engine'] = engine_name - if 'suggestion' in result: - # TODO type checks - suggestions.add(result['suggestion']) - continue - cb_res.append(result) - results[engine_name] = cb_res - return process_callback - - -def score_results(results): - flat_res = filter( - None, chain.from_iterable(izip_longest(*results.values()))) - flat_len = len(flat_res) - engines_len = len(results) - results = [] - # deduplication + scoring - for i, res in enumerate(flat_res): - res['parsed_url'] = urlparse(res['url']) - res['engines'] = [res['engine']] - weight = 1.0 - if hasattr(engines[res['engine']], 'weight'): - weight = float(engines[res['engine']].weight) - score = int((flat_len - i) / engines_len) * weight + 1 - duplicated = False - for new_res in results: - p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa - p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa - if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ - p1 == p2 and\ - res['parsed_url'].query == new_res['parsed_url'].query and\ - res.get('template') == new_res.get('template'): - duplicated = new_res - break - if duplicated: - if res.get('content') > duplicated.get('content'): - duplicated['content'] = res['content'] - duplicated['score'] += score - duplicated['engines'].append(res['engine']) - if duplicated['parsed_url'].scheme == 'https': - continue - elif res['parsed_url'].scheme == 'https': - duplicated['url'] = res['parsed_url'].geturl() - duplicated['parsed_url'] = res['parsed_url'] - else: - res['score'] = score - results.append(res) - return sorted(results, key=itemgetter('score'), reverse=True) - - -def search(query, request, selected_engines, pageno=1, lang='all'): - global engines, categories, number_of_searches - requests = [] - results = {} - suggestions = set() - number_of_searches += 1 - #user_agent = request.headers.get('User-Agent', '') - user_agent = gen_useragent() - - for selected_engine in selected_engines: - if selected_engine['name'] not in engines: - continue - - engine = engines[selected_engine['name']] - - if pageno > 1 and not engine.paging: - continue - - if lang != 'all' and not engine.language_support: - continue - - request_params = default_request_params() - request_params['headers']['User-Agent'] = user_agent - request_params['category'] = selected_engine['category'] - request_params['started'] = datetime.now() - request_params['pageno'] = pageno - request_params['language'] = lang - request_params = engine.request(query.encode('utf-8'), request_params) - - callback = make_callback( - selected_engine['name'], - results, - suggestions, - engine.response, - request_params - ) - - request_args = dict( - headers=request_params['headers'], - hooks=dict(response=callback), - cookies=request_params['cookies'], - timeout=engine.timeout - ) - - if request_params['method'] == 'GET': - req = grequests.get - else: - req = grequests.post - request_args['data'] = request_params['data'] - - # ignoring empty urls - if not request_params['url']: - continue - - requests.append(req(request_params['url'], **request_args)) - grequests.map(requests) - for engine_name, engine_results in results.items(): - engines[engine_name].stats['search_count'] += 1 - engines[engine_name].stats['result_count'] += len(engine_results) - - results = score_results(results) - - for result in results: - for res_engine in result['engines']: - engines[result['engine']].stats['score_count'] += result['score'] - - return results, suggestions - - def get_engines_stats(): # TODO refactor pageloads = [] diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 9712a3103..56c6b36c1 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -1,49 +1,82 @@ +## Bing (Web) +# +# @website https://www.bing.com +# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month +# +# @using-api no (because of query limit) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo publishedDate + from urllib import urlencode from cgi import escape from lxml import html -base_url = 'http://www.bing.com/' -search_string = 'search?{query}&first={offset}' +# engine dependent config +categories = ['general'] paging = True language_support = True +# search-url +base_url = 'https://www.bing.com/' +search_string = 'search?{query}&first={offset}' + +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en-US' else: language = params['language'].replace('_', '-') + search_path = search_string.format( query=urlencode({'q': query, 'setmkt': language}), offset=offset) params['cookies']['SRCHHPGUSR'] = \ 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] - #if params['category'] == 'images': - # params['url'] = base_url + 'images/' + search_path + params['url'] = base_url + search_path return params +# get response from search-request def response(resp): - global base_url results = [] + dom = html.fromstring(resp.content) + + # parse results for result in dom.xpath('//div[@class="sa_cc"]'): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = ' '.join(link.xpath('.//text()')) content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results if something is found if results: return results + # parse results again if nothing is found yet for result in dom.xpath('//li[@class="b_algo"]'): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = ' '.join(link.xpath('.//text()')) content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py new file mode 100644 index 000000000..b3eabba45 --- /dev/null +++ b/searx/engines/bing_images.py @@ -0,0 +1,80 @@ +## Bing (Images) +# +# @website https://www.bing.com/images +# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month +# +# @using-api no (because of query limit) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, img_src +# +# @todo currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images + +from urllib import urlencode +from cgi import escape +from lxml import html +from yaml import load +import re + +# engine dependent config +categories = ['images'] +paging = True + +# search-url +base_url = 'https://www.bing.com/' +search_string = 'images/search?{query}&count=10&first={offset}' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + # required for cookie + language = 'en-US' + + search_path = search_string.format( + query=urlencode({'q': query}), + offset=offset) + + params['cookies']['SRCHHPGUSR'] = \ + 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] + + params['url'] = base_url + search_path + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.content) + + # init regex for yaml-parsing + p = re.compile( '({|,)([a-z]+):(")') + + # parse results + for result in dom.xpath('//div[@class="dg_u"]'): + link = result.xpath('./a')[0] + + # parse yaml-data (it is required to add a space, to make it parsable) + yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m'))) + + title = link.attrib.get('t1') + #url = 'http://' + link.attrib.get('t3') + url = yaml_data.get('surl') + img_src = yaml_data.get('imgurl') + + # append result + results.append({'template': 'images.html', + 'url': url, + 'title': title, + 'content': '', + 'img_src': img_src}) + + # TODO stop parsing if 10 images are found + if len(results) >= 10: + break + + # return results + return results diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 5b48a5450..279f0d698 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -1,51 +1,100 @@ +## Bing (News) +# +# @website https://www.bing.com/news +# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month +# +# @using-api no (because of query limit) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate + from urllib import urlencode from cgi import escape from lxml import html +from datetime import datetime, timedelta +from dateutil import parser +import re +# engine dependent config categories = ['news'] - -base_url = 'http://www.bing.com/' -search_string = 'news/search?{query}&first={offset}' paging = True language_support = True +# search-url +base_url = 'https://www.bing.com/' +search_string = 'news/search?{query}&first={offset}' + +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en-US' else: language = params['language'].replace('_', '-') + search_path = search_string.format( query=urlencode({'q': query, 'setmkt': language}), offset=offset) params['cookies']['SRCHHPGUSR'] = \ 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] - #if params['category'] == 'images': - # params['url'] = base_url + 'images/' + search_path + params['url'] = base_url + search_path return params +# get response from search-request def response(resp): - global base_url results = [] + dom = html.fromstring(resp.content) - for result in dom.xpath('//div[@class="sa_cc"]'): - link = result.xpath('.//h3/a')[0] + + # parse results + for result in dom.xpath('//div[@class="sn_r"]'): + link = result.xpath('.//div[@class="newstitle"]/a')[0] url = link.attrib.get('href') title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()') + if contentXPath != None: + content = escape(' '.join(contentXPath)) + + # parse publishedDate + publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div//span[contains(@class,"sn_ST")]//span[contains(@class,"sn_tm")]//text()') + if publishedDateXPath != None: + publishedDate = escape(' '.join(publishedDateXPath)) - if results: - return results + if re.match("^[0-9]+ minute(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(minutes=int(timeNumbers[0])) + elif re.match("^[0-9]+ hour(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(hours=int(timeNumbers[0])) + elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(hours=int(timeNumbers[0]))\ + - timedelta(minutes=int(timeNumbers[1])) + elif re.match("^[0-9]+ day(s|) ago$", publishedDate): + timeNumbers = re.findall(r'\d+', publishedDate) + publishedDate = datetime.now()\ + - timedelta(days=int(timeNumbers[0])) + else: + try: + # FIXME use params['language'] to parse either mm/dd or dd/mm + publishedDate = parser.parse(publishedDate, dayfirst=False) + except TypeError: + # FIXME + publishedDate = datetime.now() + + # append result + results.append({'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content}) - for result in dom.xpath('//li[@class="b_algo"]'): - link = result.xpath('.//h2/a')[0] - url = link.attrib.get('href') - title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath('.//p//text()'))) - results.append({'url': url, 'title': title, 'content': content}) + # return results return results diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index ce6b3b854..561527bce 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -31,7 +31,6 @@ def request(query, params): def response(resp): - global base_url results = [] try: _, conversion_rate, _ = resp.text.split(',', 2) diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 03e1d7ffc..75c2e5071 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -1,45 +1,66 @@ +## Dailymotion (Videos) +# +# @website https://www.dailymotion.com +# @provide-api yes (http://www.dailymotion.com/developer) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, thumbnail +# +# @todo set content-parameter with correct data + from urllib import urlencode from json import loads from lxml import html +# engine dependent config categories = ['videos'] -locale = 'en_US' +paging = True +language_support = True +# search-url # see http://www.dailymotion.com/doc/api/obj-video.html -search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page={pageno}&{query}' # noqa - -# TODO use video result template -content_tpl = '<a href="{0}" title="{0}" ><img src="{1}" /></a><br />' - -paging = True +search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=5&page={pageno}&{query}' # noqa +# do search-request def request(query, params): + if params['language'] == 'all': + locale = 'en-US' + else: + locale = params['language'] + params['url'] = search_url.format( query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno']) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + + # return empty array if there are no results if not 'list' in search_res: - return results + return [] + + # parse results for res in search_res['list']: title = res['title'] url = res['url'] - if res['thumbnail_360_url']: - content = content_tpl.format(url, res['thumbnail_360_url']) - else: - content = '' - if res['description']: - description = text_content_from_html(res['description']) - content += description[:500] - results.append({'url': url, 'title': title, 'content': content}) - return results + #content = res['description'] + content = '' + thumbnail = res['thumbnail_360_url'] + results.append({'template': 'videos.html', + 'url': url, + 'title': title, + 'content': content, + 'thumbnail': thumbnail}) -def text_content_from_html(html_string): - desc_html = html.fragment_fromstring(html_string, create_parent=True) - return desc_html.text_content() + # return results + return results diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index d42a25a19..ff5e1d465 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -1,36 +1,61 @@ +## Deviantart (Images) +# +# @website https://www.deviantart.com/ +# @provide-api yes (https://www.deviantart.com/developers/) (RSS) +# +# @using-api no (TODO, rewrite to api) +# @results HTML +# @stable no (HTML can change) +# @parse url, title, thumbnail, img_src +# +# @todo rewrite to api + from urllib import urlencode from urlparse import urljoin from lxml import html +# engine dependent config categories = ['images'] +paging = True +# search-url base_url = 'https://www.deviantart.com/' search_url = base_url+'search?offset={offset}&{query}' -paging = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 24 + params['url'] = search_url.format(offset=offset, query=urlencode({'q': query})) + return params +# get response from search-request def response(resp): - global base_url results = [] + + # return empty array if a redirection code is returned if resp.status_code == 302: - return results + return [] + dom = html.fromstring(resp.text) + + # parse results for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'): link = result.xpath('.//a[contains(@class, "thumb")]')[0] url = urljoin(base_url, link.attrib.get('href')) title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa title = ''.join(title_links[0].xpath('.//text()')) img_src = link.xpath('.//img')[0].attrib['src'] + + # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'template': 'images.html'}) + + # return results return results diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 58cbc9872..296dd9b2d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,65 +1,74 @@ +## DuckDuckGo (Web) +# +# @website https://duckduckgo.com/ +# @provide-api yes (https://duckduckgo.com/api), but not all results from search-site +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo rewrite to api +# @todo language support (the current used site does not support language-change) + from urllib import urlencode from lxml.html import fromstring from searx.utils import html_to_text +# engine dependent config +categories = ['general'] +paging = True +language_support = True + +# search-url url = 'https://duckduckgo.com/html?{query}&s={offset}' -locale = 'us-en' +# specific xpath variables +result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa +url_xpath = './/a[@class="large"]/@href' +title_xpath = './/a[@class="large"]//text()' +content_xpath = './/div[@class="snippet"]//text()' + +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 30 - q = urlencode({'q': query, - 'l': locale}) - params['url'] = url.format(query=q, offset=offset) + + if params['language'] == 'all': + locale = 'en-us' + else: + locale = params['language'].replace('_','-').lower() + + params['url'] = url.format( + query=urlencode({'q': query, 'kl': locale}), + offset=offset) + return params +# get response from search-request def response(resp): - result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa - url_xpath = './/a[@class="large"]/@href' - title_xpath = './/a[@class="large"]//text()' - content_xpath = './/div[@class="snippet"]//text()' results = [] doc = fromstring(resp.text) + # parse results for r in doc.xpath(result_xpath): try: res_url = r.xpath(url_xpath)[-1] except: continue + if not res_url: continue + title = html_to_text(''.join(r.xpath(title_xpath))) content = html_to_text(''.join(r.xpath(content_xpath))) + + # append result results.append({'title': title, 'content': content, 'url': res_url}) + # return results return results - - -#from json import loads -#search_url = url + 'd.js?{query}&p=1&s={offset}' -# -#paging = True -# -# -#def request(query, params): -# offset = (params['pageno'] - 1) * 30 -# q = urlencode({'q': query, -# 'l': locale}) -# params['url'] = search_url.format(query=q, offset=offset) -# return params -# -# -#def response(resp): -# results = [] -# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] -# for r in search_res: -# if not r.get('t'): -# continue -# results.append({'title': r['t'], -# 'content': html_to_text(r['a']), -# 'url': r['u']}) -# return results diff --git a/searx/engines/dummy.py b/searx/engines/dummy.py index 4586760a0..5a2cdf6b5 100644 --- a/searx/engines/dummy.py +++ b/searx/engines/dummy.py @@ -1,6 +1,14 @@ +## Dummy +# +# @results empty array +# @stable yes + + +# do search-request def request(query, params): return params +# get response from search-request def response(resp): return [] diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py index 59513b41c..4ec2841dd 100644 --- a/searx/engines/flickr.py +++ b/searx/engines/flickr.py @@ -1,48 +1,54 @@ #!/usr/bin/env python from urllib import urlencode -from json import loads -#from urlparse import urljoin +#from json import loads +from urlparse import urljoin +from lxml import html +from time import time categories = ['images'] -# url = 'https://secure.flickr.com/' -# search_url = url+'search/?{query}&page={page}' -# results_xpath = '//div[@id="thumbnails"]//a[@class="rapidnofollow photo-click" and @data-track="photo-click"]' # noqa +url = 'https://secure.flickr.com/' +search_url = url+'search/?{query}&page={page}' +results_xpath = '//div[@class="view display-item-tile"]/figure/div' paging = True -# text=[query] -# TODO clean "extras" -search_url = 'https://api.flickr.com/services/rest?extras=can_addmeta%2Ccan_comment%2Ccan_download%2Ccan_share%2Ccontact%2Ccount_comments%2Ccount_faves%2Ccount_notes%2Cdate_taken%2Cdate_upload%2Cdescription%2Cicon_urls_deep%2Cisfavorite%2Cispro%2Clicense%2Cmedia%2Cneeds_interstitial%2Cowner_name%2Cowner_datecreate%2Cpath_alias%2Crealname%2Csafety_level%2Csecret_k%2Csecret_h%2Curl_c%2Curl_h%2Curl_k%2Curl_l%2Curl_m%2Curl_n%2Curl_o%2Curl_q%2Curl_s%2Curl_sq%2Curl_t%2Curl_z%2Cvisibility&per_page=50&page={page}&{query}&sort=relevance&method=flickr.photos.search&api_key=ad11b34c341305471e3c410a02e671d0&format=json' # noqa - def request(query, params): params['url'] = search_url.format(query=urlencode({'text': query}), page=params['pageno']) - #params['url'] = search_url.format(query=urlencode({'q': query}), - # page=params['pageno']) + time_string = str(int(time())-3) + params['cookies']['BX'] = '3oqjr6d9nmpgl&b=3&s=dh' + params['cookies']['xb'] = '421409' + params['cookies']['localization'] = 'en-us' + params['cookies']['flrbp'] = time_string +\ + '-3a8cdb85a427a33efda421fbda347b2eaf765a54' + params['cookies']['flrbs'] = time_string +\ + '-ed142ae8765ee62c9ec92a9513665e0ee1ba6776' + params['cookies']['flrb'] = '9' return params def response(resp): results = [] - images = loads(resp.text[14:-1])["photos"]["photo"] - for i in images: - results.append({'url': i['url_s'], - 'title': i['title'], - 'img_src': i['url_s'], + dom = html.fromstring(resp.text) + for result in dom.xpath(results_xpath): + img = result.xpath('.//img') + + if not img: + continue + + img = img[0] + img_src = 'https:'+img.attrib.get('src') + + if not img_src: + continue + + href = urljoin(url, result.xpath('.//a')[0].attrib.get('href')) + title = img.attrib.get('alt', '') + results.append({'url': href, + 'title': title, + 'img_src': img_src, 'template': 'images.html'}) - #dom = html.fromstring(resp.text) - #for result in dom.xpath(results_xpath): - # href = urljoin(url, result.attrib.get('href')) - # img = result.xpath('.//img')[0] - # title = img.attrib.get('alt', '') - # img_src = img.attrib.get('data-defer-src') - # if not img_src: - # continue - # results.append({'url': href, - # 'title': title, - # 'img_src': img_src, - # 'template': 'images.html'}) return results diff --git a/searx/engines/generalfile.py b/searx/engines/generalfile.py new file mode 100644 index 000000000..11d8b6955 --- /dev/null +++ b/searx/engines/generalfile.py @@ -0,0 +1,60 @@ +## General Files (Files) +# +# @website http://www.general-files.org +# @provide-api no (nothing found) +# +# @using-api no (because nothing found) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo detect torrents? + +from lxml import html + +# engine dependent config +categories = ['files'] +paging = True + +# search-url +base_url = 'http://www.general-file.com' +search_url = base_url + '/files-{letter}/{query}/{pageno}' + +# specific xpath variables +result_xpath = '//table[@class="block-file"]' +title_xpath = './/h2/a//text()' +url_xpath = './/h2/a/@href' +content_xpath = './/p//text()' + + +# do search-request +def request(query, params): + + params['url'] = search_url.format(query=query, + letter=query[0], + pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(result_xpath): + url = result.xpath(url_xpath)[0] + + # skip fast download links + if not url.startswith('/'): + continue + + # append result + results.append({'url': base_url + url, + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))}) + + # return results + return results diff --git a/searx/engines/github.py b/searx/engines/github.py index be2cfe7c5..53fec029f 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -1,32 +1,59 @@ +## Github (It) +# +# @website https://github.com/ +# @provide-api yes (https://developer.github.com/v3/) +# +# @using-api yes +# @results JSON +# @stable yes (using api) +# @parse url, title, content + from urllib import urlencode from json import loads from cgi import escape +# engine dependent config categories = ['it'] +# search-url search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa accept_header = 'application/vnd.github.preview.text-match+json' +# do search-request def request(query, params): - global search_url params['url'] = search_url.format(query=urlencode({'q': query})) + params['headers']['Accept'] = accept_header + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + + # check if items are recieved if not 'items' in search_res: - return results + return [] + + # parse results for res in search_res['items']: title = res['name'] url = res['html_url'] + if res['description']: content = escape(res['description'][:500]) else: content = '' - results.append({'url': url, 'title': title, 'content': content}) + + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/engines/google.py b/searx/engines/google.py index 2c6a98af3..9dbe8b8f0 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,37 +1,115 @@ -#!/usr/bin/env python +## Google (Web) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/custom-search/) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content, suggestion from urllib import urlencode -from json import loads +from urlparse import unquote,urlparse,parse_qsl +from lxml import html +from searx.engines.xpath import extract_text, extract_url +# engine dependent config categories = ['general'] - -url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa - paging = True language_support = True +# search-url +google_hostname = 'www.google.com' +search_path = '/search' +redirect_path = '/url' +images_path = '/images' +search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1' + +# specific xpath variables +results_xpath= '//li[@class="g"]' +url_xpath = './/h3/a/@href' +title_xpath = './/h3' +content_xpath = './/span[@class="st"]' +suggestion_xpath = '//p[@class="_Bmc"]' + +images_xpath = './/div/a' +image_url_xpath = './@href' +image_img_src_xpath = './img/@src' +# remove google-specific tracking-url +def parse_url(url_string): + parsed_url = urlparse(url_string) + if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path: + query = dict(parse_qsl(parsed_url.query)) + return query['q'] + else: + return url_string + +# do search-request def request(query, params): - offset = (params['pageno'] - 1) * 8 - language = 'en-US' - if params['language'] != 'all': - language = params['language'].replace('_', '-') + offset = (params['pageno'] - 1) * 10 + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].replace('_','-').lower() + params['url'] = search_url.format(offset=offset, - query=urlencode({'q': query}), - language=language) + query=urlencode({'q': query})) + + params['headers']['Accept-Language'] = language + return params +# get response from search-request def response(resp): results = [] - search_res = loads(resp.text) - if not search_res.get('responseData', {}).get('results'): - return [] + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + title = extract_text(result.xpath(title_xpath)[0]) + try: + url = parse_url(extract_url(result.xpath(url_xpath), search_url)) + parsed_url = urlparse(url) + if parsed_url.netloc==google_hostname and parsed_url.path==search_path: + # remove the link to google news + continue + + if parsed_url.netloc==google_hostname and parsed_url.path==images_path: + # images result + results = results + parse_images(result) + else: + # normal result + content = extract_text(result.xpath(content_xpath)[0]) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + except: + continue + + # parse suggestion + for suggestion in dom.xpath(suggestion_xpath): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + # return results + return results + +def parse_images(result): + results = [] + for image in result.xpath(images_xpath): + url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) + img_src = extract_text(image.xpath(image_img_src_xpath)[0]) + + # append result + results.append({'url': url, + 'title': '', + 'content': '', + 'img_src': img_src, + 'template': 'images.html'}) - for result in search_res['responseData']['results']: - results.append({'url': result['unescapedUrl'], - 'title': result['titleNoFormatting'], - 'content': result['content']}) return results diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index a6837f039..6c99f2801 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,36 +1,58 @@ -#!/usr/bin/env python +## Google (Images) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# +# @using-api yes +# @results JSON +# @stable yes (but deprecated) +# @parse url, title, img_src from urllib import urlencode from json import loads +# engine dependent config categories = ['images'] +paging = True +# search-url url = 'https://ajax.googleapis.com/' search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 8 + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) - if not search_res.get('responseData'): - return [] - if not search_res['responseData'].get('results'): + + # return empty array if there are no results + if not search_res.get('responseData', {}).get('results'): return [] + + # parse results for result in search_res['responseData']['results']: href = result['originalContextUrl'] title = result['title'] if not result['url']: continue + + # append result results.append({'url': href, 'title': title, 'content': '', 'img_src': result['url'], 'template': 'images.html'}) + + # return results return results diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 72b7a0661..becc7e21d 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,43 +1,62 @@ -#!/usr/bin/env python +## Google (News) +# +# @website https://www.google.com +# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated! +# +# @using-api yes +# @results JSON +# @stable yes (but deprecated) +# @parse url, title, content, publishedDate from urllib import urlencode from json import loads from dateutil import parser +# search-url categories = ['news'] +paging = True +language_support = True +# engine dependent config url = 'https://ajax.googleapis.com/' search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa -paging = True -language_support = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 8 + language = 'en-US' if params['language'] != 'all': language = params['language'].replace('_', '-') + params['url'] = search_url.format(offset=offset, query=urlencode({'q': query}), language=language) + return params +# get response from search-request def response(resp): results = [] + search_res = loads(resp.text) + # return empty array if there are no results if not search_res.get('responseData', {}).get('results'): return [] + # parse results for result in search_res['responseData']['results']: - -# Mon, 10 Mar 2014 16:26:15 -0700 + # parse publishedDate publishedDate = parser.parse(result['publishedDate']) + # append result results.append({'url': result['unescapedUrl'], 'title': result['titleNoFormatting'], 'publishedDate': publishedDate, 'content': result['content']}) + + # return results return results diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index f8cfb9afa..4a8b0e8b8 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -1,22 +1,78 @@ +## general mediawiki-engine (Web) +# +# @website websites built on mediawiki (https://www.mediawiki.org) +# @provide-api yes (http://www.mediawiki.org/wiki/API:Search) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title +# +# @todo content + from json import loads +from string import Formatter from urllib import urlencode, quote -url = 'https://en.wikipedia.org/' - -search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}' # noqa +# engine dependent config +categories = ['general'] +language_support = True +paging = True +number_of_results = 1 -number_of_results = 10 +# search-url +base_url = 'https://{language}.wikipedia.org/' +search_url = base_url + 'w/api.php?action=query'\ + '&list=search'\ + '&{query}'\ + '&srprop=timestamp'\ + '&format=json'\ + '&sroffset={offset}'\ + '&srlimit={limit}' +# do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 - params['url'] = search_url.format(query=urlencode({'srsearch': query}), - offset=offset) + offset = (params['pageno'] - 1) * number_of_results + string_args = dict(query=urlencode({'srsearch': query}), + offset=offset, + limit=number_of_results) + format_strings = list(Formatter().parse(base_url)) + + if params['language'] == 'all': + language = 'en' + else: + language = params['language'].split('_')[0] + + if len(format_strings) > 1: + string_args['language'] = language + + # write search-language back to params, required in response + params['language'] = language + + params['url'] = search_url.format(**string_args) + return params +# get response from search-request def response(resp): + results = [] + search_results = loads(resp.text) - res = search_results.get('query', {}).get('search', []) - return [{'url': url + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')), # noqa - 'title': result['title']} for result in res[:int(number_of_results)]] + + # return empty array if there are no results + if not search_results.get('query', {}).get('search'): + return [] + + # parse results + for result in search_results['query']['search']: + url = base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')) + + # append result + results.append({'url': url, + 'title': result['title'], + 'content': ''}) + + # return results + return results diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py new file mode 100644 index 000000000..ea7251486 --- /dev/null +++ b/searx/engines/openstreetmap.py @@ -0,0 +1,47 @@ +## OpenStreetMap (Map) +# +# @website https://openstreetmap.org/ +# @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title + +from json import loads + +# engine dependent config +categories = ['map'] +paging = False + +# search-url +url = 'https://nominatim.openstreetmap.org/search/{query}?format=json' + +result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' + + +# do search-request +def request(query, params): + params['url'] = url.format(query=query) + + return params + + +# get response from search-request +def response(resp): + results = [] + json = loads(resp.text) + + # parse results + for r in json: + title = r['display_name'] + osm_type = r.get('osm_type', r.get('type')) + url = result_base_url.format(osm_type=osm_type, + osm_id=r['osm_id']) + # append result + results.append({'title': title, + 'content': '', + 'url': url}) + + # return results + return results diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index bb4886868..9533b629e 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -1,39 +1,61 @@ +## Piratebay (Videos, Music, Files) +# +# @website https://thepiratebay.se +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML (using search portal) +# @stable yes (HTML can change) +# @parse url, title, content, seed, leech, magnetlink + from urlparse import urljoin from cgi import escape from urllib import quote from lxml import html from operator import itemgetter -categories = ['videos', 'music'] +# engine dependent config +categories = ['videos', 'music', 'files'] +paging = True +# search-url url = 'https://thepiratebay.se/' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' -search_types = {'videos': '200', + +# piratebay specific type-definitions +search_types = {'files': '0', 'music': '100', - 'files': '0'} + 'videos': '200'} +# specific xpath variables magnet_xpath = './/a[@title="Download this torrent using magnet"]' content_xpath = './/font[@class="detDesc"]//text()' -paging = True - +# do search-request def request(query, params): - search_type = search_types.get(params['category'], '200') + search_type = search_types.get(params['category'], '0') + params['url'] = search_url.format(search_term=quote(query), search_type=search_type, pageno=params['pageno'] - 1) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) + search_res = dom.xpath('//table[@id="searchResult"]//tr') + # return empty array if nothing is found if not search_res: - return results + return [] + # parse results for result in search_res[1:]: link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) @@ -41,17 +63,21 @@ def response(resp): content = escape(' '.join(result.xpath(content_xpath))) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] + # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 + # convert leech to int if possible if leech.isdigit(): leech = int(leech) else: leech = 0 magnetlink = result.xpath(magnet_xpath)[0] + + # append result results.append({'url': href, 'title': title, 'content': content, @@ -60,4 +86,5 @@ def response(resp): 'magnetlink': magnetlink.attrib['href'], 'template': 'torrent.html'}) + # return results sorted by seeder return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index e28fb1600..aebea239f 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -1,31 +1,55 @@ +## Soundcloud (Music) +# +# @website https://soundcloud.com +# @provide-api yes (https://developers.soundcloud.com/) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content + from json import loads from urllib import urlencode +# engine dependent config categories = ['music'] +paging = True +# api-key guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28' -url = 'https://api.soundcloud.com/' -search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id='+guest_client_id # noqa -paging = True +# search-url +url = 'https://api.soundcloud.com/' +search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id={client_id}' +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 20 + params['url'] = search_url.format(query=urlencode({'q': query}), - offset=offset) + offset=offset, + client_id=guest_client_id) + return params +# get response from search-request def response(resp): - global base_url results = [] + search_res = loads(resp.text) + + # parse results for result in search_res.get('collection', []): if result['kind'] in ('track', 'playlist'): title = result['title'] content = result['description'] + + # append result results.append({'url': result['permalink_url'], 'title': title, 'content': content}) + + # return results return results diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index e24b309c1..edbe74a70 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -1,30 +1,58 @@ +## Stackoverflow (It) +# +# @website https://stackoverflow.com/ +# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content + from urlparse import urljoin from cgi import escape from urllib import urlencode from lxml import html +# engine dependent config categories = ['it'] +paging = True +# search-url url = 'http://stackoverflow.com/' search_url = url+'search?{query}&page={pageno}' -result_xpath = './/div[@class="excerpt"]//text()' -paging = True +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +title_xpath = './/text()' +content_xpath = './/div[@class="excerpt"]//text()' +# do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) - for result in dom.xpath('//div[@class="question-summary search-result"]'): - link = result.xpath('.//div[@class="result-link"]//a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(' '.join(link.xpath('.//text()'))) - content = escape(' '.join(result.xpath(result_xpath))) - results.append({'url': href, 'title': title, 'content': content}) + title = escape(' '.join(link.xpath(title_xpath))) + content = escape(' '.join(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results return results diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index f5a652317..2adbfb3e4 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,46 +1,79 @@ +## Startpage (Web) +# +# @website https://startpage.com +# @provide-api no (nothing found) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo paging + from urllib import urlencode from lxml import html +from cgi import escape +import re + +# engine dependent config +categories = ['general'] +# there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls +#paging = False +language_support = True -base_url = None -search_url = None +# search-url +base_url = 'https://startpage.com/' +search_url = base_url + 'do/search' -# TODO paging -paging = False -# TODO complete list of country mapping -country_map = {'en_US': 'eng', - 'en_UK': 'uk', - 'nl_NL': 'ned'} +# specific xpath variables +# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] +# not ads: div[@class="result"] are the direct childs of div[@id="results"] +results_xpath = '//div[@class="result"]' +link_xpath = './/h3/a' +# do search-request def request(query, params): + offset = (params['pageno'] - 1) * 10 query = urlencode({'q': query})[2:] + params['url'] = search_url params['method'] = 'POST' params['data'] = {'query': query, - 'startat': (params['pageno'] - 1) * 10} # offset - country = country_map.get(params['language'], 'eng') - params['cookies']['preferences'] = \ - 'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country) # noqa + 'startat': offset} + + # set language if specified + if params['language'] != 'all': + params['data']['with_language'] = 'lang_' + params['language'].split('_')[0] + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.content) - # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] - # not ads: div[@class="result"] are the direct childs of div[@id="results"] - for result in dom.xpath('//div[@class="result"]'): - link = result.xpath('.//h3/a')[0] + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] url = link.attrib.get('href') - if url.startswith('http://www.google.')\ - or url.startswith('https://www.google.'): + title = escape(link.text_content()) + + # block google-ad url's + if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): continue - title = link.text_content() - content = '' if result.xpath('./p[@class="desc"]'): - content = result.xpath('./p[@class="desc"]')[0].text_content() + content = escape(result.xpath('./p[@class="desc"]')[0].text_content()) + else: + content = '' - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + # return results return results diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index 23393ac4d..8de78144e 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -1,32 +1,63 @@ +## Twitter (Social media) +# +# @website https://www.bing.com/news +# @provide-api yes (https://dev.twitter.com/docs/using-search) +# +# @using-api no +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content +# +# @todo publishedDate + from urlparse import urljoin from urllib import urlencode from lxml import html from cgi import escape +# engine dependent config categories = ['social media'] +language_support = True +# search-url base_url = 'https://twitter.com/' search_url = base_url+'search?' + +# specific xpath variables +results_xpath = '//li[@data-item-type="tweet"]' +link_xpath = './/small[@class="time"]//a' title_xpath = './/span[@class="username js-action-profile-name"]//text()' content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()' +# do search-request def request(query, params): - global search_url params['url'] = search_url + urlencode({'q': query}) + + # set language if specified + if params['language'] != 'all': + params['cookies']['lang'] = params['language'].split('_')[0] + return params +# get response from search-request def response(resp): - global base_url results = [] + dom = html.fromstring(resp.text) - for tweet in dom.xpath('//li[@data-item-type="tweet"]'): - link = tweet.xpath('.//small[@class="time"]//a')[0] + + # parse results + for tweet in dom.xpath(results_xpath): + link = tweet.xpath(link_xpath)[0] url = urljoin(base_url, link.attrib.get('href')) title = ''.join(tweet.xpath(title_xpath)) content = escape(''.join(tweet.xpath(content_xpath))) + + # append result results.append({'url': url, 'title': title, 'content': content}) + + # return results return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 94a6dd545..2a91e76fa 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -1,43 +1,58 @@ +## Vimeo (Videos) +# +# @website https://vimeo.com/ +# @provide-api yes (http://developer.vimeo.com/api), they have a maximum count of queries/hour +# +# @using-api no (TODO, rewrite to api) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, publishedDate, thumbnail +# +# @todo rewrite to api +# @todo set content-parameter with correct data + from urllib import urlencode from HTMLParser import HTMLParser from lxml import html from searx.engines.xpath import extract_text from dateutil import parser -base_url = 'http://vimeo.com' -search_url = base_url + '/search?{query}' -url_xpath = None -content_xpath = None -title_xpath = None -results_xpath = '' -content_tpl = '<a href="{0}"> <img src="{2}"/> </a>' -publishedDate_xpath = './/p[@class="meta"]//attribute::datetime' +# engine dependent config +categories = ['videos'] +paging = True -# the cookie set by vimeo contains all the following values, -# but only __utma seems to be requiered -cookie = { - #'vuid':'918282893.1027205400' - # 'ab_bs':'%7B%223%22%3A279%7D' - '__utma': '00000000.000#0000000.0000000000.0000000000.0000000000.0' - # '__utmb':'18302654.1.10.1388942090' - #, '__utmc':'18302654' - #, '__utmz':'18#302654.1388942090.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)' # noqa - #, '__utml':'search' -} +# search-url +base_url = 'https://vimeo.com' +search_url = base_url + '/search/page:{pageno}?{query}' + +# specific xpath variables +url_xpath = './a/@href' +content_xpath = './a/img/@src' +title_xpath = './a/div[@class="data"]/p[@class="title"]/text()' +results_xpath = '//div[@id="browse_content"]/ol/li' +publishedDate_xpath = './/p[@class="meta"]//attribute::datetime' +# do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query})) - params['cookies'] = cookie + params['url'] = search_url.format(pageno=params['pageno'] , + query=urlencode({'q': query})) + + # TODO required? + params['cookies']['__utma'] = '00000000.000#0000000.0000000000.0000000000.0000000000.0' + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) p = HTMLParser() + # parse results for result in dom.xpath(results_xpath): url = base_url + result.xpath(url_xpath)[0] title = p.unescape(extract_text(result.xpath(title_xpath))) @@ -45,10 +60,13 @@ def response(resp): publishedDate = parser.parse(extract_text( result.xpath(publishedDate_xpath)[0])) + # append result results.append({'url': url, 'title': title, - 'content': content_tpl.format(url, title, thumbnail), + 'content': '', 'template': 'videos.html', 'publishedDate': publishedDate, 'thumbnail': thumbnail}) + + # return results return results diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py deleted file mode 100644 index 1e2a798cc..000000000 --- a/searx/engines/wikipedia.py +++ /dev/null @@ -1,30 +0,0 @@ -from json import loads -from urllib import urlencode, quote - -url = 'https://{language}.wikipedia.org/' - -search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}' # noqa - -number_of_results = 10 - -language_support = True - - -def request(query, params): - offset = (params['pageno'] - 1) * 10 - if params['language'] == 'all': - language = 'en' - else: - language = params['language'].split('_')[0] - params['language'] = language - params['url'] = search_url.format(query=urlencode({'srsearch': query}), - offset=offset, - language=language) - return params - - -def response(resp): - search_results = loads(resp.text) - res = search_results.get('query', {}).get('search', []) - return [{'url': url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')), # noqa - 'title': result['title']} for result in res[:int(number_of_results)]] diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index efdf846ac..2345b24f3 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -1,40 +1,89 @@ +## Yacy (Web, Images, Videos, Music, Files) +# +# @website http://yacy.net +# @provide-api yes (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse (general) url, title, content, publishedDate +# @parse (images) url, title, img_src +# +# @todo parse video, audio and file results + from json import loads from urllib import urlencode +from dateutil import parser + +# engine dependent config +categories = ['general', 'images'] #TODO , 'music', 'videos', 'files' +paging = True +language_support = True +number_of_results = 5 + +# search-url +base_url = 'http://localhost:8090' +search_url = '/yacysearch.json?{query}&startRecord={offset}&maximumRecords={limit}&contentdom={search_type}&resource=global' -url = 'http://localhost:8090' -search_url = '/yacysearch.json?{query}&maximumRecords=10' +# yacy specific type-definitions +search_types = {'general': 'text', + 'images': 'image', + 'files': 'app', + 'music': 'audio', + 'videos': 'video'} +# do search-request def request(query, params): - params['url'] = url + search_url.format(query=urlencode({'query': query})) + offset = (params['pageno'] - 1) * number_of_results + search_type = search_types.get(params['category'], '0') + + params['url'] = base_url + search_url.format(query=urlencode({'query': query}), + offset=offset, + limit=number_of_results, + search_type=search_type) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=lang_' + params['language'].split('_')[0] + return params +# get response from search-request def response(resp): + results = [] + raw_search_results = loads(resp.text) + # return empty array if there are no results if not raw_search_results: return [] search_results = raw_search_results.get('channels', {})[0].get('items', []) - results = [] - - for result in search_results: - tmp_result = {} - tmp_result['title'] = result['title'] - tmp_result['url'] = result['link'] - tmp_result['content'] = '' - - if result['description']: - tmp_result['content'] += result['description'] + "<br/>" + if resp.search_params['category'] == 'general': + # parse general results + for result in search_results: + publishedDate = parser.parse(result['pubDate']) - if result['pubDate']: - tmp_result['content'] += result['pubDate'] + "<br/>" + # append result + results.append({'url': result['link'], + 'title': result['title'], + 'content': result['description'], + 'publishedDate': publishedDate}) - if result['size'] != '-1': - tmp_result['content'] += result['sizename'] + elif resp.search_params['category'] == 'images': + # parse image results + for result in search_results: + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': '', + 'img_src': result['image'], + 'template': 'images.html'}) - results.append(tmp_result) + #TODO parse video, audio and file results + # return results return results diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index f89741839..5e34a2b07 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -1,64 +1,101 @@ -#!/usr/bin/env python +## Yahoo (Web) +# +# @website https://search.yahoo.com/web +# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries +# +# @using-api no (because pricing) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, suggestion from urllib import urlencode from urlparse import unquote from lxml import html from searx.engines.xpath import extract_text, extract_url +# engine dependent config categories = ['general'] -search_url = 'http://search.yahoo.com/search?{query}&b={offset}' +paging = True +language_support = True + +# search-url +search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' + +# specific xpath variables results_xpath = '//div[@class="res"]' url_xpath = './/h3/a/@href' title_xpath = './/h3/a' content_xpath = './/div[@class="abstr"]' suggestion_xpath = '//div[@id="satat"]//a' -paging = True - +# remove yahoo-specific tracking-url def parse_url(url_string): endings = ['/RS', '/RK'] endpositions = [] start = url_string.find('http', url_string.find('/RU=')+1) + for ending in endings: endpos = url_string.rfind(ending) if endpos > -1: endpositions.append(endpos) - end = min(endpositions) - return unquote(url_string[start:end]) + if start==0 or len(endpositions) == 0: + return url_string + else: + end = min(endpositions) + return unquote(url_string[start:end]) +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en' else: language = params['language'].split('_')[0] + params['url'] = search_url.format(offset=offset, - query=urlencode({'p': query})) + query=urlencode({'p': query}), + lang=language) + + # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ .format(lang=language) + return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) + # parse results for result in dom.xpath(results_xpath): try: url = parse_url(extract_url(result.xpath(url_xpath), search_url)) title = extract_text(result.xpath(title_xpath)[0]) except: continue + content = extract_text(result.xpath(content_xpath)[0]) - results.append({'url': url, 'title': title, 'content': content}) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) + + # if no suggestion found, return results if not suggestion_xpath: return results + # parse suggestion for suggestion in dom.xpath(suggestion_xpath): + # append suggestion results.append({'suggestion': extract_text(suggestion)}) + # return results return results diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 43da93ede..c07d7e185 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -1,4 +1,12 @@ -#!/usr/bin/env python +## Yahoo (News) +# +# @website https://news.yahoo.com +# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries +# +# @using-api no (because pricing) +# @results HTML (using search portal) +# @stable no (HTML can change) +# @parse url, title, content, publishedDate from urllib import urlencode from lxml import html @@ -8,8 +16,15 @@ from datetime import datetime, timedelta import re from dateutil import parser +# engine dependent config categories = ['news'] -search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' +paging = True +language_support = True + +# search-url +search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' + +# specific xpath variables results_xpath = '//div[@class="res"]' url_xpath = './/h3/a/@href' title_xpath = './/h3/a' @@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]' publishedDate_xpath = './/span[@class="timestamp"]' suggestion_xpath = '//div[@id="satat"]//a' -paging = True - +# do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 + if params['language'] == 'all': language = 'en' else: language = params['language'].split('_')[0] + params['url'] = search_url.format(offset=offset, - query=urlencode({'p': query})) + query=urlencode({'p': query}), + lang=language) + + # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ .format(lang=language) return params +# get response from search-request def response(resp): results = [] + dom = html.fromstring(resp.text) + # parse results for result in dom.xpath(results_xpath): url = parse_url(extract_url(result.xpath(url_xpath), search_url)) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) + + # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) if re.match("^[0-9]+ minute(s|) ago$", publishedDate): @@ -58,15 +82,11 @@ def response(resp): if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year) + # append result results.append({'url': url, 'title': title, 'content': content, 'publishedDate': publishedDate}) - if not suggestion_xpath: - return results - - for suggestion in dom.xpath(suggestion_xpath): - results.append({'suggestion': extract_text(suggestion)}) - + # return results return results diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py index 895b55918..a3c3980af 100644 --- a/searx/engines/youtube.py +++ b/searx/engines/youtube.py @@ -1,42 +1,69 @@ +## Youtube (Videos) +# +# @website https://www.youtube.com/ +# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/) +# +# @using-api yes +# @results JSON +# @stable yes +# @parse url, title, content, publishedDate, thumbnail + from json import loads from urllib import urlencode from dateutil import parser +# engine dependent config categories = ['videos'] - -search_url = ('https://gdata.youtube.com/feeds/api/videos' - '?alt=json&{query}&start-index={index}&max-results=25') # noqa - paging = True +language_support = True + +# search-url +base_url = 'https://gdata.youtube.com/feeds/api/videos' +search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5' # noqa +# do search-request def request(query, params): - index = (params['pageno'] - 1) * 25 + 1 + index = (params['pageno'] - 1) * 5 + 1 + params['url'] = search_url.format(query=urlencode({'q': query}), index=index) + + # add language tag if specified + if params['language'] != 'all': + params['url'] += '&lr=' + params['language'].split('_')[0] + return params +# get response from search-request def response(resp): results = [] + search_results = loads(resp.text) + + # return empty array if there are no results if not 'feed' in search_results: - return results + return [] + feed = search_results['feed'] + # parse results for result in feed['entry']: url = [x['href'] for x in result['link'] if x['type'] == 'text/html'] + if not url: return + # remove tracking url = url[0].replace('feature=youtube_gdata', '') if url.endswith('&'): url = url[:-1] + title = result['title']['$t'] content = '' thumbnail = '' -#"2013-12-31T15:22:51.000Z" pubdate = result['published']['$t'] publishedDate = parser.parse(pubdate) @@ -49,6 +76,7 @@ def response(resp): else: content = result['content']['$t'] + # append result results.append({'url': url, 'title': title, 'content': content, @@ -56,4 +84,5 @@ def response(resp): 'publishedDate': publishedDate, 'thumbnail': thumbnail}) + # return results return results |