diff options
| -rw-r--r-- | searx/engines/__init__.py | 6 | ||||
| -rw-r--r-- | searx/engines/flickr_noapi.py | 11 | ||||
| -rw-r--r-- | searx/engines/www1x.py | 82 | ||||
| -rw-r--r-- | searx/engines/yacy.py | 25 | ||||
| -rw-r--r-- | searx/query.py | 10 | ||||
| -rw-r--r-- | searx/search.py | 7 | ||||
| -rw-r--r-- | searx/settings.yml | 12 | ||||
| -rw-r--r-- | searx/tests/engines/test_www1x.py | 57 | ||||
| -rw-r--r-- | searx/tests/test_engines.py | 1 | ||||
| -rw-r--r-- | searx/utils.py | 23 | ||||
| -rw-r--r-- | searx/webapp.py | 47 |
11 files changed, 218 insertions, 63 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 643b107a5..21a307501 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -69,17 +69,17 @@ def load_engine(engine_data): engine.categories = ['general'] if not hasattr(engine, 'language_support'): - # engine.language_support = False engine.language_support = True if not hasattr(engine, 'timeout'): - # engine.language_support = False engine.timeout = settings['server']['request_timeout'] if not hasattr(engine, 'shortcut'): - # engine.shortcut = ''' engine.shortcut = '' + if not hasattr(engine, 'disabled'): + engine.disabled = False + # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 73dff44c4..3a83fdc65 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -13,6 +13,10 @@ from urllib import urlencode from json import loads import re +from searx.engines import logger + + +logger = logger.getChild('flickr-noapi') categories = ['images'] @@ -62,10 +66,11 @@ def response(resp): # From the biggest to the lowest format for image_size in image_sizes: if image_size in photo['sizes']: - img_src = photo['sizes'][image_size]['displayUrl'] + img_src = photo['sizes'][image_size]['url'] break if not img_src: + logger.debug('cannot find valid image size: {0}'.format(repr(photo))) continue if 'id' not in photo['owner']: @@ -73,9 +78,9 @@ def response(resp): # For a bigger thumbnail, keep only the url_z, not the url_n if 'n' in photo['sizes']: - thumbnail_src = photo['sizes']['n']['displayUrl'] + thumbnail_src = photo['sizes']['n']['url'] elif 'z' in photo['sizes']: - thumbnail_src = photo['sizes']['z']['displayUrl'] + thumbnail_src = photo['sizes']['z']['url'] else: thumbnail_src = img_src diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py new file mode 100644 index 000000000..a68c105ce --- /dev/null +++ b/searx/engines/www1x.py @@ -0,0 +1,82 @@ +## 1x (Images) +# +# @website http://1x.com/ +# @provide-api no +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, thumbnail, img_src, content + + +from urllib import urlencode +from urlparse import urljoin +from lxml import html +import string +import re + +# engine dependent config +categories = ['images'] +paging = False + +# search-url +base_url = 'http://1x.com' +search_url = base_url+'/backend/search.php?{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # get links from result-text + regex = re.compile('(</a>|<a)') + results_parts = re.split(regex, resp.text) + + cur_element = '' + + # iterate over link parts + for result_part in results_parts: + # processed start and end of link + if result_part == '<a': + cur_element = result_part + continue + elif result_part != '</a>': + cur_element += result_part + continue + + cur_element += result_part + + # fix xml-error + cur_element = string.replace(cur_element, '"></a>', '"/></a>') + + dom = html.fromstring(cur_element) + link = dom.xpath('//a')[0] + + url = urljoin(base_url, link.attrib.get('href')) + title = link.attrib.get('title', '') + + thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) + # TODO: get image with higher resolution + img_src = thumbnail_src + + # check if url is showing to a photo + if '/photo/' not in url: + continue + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': '', + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 4c4fac7df..17e2a7aab 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -68,9 +68,18 @@ def response(resp): search_results = raw_search_results.get('channels', {})[0].get('items', []) - if resp.search_params['category'] == 'general': + for result in search_results: + # parse image results + if result.get('image'): + # append result + results.append({'url': result['url'], + 'title': result['title'], + 'content': '', + 'img_src': result['image'], + 'template': 'images.html'}) + # parse general results - for result in search_results: + else: publishedDate = parser.parse(result['pubDate']) # append result @@ -79,17 +88,7 @@ def response(resp): 'content': result['description'], 'publishedDate': publishedDate}) - elif resp.search_params['category'] == 'images': - # parse image results - for result in search_results: - # append result - results.append({'url': result['url'], - 'title': result['title'], - 'content': '', - 'img_src': result['image'], - 'template': 'images.html'}) - - #TODO parse video, audio and file results + #TODO parse video, audio and file results # return results return results diff --git a/searx/query.py b/searx/query.py index e491284f0..567c49e92 100644 --- a/searx/query.py +++ b/searx/query.py @@ -88,18 +88,16 @@ class Query(object): prefix = query_part[1:].replace('_', ' ') # check if prefix is equal with engine shortcut - if prefix in engine_shortcuts\ - and not engine_shortcuts[prefix] in self.blocked_engines: + if prefix in engine_shortcuts: parse_next = True self.engines.append({'category': 'none', 'name': engine_shortcuts[prefix]}) # check if prefix is equal with engine name - elif prefix in engines\ - and prefix not in self.blocked_engines: + elif prefix in engines: parse_next = True self.engines.append({'category': 'none', - 'name': prefix}) + 'name': prefix}) # check if prefix is equal with categorie name elif prefix in categories: @@ -107,7 +105,7 @@ class Query(object): # are declared under that categorie name parse_next = True self.engines.extend({'category': prefix, - 'name': engine.name} + 'name': engine.name} for engine in categories[prefix] if engine not in self.blocked_engines) diff --git a/searx/search.py b/searx/search.py index b6cf84e94..c16346f45 100644 --- a/searx/search.py +++ b/searx/search.py @@ -27,7 +27,7 @@ from searx.engines import ( categories, engines ) from searx.languages import language_codes -from searx.utils import gen_useragent +from searx.utils import gen_useragent, get_blocked_engines from searx.query import Query from searx import logger @@ -320,10 +320,7 @@ class Search(object): self.lang = 'all' # set blocked engines - if request.cookies.get('blocked_engines'): - self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa - else: - self.blocked_engines = [] + self.blocked_engines = get_blocked_engines(engines, request.cookies) self.results = [] self.suggestions = [] diff --git a/searx/settings.yml b/searx/settings.yml index 2c9441c34..8c9941b36 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -83,6 +83,11 @@ engines: engine : www500px shortcut : px + - name : 1x + engine : www1x + shortcut : 1x + disabled : True + - name : flickr categories : images shortcut : fl @@ -96,6 +101,7 @@ engines: - name : general-file engine : generalfile shortcut : gf + disabled : True - name : github engine : github @@ -121,6 +127,7 @@ engines: content_xpath : //a[@class="subtitle"] categories : files shortcut : gpa + disabled : True - name : google play movies engine : xpath @@ -130,6 +137,7 @@ engines: content_xpath : //a[@class="subtitle"] categories : videos shortcut : gpm + disabled : True - name : google play music engine : xpath @@ -139,6 +147,7 @@ engines: content_xpath : //a[@class="subtitle"] categories : music shortcut : gps + disabled : True - name : mixcloud engine : mixcloud @@ -175,6 +184,7 @@ engines: - name : searchcode code engine : searchcode_code shortcut : scc + disabled : True - name : subtitleseeker engine : subtitleseeker @@ -239,7 +249,7 @@ engines: # shortcut : ya # base_url : 'http://localhost:8090' # number_of_results : 5 -# timeout: 3.0 +# timeout : 3.0 locales: en : English diff --git a/searx/tests/engines/test_www1x.py b/searx/tests/engines/test_www1x.py new file mode 100644 index 000000000..ab4f282c1 --- /dev/null +++ b/searx/tests/engines/test_www1x.py @@ -0,0 +1,57 @@ +from collections import defaultdict +import mock +from searx.engines import www1x +from searx.testing import SearxTestCase + + +class TestWww1xEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + params = www1x.request(query, defaultdict(dict)) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('1x.com' in params['url']) + + def test_response(self): + self.assertRaises(AttributeError, www1x.response, None) + self.assertRaises(AttributeError, www1x.response, []) + self.assertRaises(AttributeError, www1x.response, '') + self.assertRaises(AttributeError, www1x.response, '[]') + + response = mock.Mock(text='<html></html>') + self.assertEqual(www1x.response(response), []) + html = """ + <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters + [ + <!ELEMENT characters (character*) > + <!ELEMENT character (#PCDATA ) > + + <!ENTITY iexcl "¡" > + <!ENTITY cent "¢" > + <!ENTITY pound "£" > + ] + ><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%"> + <tr> + <td style="min-width: 220px;" valign="top"> + <div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div> + <div> + <a href="/photo/123456" class="dynamiclink"> +<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;"> + </a> + <a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink"> +<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg"> + </a> + </div> + </td> + </table> + ]]></searchresult></root> + """ + response = mock.Mock(text=html) + results = www1x.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456') + self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg') + self.assertEqual(results[0]['content'], '') + self.assertEqual(results[0]['template'], 'images.html') diff --git a/searx/tests/test_engines.py b/searx/tests/test_engines.py index e66f7db28..ff8185b1e 100644 --- a/searx/tests/test_engines.py +++ b/searx/tests/test_engines.py @@ -10,6 +10,7 @@ from searx.tests.engines.test_dummy import * # noqa from searx.tests.engines.test_flickr import * # noqa from searx.tests.engines.test_flickr_noapi import * # noqa from searx.tests.engines.test_github import * # noqa +from searx.tests.engines.test_www1x import * # noqa from searx.tests.engines.test_google_images import * # noqa from searx.tests.engines.test_google_news import * # noqa from searx.tests.engines.test_kickass import * # noqa diff --git a/searx/utils.py b/searx/utils.py index ef221ef8e..c0afc94cb 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -17,11 +17,11 @@ from searx import logger logger = logger.getChild('utils') -ua_versions = ('29.0', - '30.0', - '31.0', +ua_versions = ('31.0', '32.0', - '33.0') + '33.0', + '34.0', + '35.0') ua_os = ('Windows NT 6.3; WOW64', 'X11; Linux x86_64', @@ -220,3 +220,18 @@ def dict_subset(d, properties): if k in d: result[k] = d[k] return result + + +def prettify_url(url): + if len(url) > 74: + return u'{0}[...]{1}'.format(url[:35], url[-35:]) + else: + return url + + +def get_blocked_engines(engines, cookies): + if 'blocked_engines' not in cookies: + return [engine for engine in engines if engines[engine].disabled] + + return [engine for engine in cookies.get('blocked_engines', '').split(',') + if engine in engines] diff --git a/searx/webapp.py b/searx/webapp.py index 00761404e..d92302b69 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -42,7 +42,8 @@ from searx.engines import ( ) from searx.utils import ( UnicodeWriter, highlight_content, html_to_text, get_themes, - get_static_files, get_result_templates, gen_useragent, dict_subset + get_static_files, get_result_templates, gen_useragent, dict_subset, + prettify_url, get_blocked_engines ) from searx.version import VERSION_STRING from searx.languages import language_codes @@ -91,7 +92,7 @@ for indice, theme in enumerate(themes): for (dirpath, dirnames, filenames) in os.walk(theme_img_path): global_favicons[indice].extend(filenames) -cookie_max_age = 60 * 60 * 24 * 365 * 23 # 23 years +cookie_max_age = 60 * 60 * 24 * 365 * 5 # 5 years @babel.localeselector @@ -214,17 +215,19 @@ def image_proxify(url): if url.startswith('//'): url = 'https:' + url + url = url.encode('utf-8') + if not settings['server'].get('image_proxy') and not request.cookies.get('image_proxy'): return url - h = hashlib.sha256(url + settings['server']['secret_key']).hexdigest() + h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest() return '{0}?{1}'.format(url_for('image_proxy'), urlencode(dict(url=url, h=h))) def render(template_name, override_theme=None, **kwargs): - blocked_engines = request.cookies.get('blocked_engines', '').split(',') + blocked_engines = get_blocked_engines(engines, request.cookies) autocomplete = request.cookies.get('autocomplete') @@ -330,11 +333,7 @@ def index(): result['title'] = ' '.join(html_to_text(result['title']) .strip().split()) - if len(result['url']) > 74: - url_parts = result['url'][:35], result['url'][-35:] - result['pretty_url'] = u'{0}[...]{1}'.format(*url_parts) - else: - result['pretty_url'] = result['url'] + result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: @@ -413,10 +412,7 @@ def autocompleter(): request_data = request.args # set blocked engines - if request.cookies.get('blocked_engines'): - blocked_engines = request.cookies['blocked_engines'].split(',') # noqa - else: - blocked_engines = [] + blocked_engines = get_blocked_engines(engines, request.cookies) # parse query query = Query(request_data.get('q', '').encode('utf-8'), blocked_engines) @@ -429,34 +425,29 @@ def autocompleter(): # run autocompleter completer = autocomplete_backends.get(request.cookies.get('autocomplete')) - # check if valid autocompleter is selected - if not completer: - return '', 400 - # parse searx specific autocompleter results like !bang raw_results = searx_bang(query) - # normal autocompletion results only appear if max 3. searx results returned - if len(raw_results) <= 3: + # normal autocompletion results only appear if max 3 inner results returned + if len(raw_results) <= 3 and completer: # run autocompletion raw_results.extend(completer(query.getSearchQuery())) # parse results (write :language and !engine back to result string) results = [] for result in raw_results: - result_query = query - result_query.changeSearchQuery(result) + query.changeSearchQuery(result) # add parsed result - results.append(result_query.getFullQuery()) + results.append(query.getFullQuery()) # return autocompleter results if request_data.get('format') == 'x-suggestions': return Response(json.dumps([query.query, results]), mimetype='application/json') - else: - return Response(json.dumps(results), - mimetype='application/json') + + return Response(json.dumps(results), + mimetype='application/json') @app.route('/preferences', methods=['GET', 'POST']) @@ -476,7 +467,7 @@ def preferences(): resp = make_response(redirect(url_for('index'))) if request.method == 'GET': - blocked_engines = request.cookies.get('blocked_engines', '').split(',') + blocked_engines = get_blocked_engines(engines, request.cookies) else: # on save selected_categories = [] locale = None @@ -564,12 +555,12 @@ def preferences(): @app.route('/image_proxy', methods=['GET']) def image_proxy(): - url = request.args.get('url') + url = request.args.get('url').encode('utf-8') if not url: return '', 400 - h = hashlib.sha256(url + settings['server']['secret_key']).hexdigest() + h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest() if h != request.args.get('h'): return '', 400 |