diff options
| -rw-r--r-- | .pylintrc | 2 | ||||
| -rw-r--r-- | docs/admin/installation-searx.rst | 2 | ||||
| -rw-r--r-- | docs/dev/makefile.rst | 2 | ||||
| -rw-r--r-- | searx/engines/elasticsearch.py | 142 | ||||
| -rw-r--r-- | searx/engines/qwant.py | 2 | ||||
| -rw-r--r-- | searx/engines/xpath.py | 1 | ||||
| -rw-r--r-- | searx/search.py | 27 | ||||
| -rw-r--r-- | searx/settings.yml | 20 | ||||
| -rwxr-xr-x | searx/webapp.py | 33 | ||||
| -rw-r--r-- | tests/unit/test_search.py | 14 | ||||
| -rw-r--r-- | tests/unit/test_standalone_searx.py | 3 | ||||
| -rw-r--r-- | tests/unit/test_webapp.py | 25 |
12 files changed, 250 insertions, 23 deletions
@@ -12,7 +12,7 @@ # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code -extension-pkg-whitelist= +extension-pkg-whitelist=lxml.etree # Add files or directories to the blacklist. They should be base names, not # paths. diff --git a/docs/admin/installation-searx.rst b/docs/admin/installation-searx.rst index f1d486021..a368bfe8c 100644 --- a/docs/admin/installation-searx.rst +++ b/docs/admin/installation-searx.rst @@ -52,7 +52,7 @@ In the same shell create *virtualenv*: :end-before: END create virtualenv To install searx's dependencies, exit the searx *bash* session you opened above -and restart a new. Before install, first check if your *virualenv* was sourced +and restart a new. Before install, first check if your *virtualenv* was sourced from the login (*~/.profile*): .. kernel-include:: $DOCS_BUILD/includes/searx.rst diff --git a/docs/dev/makefile.rst b/docs/dev/makefile.rst index 62cd0a984..699729a28 100644 --- a/docs/dev/makefile.rst +++ b/docs/dev/makefile.rst @@ -68,7 +68,7 @@ Python environment ``source ./local/py3/bin/activate`` -With Makefile we do no longer need to build up the virualenv manually (as +With Makefile we do no longer need to build up the virtualenv manually (as described in the :ref:`devquickstart` guide). Jump into your git working tree and release a ``make pyenv``: diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py new file mode 100644 index 000000000..bad65fb27 --- /dev/null +++ b/searx/engines/elasticsearch.py @@ -0,0 +1,142 @@ +from json import loads, dumps +from lxml import html +from urllib.parse import quote, urljoin +from requests.auth import HTTPBasicAuth +from searx.utils import extract_text, get_torrent_size + + +base_url = 'http://localhost:9200' +username = '' +password = '' +index = '' +search_url = base_url + '/' + index + '/_search' +query_type = 'match' +custom_query_json = {} +show_metadata = False +categories = ['general'] + + +def init(engine_settings): + if 'query_type' in engine_settings and engine_settings['query_type'] not in _available_query_types: + raise ValueError('unsupported query type', engine_settings['query_type']) + + if index == '': + raise ValueError('index cannot be empty') + + +def request(query, params): + if query_type not in _available_query_types: + return params + + if username and password: + params['auth'] = HTTPBasicAuth(username, password) + + params['url'] = search_url + params['method'] = 'GET' + params['data'] = dumps(_available_query_types[query_type](query)) + params['headers']['Content-Type'] = 'application/json' + + return params + + +def _match_query(query): + """ + The standard for full text queries. + searx format: "key:value" e.g. city:berlin + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html + """ + + try: + key, value = query.split(':') + except: + raise ValueError('query format must be "key:value"') + + return {"query": {"match": {key: {'query': value}}}} + + +def _simple_query_string_query(query): + """ + Accepts query strings, but it is less strict than query_string + The field used can be specified in index.query.default_field in Elasticsearch. + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html + """ + + return {'query': {'simple_query_string': {'query': query}}} + + +def _term_query(query): + """ + Accepts one term and the name of the field. + searx format: "key:value" e.g. city:berlin + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html + """ + + try: + key, value = query.split(':') + except: + raise ValueError('query format must be key:value') + + return {'query': {'term': {key: value}}} + + +def _terms_query(query): + """ + Accepts multiple terms and the name of the field. + searx format: "key:value1,value2" e.g. city:berlin,paris + REF: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html + """ + + try: + key, values = query.split(':') + except: + raise ValueError('query format must be key:value1,value2') + + return {'query': {'terms': {key: values.split(',')}}} + + +def _custom_query(query): + key, value = query.split(':') + custom_query = custom_query_json + for query_key, query_value in custom_query.items(): + if query_key == '{{KEY}}': + custom_query[key] = custom_query.pop(query_key) + if query_value == '{{VALUE}}': + custom_query[query_key] = value + return custom_query + + +def response(resp): + results = [] + + resp_json = loads(resp.text) + if 'error' in resp_json: + raise Exception(resp_json['error']) + + for result in resp_json['hits']['hits']: + r = {key: str(value) if not key.startswith('_') else value for key, value in result['_source'].items()} + r['template'] = 'key-value.html' + + if show_metadata: + r['metadata'] = {'index': result['_index'], + 'id': result['_id'], + 'score': result['_score']} + + results.append(r) + + return results + + +_available_query_types = { + # Full text queries + # https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html + 'match': _match_query, + 'simple_query_string': _simple_query_string_query, + + # Term-level queries + # https://www.elastic.co/guide/en/elasticsearch/reference/current/term-level-queries.html + 'term': _term_query, + 'terms': _terms_query, + + # Query JSON defined by the instance administrator. + 'custom': _custom_query, +} diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 98460604c..c909ce11b 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -17,7 +17,7 @@ from searx.utils import html_to_text, match_language # engine dependent config -categories = None +categories = [] paging = True language_support = True supported_languages_url = 'https://qwant.com/region' diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 81c2747fb..a569d9160 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -7,6 +7,7 @@ url_xpath = None content_xpath = None title_xpath = None thumbnail_xpath = False +categories = [] paging = False suggestion_xpath = '' results_xpath = '' diff --git a/searx/search.py b/searx/search.py index cd195825a..1cb2a603b 100644 --- a/searx/search.py +++ b/searx/search.py @@ -57,8 +57,11 @@ class EngineRef: self.category = category self.from_bang = from_bang - def __str__(self): - return "(" + self.name + "," + self.category + "," + str(self.from_bang) + ")" + def __repr__(self): + return "EngineRef({!r}, {!r}, {!r})".format(self.name, self.category, self.from_bang) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category and self.from_bang == other.from_bang class SearchQuery: @@ -87,8 +90,21 @@ class SearchQuery: self.timeout_limit = timeout_limit self.external_bang = external_bang - def __str__(self): - return self.query + ";" + str(self.engineref_list) + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.categories, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.categories == self.categories\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang def send_http_request(engine, request_params): @@ -110,7 +126,8 @@ def send_http_request(engine, request_params): req = requests_lib.get else: req = requests_lib.post - request_args['data'] = request_params['data'] + + request_args['data'] = request_params['data'] # send the request return req(request_params['url'], **request_args) diff --git a/searx/settings.yml b/searx/settings.yml index 54352bbfc..78ae26b97 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -17,6 +17,12 @@ server: image_proxy : False # Proxying image results through searx http_protocol_version : "1.0" # 1.0 and 1.1 are supported method: "POST" # POST queries are more secure as they don't show up in history but may cause problems when using Firefox containers + default_http_headers: + X-Content-Type-Options : nosniff + X-XSS-Protection : 1; mode=block + X-Download-Options : noopen + X-Robots-Tag : noindex, nofollow + Referrer-Policy : no-referrer ui: static_path : "" # Custom static path - leave it blank if you didn't change @@ -225,6 +231,20 @@ engines: shortcut : ew disabled : True +# - name : elasticsearch +# shortcut : es +# engine : elasticsearch +# base_url : http://localhost:9200 +# username : elastic +# password : changeme +# index : my-index +# # available options: match, simple_query_string, term, terms, custom +# query_type : match +# # if query_type is set to custom, provide your query here +# #custom_query_json: {"query":{"match_all": {}}} +# #show_metadata: False +# disabled : True + - name : wikidata engine : wikidata shortcut : wd diff --git a/searx/webapp.py b/searx/webapp.py index 46d547d52..326200cec 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -44,7 +44,7 @@ from urllib.parse import urlencode, urlparse, urljoin, urlsplit from pygments import highlight from pygments.lexers import get_lexer_by_name -from pygments.formatters import HtmlFormatter +from pygments.formatters import HtmlFormatter # pylint: disable=no-name-in-module from werkzeug.middleware.proxy_fix import ProxyFix from flask import ( @@ -111,7 +111,7 @@ app = Flask( app.jinja_env.trim_blocks = True app.jinja_env.lstrip_blocks = True -app.jinja_env.add_extension('jinja2.ext.loopcontrols') +app.jinja_env.add_extension('jinja2.ext.loopcontrols') # pylint: disable=no-member app.secret_key = settings['server']['secret_key'] # see https://flask.palletsprojects.com/en/1.1.x/cli/ @@ -488,6 +488,16 @@ def pre_request(): @app.after_request +def add_default_headers(response): + # set default http headers + for header, value in settings['server'].get('default_http_headers', {}).items(): + if header in response.headers: + continue + response.headers[header] = value + return response + + +@app.after_request def post_request(response): total_time = time() - request.start_time timings_all = ['total;dur=' + str(round(total_time * 1000, 3))] @@ -537,10 +547,12 @@ def index(): # redirect to search if there's a query in the request if request.form.get('q'): - return redirect(url_for('search'), 308) + query = ('?' + request.query_string.decode()) if request.query_string else '' + return redirect(url_for('search') + query, 308) return render( 'index.html', + selected_categories=get_selected_categories(request.preferences, request.form), ) @@ -556,8 +568,8 @@ def search(): if output_format not in ['html', 'csv', 'json', 'rss']: output_format = 'html' - # check if there is query - if request.form.get('q') is None: + # check if there is query (not None and not an empty string) + if not request.form.get('q'): if output_format == 'html': return render( 'index.html', @@ -577,15 +589,12 @@ def search(): result_container = search.search() + except SearxParameterException as e: + logger.exception('search error: SearxParameterException') + return index_error(output_format, e.message), 400 except Exception as e: - # log exception logger.exception('search error') - - # is it an invalid input parameter or something else ? - if (issubclass(e.__class__, SearxParameterException)): - return index_error(output_format, e.message), 400 - else: - return index_error(output_format, gettext('search error')), 500 + return index_error(output_format, gettext('search error')), 500 # results results = result_container.get_ordered_results() diff --git a/tests/unit/test_search.py b/tests/unit/test_search.py index 36135913c..464a9b37d 100644 --- a/tests/unit/test_search.py +++ b/tests/unit/test_search.py @@ -21,6 +21,20 @@ TEST_ENGINES = [ ] +class SearchQueryTestCase(SearxTestCase): + + def test_repr(self): + s = SearchQuery('test', [EngineRef('bing', 'general', False)], ['general'], 'all', 0, 1, '1', 5.0, 'g') + self.assertEqual(repr(s), + "SearchQuery('test', [EngineRef('bing', 'general', False)], ['general'], 'all', 0, 1, '1', 5.0, 'g')") # noqa + + def test_eq(self): + s = SearchQuery('test', [EngineRef('bing', 'general', False)], ['general'], 'all', 0, 1, None, None, None) + t = SearchQuery('test', [EngineRef('google', 'general', False)], ['general'], 'all', 0, 1, None, None, None) + self.assertEqual(s, s) + self.assertNotEqual(s, t) + + class SearchTestCase(SearxTestCase): @classmethod diff --git a/tests/unit/test_standalone_searx.py b/tests/unit/test_standalone_searx.py index 6b8bdac2d..ddf140799 100644 --- a/tests/unit/test_standalone_searx.py +++ b/tests/unit/test_standalone_searx.py @@ -8,6 +8,7 @@ import sys from mock import Mock, patch from nose2.tools import params +from searx.search import SearchQuery from searx.testing import SearxTestCase @@ -94,7 +95,7 @@ class StandaloneSearx(SearxTestCase): args = sas.parse_argument(['rain', ]) search_q = sas.get_search_query(args) self.assertTrue(search_q) - self.assertEqual(str(search_q), 'rain;[]') + self.assertEqual(search_q, SearchQuery('rain', [], ['general'], 'all', 0, 1, None, None, None)) def test_no_parsed_url(self): """test no_parsed_url func""" diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index 08a266931..75a968ad8 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -75,9 +75,32 @@ class ViewsTestCase(SearxTestCase): self.assertEqual(result.status_code, 200) self.assertIn(b'<div class="title"><h1>searx</h1></div>', result.data) - def test_index_html(self): + def test_index_html_post(self): result = self.app.post('/', data={'q': 'test'}) self.assertEqual(result.status_code, 308) + self.assertEqual(result.location, 'http://localhost/search') + + def test_index_html_get(self): + result = self.app.post('/?q=test') + self.assertEqual(result.status_code, 308) + self.assertEqual(result.location, 'http://localhost/search?q=test') + + def test_search_empty_html(self): + result = self.app.post('/search', data={'q': ''}) + self.assertEqual(result.status_code, 200) + self.assertIn(b'<div class="title"><h1>searx</h1></div>', result.data) + + def test_search_empty_json(self): + result = self.app.post('/search', data={'q': '', 'format': 'json'}) + self.assertEqual(result.status_code, 400) + + def test_search_empty_csv(self): + result = self.app.post('/search', data={'q': '', 'format': 'csv'}) + self.assertEqual(result.status_code, 400) + + def test_search_empty_rss(self): + result = self.app.post('/search', data={'q': '', 'format': 'rss'}) + self.assertEqual(result.status_code, 400) def test_search_html(self): result = self.app.post('/search', data={'q': 'test'}) |