summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/engines/command.py184
-rw-r--r--searx/settings.yml71
-rw-r--r--searx/templates/oscar/result_templates/key-value.html2
-rw-r--r--tests/unit/engines/test_command.py241
4 files changed, 497 insertions, 1 deletions
diff --git a/searx/engines/command.py b/searx/engines/command.py
new file mode 100644
index 000000000..b9e672ffa
--- /dev/null
+++ b/searx/engines/command.py
@@ -0,0 +1,184 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+'''
+
+
+from os.path import expanduser, isabs, realpath, commonprefix
+from re import MULTILINE, search as re_search
+from shlex import split as shlex_split
+from subprocess import Popen, PIPE
+from time import time
+from threading import Thread
+
+from searx import logger
+
+
+offline = True
+paging = True
+command = []
+delimiter = {}
+parse_regex = {}
+query_type = ''
+query_enum = []
+environment_variables = {}
+working_dir = realpath('.')
+result_separator = '\n'
+result_template = 'key-value.html'
+timeout = 4.0
+
+_command_logger = logger.getChild('command')
+_compiled_parse_regex = {}
+
+
+def init(engine_settings):
+ check_parsing_options(engine_settings)
+
+ if 'command' not in engine_settings:
+ raise ValueError('engine command : missing configuration key: command')
+
+ global command, working_dir, result_template, delimiter, parse_regex, timeout, environment_variables
+
+ command = engine_settings['command']
+
+ if 'working_dir' in engine_settings:
+ working_dir = engine_settings['working_dir']
+ if not isabs(engine_settings['working_dir']):
+ working_dir = realpath(working_dir)
+
+ if 'parse_regex' in engine_settings:
+ parse_regex = engine_settings['parse_regex']
+ for result_key, regex in parse_regex.items():
+ _compiled_parse_regex[result_key] = re.compile(regex, flags=MULTILINE)
+ if 'delimiter' in engine_settings:
+ delimiter = engine_settings['delimiter']
+
+ if 'environment_variables' in engine_settings:
+ environment_variables = engine_settings['environment_variables']
+
+
+def search(query, params):
+ cmd = _get_command_to_run(query)
+ if not cmd:
+ return []
+
+ results = []
+ reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno']))
+ reader_thread.start()
+ reader_thread.join(timeout=timeout)
+
+ return results
+
+
+def _get_command_to_run(query):
+ params = shlex_split(query.decode('utf-8'))
+ __check_query_params(params)
+
+ cmd = []
+ for c in command:
+ if c == '{{QUERY}}':
+ cmd.extend(params)
+ else:
+ cmd.append(c)
+
+ return cmd
+
+
+def _get_results_from_process(results, cmd, pageno):
+ leftover = ''
+ count = 0
+ start, end = __get_results_limits(pageno)
+ with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process:
+ line = process.stdout.readline()
+ while line:
+ buf = leftover + line.decode('utf-8')
+ raw_results = buf.split(result_separator)
+ if raw_results[-1]:
+ leftover = raw_results[-1]
+ raw_results = raw_results[:-1]
+
+ for raw_result in raw_results:
+ result = __parse_single_result(raw_result)
+ if result is None:
+ _command_logger.debug('skipped result:', raw_result)
+ continue
+
+ if start <= count and count <= end:
+ result['template'] = result_template
+ results.append(result)
+
+ count += 1
+ if end < count:
+ return results
+
+ line = process.stdout.readline()
+
+ return_code = process.wait(timeout=timeout)
+ if return_code != 0:
+ raise RuntimeError('non-zero return code when running command', cmd, return_code)
+
+
+def __get_results_limits(pageno):
+ start = (pageno - 1) * 10
+ end = start + 9
+ return start, end
+
+
+def __check_query_params(params):
+ if not query_type:
+ return
+
+ if query_type == 'path':
+ query_path = params[-1]
+ query_path = expanduser(query_path)
+ if commonprefix([realpath(query_path), working_dir]) != working_dir:
+ raise ValueError('requested path is outside of configured working directory')
+ elif query_type == 'enum' and len(query_enum) > 0:
+ for param in params:
+ if param not in query_enum:
+ raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum)
+
+
+def check_parsing_options(engine_settings):
+ """ Checks if delimiter based parsing or regex parsing is configured correctly """
+
+ if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings:
+ raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex')
+ if 'delimiter' in engine_settings and 'parse_regex' in engine_settings:
+ raise ValueError('failed to init settings for parsing lines: too many settings')
+
+ if 'delimiter' in engine_settings:
+ if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']:
+ raise ValueError
+
+
+def __parse_single_result(raw_result):
+ """ Parses command line output based on configuration """
+
+ result = {}
+
+ if delimiter:
+ elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1)
+ if len(elements) != len(delimiter['keys']):
+ return {}
+ for i in range(len(elements)):
+ result[delimiter['keys'][i]] = elements[i]
+
+ if parse_regex:
+ for result_key, regex in _compiled_parse_regex.items():
+ found = regex.search(raw_result)
+ if not found:
+ return {}
+ result[result_key] = raw_result[found.start():found.end()]
+
+ return result
diff --git a/searx/settings.yml b/searx/settings.yml
index d6ea53177..9140522c4 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -920,6 +920,77 @@ engines:
# shortcut : uw
# base_url : 'http://doc.ubuntu-fr.org'
+# Be careful when enabling this engine if you are
+# running a public instance. Do not expose any sensitive
+# information. You can restrict access by configuring a list
+# of access tokens under tokens.
+# - name: git grep
+# engine: command
+# command: ['git', 'grep', '{{QUERY}}']
+# shortcut: gg
+# tokens: []
+# disabled: True
+# delimiter:
+# chars: ':'
+# keys: ['filepath', 'code']
+
+# Be careful when enabling this engine if you are
+# running a public instance. Do not expose any sensitive
+# information. You can restrict access by configuring a list
+# of access tokens under tokens.
+# - name: locate
+# engine: command
+# command: ['locate', '{{QUERY}}']
+# shortcut: loc
+# tokens: []
+# disabled: True
+# delimiter:
+# chars: ' '
+# keys: ['line']
+
+# Be careful when enabling this engine if you are
+# running a public instance. Do not expose any sensitive
+# information. You can restrict access by configuring a list
+# of access tokens under tokens.
+# - name: find
+# engine: command
+# command: ['find', '.', '-name', '{{QUERY}}']
+# query_type: path
+# shortcut: fnd
+# tokens: []
+# disabled: True
+# delimiter:
+# chars: ' '
+# keys: ['line']
+
+# Be careful when enabling this engine if you are
+# running a public instance. Do not expose any sensitive
+# information. You can restrict access by configuring a list
+# of access tokens under tokens.
+# - name: pattern search in files
+# engine: command
+# command: ['fgrep', '{{QUERY}}']
+# shortcut: fgr
+# tokens: []
+# disabled: True
+# delimiter:
+# chars: ' '
+# keys: ['line']
+
+# Be careful when enabling this engine if you are
+# running a public instance. Do not expose any sensitive
+# information. You can restrict access by configuring a list
+# of access tokens under tokens.
+# - name: regex search in files
+# engine: command
+# command: ['grep', '{{QUERY}}']
+# shortcut: gr
+# tokens: []
+# disabled: True
+# delimiter:
+# chars: ' '
+# keys: ['line']
+
locales:
en : English
ar : العَرَبِيَّة (Arabic)
diff --git a/searx/templates/oscar/result_templates/key-value.html b/searx/templates/oscar/result_templates/key-value.html
index 67c748e7f..d5c56a189 100644
--- a/searx/templates/oscar/result_templates/key-value.html
+++ b/searx/templates/oscar/result_templates/key-value.html
@@ -6,7 +6,7 @@
{% continue %}
{% endif %}
<tr>
- <td><b>{{ key|upper }}</b>: {{ value }}</td>
+ <td><b>{{ key|upper }}</b>: {{ value|truncate }}</td>
</tr>
{% endfor %}
</table>
diff --git a/tests/unit/engines/test_command.py b/tests/unit/engines/test_command.py
new file mode 100644
index 000000000..0aa1c6201
--- /dev/null
+++ b/tests/unit/engines/test_command.py
@@ -0,0 +1,241 @@
+'''
+searx is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+searx is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
+
+'''
+
+from sys import version_info
+
+from searx.engines import command as command_engine
+from searx.testing import SearxTestCase
+
+
+class TestCommandEngine(SearxTestCase):
+ def test_basic_seq_command_engine(self):
+ ls_engine = command_engine
+ ls_engine.command = ['seq', '{{QUERY}}']
+ ls_engine.delimiter = {'chars': ' ', 'keys': ['number']}
+ expected_results = [
+ {'number': '1', 'template': 'key-value.html'},
+ {'number': '2', 'template': 'key-value.html'},
+ {'number': '3', 'template': 'key-value.html'},
+ {'number': '4', 'template': 'key-value.html'},
+ {'number': '5', 'template': 'key-value.html'},
+ ]
+ results = ls_engine.search('5'.encode('utf-8'), {'pageno': 1})
+ self.assertEqual(results, expected_results)
+
+ def test_delimiter_parsing_command_engine(self):
+ searx_logs = '''DEBUG:searx.webapp:static directory is /home/n/p/searx/searx/static
+DEBUG:searx.webapp:templates directory is /home/n/p/searx/searx/templates
+DEBUG:searx.engines:soundcloud engine: Starting background initialization
+DEBUG:searx.engines:wolframalpha engine: Starting background initialization
+DEBUG:searx.engines:locate engine: Starting background initialization
+DEBUG:searx.engines:regex search in files engine: Starting background initialization
+DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.wolframalpha.com
+DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): soundcloud.com
+DEBUG:searx.engines:find engine: Starting background initialization
+DEBUG:searx.engines:pattern search in files engine: Starting background initialization
+DEBUG:searx.webapp:starting webserver on 127.0.0.1:8888
+WARNING:werkzeug: * Debugger is active!
+INFO:werkzeug: * Debugger PIN: 299-578-362'''
+ echo_engine = command_engine
+ echo_engine.command = ['echo', searx_logs]
+ echo_engine.delimiter = {'chars': ':', 'keys': ['level', 'component', 'message']}
+
+ expected_results_by_page = [
+ [
+ {
+ 'component': 'searx.webapp',
+ 'message': 'static directory is /home/n/p/searx/searx/static',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.webapp',
+ 'message': 'templates directory is /home/n/p/searx/searx/templates',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.engines',
+ 'message': 'soundcloud engine: Starting background initialization',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.engines',
+ 'message': 'wolframalpha engine: Starting background initialization',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.engines',
+ 'message': 'locate engine: Starting background initialization',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.engines',
+ 'message': 'regex search in files engine: Starting background initialization',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'urllib3.connectionpool',
+ 'message': 'Starting new HTTPS connection (1): www.wolframalpha.com',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'urllib3.connectionpool',
+ 'message': 'Starting new HTTPS connection (1): soundcloud.com',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.engines',
+ 'message': 'find engine: Starting background initialization',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'searx.engines',
+ 'message': 'pattern search in files engine: Starting background initialization',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+
+ ],
+ [
+ {
+ 'component': 'searx.webapp',
+ 'message': 'starting webserver on 127.0.0.1:8888',
+ 'template': 'key-value.html',
+ 'level': 'DEBUG',
+ },
+ {
+ 'component': 'werkzeug',
+ 'message': ' * Debugger is active!',
+ 'template': 'key-value.html',
+ 'level': 'WARNING',
+ },
+ {
+ 'component': 'werkzeug',
+ 'message': ' * Debugger PIN: 299-578-362',
+ 'template': 'key-value.html',
+ 'level': 'INFO',
+ },
+ ],
+
+ ]
+
+ for i in [0, 1]:
+ results = echo_engine.search(''.encode('utf-8'), {'pageno': i + 1})
+ self.assertEqual(results, expected_results_by_page[i])
+
+ def test_regex_parsing_command_engine(self):
+ txt = '''commit 35f9a8c81d162a361b826bbcd4a1081a4fbe76a7
+Author: Noémi Ványi <sitbackandwait@gmail.com>
+Date: Tue Oct 15 11:31:33 2019 +0200
+
+first interesting message
+
+commit 6c3c206316153ccc422755512bceaa9ab0b14faa
+Author: Noémi Ványi <sitbackandwait@gmail.com>
+Date: Mon Oct 14 17:10:08 2019 +0200
+
+second interesting message
+
+commit d8594d2689b4d5e0d2f80250223886c3a1805ef5
+Author: Noémi Ványi <sitbackandwait@gmail.com>
+Date: Mon Oct 14 14:45:05 2019 +0200
+
+third interesting message
+
+commit '''
+ git_log_engine = command_engine
+ git_log_engine.command = ['echo', txt]
+ git_log_engine.result_separator = '\n\ncommit '
+ git_log_engine.delimiter = {}
+ git_log_engine.parse_regex = {
+ 'commit': '\w{40}',
+ 'author': '[\w* ]* <\w*@?\w*\.?\w*>',
+ 'date': 'Date: .*',
+ 'message': '\n\n.*$'
+ }
+ expected_results = [
+ {
+ 'commit': '35f9a8c81d162a361b826bbcd4a1081a4fbe76a7',
+ 'author': ' Noémi Ványi <sitbackandwait@gmail.com>',
+ 'date': 'Date: Tue Oct 15 11:31:33 2019 +0200',
+ 'message': '\n\nfirst interesting message',
+ 'template': 'key-value.html',
+ },
+ {
+ 'commit': '6c3c206316153ccc422755512bceaa9ab0b14faa',
+ 'author': ' Noémi Ványi <sitbackandwait@gmail.com>',
+ 'date': 'Date: Mon Oct 14 17:10:08 2019 +0200',
+ 'message': '\n\nsecond interesting message',
+ 'template': 'key-value.html',
+ },
+ {
+ 'commit': 'd8594d2689b4d5e0d2f80250223886c3a1805ef5',
+ 'author': ' Noémi Ványi <sitbackandwait@gmail.com>',
+ 'date': 'Date: Mon Oct 14 14:45:05 2019 +0200',
+ 'message': '\n\nthird interesting message',
+ 'template': 'key-value.html',
+ },
+
+ ]
+
+ results = git_log_engine.search(''.encode('utf-8'), {'pageno': 1})
+ self.assertEqual(results, expected_results)
+
+ def test_working_dir_path_query(self):
+ ls_engine = command_engine
+ ls_engine.command = ['ls', '{{QUERY}}']
+ ls_engine.result_separator = '\n'
+ ls_engine.delimiter = {'chars': ' ', 'keys': ['file']}
+ ls_engine.query_type = 'path'
+
+ results = ls_engine.search('.'.encode(), {'pageno': 1})
+ self.assertTrue(len(results) != 0)
+
+ forbidden_paths = [
+ '..',
+ '../..',
+ './..',
+ '~',
+ '/var',
+ ]
+ for forbidden_path in forbidden_paths:
+ self.assertRaises(ValueError, ls_engine.search, '..'.encode(), {'pageno': 1})
+
+ def test_enum_queries(self):
+ echo_engine = command_engine
+ echo_engine.command = ['echo', '{{QUERY}}']
+ echo_engine.query_type = 'enum'
+ echo_engine.query_enum = ['i-am-allowed-to-say-this', 'and-that']
+
+ for allowed in echo_engine.query_enum:
+ results = echo_engine.search(allowed.encode(), {'pageno': 1})
+ self.assertTrue(len(results) != 0)
+
+ forbidden_queries = [
+ 'forbidden',
+ 'banned',
+ 'prohibited',
+ ]
+ for forbidden in forbidden_queries:
+ self.assertRaises(ValueError, echo_engine.search, forbidden.encode(), {'pageno': 1})