diff options
| -rw-r--r-- | Dockerfile | 3 | ||||
| -rw-r--r-- | dockerfiles/uwsgi.ini | 3 | ||||
| -rw-r--r-- | requirements.txt | 1 | ||||
| -rw-r--r-- | searx/search/__init__.py | 65 | ||||
| -rw-r--r-- | searx/search/checker/__init__.py | 4 | ||||
| -rw-r--r-- | searx/search/checker/__main__.py | 94 | ||||
| -rw-r--r-- | searx/search/checker/background.py | 123 | ||||
| -rw-r--r-- | searx/search/checker/impl.py | 406 | ||||
| -rw-r--r-- | searx/search/models.py | 69 | ||||
| -rw-r--r-- | searx/search/processors/abstract.py | 12 | ||||
| -rw-r--r-- | searx/search/processors/online.py | 52 | ||||
| -rw-r--r-- | searx/search/processors/online_currency.py | 10 | ||||
| -rw-r--r-- | searx/search/processors/online_dictionary.py | 18 | ||||
| -rw-r--r-- | searx/settings.yml | 40 | ||||
| -rw-r--r-- | searx/shared/__init__.py | 31 | ||||
| -rw-r--r-- | searx/shared/shared_abstract.py | 21 | ||||
| -rw-r--r-- | searx/shared/shared_simple.py | 39 | ||||
| -rw-r--r-- | searx/shared/shared_uwsgi.py | 64 | ||||
| -rwxr-xr-x | searx/webapp.py | 12 | ||||
| -rw-r--r-- | setup.py | 3 | ||||
| -rwxr-xr-x | utils/searx.sh | 6 | ||||
| -rw-r--r-- | utils/templates/etc/uwsgi/apps-archlinux/searx.ini | 5 | ||||
| -rw-r--r-- | utils/templates/etc/uwsgi/apps-available/searx.ini | 3 |
23 files changed, 1014 insertions, 70 deletions
diff --git a/Dockerfile b/Dockerfile index 3894aa968..f251d06ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,6 +41,8 @@ RUN apk upgrade --no-cache \ openssl-dev \ tar \ git \ + protoc \ + protobuf-dev \ && apk add --no-cache \ ca-certificates \ su-exec \ @@ -53,6 +55,7 @@ RUN apk upgrade --no-cache \ uwsgi \ uwsgi-python3 \ brotli \ + protobuf \ && pip3 install --upgrade pip \ && pip3 install --no-cache -r requirements.txt \ && apk del build-dependencies \ diff --git a/dockerfiles/uwsgi.ini b/dockerfiles/uwsgi.ini index 398a440d9..818a99cc0 100644 --- a/dockerfiles/uwsgi.ini +++ b/dockerfiles/uwsgi.ini @@ -42,3 +42,6 @@ static-map = /static=/usr/local/searx/searx/static static-expires = /* 864000 static-gzip-all = True offload-threads = %k + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 diff --git a/requirements.txt b/requirements.txt index ecf8e0c62..776bbc20b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pygments==2.1.3 python-dateutil==2.8.1 pyyaml==5.3.1 requests[socks]==2.25.1 +pycld3==0.20 diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 0d45f0b7c..f777e8595 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url from searx.results import ResultContainer from searx import logger from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery from searx.search.processors import processors, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker logger = logger.getChild('search') @@ -45,68 +47,11 @@ else: sys.exit(1) -def initialize(settings_engines=None): +def initialize(settings_engines=None, enable_checker=False): settings_engines = settings_engines or settings['engines'] initialize_processors(settings_engines) - - -class EngineRef: - - __slots__ = 'name', 'category' - - def __init__(self, name: str, category: str): - self.name = name - self.category = category - - def __repr__(self): - return "EngineRef({!r}, {!r})".format(self.name, self.category) - - def __eq__(self, other): - return self.name == other.name and self.category == other.category - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ - 'timeout_limit', 'external_bang' - - def __init__(self, - query: str, - engineref_list: typing.List[EngineRef], - lang: str='all', - safesearch: int=0, - pageno: int=1, - time_range: typing.Optional[str]=None, - timeout_limit: typing.Optional[float]=None, - external_bang: typing.Optional[str]=None): - self.query = query - self.engineref_list = engineref_list - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = time_range - self.timeout_limit = timeout_limit - self.external_bang = external_bang - - @property - def categories(self): - return list(set(map(lambda engineref: engineref.category, self.engineref_list))) - - def __repr__(self): - return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ - format(self.query, self.engineref_list, self.lang, self.safesearch, - self.pageno, self.time_range, self.timeout_limit, self.external_bang) - - def __eq__(self, other): - return self.query == other.query\ - and self.engineref_list == other.engineref_list\ - and self.lang == other.lang\ - and self.safesearch == other.safesearch\ - and self.pageno == other.pageno\ - and self.time_range == other.time_range\ - and self.timeout_limit == other.timeout_limit\ - and self.external_bang == other.external_bang + if enable_checker: + initialize_checker() class Search: diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 000000000..85b9178df --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 000000000..75b37e6c5 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import sys +import io +import os +import argparse +import logging + +import searx.search +import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts + + +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) +stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) + + +# iterator of processors +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: + yield name, processor + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +# actual check & display +def run(engine_name_list, verbose): + searx.search.initialize() + for name, processor in iter_processor(engine_name_list): + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + else: + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') + + +# call by setup.py +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + parser.add_argument('--verbose', '-v', + action='store_true', dest='verbose', + help='Display details about the test results', + default=False) + args = parser.parse_args() + run(args.engine_name_list, args.verbose) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..e41bff5f5 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str(CHECKER_RESULT) + if serialized_result is not None: + return json.loads(serialized_result) + + +def _set_result(result, include_timestamp=True): + if include_timestamp: + result['timestamp'] = int(time.time() / 3600) * 3600 + storage.set_str(CHECKER_RESULT, json.dumps(result)) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = { + 'status': 'ok', + 'engines': {} + } + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result['engines'][name] = {'success': True} + else: + result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} + + _set_result(result) + logger.info('Check done') + except Exception: + _set_result({'status': 'error'}) + logger.exception('Error while running the checker') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + if schedule(every[0], _run_with_delay): + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # disabled by default + _set_result({'status': 'disabled'}) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + _set_result({'status': 'unknown'}, include_timestamp=False) + + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 000000000..71a941f73 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing +import types +import functools +import itertools +from time import time +from urllib.parse import urlparse + +import re +import cld3 +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search.models import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +logger = logger.getChild('searx.search.checker') + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['</' + tag + '>' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'logs', 'languages' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() + + def add_error(self, test, message, *args): + # message to self.errors + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) + + def add_language(self, language): + self.languages.add(language) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results] + + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') + + def _add_language(self, text: str) -> typing.Optional[str]: + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.98 and r.is_reliable: + self.languages.add(r.language) + self.test_results.add_language(r.language) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title', repr(result.get('title', ''))) + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content', repr(result.get('content', ''))) + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self._record_error('thumbnail_src URL is invalid', thumbnail_src) + elif not _is_url_image(result.get('img_src')): + self._record_error('img_src URL is invalid', result.get('img_src')) + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid', result.get('img_src')) + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer', answer) + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content', infobox.get('content', '')) + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + """Check the ResultContainer has at least one infobox""" + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + """Check the ResultContainer has at least one answer""" + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + """Check the results of each ResultContain is unique""" + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) diff --git a/searx/search/models.py b/searx/search/models.py new file mode 100644 index 000000000..80ceaa223 --- /dev/null +++ b/searx/search/models.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing + + +class EngineRef: + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ + 'timeout_limit', 'external_bang' + + def __init__(self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str='all', + safesearch: int=0, + pageno: int=1, + time_range: typing.Optional[str]=None, + timeout_limit: typing.Optional[float]=None, + external_bang: typing.Optional[str]=None): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang + + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index cf3fd7236..eb8d296ec 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -37,3 +37,15 @@ class EngineProcessor: @abstractmethod def search(self, query, params, result_container, start_time, timeout_limit): pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + else: + return tests + + def get_default_tests(self): + return {} diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index b62f8059e..0ceb0adf2 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -179,15 +179,15 @@ class OnlineProcessor(EngineProcessor): requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required') - logger.exception('engine {0} : CAPTCHA') + logger.exception('engine {0} : CAPTCHA'.format(self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)): result_container.add_unresponsive_engine(self.engine_name, 'too many requests') - logger.exception('engine {0} : Too many requests') + logger.exception('engine {0} : Too many requests'.format(self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member elif (issubclass(e.__class__, SearxEngineAccessDeniedException)): result_container.add_unresponsive_engine(self.engine_name, 'blocked') - logger.exception('engine {0} : Searx is blocked') + logger.exception('engine {0} : Searx is blocked'.format(self.engine_name)) suspended_time = e.suspended_time # pylint: disable=no-member else: result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash') @@ -211,3 +211,49 @@ class OnlineProcessor(EngineProcessor): # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0 + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('life', 'computer')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + tests['paging'] = { + 'matrix': {'query': 'time', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + if 'general' in self.engine.categories: + # avoid documentation about HTML tags (<time> and <input type="time">) + tests['paging']['matrix']['query'] = 'news' + + if getattr(self.engine, 'time_range', False): + tests['time_range'] = { + 'matrix': {'query': 'news', + 'time_range': (None, 'day')}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'lang', False): + tests['lang_fr'] = { + 'matrix': {'query': 'paris', 'lang': 'fr'}, + 'result_container': ['not_empty', ('has_lang', 'fr')], + } + tests['lang_en'] = { + 'matrix': {'query': 'paris', 'lang': 'en'}, + 'result_container': ['not_empty', ('has_lang', 'en')], + } + + if getattr(self.engine, 'safesearch', False): + tests['safesearch'] = { + 'matrix': {'query': 'porn', + 'safesearch': (0, 2)}, + 'test': ['unique_results'] + } + + return tests diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py index f0e919c03..132c10594 100644 --- a/searx/search/processors/online_currency.py +++ b/searx/search/processors/online_currency.py @@ -55,3 +55,13 @@ class OnlineCurrencyProcessor(OnlineProcessor): params['from_name'] = iso4217_to_name(from_currency, 'en') params['to_name'] = iso4217_to_name(to_currency, 'en') return params + + def get_default_tests(self): + tests = {} + + tests['currency'] = { + 'matrix': {'query': '1337 usd in rmb'}, + 'result_container': ['has_answer'], + } + + return tests diff --git a/searx/search/processors/online_dictionary.py b/searx/search/processors/online_dictionary.py index 8e9ef1620..987c710a1 100644 --- a/searx/search/processors/online_dictionary.py +++ b/searx/search/processors/online_dictionary.py @@ -35,3 +35,21 @@ class OnlineDictionaryProcessor(OnlineProcessor): params['query'] = query return params + + def get_default_tests(self): + tests = {} + + if getattr(self.engine, 'paging', False): + tests['translation_paging'] = { + 'matrix': {'query': 'en-es house', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + 'test': ['unique_results'] + } + else: + tests['translation'] = { + 'matrix': {'query': 'en-es house'}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + } + + return tests diff --git a/searx/settings.yml b/searx/settings.yml index 30afbf957..55c9849c1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -102,6 +102,34 @@ outgoing: # communication with search engines # - "HTTPS rewrite" # - ... +checker: + # disable checker when in debug mode + off_when_debug: True + # scheduling: interval or int + # use "scheduling: False" to disable scheduling + scheduling: + start_after: [300, 1800] # delay to start the first run of the checker + every: [86400, 90000] # how often the checker runs + # additional tests: only for the YAML anchors (see the engines section) + additional_tests: + rosebud: &test_rosebud + matrix: + query: rosebud + lang: en + result_container: + - not_empty + - ['one_title_contains', 'citizen kane'] + test: + - unique_results + # tests: only for the YAML anchors (see the engines section) + tests: + infobox: &tests_infobox + infobox: + matrix: + query: ["linux", "new york", "bbc"] + result_container: + - has_infobox + engines: - name: apk mirror engine: apkmirror @@ -218,6 +246,7 @@ engines: shortcut : ddd weight : 2 disabled : True + tests: *tests_infobox # cloudflare protected # - name : digbt @@ -262,6 +291,7 @@ engines: shortcut : wd timeout : 3.0 weight : 2 + tests: *tests_infobox - name : duckduckgo engine : duckduckgo @@ -278,6 +308,8 @@ engines: engine : etools shortcut : eto disabled : True + additional_tests: + rosebud: *test_rosebud - name : etymonline engine : xpath @@ -343,6 +375,8 @@ engines: shortcut : gb timeout : 3.0 disabled: True + additional_tests: + rosebud: *test_rosebud - name : gentoo engine : gentoo @@ -646,6 +680,8 @@ engines: shortcut : qw categories : general disabled : True + additional_tests: + rosebud: *test_rosebud - name : qwant images engine : qwant @@ -745,6 +781,8 @@ engines: shortcut : sp timeout : 6.0 disabled : True + additional_tests: + rosebud: *test_rosebud - name : tokyotoshokan engine : tokyotoshokan @@ -856,6 +894,8 @@ engines: number_of_results : 5 search_type : text disabled : True + additional_tests: + rosebud: *test_rosebud - name : wikisource engine : mediawiki diff --git a/searx/shared/__init__.py b/searx/shared/__init__.py new file mode 100644 index 000000000..cbe24d239 --- /dev/null +++ b/searx/shared/__init__.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import logging + +logger = logging.getLogger('searx.shared') + +try: + import uwsgi +except: + # no uwsgi + from .shared_simple import SimpleSharedDict as SharedDict, schedule + logger.info('Use shared_simple implementation') +else: + try: + uwsgi.cache_update('dummy', b'dummy') + if uwsgi.cache_get('dummy') != b'dummy': + raise Exception() + except: + # uwsgi.ini configuration problem: disable all scheduling + logger.error('uwsgi.ini configuration error, add this line to your uwsgi.ini\n' + 'cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1') + from .shared_simple import SimpleSharedDict as SharedDict + + def schedule(delay, func, *args): + return False + else: + # uwsgi + from .shared_uwsgi import UwsgiCacheSharedDict as SharedDict, schedule + logger.info('Use shared_uwsgi implementation') + +storage = SharedDict() diff --git a/searx/shared/shared_abstract.py b/searx/shared/shared_abstract.py new file mode 100644 index 000000000..b1c72aabe --- /dev/null +++ b/searx/shared/shared_abstract.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from abc import ABC, abstractmethod + + +class SharedDict(ABC): + + @abstractmethod + def get_int(self, key): + pass + + @abstractmethod + def set_int(self, key, value): + pass + + @abstractmethod + def get_str(self, key): + pass + + @abstractmethod + def set_str(self, key, value): + pass diff --git a/searx/shared/shared_simple.py b/searx/shared/shared_simple.py new file mode 100644 index 000000000..48d8cb822 --- /dev/null +++ b/searx/shared/shared_simple.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import threading + +from . import shared_abstract + + +class SimpleSharedDict(shared_abstract.SharedDict): + + __slots__ = 'd', + + def __init__(self): + self.d = {} + + def get_int(self, key): + return self.d.get(key, None) + + def set_int(self, key, value): + self.d[key] = value + + def get_str(self, key): + return self.d.get(key, None) + + def set_str(self, key, value): + self.d[key] = value + + +def schedule(delay, func, *args): + def call_later(): + t = threading.Timer(delay, wrapper) + t.daemon = True + t.start() + + def wrapper(): + call_later() + func(*args) + + call_later() + return True diff --git a/searx/shared/shared_uwsgi.py b/searx/shared/shared_uwsgi.py new file mode 100644 index 000000000..a6dba9f59 --- /dev/null +++ b/searx/shared/shared_uwsgi.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import time +import uwsgi # pylint: disable=E0401 +from . import shared_abstract + + +_last_signal = 10 + + +class UwsgiCacheSharedDict(shared_abstract.SharedDict): + + def get_int(self, key): + value = uwsgi.cache_get(key) + if value is None: + return value + else: + return int.from_bytes(value, 'big') + + def set_int(self, key, value): + b = value.to_bytes(4, 'big') + uwsgi.cache_update(key, b) + + def get_str(self, key): + value = uwsgi.cache_get(key) + if value is None: + return value + else: + return value.decode('utf-8') + + def set_str(self, key, value): + b = value.encode('utf-8') + uwsgi.cache_update(key, b) + + +def schedule(delay, func, *args): + """ + Can be implemented using a spooler. + https://uwsgi-docs.readthedocs.io/en/latest/PythonDecorators.html + + To make the uwsgi configuration simple, use the alternative implementation. + """ + global _last_signal + + def sighandler(signum): + now = int(time.time()) + key = 'scheduler_call_time_signal_' + str(signum) + uwsgi.lock() + try: + updating = uwsgi.cache_get(key) + if updating is not None: + updating = int.from_bytes(updating, 'big') + if now - updating < delay: + return + uwsgi.cache_update(key, now.to_bytes(4, 'big')) + finally: + uwsgi.unlock() + func(*args) + + signal_num = _last_signal + _last_signal += 1 + uwsgi.register_signal(signal_num, 'worker', sighandler) + uwsgi.add_timer(signal_num, delay) + return True diff --git a/searx/webapp.py b/searx/webapp.py index 10f4ce78c..985eced18 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori from searx.utils import html_to_text, gen_useragent, dict_subset, match_language from searx.version import VERSION_STRING from searx.languages import language_codes as languages -from searx.search import SearchWithPlugins, initialize +from searx.search import SearchWithPlugins, initialize as search_initialize +from searx.search.checker import get_result as checker_get_result from searx.query import RawTextQuery from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.plugins import plugins @@ -81,7 +82,6 @@ from searx.answerers import answerers from searx.poolrequests import get_global_proxies from searx.metrology.error_recorder import errors_per_engines - # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) @@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai # initialize the engines except on the first run of the werkzeug server. if not werkzeug_reloader\ or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"): - initialize() + search_initialize(enable_checker=True) babel = Babel(app) @@ -977,6 +977,12 @@ def stats_errors(): return jsonify(result) +@app.route('/stats/checker', methods=['GET']) +def stats_checker(): + result = checker_get_result() + return jsonify(result) + + @app.route('/robots.txt', methods=['GET']) def robots(): return Response("""User-agent: * @@ -49,7 +49,8 @@ setup( }, entry_points={ 'console_scripts': [ - 'searx-run = searx.webapp:run' + 'searx-run = searx.webapp:run', + 'searx-checker = searx.search.checker.__main__:main' ] }, package_data={ diff --git a/utils/searx.sh b/utils/searx.sh index b7d3b8e1c..134f0dbb1 100755 --- a/utils/searx.sh +++ b/utils/searx.sh @@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\ python3-dev python3-babel python3-venv uwsgi uwsgi-plugin-python3 git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev +libprotobuf-dev protobuf-compiler shellcheck" BUILD_PACKAGES_debian="\ @@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python git base-devel libxml2 +protobuf shellcheck" BUILD_PACKAGES_arch="\ @@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_fedora="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools @@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\ python36 python36-pip python36-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_centos="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools diff --git a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini index 9dd2e6f2f..71cece3c4 100644 --- a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini +++ b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini @@ -82,4 +82,7 @@ http = ${SEARX_INTERNAL_HTTP} # mkdir -p /run/uwsgi/app/searx # chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx # -# socket = /run/uwsgi/app/searx/socket
\ No newline at end of file +# socket = /run/uwsgi/app/searx/socket + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 diff --git a/utils/templates/etc/uwsgi/apps-available/searx.ini b/utils/templates/etc/uwsgi/apps-available/searx.ini index 4d69da0cf..45214ef13 100644 --- a/utils/templates/etc/uwsgi/apps-available/searx.ini +++ b/utils/templates/etc/uwsgi/apps-available/searx.ini @@ -82,3 +82,6 @@ http = ${SEARX_INTERNAL_HTTP} # chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx # # socket = /run/uwsgi/app/searx/socket + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 |