diff options
Diffstat (limited to 'searx/search/checker')
| -rw-r--r-- | searx/search/checker/__init__.py | 3 | ||||
| -rw-r--r-- | searx/search/checker/__main__.py | 30 | ||||
| -rw-r--r-- | searx/search/checker/background.py | 106 | ||||
| -rw-r--r-- | searx/search/checker/impl.py | 12 |
4 files changed, 143 insertions, 8 deletions
diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py index 442d5a09d..85b9178df 100644 --- a/searx/search/checker/__init__.py +++ b/searx/search/checker/__init__.py @@ -1 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 2f808237a..37b7e6cda 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import sys import os +import argparse import searx.search -import searx.search.processors import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: @@ -18,20 +22,24 @@ else: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" -def iter_processor(): - if len(sys.argv) > 1: - for name, processor in searx.search.processors.items(): - if name in sys.argv: +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: yield name, processor + else: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) else: for name, processor in searx.search.processors.items(): yield name, processor -def main(): +def run(engine_name_list): searx.search.initialize() broken_urls = [] - for name, processor in iter_processor(): + for name, processor in iter_processor(engine_name_list): if sys.stdout.isatty(): print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) checker = searx.search.checker.Checker(processor) @@ -48,5 +56,13 @@ def main(): print('Error fetching', url) +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + args = parser.parse_args() + run(args.engine_name_list) + + if __name__ == '__main__': main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..45188ab38 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str('CHECKER_RESULT') + if serialized_result is not None: + return json.loads(serialized_result) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = {} + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result[name] = {'status': True} + else: + result[name] = {'status': False, 'errors': checker.test_results.errors} + + storage.set_str('CHECKER_RESULT', json.dumps(result)) + logger.info('Check done') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + schedule(every[0], _run_with_delay) + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index f55b6d0f5..abef5f8e9 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import typing import types import functools @@ -11,7 +13,7 @@ import requests.exceptions from searx import poolrequests, logger from searx.results import ResultContainer -from searx.search import SearchQuery, EngineRef +from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -240,18 +242,24 @@ class ResultContainerTests: self._check_infoboxes(self.result_container.infoboxes) def has_infobox(self): + """Check the ResultContainer has at least one infobox""" if len(self.result_container.infoboxes) == 0: self._record_error('No infobox') def has_answer(self): + """Check the ResultContainer has at least one answer""" if len(self.result_container.answers) == 0: self._record_error('No answer') def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" if lang not in self.languages: self._record_error(lang + ' not found') def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" result_types = set() results = self.result_container.get_ordered_results() if len(results) > 0: @@ -267,6 +275,7 @@ class ResultContainerTests: self._record_error('No result') def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" title = title.lower() for result in self.result_container.get_ordered_results(): if title in result['title'].lower(): @@ -287,6 +296,7 @@ class CheckerTests: self.result_container_tests_list = result_container_tests_list def unique_results(self): + """Check the results of each ResultContain is unique""" urls_list = [rct.result_urls for rct in self.result_container_tests_list] if len(urls_list[0]) > 0: # results on the first page |