From 8cbc9f2d5823eb984e99e15c963e306610007fa1 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 24 Dec 2020 09:28:16 +0100 Subject: [enh] add checker --- searx/search/checker/__init__.py | 1 + searx/search/checker/__main__.py | 51 +++++ searx/search/checker/impl.py | 388 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 440 insertions(+) create mode 100644 searx/search/checker/__init__.py create mode 100644 searx/search/checker/__main__.py create mode 100644 searx/search/checker/impl.py (limited to 'searx/search/checker') diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 000000000..442d5a09d --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1 @@ +from .impl import Checker diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 000000000..c071e6437 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,51 @@ +import sys + +import searx.search +import searx.search.processors +import searx.search.checker + + +if sys.stdout.isatty(): + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + + +def iter_processor(): + if len(sys.argv) > 1: + for name, processor in searx.search.processors.items(): + if name in sys.argv: + yield name, processor + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +def main(): + searx.search.initialize() + broken_urls = [] + for name, processor in iter_processor(): + if sys.stdout.isatty(): + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + else: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) + + broken_urls += checker.test_results.broken_urls + + for url in broken_urls: + print('Error fetching', url) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 000000000..f55b6d0f5 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,388 @@ +import typing +import types +import functools +import itertools +from time import time +from urllib.parse import urlparse + +import re +import cld3 +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'broken_urls' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.broken_urls = [] + + def add_error(self, test, message): + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + + def add_broken_url(self, url): + if url not in self.broken_urls: + self.broken_urls.append(url) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results] + + def _record_error(self, message: str) -> None: + self.test_results.add_error(self.test_name, message) + + def _add_language(self, text: str) -> typing.Optional[str]: + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.9 and r.is_reliable: + self.languages.add(r.language) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title') + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content') + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self.test_results.add_broken_url(thumbnail_src) + self._record_error('thumbnail_src URL is invalid') + elif not _is_url_image(result.get('img_src')): + self.test_results.add_broken_url(result.get('img_src')) + self._record_error('img_src URL is invalid') + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid') + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer') + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content') + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value') + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) -- cgit v1.2.3 From 9c581466e136f7cb82d5ffe6c052fbd9e93ab39f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 30 Dec 2020 15:24:29 +0100 Subject: [fix] do not colorize output on dumb terminals Signed-off-by: Markus Heiser --- searx/search/checker/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'searx/search/checker') diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index c071e6437..2f808237a 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,11 +1,12 @@ import sys +import os import searx.search import searx.search.processors import searx.search.checker -if sys.stdout.isatty(): +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: RESET_SEQ = "\033[0m" COLOR_SEQ = "\033[1;%dm" BOLD_SEQ = "\033[1m" -- cgit v1.2.3 From 3a9f513521d006a7939538cce368d7b799e32c30 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 5 Jan 2021 11:24:39 +0100 Subject: [enh] checker: background check See settings.yml for the options SIGUSR1 signal starts the checker. The result is available at /stats/checker --- searx/search/checker/__init__.py | 3 ++ searx/search/checker/__main__.py | 30 ++++++++--- searx/search/checker/background.py | 106 +++++++++++++++++++++++++++++++++++++ searx/search/checker/impl.py | 12 ++++- 4 files changed, 143 insertions(+), 8 deletions(-) create mode 100644 searx/search/checker/background.py (limited to 'searx/search/checker') diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py index 442d5a09d..85b9178df 100644 --- a/searx/search/checker/__init__.py +++ b/searx/search/checker/__init__.py @@ -1 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 2f808237a..37b7e6cda 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import sys import os +import argparse import searx.search -import searx.search.processors import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: @@ -18,20 +22,24 @@ else: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" -def iter_processor(): - if len(sys.argv) > 1: - for name, processor in searx.search.processors.items(): - if name in sys.argv: +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: yield name, processor + else: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) else: for name, processor in searx.search.processors.items(): yield name, processor -def main(): +def run(engine_name_list): searx.search.initialize() broken_urls = [] - for name, processor in iter_processor(): + for name, processor in iter_processor(engine_name_list): if sys.stdout.isatty(): print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) checker = searx.search.checker.Checker(processor) @@ -48,5 +56,13 @@ def main(): print('Error fetching', url) +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + args = parser.parse_args() + run(args.engine_name_list) + + if __name__ == '__main__': main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..45188ab38 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str('CHECKER_RESULT') + if serialized_result is not None: + return json.loads(serialized_result) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = {} + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result[name] = {'status': True} + else: + result[name] = {'status': False, 'errors': checker.test_results.errors} + + storage.set_str('CHECKER_RESULT', json.dumps(result)) + logger.info('Check done') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + schedule(every[0], _run_with_delay) + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index f55b6d0f5..abef5f8e9 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import typing import types import functools @@ -11,7 +13,7 @@ import requests.exceptions from searx import poolrequests, logger from searx.results import ResultContainer -from searx.search import SearchQuery, EngineRef +from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -240,18 +242,24 @@ class ResultContainerTests: self._check_infoboxes(self.result_container.infoboxes) def has_infobox(self): + """Check the ResultContainer has at least one infobox""" if len(self.result_container.infoboxes) == 0: self._record_error('No infobox') def has_answer(self): + """Check the ResultContainer has at least one answer""" if len(self.result_container.answers) == 0: self._record_error('No answer') def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" if lang not in self.languages: self._record_error(lang + ' not found') def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" result_types = set() results = self.result_container.get_ordered_results() if len(results) > 0: @@ -267,6 +275,7 @@ class ResultContainerTests: self._record_error('No result') def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" title = title.lower() for result in self.result_container.get_ordered_results(): if title in result['title'].lower(): @@ -287,6 +296,7 @@ class CheckerTests: self.result_container_tests_list = result_container_tests_list def unique_results(self): + """Check the results of each ResultContain is unique""" urls_list = [rct.result_urls for rct in self.result_container_tests_list] if len(urls_list[0]) > 0: # results on the first page -- cgit v1.2.3 From 45bfab77d0154c60f58be0453307cb03b48dca35 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 8 Jan 2021 19:04:04 +0100 Subject: |mod] checker: improve searx-checker command line * output is unbuffered * verbose mode describe more precisly the errrors --- searx/search/checker/__main__.py | 54 +++++++++++++++++++++++++++++----------- searx/search/checker/impl.py | 46 ++++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 33 deletions(-) (limited to 'searx/search/checker') diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 37b7e6cda..75b37e6c5 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: AGPL-3.0-or-later import sys +import io import os import argparse +import logging import searx.search import searx.search.checker @@ -10,6 +12,14 @@ from searx.search import processors from searx.engines import engine_shortcuts +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: RESET_SEQ = "\033[0m" COLOR_SEQ = "\033[1;%dm" @@ -21,7 +31,12 @@ else: BOLD_SEQ = "" BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) +stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) + +# iterator of processors def iter_processor(engine_name_list): if len(engine_name_list) > 0: for name in engine_name_list: @@ -30,38 +45,49 @@ def iter_processor(engine_name_list): if processor is not None: yield name, processor else: - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') else: for name, processor in searx.search.processors.items(): yield name, processor -def run(engine_name_list): +# actual check & display +def run(engine_name_list, verbose): searx.search.initialize() - broken_urls = [] for name, processor in iter_processor(engine_name_list): - if sys.stdout.isatty(): - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') checker = searx.search.checker.Checker(processor) checker.run() if checker.test_results.succesfull: - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') else: - errors = [test_name + ': ' + error for test_name, error in checker.test_results] - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) - - broken_urls += checker.test_results.broken_urls - - for url in broken_urls: - print('Error fetching', url) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') +# call by setup.py def main(): parser = argparse.ArgumentParser(description='Check searx engines.') parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', help='engines name or shortcut list. Empty for all engines.') + parser.add_argument('--verbose', '-v', + action='store_true', dest='verbose', + help='Display details about the test results', + default=False) args = parser.parse_args() - run(args.engine_name_list) + run(args.engine_name_list, args.verbose) if __name__ == '__main__': diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index abef5f8e9..71a941f73 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -17,6 +17,8 @@ from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor +logger = logger.getChild('searx.search.checker') + HTML_TAGS = [ 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', @@ -121,20 +123,25 @@ def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ class TestResults: - __slots__ = 'errors', 'broken_urls' + __slots__ = 'errors', 'logs', 'languages' def __init__(self): self.errors: typing.Dict[str, typing.List[str]] = {} - self.broken_urls = [] + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() - def add_error(self, test, message): + def add_error(self, test, message, *args): + # message to self.errors errors_for_test = self.errors.setdefault(test, []) if message not in errors_for_test: errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) - def add_broken_url(self, url): - if url not in self.broken_urls: - self.broken_urls.append(url) + def add_language(self, language): + self.languages.add(language) @property def succesfull(self): @@ -167,20 +174,23 @@ class ResultContainerTests: results = self.result_container.get_ordered_results() return [result['url'] for result in results] - def _record_error(self, message: str) -> None: - self.test_results.add_error(self.test_name, message) + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') def _add_language(self, text: str) -> typing.Optional[str]: r = cld3.get_language(str(text)) # pylint: disable=E1101 - if r is not None and r.probability >= 0.9 and r.is_reliable: + if r is not None and r.probability >= 0.98 and r.is_reliable: self.languages.add(r.language) + self.test_results.add_language(r.language) return None def _check_result(self, result): if not _check_no_html(result.get('title', '')): - self._record_error('HTML in title') + self._record_error('HTML in title', repr(result.get('title', ''))) if not _check_no_html(result.get('content', '')): - self._record_error('HTML in content') + self._record_error('HTML in content', repr(result.get('content', ''))) self._add_language(result.get('title', '')) self._add_language(result.get('content', '')) @@ -198,13 +208,11 @@ class ResultContainerTests: thumbnail_src = result.get('thumbnail_src') if thumbnail_src is not None: if not _is_url_image(thumbnail_src): - self.test_results.add_broken_url(thumbnail_src) - self._record_error('thumbnail_src URL is invalid') + self._record_error('thumbnail_src URL is invalid', thumbnail_src) elif not _is_url_image(result.get('img_src')): - self.test_results.add_broken_url(result.get('img_src')) - self._record_error('img_src URL is invalid') + self._record_error('img_src URL is invalid', result.get('img_src')) if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): - self._record_error('thumbnail URL is invalid') + self._record_error('thumbnail URL is invalid', result.get('img_src')) def _check_results(self, results: list): for result in results: @@ -213,16 +221,16 @@ class ResultContainerTests: def _check_answers(self, answers): for answer in answers: if not _check_no_html(answer): - self._record_error('HTML in answer') + self._record_error('HTML in answer', answer) def _check_infoboxes(self, infoboxes): for infobox in infoboxes: if not _check_no_html(infobox.get('content', '')): - self._record_error('HTML in infobox content') + self._record_error('HTML in infobox content', infobox.get('content', '')) self._add_language(infobox.get('content', '')) for attribute in infobox.get('attributes', {}): if not _check_no_html(attribute.get('value', '')): - self._record_error('HTML in infobox attribute value') + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) def check_basic(self): if len(self.result_container.unresponsive_engines) > 0: -- cgit v1.2.3 From 87bafbc32b34ef7f3033bdea6a4bfa966a6068c1 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Mon, 11 Jan 2021 18:43:12 +0100 Subject: [mod] checker: add status and timestamp to the result for each engine: replace status by success --- searx/search/checker/background.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'searx/search/checker') diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py index 45188ab38..0fc13ddb6 100644 --- a/searx/search/checker/background.py +++ b/searx/search/checker/background.py @@ -35,28 +35,39 @@ def _get_every(): def get_result(): - serialized_result = storage.get_str('CHECKER_RESULT') + serialized_result = storage.get_str(CHECKER_RESULT) if serialized_result is not None: return json.loads(serialized_result) +def _set_result(result): + result['timestamp'] = int(time.time() / 3600) * 3600 + storage.set_str(CHECKER_RESULT, json.dumps(result)) + + def run(): if not running.acquire(blocking=False): return try: logger.info('Starting checker') - result = {} + result = { + 'status': 'ok', + 'engines': {} + } for name, processor in processors.items(): logger.debug('Checking %s engine', name) checker = Checker(processor) checker.run() if checker.test_results.succesfull: - result[name] = {'status': True} + result['engines'][name] = {'success': True} else: - result[name] = {'status': False, 'errors': checker.test_results.errors} + result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} - storage.set_str('CHECKER_RESULT', json.dumps(result)) + _set_result(result) logger.info('Check done') + except Exception: + _set_result({'status': 'error'}) + logger.exception('Error while running the checker') finally: running.release() @@ -85,6 +96,9 @@ def initialize(): logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) signal.signal(signal.SIGUSR1, _signal_handler) + # disabled by default + _set_result({'status': 'disabled'}) + # special case when debug is activate if searx_debug and settings.get('checker', {}).get('off_when_debug', True): logger.info('debug mode: checker is disabled') @@ -97,6 +111,8 @@ def initialize(): return # + _set_result({'status': 'unknow'}) + start_after = scheduling.get('start_after', (300, 1800)) start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') delay = random.randint(start_after[0], start_after[1]) -- cgit v1.2.3 From 7f0c508598cc2197e53b877dcf4c76e25a097c4f Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 12 Jan 2021 09:33:58 +0100 Subject: [fix] checker: fix typo unknown instead of unknow --- searx/search/checker/background.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/search/checker') diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py index 0fc13ddb6..be30897bc 100644 --- a/searx/search/checker/background.py +++ b/searx/search/checker/background.py @@ -111,7 +111,7 @@ def initialize(): return # - _set_result({'status': 'unknow'}) + _set_result({'status': 'unknown'}) start_after = scheduling.get('start_after', (300, 1800)) start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') -- cgit v1.2.3 From 912c7e975c3943db798d748fa48d460467b66d30 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Wed, 13 Jan 2021 14:07:39 +0100 Subject: [fix] checker: don't run the checker when uwsgi is not properly configured Before this commit, even with the scheduler disabled, the checker was running at least once for each uwsgi worker. --- searx/search/checker/background.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'searx/search/checker') diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py index be30897bc..e41bff5f5 100644 --- a/searx/search/checker/background.py +++ b/searx/search/checker/background.py @@ -40,8 +40,9 @@ def get_result(): return json.loads(serialized_result) -def _set_result(result): - result['timestamp'] = int(time.time() / 3600) * 3600 +def _set_result(result, include_timestamp=True): + if include_timestamp: + result['timestamp'] = int(time.time() / 3600) * 3600 storage.set_str(CHECKER_RESULT, json.dumps(result)) @@ -82,8 +83,8 @@ def _run_with_delay(): def _start_scheduling(): every = _get_every() - schedule(every[0], _run_with_delay) - run() + if schedule(every[0], _run_with_delay): + run() def _signal_handler(signum, frame): @@ -111,7 +112,7 @@ def initialize(): return # - _set_result({'status': 'unknown'}) + _set_result({'status': 'unknown'}, include_timestamp=False) start_after = scheduling.get('start_after', (300, 1800)) start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') -- cgit v1.2.3