From 8cbc9f2d5823eb984e99e15c963e306610007fa1 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 24 Dec 2020 09:28:16 +0100 Subject: [enh] add checker --- searx/search/__init__.py | 7 + searx/search/checker/__init__.py | 1 + searx/search/checker/__main__.py | 51 ++++ searx/search/checker/impl.py | 388 +++++++++++++++++++++++++++ searx/search/processors/abstract.py | 12 + searx/search/processors/online.py | 44 +++ searx/search/processors/online_currency.py | 10 + searx/search/processors/online_dictionary.py | 18 ++ 8 files changed, 531 insertions(+) create mode 100644 searx/search/checker/__init__.py create mode 100644 searx/search/checker/__main__.py create mode 100644 searx/search/checker/impl.py (limited to 'searx/search') diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 0d45f0b7c..7768d21e9 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -64,6 +64,9 @@ class EngineRef: def __eq__(self, other): return self.name == other.name and self.category == other.category + def __hash__(self): + return hash((self.name, self.category)) + class SearchQuery: """container for all the search parameters (query, language, etc...)""" @@ -108,6 +111,10 @@ class SearchQuery: and self.timeout_limit == other.timeout_limit\ and self.external_bang == other.external_bang + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) + class Search: """Search information container""" diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 000000000..442d5a09d --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1 @@ +from .impl import Checker diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 000000000..c071e6437 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,51 @@ +import sys + +import searx.search +import searx.search.processors +import searx.search.checker + + +if sys.stdout.isatty(): + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + + +def iter_processor(): + if len(sys.argv) > 1: + for name, processor in searx.search.processors.items(): + if name in sys.argv: + yield name, processor + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +def main(): + searx.search.initialize() + broken_urls = [] + for name, processor in iter_processor(): + if sys.stdout.isatty(): + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + else: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) + + broken_urls += checker.test_results.broken_urls + + for url in broken_urls: + print('Error fetching', url) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 000000000..f55b6d0f5 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,388 @@ +import typing +import types +import functools +import itertools +from time import time +from urllib.parse import urlparse + +import re +import cld3 +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'broken_urls' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.broken_urls = [] + + def add_error(self, test, message): + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + + def add_broken_url(self, url): + if url not in self.broken_urls: + self.broken_urls.append(url) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results] + + def _record_error(self, message: str) -> None: + self.test_results.add_error(self.test_name, message) + + def _add_language(self, text: str) -> typing.Optional[str]: + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.9 and r.is_reliable: + self.languages.add(r.language) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title') + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content') + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self.test_results.add_broken_url(thumbnail_src) + self._record_error('thumbnail_src URL is invalid') + elif not _is_url_image(result.get('img_src')): + self.test_results.add_broken_url(result.get('img_src')) + self._record_error('img_src URL is invalid') + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid') + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer') + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content') + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value') + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index cf3fd7236..eb8d296ec 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -37,3 +37,15 @@ class EngineProcessor: @abstractmethod def search(self, query, params, result_container, start_time, timeout_limit): pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + else: + return tests + + def get_default_tests(self): + return {} diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index b62f8059e..54d63b4c9 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -211,3 +211,47 @@ class OnlineProcessor(EngineProcessor): # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0 + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('time', 'time')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + # [1, 2, 3] --> isinstance(l, (list, tuple)) ?? + tests['paging'] = { + 'matrix': {'query': 'time', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'time_range', False): + tests['time_range'] = { + 'matrix': {'query': 'time', + 'time_range': (None, 'day')}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'lang', False): + tests['lang_fr'] = { + 'matrix': {'query': 'paris', 'lang': 'fr'}, + 'result_container': ['not_empty', ('has_lang', 'fr')], + } + tests['lang_en'] = { + 'matrix': {'query': 'paris', 'lang': 'en'}, + 'result_container': ['not_empty', ('has_lang', 'en')], + } + + if getattr(self.engine, 'safesearch', False): + tests['safesearch'] = { + 'matrix': {'query': 'porn', + 'safesearch': (0, 2)}, + 'test': ['unique_results'] + } + + return tests diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py index f0e919c03..132c10594 100644 --- a/searx/search/processors/online_currency.py +++ b/searx/search/processors/online_currency.py @@ -55,3 +55,13 @@ class OnlineCurrencyProcessor(OnlineProcessor): params['from_name'] = iso4217_to_name(from_currency, 'en') params['to_name'] = iso4217_to_name(to_currency, 'en') return params + + def get_default_tests(self): + tests = {} + + tests['currency'] = { + 'matrix': {'query': '1337 usd in rmb'}, + 'result_container': ['has_answer'], + } + + return tests diff --git a/searx/search/processors/online_dictionary.py b/searx/search/processors/online_dictionary.py index 8e9ef1620..987c710a1 100644 --- a/searx/search/processors/online_dictionary.py +++ b/searx/search/processors/online_dictionary.py @@ -35,3 +35,21 @@ class OnlineDictionaryProcessor(OnlineProcessor): params['query'] = query return params + + def get_default_tests(self): + tests = {} + + if getattr(self.engine, 'paging', False): + tests['translation_paging'] = { + 'matrix': {'query': 'en-es house', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + 'test': ['unique_results'] + } + else: + tests['translation'] = { + 'matrix': {'query': 'en-es house'}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + } + + return tests -- cgit v1.2.3 From 9c581466e136f7cb82d5ffe6c052fbd9e93ab39f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 30 Dec 2020 15:24:29 +0100 Subject: [fix] do not colorize output on dumb terminals Signed-off-by: Markus Heiser --- searx/search/checker/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'searx/search') diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index c071e6437..2f808237a 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,11 +1,12 @@ import sys +import os import searx.search import searx.search.processors import searx.search.checker -if sys.stdout.isatty(): +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: RESET_SEQ = "\033[0m" COLOR_SEQ = "\033[1;%dm" BOLD_SEQ = "\033[1m" -- cgit v1.2.3 From 3a9f513521d006a7939538cce368d7b799e32c30 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 5 Jan 2021 11:24:39 +0100 Subject: [enh] checker: background check See settings.yml for the options SIGUSR1 signal starts the checker. The result is available at /stats/checker --- searx/search/__init__.py | 72 ++----------------------- searx/search/checker/__init__.py | 3 ++ searx/search/checker/__main__.py | 30 ++++++++--- searx/search/checker/background.py | 106 +++++++++++++++++++++++++++++++++++++ searx/search/checker/impl.py | 12 ++++- searx/search/models.py | 69 ++++++++++++++++++++++++ 6 files changed, 217 insertions(+), 75 deletions(-) create mode 100644 searx/search/checker/background.py create mode 100644 searx/search/models.py (limited to 'searx/search') diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 7768d21e9..f777e8595 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url from searx.results import ResultContainer from searx import logger from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery from searx.search.processors import processors, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker logger = logger.getChild('search') @@ -45,75 +47,11 @@ else: sys.exit(1) -def initialize(settings_engines=None): +def initialize(settings_engines=None, enable_checker=False): settings_engines = settings_engines or settings['engines'] initialize_processors(settings_engines) - - -class EngineRef: - - __slots__ = 'name', 'category' - - def __init__(self, name: str, category: str): - self.name = name - self.category = category - - def __repr__(self): - return "EngineRef({!r}, {!r})".format(self.name, self.category) - - def __eq__(self, other): - return self.name == other.name and self.category == other.category - - def __hash__(self): - return hash((self.name, self.category)) - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ - 'timeout_limit', 'external_bang' - - def __init__(self, - query: str, - engineref_list: typing.List[EngineRef], - lang: str='all', - safesearch: int=0, - pageno: int=1, - time_range: typing.Optional[str]=None, - timeout_limit: typing.Optional[float]=None, - external_bang: typing.Optional[str]=None): - self.query = query - self.engineref_list = engineref_list - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = time_range - self.timeout_limit = timeout_limit - self.external_bang = external_bang - - @property - def categories(self): - return list(set(map(lambda engineref: engineref.category, self.engineref_list))) - - def __repr__(self): - return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ - format(self.query, self.engineref_list, self.lang, self.safesearch, - self.pageno, self.time_range, self.timeout_limit, self.external_bang) - - def __eq__(self, other): - return self.query == other.query\ - and self.engineref_list == other.engineref_list\ - and self.lang == other.lang\ - and self.safesearch == other.safesearch\ - and self.pageno == other.pageno\ - and self.time_range == other.time_range\ - and self.timeout_limit == other.timeout_limit\ - and self.external_bang == other.external_bang - - def __hash__(self): - return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, - self.timeout_limit, self.external_bang)) + if enable_checker: + initialize_checker() class Search: diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py index 442d5a09d..85b9178df 100644 --- a/searx/search/checker/__init__.py +++ b/searx/search/checker/__init__.py @@ -1 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 2f808237a..37b7e6cda 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import sys import os +import argparse import searx.search -import searx.search.processors import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: @@ -18,20 +22,24 @@ else: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" -def iter_processor(): - if len(sys.argv) > 1: - for name, processor in searx.search.processors.items(): - if name in sys.argv: +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: yield name, processor + else: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) else: for name, processor in searx.search.processors.items(): yield name, processor -def main(): +def run(engine_name_list): searx.search.initialize() broken_urls = [] - for name, processor in iter_processor(): + for name, processor in iter_processor(engine_name_list): if sys.stdout.isatty(): print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) checker = searx.search.checker.Checker(processor) @@ -48,5 +56,13 @@ def main(): print('Error fetching', url) +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + args = parser.parse_args() + run(args.engine_name_list) + + if __name__ == '__main__': main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..45188ab38 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str('CHECKER_RESULT') + if serialized_result is not None: + return json.loads(serialized_result) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = {} + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result[name] = {'status': True} + else: + result[name] = {'status': False, 'errors': checker.test_results.errors} + + storage.set_str('CHECKER_RESULT', json.dumps(result)) + logger.info('Check done') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + schedule(every[0], _run_with_delay) + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index f55b6d0f5..abef5f8e9 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import typing import types import functools @@ -11,7 +13,7 @@ import requests.exceptions from searx import poolrequests, logger from searx.results import ResultContainer -from searx.search import SearchQuery, EngineRef +from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -240,18 +242,24 @@ class ResultContainerTests: self._check_infoboxes(self.result_container.infoboxes) def has_infobox(self): + """Check the ResultContainer has at least one infobox""" if len(self.result_container.infoboxes) == 0: self._record_error('No infobox') def has_answer(self): + """Check the ResultContainer has at least one answer""" if len(self.result_container.answers) == 0: self._record_error('No answer') def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" if lang not in self.languages: self._record_error(lang + ' not found') def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" result_types = set() results = self.result_container.get_ordered_results() if len(results) > 0: @@ -267,6 +275,7 @@ class ResultContainerTests: self._record_error('No result') def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" title = title.lower() for result in self.result_container.get_ordered_results(): if title in result['title'].lower(): @@ -287,6 +296,7 @@ class CheckerTests: self.result_container_tests_list = result_container_tests_list def unique_results(self): + """Check the results of each ResultContain is unique""" urls_list = [rct.result_urls for rct in self.result_container_tests_list] if len(urls_list[0]) > 0: # results on the first page diff --git a/searx/search/models.py b/searx/search/models.py new file mode 100644 index 000000000..80ceaa223 --- /dev/null +++ b/searx/search/models.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing + + +class EngineRef: + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ + 'timeout_limit', 'external_bang' + + def __init__(self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str='all', + safesearch: int=0, + pageno: int=1, + time_range: typing.Optional[str]=None, + timeout_limit: typing.Optional[float]=None, + external_bang: typing.Optional[str]=None): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang + + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) -- cgit v1.2.3 From 45bfab77d0154c60f58be0453307cb03b48dca35 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 8 Jan 2021 19:04:04 +0100 Subject: |mod] checker: improve searx-checker command line * output is unbuffered * verbose mode describe more precisly the errrors --- searx/search/checker/__main__.py | 54 +++++++++++++++++++++++++++++----------- searx/search/checker/impl.py | 46 ++++++++++++++++++++-------------- 2 files changed, 67 insertions(+), 33 deletions(-) (limited to 'searx/search') diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 37b7e6cda..75b37e6c5 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: AGPL-3.0-or-later import sys +import io import os import argparse +import logging import searx.search import searx.search.checker @@ -10,6 +12,14 @@ from searx.search import processors from searx.engines import engine_shortcuts +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: RESET_SEQ = "\033[0m" COLOR_SEQ = "\033[1;%dm" @@ -21,7 +31,12 @@ else: BOLD_SEQ = "" BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) +stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) + +# iterator of processors def iter_processor(engine_name_list): if len(engine_name_list) > 0: for name in engine_name_list: @@ -30,38 +45,49 @@ def iter_processor(engine_name_list): if processor is not None: yield name, processor else: - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') else: for name, processor in searx.search.processors.items(): yield name, processor -def run(engine_name_list): +# actual check & display +def run(engine_name_list, verbose): searx.search.initialize() - broken_urls = [] for name, processor in iter_processor(engine_name_list): - if sys.stdout.isatty(): - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') checker = searx.search.checker.Checker(processor) checker.run() if checker.test_results.succesfull: - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') else: - errors = [test_name + ': ' + error for test_name, error in checker.test_results] - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) - - broken_urls += checker.test_results.broken_urls - - for url in broken_urls: - print('Error fetching', url) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') +# call by setup.py def main(): parser = argparse.ArgumentParser(description='Check searx engines.') parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', help='engines name or shortcut list. Empty for all engines.') + parser.add_argument('--verbose', '-v', + action='store_true', dest='verbose', + help='Display details about the test results', + default=False) args = parser.parse_args() - run(args.engine_name_list) + run(args.engine_name_list, args.verbose) if __name__ == '__main__': diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index abef5f8e9..71a941f73 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -17,6 +17,8 @@ from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor +logger = logger.getChild('searx.search.checker') + HTML_TAGS = [ 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', @@ -121,20 +123,25 @@ def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ class TestResults: - __slots__ = 'errors', 'broken_urls' + __slots__ = 'errors', 'logs', 'languages' def __init__(self): self.errors: typing.Dict[str, typing.List[str]] = {} - self.broken_urls = [] + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() - def add_error(self, test, message): + def add_error(self, test, message, *args): + # message to self.errors errors_for_test = self.errors.setdefault(test, []) if message not in errors_for_test: errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) - def add_broken_url(self, url): - if url not in self.broken_urls: - self.broken_urls.append(url) + def add_language(self, language): + self.languages.add(language) @property def succesfull(self): @@ -167,20 +174,23 @@ class ResultContainerTests: results = self.result_container.get_ordered_results() return [result['url'] for result in results] - def _record_error(self, message: str) -> None: - self.test_results.add_error(self.test_name, message) + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') def _add_language(self, text: str) -> typing.Optional[str]: r = cld3.get_language(str(text)) # pylint: disable=E1101 - if r is not None and r.probability >= 0.9 and r.is_reliable: + if r is not None and r.probability >= 0.98 and r.is_reliable: self.languages.add(r.language) + self.test_results.add_language(r.language) return None def _check_result(self, result): if not _check_no_html(result.get('title', '')): - self._record_error('HTML in title') + self._record_error('HTML in title', repr(result.get('title', ''))) if not _check_no_html(result.get('content', '')): - self._record_error('HTML in content') + self._record_error('HTML in content', repr(result.get('content', ''))) self._add_language(result.get('title', '')) self._add_language(result.get('content', '')) @@ -198,13 +208,11 @@ class ResultContainerTests: thumbnail_src = result.get('thumbnail_src') if thumbnail_src is not None: if not _is_url_image(thumbnail_src): - self.test_results.add_broken_url(thumbnail_src) - self._record_error('thumbnail_src URL is invalid') + self._record_error('thumbnail_src URL is invalid', thumbnail_src) elif not _is_url_image(result.get('img_src')): - self.test_results.add_broken_url(result.get('img_src')) - self._record_error('img_src URL is invalid') + self._record_error('img_src URL is invalid', result.get('img_src')) if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): - self._record_error('thumbnail URL is invalid') + self._record_error('thumbnail URL is invalid', result.get('img_src')) def _check_results(self, results: list): for result in results: @@ -213,16 +221,16 @@ class ResultContainerTests: def _check_answers(self, answers): for answer in answers: if not _check_no_html(answer): - self._record_error('HTML in answer') + self._record_error('HTML in answer', answer) def _check_infoboxes(self, infoboxes): for infobox in infoboxes: if not _check_no_html(infobox.get('content', '')): - self._record_error('HTML in infobox content') + self._record_error('HTML in infobox content', infobox.get('content', '')) self._add_language(infobox.get('content', '')) for attribute in infobox.get('attributes', {}): if not _check_no_html(attribute.get('value', '')): - self._record_error('HTML in infobox attribute value') + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) def check_basic(self): if len(self.result_container.unresponsive_engines) > 0: -- cgit v1.2.3 From f3e1bd308f8abb62b3ce0070973e0a494d15b122 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 8 Jan 2021 19:05:56 +0100 Subject: [mod] checker: minor adjustements on the default tests the query "time" is convinient because most of the search engine will return some results, but some engines in the general category will return documentation about the HTML tags