diff options
| author | Markus Heiser <markus.heiser@darmarit.de> | 2025-03-05 17:29:20 +0100 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-03-15 10:36:33 +0100 |
| commit | 8769b7c6d6d62b961d62b9454bd12f69fba298c8 (patch) | |
| tree | bdf58284b785169606060acc257db5c1e828b430 /searx/result_types | |
| parent | d6ce29f7f05a2ff49e1b8566d49b9ca21c30936a (diff) | |
[refactor] typification of SearXNG (MainResult) / result items (part 2)
The class ReslutContainer has been revised, it can now handle the typed Result
items of classes:
- MainResult
- LegacyResult (a dict wrapper for backward compatibility)
Due to the now complete typing of theses three clases, instead of the *getitem*
accesses, the fields can now be accessed directly via attributes (which is also
supported by the IDE).
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/result_types')
| -rw-r--r-- | searx/result_types/__init__.py | 11 | ||||
| -rw-r--r-- | searx/result_types/_base.py | 280 |
2 files changed, 234 insertions, 57 deletions
diff --git a/searx/result_types/__init__.py b/searx/result_types/__init__.py index 5f24b86c4..9af16b9b5 100644 --- a/searx/result_types/__init__.py +++ b/searx/result_types/__init__.py @@ -13,13 +13,13 @@ from __future__ import annotations -__all__ = ["Result", "EngineResults", "AnswerSet", "Answer", "Translations"] +__all__ = ["Result", "MainResult", "EngineResults", "AnswerSet", "Answer", "Translations"] import abc from searx import enginelib -from ._base import Result, LegacyResult +from ._base import Result, MainResult, LegacyResult from .answer import AnswerSet, Answer, Translations @@ -30,13 +30,18 @@ class ResultList(list, abc.ABC): """The collection of result types (which have already been implemented).""" Answer = Answer + MainResult = MainResult + Result = Result Translations = Translations + # for backward compatibility + LegacyResult = LegacyResult + def __init__(self): # pylint: disable=useless-parent-delegation super().__init__() - def add(self, result: Result): + def add(self, result: Result | LegacyResult): """Add a :py:`Result` item to the result list.""" self.append(result) diff --git a/searx/result_types/_base.py b/searx/result_types/_base.py index 1496cbd2c..220a4ff12 100644 --- a/searx/result_types/_base.py +++ b/searx/result_types/_base.py @@ -10,6 +10,8 @@ .. autoclass:: Result :members: +.. _LegacyResult: + .. autoclass:: LegacyResult :members: """ @@ -22,9 +24,88 @@ __all__ = ["Result"] import re import urllib.parse import warnings +import typing import msgspec +from searx import logger as log + +WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) + + +def _normalize_url_fields(result: Result | LegacyResult): + + # As soon we need LegacyResult not any longer, we can move this function to + # method Result.normalize_result_fields + + if result.url and not result.parsed_url: + if not isinstance(result.url, str): + log.debug('result: invalid URL: %s', str(result)) + result.url = "" + result.parsed_url = None + else: + result.parsed_url = urllib.parse.urlparse(result.url) + + if result.parsed_url: + result.parsed_url = result.parsed_url._replace( + # if the result has no scheme, use http as default + scheme=result.parsed_url.scheme or "http", + # normalize ``www.example.com`` to ``example.com`` + netloc=result.parsed_url.netloc.replace("www.", ""), + # normalize ``example.com/path/`` to ``example.com/path`` + path=result.parsed_url.path.rstrip("/"), + ) + result.url = result.parsed_url.geturl() + + if isinstance(result, LegacyResult) and getattr(result, "infobox", None): + # As soon we have InfoboxResult, we can move this function to method + # InfoboxResult.normalize_result_fields + + infobox_urls: list[dict[str, str]] = getattr(result, "urls", []) + for item in infobox_urls: + _url = item.get("url") + if not _url: + continue + _url = urllib.parse.urlparse(_url) + item["url"] = _url._replace( + scheme=_url.scheme or "http", + netloc=_url.netloc.replace("www.", ""), + path=_url.path.rstrip("/"), + ).geturl() + + infobox_id = getattr(result, "id", None) + if infobox_id: + _url = urllib.parse.urlparse(infobox_id) + result.id = _url._replace( + scheme=_url.scheme or "http", + netloc=_url.netloc.replace("www.", ""), + path=_url.path.rstrip("/"), + ).geturl() + + +def _normalize_text_fields(result: MainResult | LegacyResult): + + # As soon we need LegacyResult not any longer, we can move this function to + # method MainResult.normalize_result_fields + + # Actually, a type check should not be necessary if the engine is + # implemented correctly. Historically, however, we have always had a type + # check here. + + if result.title and not isinstance(result.title, str): + log.debug("result: invalid type of field 'title': %s", str(result)) + result.title = str(result) + if result.content and not isinstance(result.content, str): + log.debug("result: invalid type of field 'content': %s", str(result)) + result.content = str(result) + + # normalize title and content + result.title = WHITESPACE_REGEX.sub(" ", result.title).strip() + result.content = WHITESPACE_REGEX.sub(" ", result.content).strip() + if result.content == result.title: + # avoid duplicate content between the content and title fields + result.content = "" + class Result(msgspec.Struct, kw_only=True): """Base class of all result types :ref:`result types`.""" @@ -54,21 +135,20 @@ class Result(msgspec.Struct, kw_only=True): """ def normalize_result_fields(self): - """Normalize a result .. - - - if field ``url`` is set and field ``parse_url`` is unset, init - ``parse_url`` from field ``url``. This method can be extended in the - inheritance. + """Normalize fields ``url`` and ``parse_sql``. - """ + - If field ``url`` is set and field ``parse_url`` is unset, init + ``parse_url`` from field ``url``. The ``url`` field is initialized + with the resulting value in ``parse_url``, if ``url`` and + ``parse_url`` are not equal. - if not self.parsed_url and self.url: - self.parsed_url = urllib.parse.urlparse(self.url) + - ``www.example.com`` and ``example.com`` are equivalent and are normalized + to ``example.com``. - # if the result has no scheme, use http as default - if not self.parsed_url.scheme: - self.parsed_url = self.parsed_url._replace(scheme="http") - self.url = self.parsed_url.geturl() + - ``example.com/path/`` and ``example.com/path`` are equivalent and are + normalized to ``example.com/path``. + """ + _normalize_url_fields(self) def __post_init__(self): pass @@ -84,7 +164,6 @@ class Result(msgspec.Struct, kw_only=True): The hash value is used in contexts, e.g. when checking for equality to identify identical results from different sources (engines). """ - return id(self) def __eq__(self, other): @@ -113,12 +192,19 @@ class Result(msgspec.Struct, kw_only=True): def as_dict(self): return {f: getattr(self, f) for f in self.__struct_fields__} + def defaults_from(self, other: Result): + """Fields not set in *self* will be updated from the field values of the + *other*. + """ + for field_name in self.__struct_fields__: + self_val = getattr(self, field_name, False) + other_val = getattr(other, field_name, False) + if self_val: + setattr(self, field_name, other_val) -class MainResult(Result): # pylint: disable=missing-class-docstring - # open_group and close_group should not manged in the Result class (we should rop it from here!) - open_group: bool = False - close_group: bool = False +class MainResult(Result): # pylint: disable=missing-class-docstring + """Base class of all result types displayed in :ref:`area main results`.""" title: str = "" """Link title of the result item.""" @@ -132,6 +218,43 @@ class MainResult(Result): # pylint: disable=missing-class-docstring thumbnail: str = "" """URL of a thumbnail that is displayed in the result item.""" + priority: typing.Literal["", "high", "low"] = "" + """The priority can be set via :ref:`hostnames plugin`, for example.""" + + engines: set[str] = set() + """In a merged results list, the names of the engines that found this result + are listed in this field.""" + + # open_group and close_group should not manged in the Result + # class (we should drop it from here!) + open_group: bool = False + close_group: bool = False + positions: list[int] = [] + score: float = 0 + category: str = "" + + def __hash__(self) -> int: + """Ordinary url-results are equal if their values for + :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme) + and :py:obj:`MainResult.img_src` are equal. + """ + if not self.parsed_url: + raise ValueError(f"missing a value in field 'parsed_url': {self}") + + url = self.parsed_url + return hash( + f"{self.template}" + + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}" + + f"|{self.img_src}" + ) + + def normalize_result_fields(self): + super().normalize_result_fields() + + _normalize_text_fields(self) + if self.engine: + self.engines.add(self.engine) + class LegacyResult(dict): """A wrapper around a legacy result item. The SearXNG core uses this class @@ -150,7 +273,27 @@ class LegacyResult(dict): """ UNSET = object() - WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) + + # emulate field types from type class Result + url: str | None + template: str + engine: str + parsed_url: urllib.parse.ParseResult | None + + # emulate field types from type class MainResult + title: str + content: str + img_src: str + thumbnail: str + priority: typing.Literal["", "high", "low"] + engines: set[str] + positions: list[int] + score: float + category: str + + # infobox result + urls: list[dict[str, str]] + attributes: list[dict[str, str]] def as_dict(self): return self @@ -159,14 +302,26 @@ class LegacyResult(dict): super().__init__(*args, **kwargs) - # Init fields with defaults / compare with defaults of the fields in class Result - self.engine = self.get("engine", "") - self.template = self.get("template", "default.html") - self.url = self.get("url", None) - self.parsed_url = self.get("parsed_url", None) - - self.content = self.get("content", "") - self.title = self.get("title", "") + # emulate field types from type class Result + self["url"] = self.get("url") + self["template"] = self.get("template", "default.html") + self["engine"] = self.get("engine", "") + self["parsed_url"] = self.get("parsed_url") + + # emulate field types from type class MainResult + self["title"] = self.get("title", "") + self["content"] = self.get("content", "") + self["img_src"] = self.get("img_src", "") + self["thumbnail"] = self.get("thumbnail", "") + self["priority"] = self.get("priority", "") + self["engines"] = self.get("engines", set()) + self["positions"] = self.get("positions", "") + self["score"] = self.get("score", 0) + self["category"] = self.get("category", "") + + if "infobox" in self: + self["urls"] = self.get("urls", []) + self["attributes"] = self.get("attributes", []) # Legacy types that have already been ported to a type .. @@ -178,13 +333,47 @@ class LegacyResult(dict): ) self.template = "answer/legacy.html" + if self.template == "keyvalue.html": + warnings.warn( + f"engine {self.engine} is using deprecated `dict` for key/value results" + f" / use a class from searx.result_types", + DeprecationWarning, + ) + + def __getattr__(self, name: str, default=UNSET) -> typing.Any: + if default == self.UNSET and name not in self: + raise AttributeError(f"LegacyResult object has no field named: {name}") + return self[name] + + def __setattr__(self, name: str, val): + self[name] = val + def __hash__(self) -> int: # type: ignore if "answer" in self: + # deprecated .. return hash(self["answer"]) + + if self.template == "images.html": + # image results are equal if their values for template, the url and + # the img_src are equal. + return hash(f"{self.template}|{self.url}|{self.img_src}") + if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]): - # it is a commun url-result .. - return hash(self.url) + # Ordinary url-results are equal if their values for template, + # parsed_url (without schema) and img_src` are equal. + + # Code copied from with MainResult.__hash__: + if not self.parsed_url: + raise ValueError(f"missing a value in field 'parsed_url': {self}") + + url = self.parsed_url + return hash( + f"{self.template}" + + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}" + + f"|{self.img_src}" + ) + return id(self) def __eq__(self, other): @@ -195,30 +384,13 @@ class LegacyResult(dict): return f"LegacyResult: {super().__repr__()}" - def __getattr__(self, name: str, default=UNSET): - - if default == self.UNSET and name not in self: - raise AttributeError(f"LegacyResult object has no field named: {name}") - return self[name] - - def __setattr__(self, name: str, val): - - self[name] = val - def normalize_result_fields(self): - - self.title = self.WHITESPACE_REGEX.sub(" ", self.title) - - if not self.parsed_url and self.url: - self.parsed_url = urllib.parse.urlparse(self.url) - - # if the result has no scheme, use http as default - if not self.parsed_url.scheme: - self.parsed_url = self.parsed_url._replace(scheme="http") - self.url = self.parsed_url.geturl() - - if self.content: - self.content = self.WHITESPACE_REGEX.sub(" ", self.content) - if self.content == self.title: - # avoid duplicate content between the content and title fields - self.content = "" + _normalize_url_fields(self) + _normalize_text_fields(self) + if self.engine: + self.engines.add(self.engine) + + def defaults_from(self, other: LegacyResult): + for k, v in other.items(): + if not self.get(k): + self[k] = v |