# SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=too-few-public-methods, missing-module-docstring """Basic types for the typification of results. - :py:obj:`Result` base class - :py:obj:`LegacyResult` for internal use only ---- .. autoclass:: Result :members: .. _LegacyResult: .. autoclass:: LegacyResult :members: """ __all__ = ["Result"] import typing as t import re import urllib.parse import warnings import datetime from collections.abc import Callable import msgspec from searx import logger as log WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) UNKNOWN = object() def _normalize_url_fields(result: "Result | LegacyResult"): # As soon we need LegacyResult not any longer, we can move this function to # method Result.normalize_result_fields if result.url and not result.parsed_url: if not isinstance(result.url, str): log.debug('result: invalid URL: %s', str(result)) result.url = "" result.parsed_url = None else: result.parsed_url = urllib.parse.urlparse(result.url) if result.parsed_url: result.parsed_url = result.parsed_url._replace( # if the result has no scheme, use http as default scheme=result.parsed_url.scheme or "http", path=result.parsed_url.path, ) result.url = result.parsed_url.geturl() if isinstance(result, LegacyResult) and getattr(result, "infobox", None): # As soon we have InfoboxResult, we can move this function to method # InfoboxResult.normalize_result_fields infobox_urls: list[dict[str, str]] = getattr(result, "urls", []) for item in infobox_urls: _url = item.get("url") if not _url: continue _url = urllib.parse.urlparse(_url) item["url"] = _url._replace( scheme=_url.scheme or "http", # netloc=_url.netloc.replace("www.", ""), path=_url.path, ).geturl() infobox_id: str | None = getattr(result, "id", None) if infobox_id: _url = urllib.parse.urlparse(infobox_id) result.id = _url._replace( scheme=_url.scheme or "http", # netloc=_url.netloc.replace("www.", ""), path=_url.path, ).geturl() def _normalize_text_fields(result: "MainResult | LegacyResult"): # As soon we need LegacyResult not any longer, we can move this function to # method MainResult.normalize_result_fields # Actually, a type check should not be necessary if the engine is # implemented correctly. Historically, however, we have always had a type # check here. if result.title and not isinstance(result.title, str): log.debug("result: invalid type of field 'title': %s", str(result)) result.title = str(result) if result.content and not isinstance(result.content, str): log.debug("result: invalid type of field 'content': %s", str(result)) result.content = str(result) # normalize title and content if result.title: result.title = WHITESPACE_REGEX.sub(" ", result.title).strip() if result.content: result.content = WHITESPACE_REGEX.sub(" ", result.content).strip() if result.content == result.title: # avoid duplicate content between the content and title fields result.content = "" def _filter_urls( result: "Result | LegacyResult", filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]" ): # pylint: disable=too-many-branches, too-many-statements # As soon we need LegacyResult not any longer, we can move this function to # method Result. url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"] url_src: str for field_name in url_fields: url_src = getattr(result, field_name, "") if not url_src: continue new_url = filter_func(result, field_name, url_src) # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url) if isinstance(new_url, bool): if new_url: # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value) continue log.debug("filter_urls: drop field %s URL %s", field_name, url_src) new_url = None else: log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url) setattr(result, field_name, new_url) if field_name == "url": # sync parsed_url with new_url if not new_url: result.parsed_url = None elif isinstance(new_url, str): result.parsed_url = urllib.parse.urlparse(new_url) # "urls": are from infobox # # As soon we have InfoboxResult, we can move this function to method # InfoboxResult.normalize_result_fields infobox_urls: list[dict[str, str]] = getattr(result, "urls", []) if infobox_urls: # log.debug("filter_urls: infobox_urls .. %s", infobox_urls) new_infobox_urls: list[dict[str, str]] = [] for item in infobox_urls: url_src = item.get("url", "") if not url_src: new_infobox_urls.append(item) continue new_url = filter_func(result, "infobox_urls", url_src) if isinstance(new_url, bool): if new_url: new_infobox_urls.append(item) # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url) continue log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src) new_url = None if new_url: log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url) item["url"] = new_url new_infobox_urls.append(item) setattr(result, "urls", new_infobox_urls) # "attributes": are from infobox # # The infobox has additional subsections for attributes, urls and relatedTopics: infobox_attributes: list[dict[str, t.Any]] = getattr(result, "attributes", []) if infobox_attributes: # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes) new_infobox_attributes: list[dict[str, str | list[dict[str, str]]]] = [] for item in infobox_attributes: image: dict[str, str] = item.get("image", {}) url_src = image.get("src", "") if not url_src: new_infobox_attributes.append(item) continue new_url = filter_func(result, "infobox_attributes", url_src) if isinstance(new_url, bool): if new_url: new_infobox_attributes.append(item) # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src) continue log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src) new_url = None if new_url: log.debug( "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s", url_src, new_url, ) item["image"]["src"] = new_url new_infobox_attributes.append(item) setattr(result, "attributes", new_infobox_attributes) result.normalize_result_fields() def _normalize_date_fields(result: "MainResult | LegacyResult"): if result.publishedDate: # do not try to get a date from an empty string or a None type try: # test if publishedDate >= 1900 (datetime module bug) result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z') except ValueError: result.publishedDate = None class Result(msgspec.Struct, kw_only=True): """Base class of all result types :ref:`result types`.""" url: str | None = None """A link related to this *result*""" engine: str | None = "" """Name of the engine *this* result comes from. In case of *plugins* a prefix ``plugin:`` is set, in case of *answerer* prefix ``answerer:`` is set. The field is optional and is initialized from the context if necessary. """ parsed_url: urllib.parse.ParseResult | None = None """:py:obj:`urllib.parse.ParseResult` of :py:obj:`Result.url`. The field is optional and is initialized from the context if necessary. """ def normalize_result_fields(self): """Normalize fields ``url`` and ``parse_sql``. - If field ``url`` is set and field ``parse_url`` is unset, init ``parse_url`` from field ``url``. The ``url`` field is initialized with the resulting value in ``parse_url``, if ``url`` and ``parse_url`` are not equal. """ _normalize_url_fields(self) def __post_init__(self): pass def filter_urls(self, filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"): """A filter function is passed in the ``filter_func`` argument to filter and/or modify the URLs. The filter function receives the :py:obj:`result object ` as the first argument and the field name (``str``) in the second argument. In the third argument the URL string value is passed to the filter function. The filter function is applied to all fields that contain a URL, in addition to the familiar ``url`` field, these include fields such as:: ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"] and the ``urls`` list of items of the infobox. For each field, the filter function is called and returns a bool or a string value: - ``True``: leave URL in field unchanged - ``False``: remove URL field from result (or remove entire result) - ``str``: modified URL to be used instead See :ref:`filter urls example`. """ _filter_urls(self, filter_func=filter_func) def __hash__(self) -> int: """Generates a hash value that uniquely identifies the content of *this* result. The method can be adapted in the inheritance to compare results from different sources. If two result objects are not identical but have the same content, their hash values should also be identical. The hash value is used in contexts, e.g. when checking for equality to identify identical results from different sources (engines). """ return id(self) def __eq__(self, other: object): """py:obj:`Result` objects are equal if the hash values of the two objects are equal. If needed, its recommended to overwrite "py:obj:`Result.__hash__`.""" return hash(self) == hash(other) # for legacy code where a result is treated as a Python dict def __setitem__(self, field_name: str, value: t.Any): return setattr(self, field_name, value) def __getitem__(self, field_name: str) -> t.Any: if field_name not in self.__struct_fields__: raise KeyError(f"{field_name}") return getattr(self, field_name) def __iter__(self): return iter(self.__struct_fields__) def as_dict(self): return {f: getattr(self, f) for f in self.__struct_fields__} def defaults_from(self, other: "Result"): """Fields not set in *self* will be updated from the field values of the *other*. """ for field_name in self.__struct_fields__: self_val = getattr(self, field_name, False) other_val = getattr(other, field_name, False) if self_val: setattr(self, field_name, other_val) class MainResult(Result): # pylint: disable=missing-class-docstring """Base class of all result types displayed in :ref:`area main results`.""" template: str = "default.html" """Name of the template used to render the result. By default :origin:`result_templates/default.html ` is used. """ title: str = "" """Link title of the result item.""" content: str = "" """Extract or description of the result item""" img_src: str = "" """URL of a image that is displayed in the result item.""" iframe_src: str = "" """URL of an embedded ``