summaryrefslogtreecommitdiff
path: root/searx/result_types
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2025-03-05 17:29:20 +0100
committerMarkus Heiser <markus.heiser@darmarIT.de>2025-03-15 10:36:33 +0100
commit8769b7c6d6d62b961d62b9454bd12f69fba298c8 (patch)
treebdf58284b785169606060acc257db5c1e828b430 /searx/result_types
parentd6ce29f7f05a2ff49e1b8566d49b9ca21c30936a (diff)
[refactor] typification of SearXNG (MainResult) / result items (part 2)
The class ReslutContainer has been revised, it can now handle the typed Result items of classes: - MainResult - LegacyResult (a dict wrapper for backward compatibility) Due to the now complete typing of theses three clases, instead of the *getitem* accesses, the fields can now be accessed directly via attributes (which is also supported by the IDE). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/result_types')
-rw-r--r--searx/result_types/__init__.py11
-rw-r--r--searx/result_types/_base.py280
2 files changed, 234 insertions, 57 deletions
diff --git a/searx/result_types/__init__.py b/searx/result_types/__init__.py
index 5f24b86c4..9af16b9b5 100644
--- a/searx/result_types/__init__.py
+++ b/searx/result_types/__init__.py
@@ -13,13 +13,13 @@
from __future__ import annotations
-__all__ = ["Result", "EngineResults", "AnswerSet", "Answer", "Translations"]
+__all__ = ["Result", "MainResult", "EngineResults", "AnswerSet", "Answer", "Translations"]
import abc
from searx import enginelib
-from ._base import Result, LegacyResult
+from ._base import Result, MainResult, LegacyResult
from .answer import AnswerSet, Answer, Translations
@@ -30,13 +30,18 @@ class ResultList(list, abc.ABC):
"""The collection of result types (which have already been implemented)."""
Answer = Answer
+ MainResult = MainResult
+ Result = Result
Translations = Translations
+ # for backward compatibility
+ LegacyResult = LegacyResult
+
def __init__(self):
# pylint: disable=useless-parent-delegation
super().__init__()
- def add(self, result: Result):
+ def add(self, result: Result | LegacyResult):
"""Add a :py:`Result` item to the result list."""
self.append(result)
diff --git a/searx/result_types/_base.py b/searx/result_types/_base.py
index 1496cbd2c..220a4ff12 100644
--- a/searx/result_types/_base.py
+++ b/searx/result_types/_base.py
@@ -10,6 +10,8 @@
.. autoclass:: Result
:members:
+.. _LegacyResult:
+
.. autoclass:: LegacyResult
:members:
"""
@@ -22,9 +24,88 @@ __all__ = ["Result"]
import re
import urllib.parse
import warnings
+import typing
import msgspec
+from searx import logger as log
+
+WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
+
+
+def _normalize_url_fields(result: Result | LegacyResult):
+
+ # As soon we need LegacyResult not any longer, we can move this function to
+ # method Result.normalize_result_fields
+
+ if result.url and not result.parsed_url:
+ if not isinstance(result.url, str):
+ log.debug('result: invalid URL: %s', str(result))
+ result.url = ""
+ result.parsed_url = None
+ else:
+ result.parsed_url = urllib.parse.urlparse(result.url)
+
+ if result.parsed_url:
+ result.parsed_url = result.parsed_url._replace(
+ # if the result has no scheme, use http as default
+ scheme=result.parsed_url.scheme or "http",
+ # normalize ``www.example.com`` to ``example.com``
+ netloc=result.parsed_url.netloc.replace("www.", ""),
+ # normalize ``example.com/path/`` to ``example.com/path``
+ path=result.parsed_url.path.rstrip("/"),
+ )
+ result.url = result.parsed_url.geturl()
+
+ if isinstance(result, LegacyResult) and getattr(result, "infobox", None):
+ # As soon we have InfoboxResult, we can move this function to method
+ # InfoboxResult.normalize_result_fields
+
+ infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
+ for item in infobox_urls:
+ _url = item.get("url")
+ if not _url:
+ continue
+ _url = urllib.parse.urlparse(_url)
+ item["url"] = _url._replace(
+ scheme=_url.scheme or "http",
+ netloc=_url.netloc.replace("www.", ""),
+ path=_url.path.rstrip("/"),
+ ).geturl()
+
+ infobox_id = getattr(result, "id", None)
+ if infobox_id:
+ _url = urllib.parse.urlparse(infobox_id)
+ result.id = _url._replace(
+ scheme=_url.scheme or "http",
+ netloc=_url.netloc.replace("www.", ""),
+ path=_url.path.rstrip("/"),
+ ).geturl()
+
+
+def _normalize_text_fields(result: MainResult | LegacyResult):
+
+ # As soon we need LegacyResult not any longer, we can move this function to
+ # method MainResult.normalize_result_fields
+
+ # Actually, a type check should not be necessary if the engine is
+ # implemented correctly. Historically, however, we have always had a type
+ # check here.
+
+ if result.title and not isinstance(result.title, str):
+ log.debug("result: invalid type of field 'title': %s", str(result))
+ result.title = str(result)
+ if result.content and not isinstance(result.content, str):
+ log.debug("result: invalid type of field 'content': %s", str(result))
+ result.content = str(result)
+
+ # normalize title and content
+ result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
+ result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
+ if result.content == result.title:
+ # avoid duplicate content between the content and title fields
+ result.content = ""
+
class Result(msgspec.Struct, kw_only=True):
"""Base class of all result types :ref:`result types`."""
@@ -54,21 +135,20 @@ class Result(msgspec.Struct, kw_only=True):
"""
def normalize_result_fields(self):
- """Normalize a result ..
-
- - if field ``url`` is set and field ``parse_url`` is unset, init
- ``parse_url`` from field ``url``. This method can be extended in the
- inheritance.
+ """Normalize fields ``url`` and ``parse_sql``.
- """
+ - If field ``url`` is set and field ``parse_url`` is unset, init
+ ``parse_url`` from field ``url``. The ``url`` field is initialized
+ with the resulting value in ``parse_url``, if ``url`` and
+ ``parse_url`` are not equal.
- if not self.parsed_url and self.url:
- self.parsed_url = urllib.parse.urlparse(self.url)
+ - ``www.example.com`` and ``example.com`` are equivalent and are normalized
+ to ``example.com``.
- # if the result has no scheme, use http as default
- if not self.parsed_url.scheme:
- self.parsed_url = self.parsed_url._replace(scheme="http")
- self.url = self.parsed_url.geturl()
+ - ``example.com/path/`` and ``example.com/path`` are equivalent and are
+ normalized to ``example.com/path``.
+ """
+ _normalize_url_fields(self)
def __post_init__(self):
pass
@@ -84,7 +164,6 @@ class Result(msgspec.Struct, kw_only=True):
The hash value is used in contexts, e.g. when checking for equality to
identify identical results from different sources (engines).
"""
-
return id(self)
def __eq__(self, other):
@@ -113,12 +192,19 @@ class Result(msgspec.Struct, kw_only=True):
def as_dict(self):
return {f: getattr(self, f) for f in self.__struct_fields__}
+ def defaults_from(self, other: Result):
+ """Fields not set in *self* will be updated from the field values of the
+ *other*.
+ """
+ for field_name in self.__struct_fields__:
+ self_val = getattr(self, field_name, False)
+ other_val = getattr(other, field_name, False)
+ if self_val:
+ setattr(self, field_name, other_val)
-class MainResult(Result): # pylint: disable=missing-class-docstring
- # open_group and close_group should not manged in the Result class (we should rop it from here!)
- open_group: bool = False
- close_group: bool = False
+class MainResult(Result): # pylint: disable=missing-class-docstring
+ """Base class of all result types displayed in :ref:`area main results`."""
title: str = ""
"""Link title of the result item."""
@@ -132,6 +218,43 @@ class MainResult(Result): # pylint: disable=missing-class-docstring
thumbnail: str = ""
"""URL of a thumbnail that is displayed in the result item."""
+ priority: typing.Literal["", "high", "low"] = ""
+ """The priority can be set via :ref:`hostnames plugin`, for example."""
+
+ engines: set[str] = set()
+ """In a merged results list, the names of the engines that found this result
+ are listed in this field."""
+
+ # open_group and close_group should not manged in the Result
+ # class (we should drop it from here!)
+ open_group: bool = False
+ close_group: bool = False
+ positions: list[int] = []
+ score: float = 0
+ category: str = ""
+
+ def __hash__(self) -> int:
+ """Ordinary url-results are equal if their values for
+ :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme)
+ and :py:obj:`MainResult.img_src` are equal.
+ """
+ if not self.parsed_url:
+ raise ValueError(f"missing a value in field 'parsed_url': {self}")
+
+ url = self.parsed_url
+ return hash(
+ f"{self.template}"
+ + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
+ + f"|{self.img_src}"
+ )
+
+ def normalize_result_fields(self):
+ super().normalize_result_fields()
+
+ _normalize_text_fields(self)
+ if self.engine:
+ self.engines.add(self.engine)
+
class LegacyResult(dict):
"""A wrapper around a legacy result item. The SearXNG core uses this class
@@ -150,7 +273,27 @@ class LegacyResult(dict):
"""
UNSET = object()
- WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
+
+ # emulate field types from type class Result
+ url: str | None
+ template: str
+ engine: str
+ parsed_url: urllib.parse.ParseResult | None
+
+ # emulate field types from type class MainResult
+ title: str
+ content: str
+ img_src: str
+ thumbnail: str
+ priority: typing.Literal["", "high", "low"]
+ engines: set[str]
+ positions: list[int]
+ score: float
+ category: str
+
+ # infobox result
+ urls: list[dict[str, str]]
+ attributes: list[dict[str, str]]
def as_dict(self):
return self
@@ -159,14 +302,26 @@ class LegacyResult(dict):
super().__init__(*args, **kwargs)
- # Init fields with defaults / compare with defaults of the fields in class Result
- self.engine = self.get("engine", "")
- self.template = self.get("template", "default.html")
- self.url = self.get("url", None)
- self.parsed_url = self.get("parsed_url", None)
-
- self.content = self.get("content", "")
- self.title = self.get("title", "")
+ # emulate field types from type class Result
+ self["url"] = self.get("url")
+ self["template"] = self.get("template", "default.html")
+ self["engine"] = self.get("engine", "")
+ self["parsed_url"] = self.get("parsed_url")
+
+ # emulate field types from type class MainResult
+ self["title"] = self.get("title", "")
+ self["content"] = self.get("content", "")
+ self["img_src"] = self.get("img_src", "")
+ self["thumbnail"] = self.get("thumbnail", "")
+ self["priority"] = self.get("priority", "")
+ self["engines"] = self.get("engines", set())
+ self["positions"] = self.get("positions", "")
+ self["score"] = self.get("score", 0)
+ self["category"] = self.get("category", "")
+
+ if "infobox" in self:
+ self["urls"] = self.get("urls", [])
+ self["attributes"] = self.get("attributes", [])
# Legacy types that have already been ported to a type ..
@@ -178,13 +333,47 @@ class LegacyResult(dict):
)
self.template = "answer/legacy.html"
+ if self.template == "keyvalue.html":
+ warnings.warn(
+ f"engine {self.engine} is using deprecated `dict` for key/value results"
+ f" / use a class from searx.result_types",
+ DeprecationWarning,
+ )
+
+ def __getattr__(self, name: str, default=UNSET) -> typing.Any:
+ if default == self.UNSET and name not in self:
+ raise AttributeError(f"LegacyResult object has no field named: {name}")
+ return self[name]
+
+ def __setattr__(self, name: str, val):
+ self[name] = val
+
def __hash__(self) -> int: # type: ignore
if "answer" in self:
+ # deprecated ..
return hash(self["answer"])
+
+ if self.template == "images.html":
+ # image results are equal if their values for template, the url and
+ # the img_src are equal.
+ return hash(f"{self.template}|{self.url}|{self.img_src}")
+
if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]):
- # it is a commun url-result ..
- return hash(self.url)
+ # Ordinary url-results are equal if their values for template,
+ # parsed_url (without schema) and img_src` are equal.
+
+ # Code copied from with MainResult.__hash__:
+ if not self.parsed_url:
+ raise ValueError(f"missing a value in field 'parsed_url': {self}")
+
+ url = self.parsed_url
+ return hash(
+ f"{self.template}"
+ + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
+ + f"|{self.img_src}"
+ )
+
return id(self)
def __eq__(self, other):
@@ -195,30 +384,13 @@ class LegacyResult(dict):
return f"LegacyResult: {super().__repr__()}"
- def __getattr__(self, name: str, default=UNSET):
-
- if default == self.UNSET and name not in self:
- raise AttributeError(f"LegacyResult object has no field named: {name}")
- return self[name]
-
- def __setattr__(self, name: str, val):
-
- self[name] = val
-
def normalize_result_fields(self):
-
- self.title = self.WHITESPACE_REGEX.sub(" ", self.title)
-
- if not self.parsed_url and self.url:
- self.parsed_url = urllib.parse.urlparse(self.url)
-
- # if the result has no scheme, use http as default
- if not self.parsed_url.scheme:
- self.parsed_url = self.parsed_url._replace(scheme="http")
- self.url = self.parsed_url.geturl()
-
- if self.content:
- self.content = self.WHITESPACE_REGEX.sub(" ", self.content)
- if self.content == self.title:
- # avoid duplicate content between the content and title fields
- self.content = ""
+ _normalize_url_fields(self)
+ _normalize_text_fields(self)
+ if self.engine:
+ self.engines.add(self.engine)
+
+ def defaults_from(self, other: LegacyResult):
+ for k, v in other.items():
+ if not self.get(k):
+ self[k] = v