summaryrefslogtreecommitdiff
path: root/searx/result_types
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2025-03-20 07:47:38 +0100
committerMarkus Heiser <markus.heiser@darmarIT.de>2025-03-29 10:16:43 +0100
commit50f92779bd9f2762d92a24a63851f4069f88297d (patch)
tree54e9a3d7b2839bf3feb30339c3e3bd63196fd1da /searx/result_types
parentd36da0a6c34761258c43424a7d948f0c554ef5c9 (diff)
[refactor] migrate plugins from "module" to class SXNGPlugin
This patch brings two major changes: - ``Result.filter_urls(..)`` to pass a filter function for URL fields - The ``enabled_plugins:`` section in SearXNG's settings do no longer exists. To understand plugin development compile documentation: $ make docs.clean docs.live and read http://0.0.0.0:8000/dev/plugins/development.html There is no longer a distinction between built-in and external plugin, all plugins are registered via the settings in the ``plugins:`` section. In SearXNG, plugins can be registered via a fully qualified class name. A configuration (`PluginCfg`) can be transferred to the plugin, e.g. to activate it by default / *opt-in* or *opt-out* from user's point of view. built-in plugins ================ The built-in plugins are all located in the namespace `searx.plugins`. .. code:: yaml plugins: searx.plugins.calculator.SXNGPlugin: active: true searx.plugins.hash_plugin.SXNGPlugin: active: true searx.plugins.self_info.SXNGPlugin: active: true searx.plugins.tracker_url_remover.SXNGPlugin: active: true searx.plugins.unit_converter.SXNGPlugin: active: true searx.plugins.ahmia_filter.SXNGPlugin: active: true searx.plugins.hostnames.SXNGPlugin: active: true searx.plugins.oa_doi_rewrite.SXNGPlugin: active: false searx.plugins.tor_check.SXNGPlugin: active: false external plugins ================ SearXNG supports *external plugins* / there is no need to install one, SearXNG runs out of the box. - Only show green hosted results: https://github.com/return42/tgwf-searx-plugins/ To get a developer installation in a SearXNG developer environment: .. code:: sh $ git clone git@github.com:return42/tgwf-searx-plugins.git $ ./manage pyenv.cmd python -m \ pip install -e tgwf-searx-plugins To register the plugin in SearXNG add ``only_show_green_results.SXNGPlugin`` to the ``plugins:``: .. code:: yaml plugins: # ... only_show_green_results.SXNGPlugin: active: false Result.filter_urls(..) ====================== The ``Result.filter_urls(..)`` can be used to filter and/or modify URL fields. In the following example, the filter function ``my_url_filter``: .. code:: python def my_url_filter(result, field_name, url_src) -> bool | str: if "google" in url_src: return False # remove URL field from result if "facebook" in url_src: new_url = url_src.replace("facebook", "fb-dummy") return new_url # return modified URL return True # leave URL in field unchanged is applied to all URL fields in the :py:obj:`Plugin.on_result` hook: .. code:: python class MyUrlFilter(Plugin): ... def on_result(self, request, search, result) -> bool: result.filter_urls(my_url_filter) return True Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/result_types')
-rw-r--r--searx/result_types/_base.py143
1 files changed, 138 insertions, 5 deletions
diff --git a/searx/result_types/_base.py b/searx/result_types/_base.py
index 1cd4e4d2d..c4c0b18b2 100644
--- a/searx/result_types/_base.py
+++ b/searx/result_types/_base.py
@@ -26,11 +26,14 @@ import urllib.parse
import warnings
import typing
+from collections.abc import Callable
+
import msgspec
from searx import logger as log
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
+UNKNOWN = object()
def _normalize_url_fields(result: Result | LegacyResult):
@@ -50,8 +53,6 @@ def _normalize_url_fields(result: Result | LegacyResult):
result.parsed_url = result.parsed_url._replace(
# if the result has no scheme, use http as default
scheme=result.parsed_url.scheme or "http",
- # normalize ``www.example.com`` to ``example.com``
- # netloc=result.parsed_url.netloc.replace("www.", ""),
# normalize ``example.com/path/`` to ``example.com/path``
path=result.parsed_url.path.rstrip("/"),
)
@@ -107,6 +108,110 @@ def _normalize_text_fields(result: MainResult | LegacyResult):
result.content = ""
+def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
+ # pylint: disable=too-many-branches, too-many-statements
+
+ # As soon we need LegacyResult not any longer, we can move this function to
+ # method Result.
+
+ url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
+
+ for field_name in url_fields:
+ url_src = getattr(result, field_name, "")
+ if not url_src:
+ continue
+
+ new_url = filter_func(result, field_name, url_src)
+ # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url)
+ if isinstance(new_url, bool):
+ if new_url:
+ # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value)
+ continue
+ log.debug("filter_urls: drop field %s URL %s", field_name, url_src)
+ new_url = None
+ else:
+ log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url)
+
+ setattr(result, field_name, new_url)
+ if field_name == "url":
+ # sync parsed_url with new_url
+ if not new_url:
+ result.parsed_url = None
+ elif isinstance(new_url, str):
+ result.parsed_url = urllib.parse.urlparse(new_url)
+
+ # "urls": are from infobox
+ #
+ # As soon we have InfoboxResult, we can move this function to method
+ # InfoboxResult.normalize_result_fields
+
+ infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
+
+ if infobox_urls:
+ # log.debug("filter_urls: infobox_urls .. %s", infobox_urls)
+ new_infobox_urls: list[dict[str, str]] = []
+
+ for item in infobox_urls:
+ url_src = item.get("url")
+ if not url_src:
+ new_infobox_urls.append(item)
+ continue
+
+ new_url = filter_func(result, "infobox_urls", url_src)
+ if isinstance(new_url, bool):
+ if new_url:
+ new_infobox_urls.append(item)
+ # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url)
+ continue
+ log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src)
+ new_url = None
+ if new_url:
+ log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url)
+ item["url"] = new_url
+ new_infobox_urls.append(item)
+
+ setattr(result, "urls", new_infobox_urls)
+
+ # "attributes": are from infobox
+ #
+ # The infobox has additional subsections for attributes, urls and relatedTopics:
+
+ infobox_attributes: list[dict[str, dict]] = getattr(result, "attributes", [])
+
+ if infobox_attributes:
+ # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes)
+ new_infobox_attributes: list[dict[str, dict]] = []
+
+ for item in infobox_attributes:
+ image = item.get("image", {})
+ url_src = image.get("src", "")
+ if not url_src:
+ new_infobox_attributes.append(item)
+ continue
+
+ new_url = filter_func(result, "infobox_attributes", url_src)
+ if isinstance(new_url, bool):
+ if new_url:
+ new_infobox_attributes.append(item)
+ # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src)
+ continue
+ log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src)
+ new_url = None
+
+ if new_url:
+ log.debug(
+ "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s",
+ url_src,
+ new_url,
+ )
+ item["image"]["src"] = new_url
+ new_infobox_attributes.append(item)
+
+ setattr(result, "attributes", new_infobox_attributes)
+
+ result.normalize_result_fields()
+
+
class Result(msgspec.Struct, kw_only=True):
"""Base class of all result types :ref:`result types`."""
@@ -142,9 +247,6 @@ class Result(msgspec.Struct, kw_only=True):
with the resulting value in ``parse_url``, if ``url`` and
``parse_url`` are not equal.
- - ``www.example.com`` and ``example.com`` are equivalent and are normalized
- to ``example.com``.
-
- ``example.com/path/`` and ``example.com/path`` are equivalent and are
normalized to ``example.com/path``.
"""
@@ -153,6 +255,33 @@ class Result(msgspec.Struct, kw_only=True):
def __post_init__(self):
pass
+ def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
+ """A filter function is passed in the ``filter_func`` argument to
+ filter and/or modify the URLs.
+
+ The filter function receives the :py:obj:`result object <Result>` as
+ the first argument and the field name (``str``) in the second argument.
+ In the third argument the URL string value is passed to the filter function.
+
+ The filter function is applied to all fields that contain a URL,
+ in addition to the familiar ``url`` field, these include fields such as::
+
+ ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
+
+ and the ``urls`` list of items of the infobox.
+
+ For each field, the filter function is called and returns a bool or a
+ string value:
+
+ - ``True``: leave URL in field unchanged
+ - ``False``: remove URL field from result (or remove entire result)
+ - ``str``: modified URL to be used instead
+
+ See :ref:`filter urls example`.
+
+ """
+ _filter_urls(self, filter_func=filter_func)
+
def __hash__(self) -> int:
"""Generates a hash value that uniquely identifies the content of *this*
result. The method can be adapted in the inheritance to compare results
@@ -394,3 +523,7 @@ class LegacyResult(dict):
for k, v in other.items():
if not self.get(k):
self[k] = v
+
+ def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
+ """See :py:obj:`Result.filter_urls`"""
+ _filter_urls(self, filter_func=filter_func)