diff options
| author | Markus Heiser <markus.heiser@darmarit.de> | 2025-03-20 07:47:38 +0100 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarIT.de> | 2025-03-29 10:16:43 +0100 |
| commit | 50f92779bd9f2762d92a24a63851f4069f88297d (patch) | |
| tree | 54e9a3d7b2839bf3feb30339c3e3bd63196fd1da /searx/result_types | |
| parent | d36da0a6c34761258c43424a7d948f0c554ef5c9 (diff) | |
[refactor] migrate plugins from "module" to class SXNGPlugin
This patch brings two major changes:
- ``Result.filter_urls(..)`` to pass a filter function for URL fields
- The ``enabled_plugins:`` section in SearXNG's settings do no longer exists.
To understand plugin development compile documentation:
$ make docs.clean docs.live
and read http://0.0.0.0:8000/dev/plugins/development.html
There is no longer a distinction between built-in and external plugin, all
plugins are registered via the settings in the ``plugins:`` section.
In SearXNG, plugins can be registered via a fully qualified class name. A
configuration (`PluginCfg`) can be transferred to the plugin, e.g. to activate
it by default / *opt-in* or *opt-out* from user's point of view.
built-in plugins
================
The built-in plugins are all located in the namespace `searx.plugins`.
.. code:: yaml
plugins:
searx.plugins.calculator.SXNGPlugin:
active: true
searx.plugins.hash_plugin.SXNGPlugin:
active: true
searx.plugins.self_info.SXNGPlugin:
active: true
searx.plugins.tracker_url_remover.SXNGPlugin:
active: true
searx.plugins.unit_converter.SXNGPlugin:
active: true
searx.plugins.ahmia_filter.SXNGPlugin:
active: true
searx.plugins.hostnames.SXNGPlugin:
active: true
searx.plugins.oa_doi_rewrite.SXNGPlugin:
active: false
searx.plugins.tor_check.SXNGPlugin:
active: false
external plugins
================
SearXNG supports *external plugins* / there is no need to install one, SearXNG
runs out of the box.
- Only show green hosted results: https://github.com/return42/tgwf-searx-plugins/
To get a developer installation in a SearXNG developer environment:
.. code:: sh
$ git clone git@github.com:return42/tgwf-searx-plugins.git
$ ./manage pyenv.cmd python -m \
pip install -e tgwf-searx-plugins
To register the plugin in SearXNG add ``only_show_green_results.SXNGPlugin`` to
the ``plugins:``:
.. code:: yaml
plugins:
# ...
only_show_green_results.SXNGPlugin:
active: false
Result.filter_urls(..)
======================
The ``Result.filter_urls(..)`` can be used to filter and/or modify URL fields.
In the following example, the filter function ``my_url_filter``:
.. code:: python
def my_url_filter(result, field_name, url_src) -> bool | str:
if "google" in url_src:
return False # remove URL field from result
if "facebook" in url_src:
new_url = url_src.replace("facebook", "fb-dummy")
return new_url # return modified URL
return True # leave URL in field unchanged
is applied to all URL fields in the :py:obj:`Plugin.on_result` hook:
.. code:: python
class MyUrlFilter(Plugin):
...
def on_result(self, request, search, result) -> bool:
result.filter_urls(my_url_filter)
return True
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/result_types')
| -rw-r--r-- | searx/result_types/_base.py | 143 |
1 files changed, 138 insertions, 5 deletions
diff --git a/searx/result_types/_base.py b/searx/result_types/_base.py index 1cd4e4d2d..c4c0b18b2 100644 --- a/searx/result_types/_base.py +++ b/searx/result_types/_base.py @@ -26,11 +26,14 @@ import urllib.parse import warnings import typing +from collections.abc import Callable + import msgspec from searx import logger as log WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) +UNKNOWN = object() def _normalize_url_fields(result: Result | LegacyResult): @@ -50,8 +53,6 @@ def _normalize_url_fields(result: Result | LegacyResult): result.parsed_url = result.parsed_url._replace( # if the result has no scheme, use http as default scheme=result.parsed_url.scheme or "http", - # normalize ``www.example.com`` to ``example.com`` - # netloc=result.parsed_url.netloc.replace("www.", ""), # normalize ``example.com/path/`` to ``example.com/path`` path=result.parsed_url.path.rstrip("/"), ) @@ -107,6 +108,110 @@ def _normalize_text_fields(result: MainResult | LegacyResult): result.content = "" +def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]): + # pylint: disable=too-many-branches, too-many-statements + + # As soon we need LegacyResult not any longer, we can move this function to + # method Result. + + url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"] + + for field_name in url_fields: + url_src = getattr(result, field_name, "") + if not url_src: + continue + + new_url = filter_func(result, field_name, url_src) + # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url) + if isinstance(new_url, bool): + if new_url: + # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value) + continue + log.debug("filter_urls: drop field %s URL %s", field_name, url_src) + new_url = None + else: + log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url) + + setattr(result, field_name, new_url) + if field_name == "url": + # sync parsed_url with new_url + if not new_url: + result.parsed_url = None + elif isinstance(new_url, str): + result.parsed_url = urllib.parse.urlparse(new_url) + + # "urls": are from infobox + # + # As soon we have InfoboxResult, we can move this function to method + # InfoboxResult.normalize_result_fields + + infobox_urls: list[dict[str, str]] = getattr(result, "urls", []) + + if infobox_urls: + # log.debug("filter_urls: infobox_urls .. %s", infobox_urls) + new_infobox_urls: list[dict[str, str]] = [] + + for item in infobox_urls: + url_src = item.get("url") + if not url_src: + new_infobox_urls.append(item) + continue + + new_url = filter_func(result, "infobox_urls", url_src) + if isinstance(new_url, bool): + if new_url: + new_infobox_urls.append(item) + # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url) + continue + log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src) + new_url = None + if new_url: + log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url) + item["url"] = new_url + new_infobox_urls.append(item) + + setattr(result, "urls", new_infobox_urls) + + # "attributes": are from infobox + # + # The infobox has additional subsections for attributes, urls and relatedTopics: + + infobox_attributes: list[dict[str, dict]] = getattr(result, "attributes", []) + + if infobox_attributes: + # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes) + new_infobox_attributes: list[dict[str, dict]] = [] + + for item in infobox_attributes: + image = item.get("image", {}) + url_src = image.get("src", "") + if not url_src: + new_infobox_attributes.append(item) + continue + + new_url = filter_func(result, "infobox_attributes", url_src) + if isinstance(new_url, bool): + if new_url: + new_infobox_attributes.append(item) + # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src) + continue + log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src) + new_url = None + + if new_url: + log.debug( + "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s", + url_src, + new_url, + ) + item["image"]["src"] = new_url + new_infobox_attributes.append(item) + + setattr(result, "attributes", new_infobox_attributes) + + result.normalize_result_fields() + + class Result(msgspec.Struct, kw_only=True): """Base class of all result types :ref:`result types`.""" @@ -142,9 +247,6 @@ class Result(msgspec.Struct, kw_only=True): with the resulting value in ``parse_url``, if ``url`` and ``parse_url`` are not equal. - - ``www.example.com`` and ``example.com`` are equivalent and are normalized - to ``example.com``. - - ``example.com/path/`` and ``example.com/path`` are equivalent and are normalized to ``example.com/path``. """ @@ -153,6 +255,33 @@ class Result(msgspec.Struct, kw_only=True): def __post_init__(self): pass + def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]): + """A filter function is passed in the ``filter_func`` argument to + filter and/or modify the URLs. + + The filter function receives the :py:obj:`result object <Result>` as + the first argument and the field name (``str``) in the second argument. + In the third argument the URL string value is passed to the filter function. + + The filter function is applied to all fields that contain a URL, + in addition to the familiar ``url`` field, these include fields such as:: + + ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"] + + and the ``urls`` list of items of the infobox. + + For each field, the filter function is called and returns a bool or a + string value: + + - ``True``: leave URL in field unchanged + - ``False``: remove URL field from result (or remove entire result) + - ``str``: modified URL to be used instead + + See :ref:`filter urls example`. + + """ + _filter_urls(self, filter_func=filter_func) + def __hash__(self) -> int: """Generates a hash value that uniquely identifies the content of *this* result. The method can be adapted in the inheritance to compare results @@ -394,3 +523,7 @@ class LegacyResult(dict): for k, v in other.items(): if not self.get(k): self[k] = v + + def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]): + """See :py:obj:`Result.filter_urls`""" + _filter_urls(self, filter_func=filter_func) |