diff options
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/data/__init__.py | 3 | ||||
| -rw-r--r-- | searx/plugins/tracker_url_remover.py | 78 |
2 files changed, 57 insertions, 24 deletions
diff --git a/searx/data/__init__.py b/searx/data/__init__.py index 9be1cd67e..5a859f8cd 100644 --- a/searx/data/__init__.py +++ b/searx/data/__init__.py @@ -23,6 +23,7 @@ OSM_KEYS_TAGS: dict[str, typing.Any] ENGINE_DESCRIPTIONS: dict[str, typing.Any] ENGINE_TRAITS: dict[str, typing.Any] LOCALES: dict[str, typing.Any] +TRACKER_PATTERNS: list[dict[str, typing.Any]] lazy_globals = { "CURRENCIES": CurrenciesDB(), @@ -34,6 +35,7 @@ lazy_globals = { "ENGINE_DESCRIPTIONS": None, "ENGINE_TRAITS": None, "LOCALES": None, + "TRACKER_PATTERNS": None, } data_json_files = { @@ -45,6 +47,7 @@ data_json_files = { "ENGINE_DESCRIPTIONS": "engine_descriptions.json", "ENGINE_TRAITS": "engine_traits.json", "LOCALES": "locales.json", + "TRACKER_PATTERNS": "tracker_patterns.json", } diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py index d9c767a36..efc593775 100644 --- a/searx/plugins/tracker_url_remover.py +++ b/searx/plugins/tracker_url_remover.py @@ -1,34 +1,31 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring +# pylint: disable=missing-module-docstring, unused-argument from __future__ import annotations import typing import re -from urllib.parse import urlunparse, parse_qsl, urlencode +from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode from flask_babel import gettext -from searx.plugins import Plugin, PluginInfo +from searx.data import TRACKER_PATTERNS + +from . import Plugin, PluginInfo +from ._core import log if typing.TYPE_CHECKING: from searx.search import SearchWithPlugins from searx.extended_types import SXNG_Request - from searx.result_types import Result + from searx.result_types import Result, LegacyResult from searx.plugins import PluginCfg -regexes = { - re.compile(r'utm_[^&]+'), - re.compile(r'(wkey|wemail)[^&]*'), - re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'), - re.compile(r'&$'), -} - class SXNGPlugin(Plugin): - """Remove trackers arguments from the returned URL""" + """Remove trackers arguments from the returned URL.""" id = "tracker_url_remover" + log = log.getChild(id) def __init__(self, plg_cfg: "PluginCfg") -> None: super().__init__(plg_cfg) @@ -39,20 +36,53 @@ class SXNGPlugin(Plugin): preference_section="privacy", ) - def on_result( - self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result - ) -> bool: # pylint: disable=unused-argument - if not result.parsed_url: + def on_result(self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result) -> bool: + + result.filter_urls(self.filter_url_field) + return True + + @classmethod + def filter_url_field(cls, result: "Result|LegacyResult", field_name: str, url_src: str) -> bool | str: + """Returns bool ``True`` to use URL unchanged (``False`` to ignore URL). + If URL should be modified, the returned string is the new URL to use.""" + + if not url_src: + cls.log.debug("missing a URL in field %s", field_name) return True - parsed_query: list[tuple[str, str]] = parse_qsl(result.parsed_url.query) - for name_value in list(parsed_query): - param_name = name_value[0] - for reg in regexes: - if reg.match(param_name): - parsed_query.remove(name_value) - result.parsed_url = result.parsed_url._replace(query=urlencode(parsed_query)) - result.url = urlunparse(result.parsed_url) + new_url = url_src + parsed_new_url = urlparse(url=new_url) + + for rule in TRACKER_PATTERNS: + + if not re.match(rule["urlPattern"], new_url): + # no match / ignore pattern + continue + + in_exceptions = False + for exception in rule["exceptions"]: + if re.match(exception, new_url): + in_exceptions = True break + if in_exceptions: + # pattern is in the list of exceptions / ignore pattern + # hint: we can't break the outer pattern loop since we have + # overlapping urlPattern like ".*" + continue + + # remove tracker arguments from the url-query part + query_args: list[tuple[str, str]] = list(parse_qsl(parsed_new_url.query)) + + for name, val in query_args.copy(): + for reg in rule["trackerParams"]: + if re.match(reg, name): + cls.log.debug("%s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val) + query_args.remove((name, val)) + + parsed_new_url = parsed_new_url._replace(query=urlencode(query_args)) + new_url = urlunparse(parsed_new_url) + + if new_url != url_src: + return new_url return True |