diff options
| author | Markus Heiser <markus.heiser@darmarIT.de> | 2025-10-20 11:20:33 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-10-20 11:20:33 +0200 |
| commit | 33e798b01b3d8b45aeae31c085a072260ad351df (patch) | |
| tree | a2483fe0d21d55932476827108d1a00615bb42e2 /searx | |
| parent | d84ae96cf97cd1b9d798ec5206dce846d8519c03 (diff) | |
[fix] TrackerPatternsDB.clean_url: don't delete query argument from new_url (#5339)
The query argument for URLs like:
- 'http://example.org?q=' --> query_str is 'q='
- 'http://example.org?/foo/bar' --> query_str is 'foo/bar'
is a *simple string* and not a key/value dict. This string may only be removed
from the URL if one of the patterns matches.
BTW get_pretty_url(): keep such a *simple string* in the path element.
Closes: https://github.com/searxng/searxng/issues/5299
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/data/tracker_patterns.py | 41 | ||||
| -rwxr-xr-x | searx/webapp.py | 6 |
2 files changed, 37 insertions, 10 deletions
diff --git a/searx/data/tracker_patterns.py b/searx/data/tracker_patterns.py index fd4746e5c..ed4415bce 100644 --- a/searx/data/tracker_patterns.py +++ b/searx/data/tracker_patterns.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Simple implementation to store TrackerPatterns data in a SQL database.""" +# pylint: disable=too-many-branches import typing as t @@ -119,6 +120,12 @@ class TrackerPatternsDB: for rule in self.rules(): + query_str: str = parsed_new_url.query + if not query_str: + # There are no more query arguments in the parsed_new_url on + # which rules can be applied, stop iterating over the rules. + break + if not re.match(rule[self.Fields.url_regexp], new_url): # no match / ignore pattern continue @@ -136,18 +143,32 @@ class TrackerPatternsDB: # overlapping urlPattern like ".*" continue - # remove tracker arguments from the url-query part query_args: list[tuple[str, str]] = list(parse_qsl(parsed_new_url.query)) - - for name, val in query_args.copy(): - # remove URL arguments + if query_args: + # remove tracker arguments from the url-query part + for name, val in query_args.copy(): + # remove URL arguments + for pattern in rule[self.Fields.del_args]: + if re.match(pattern, name): + log.debug( + "TRACKER_PATTERNS: %s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val + ) + query_args.remove((name, val)) + + parsed_new_url = parsed_new_url._replace(query=urlencode(query_args)) + new_url = urlunparse(parsed_new_url) + + else: + # The query argument for URLs like: + # - 'http://example.org?q=' --> query_str is 'q=' and query_args is [] + # - 'http://example.org?/foo/bar' --> query_str is 'foo/bar' and query_args is [] + # is a simple string and not a key/value dict. for pattern in rule[self.Fields.del_args]: - if re.match(pattern, name): - log.debug("TRACKER_PATTERNS: %s remove tracker arg: %s='%s'", parsed_new_url.netloc, name, val) - query_args.remove((name, val)) - - parsed_new_url = parsed_new_url._replace(query=urlencode(query_args)) - new_url = urlunparse(parsed_new_url) + if re.match(pattern, query_str): + log.debug("TRACKER_PATTERNS: %s remove tracker arg: '%s'", parsed_new_url.netloc, query_str) + parsed_new_url = parsed_new_url._replace(query="") + new_url = urlunparse(parsed_new_url) + break if new_url != url: return new_url diff --git a/searx/webapp.py b/searx/webapp.py index 218959a9c..d7bb37717 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -356,6 +356,12 @@ def get_pretty_url(parsed_url: urllib.parse.ParseResult): path = parsed_url.path path = path[:-1] if len(path) > 0 and path[-1] == '/' else path path = unquote(path.replace("/", " › ")) + + # Keep the query argument for URLs like: + # - 'http://example.org?/foo/bar' --> parsed_url.query is 'foo/bar' + query_args: list[tuple[str, str]] = list(urllib.parse.parse_qsl(parsed_url.query)) + if not query_args and parsed_url.query: + path += (" › .." if len(parsed_url.query) > 24 else " › ") + parsed_url.query[-24:] return [parsed_url.scheme + "://" + parsed_url.netloc, path] |