diff options
| author | Ivan Gabaldon <igabaldon@inetol.net> | 2025-08-09 23:03:30 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-08-09 23:03:30 +0200 |
| commit | ce8929cabe27c7cf0bfb21b47786c7442ffb3712 (patch) | |
| tree | 8b478698e812efee8e457d34cef8fbd43d2da024 /searx/botdetection | |
| parent | 341d718c7f8557e03e184102f63b2f4a4364b939 (diff) | |
[mod] limiter: trusted proxies (#4911)
Replaces `x_for` functionality with `trusted_proxies`. This allows defining
which IP / ranges to trust extracting the client IP address from X-Forwarded-For
and X-Real-IP headers.
We don't know if the proxy chain will give us the proper client
address (REMOTE_ADDR in the WSGI environment), so we rely on reading the headers
of the proxy before SearXNG (if there is one, in that case it must be added to
trusted_proxies) hoping it has done the proper checks. In case a proxy in the
chain does not check the client address correctly, integrity is compromised and
this should be fixed by whoever manages the proxy, not us.
Closes:
- https://github.com/searxng/searxng/issues/4940
- https://github.com/searxng/searxng/issues/4939
- https://github.com/searxng/searxng/issues/4907
- https://github.com/searxng/searxng/issues/3632
- https://github.com/searxng/searxng/issues/3191
- https://github.com/searxng/searxng/issues/1237
Related:
- https://github.com/searxng/searxng-docker/issues/386
- https://github.com/inetol-infrastructure/searxng-container/issues/81
Diffstat (limited to 'searx/botdetection')
| -rw-r--r-- | searx/botdetection/__init__.py | 23 | ||||
| -rw-r--r-- | searx/botdetection/_helpers.py | 99 | ||||
| -rw-r--r-- | searx/botdetection/config.py | 33 | ||||
| -rw-r--r-- | searx/botdetection/http_accept.py | 5 | ||||
| -rw-r--r-- | searx/botdetection/http_accept_encoding.py | 5 | ||||
| -rw-r--r-- | searx/botdetection/http_accept_language.py | 5 | ||||
| -rw-r--r-- | searx/botdetection/http_connection.py | 5 | ||||
| -rw-r--r-- | searx/botdetection/http_sec_fetch.py | 4 | ||||
| -rw-r--r-- | searx/botdetection/http_user_agent.py | 5 | ||||
| -rw-r--r-- | searx/botdetection/ip_limit.py | 7 | ||||
| -rw-r--r-- | searx/botdetection/ip_lists.py | 16 | ||||
| -rw-r--r-- | searx/botdetection/link_token.py | 42 | ||||
| -rw-r--r-- | searx/botdetection/trusted_proxies.py | 175 | ||||
| -rw-r--r-- | searx/botdetection/valkeydb.py | 22 |
14 files changed, 301 insertions, 145 deletions
diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index 4079d97a9..e686e3de9 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -4,19 +4,22 @@ Implementations used for bot detection. """ +from __future__ import annotations + +__all__ = ["init", "dump_request", "get_network", "too_many_requests", "ProxyFix"] + + +import valkey from ._helpers import dump_request -from ._helpers import get_real_ip from ._helpers import get_network from ._helpers import too_many_requests - -__all__ = ['dump_request', 'get_network', 'get_real_ip', 'too_many_requests'] - -valkey_client = None -cfg = None +from . import config +from . import valkeydb +from .trusted_proxies import ProxyFix -def init(_cfg, _valkey_client): - global valkey_client, cfg # pylint: disable=global-statement - valkey_client = _valkey_client - cfg = _cfg +def init(cfg: config.Config, valkey_client: valkey.Valkey | None): + config.set_global_cfg(cfg) + if valkey_client: + valkeydb.set_valkey_client(valkey_client) diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py index 7b57ae694..72af693c1 100644 --- a/searx/botdetection/_helpers.py +++ b/searx/botdetection/_helpers.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=missing-module-docstring, invalid-name from __future__ import annotations +import typing as t + +__all__ = ["log_error_only_once", "dump_request", "get_network", "logger", "too_many_requests"] from ipaddress import ( IPv4Network, @@ -8,20 +11,19 @@ from ipaddress import ( IPv4Address, IPv6Address, ip_network, - ip_address, ) import flask import werkzeug from searx import logger -from searx.extended_types import SXNG_Request -from . import config +if t.TYPE_CHECKING: + from . import config logger = logger.getChild('botdetection') -def dump_request(request: SXNG_Request): +def dump_request(request: flask.Request): return ( request.path + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') @@ -52,86 +54,33 @@ def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkz def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network: - """Returns the (client) network of whether the real_ip is part of.""" + """Returns the (client) network of whether the ``real_ip`` is part of. + + The ``ipv4_prefix`` and ``ipv6_prefix`` define the number of leading bits in + an address that are compared to determine whether or not an address is part + of a (client) network. + + .. code:: toml + + [botdetection] + + ipv4_prefix = 32 + ipv6_prefix = 48 + + """ + prefix: int = cfg["botdetection.ipv4_prefix"] if real_ip.version == 6: - prefix = cfg['real_ip.ipv6_prefix'] - else: - prefix = cfg['real_ip.ipv4_prefix'] + prefix: int = cfg["botdetection.ipv6_prefix"] network = ip_network(f"{real_ip}/{prefix}", strict=False) # logger.debug("get_network(): %s", network.compressed) return network -_logged_errors = [] +_logged_errors: list[str] = [] -def _log_error_only_once(err_msg): +def log_error_only_once(err_msg: str): if err_msg not in _logged_errors: logger.error(err_msg) _logged_errors.append(err_msg) - - -def get_real_ip(request: SXNG_Request) -> str: - """Returns real IP of the request. Since not all proxies set all the HTTP - headers and incoming headers can be faked it may happen that the IP cannot - be determined correctly. - - .. sidebar:: :py:obj:`flask.Request.remote_addr` - - SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). - - This function tries to get the remote IP in the order listed below, - additional some tests are done and if inconsistencies or errors are - detected, they are logged. - - The remote IP of the request is taken from (first match): - - - X-Forwarded-For_ header - - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__ - - :py:obj:`flask.Request.remote_addr` - - .. _ProxyFix: - https://werkzeug.palletsprojects.com/middleware/proxy_fix/ - - .. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - - """ - - forwarded_for = request.headers.get("X-Forwarded-For") - real_ip = request.headers.get('X-Real-IP') - remote_addr = request.remote_addr - # logger.debug( - # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr - # ) - - if not forwarded_for: - _log_error_only_once("X-Forwarded-For header is not set!") - else: - from . import cfg # pylint: disable=import-outside-toplevel, cyclic-import - - forwarded_for = [x.strip() for x in forwarded_for.split(',')] - x_for: int = cfg['real_ip.x_for'] # type: ignore - forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] - - if not real_ip: - _log_error_only_once("X-Real-IP header is not set!") - - if forwarded_for and real_ip and forwarded_for != real_ip: - logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) - - if forwarded_for and remote_addr and forwarded_for != remote_addr: - logger.warning( - "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for - ) - - if real_ip and remote_addr and real_ip != remote_addr: - logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) - - request_ip = ip_address(forwarded_for or real_ip or remote_addr or '0.0.0.0') - if request_ip.version == 6 and request_ip.ipv4_mapped: - request_ip = request_ip.ipv4_mapped - - # logger.debug("get_real_ip() -> %s", request_ip) - return str(request_ip) diff --git a/searx/botdetection/config.py b/searx/botdetection/config.py index 5b73afe1c..6b35df84f 100644 --- a/searx/botdetection/config.py +++ b/searx/botdetection/config.py @@ -7,19 +7,32 @@ structured dictionaries. The configuration schema is defined in a dictionary structure and the configuration data is given in a dictionary structure. """ from __future__ import annotations -from typing import Any +import typing import copy -import typing import logging import pathlib from ..compat import tomllib -__all__ = ['Config', 'UNSET', 'SchemaIssue'] +__all__ = ['Config', 'UNSET', 'SchemaIssue', 'set_global_cfg', 'get_global_cfg'] log = logging.getLogger(__name__) +CFG: Config | None = None +"""Global config of the botdetection.""" + + +def set_global_cfg(cfg: Config): + global CFG # pylint: disable=global-statement + CFG = cfg + + +def get_global_cfg() -> Config: + if CFG is None: + raise ValueError("Botdetection's config is not yet initialized.") + return CFG + class FALSE: """Class of ``False`` singleton""" @@ -57,7 +70,7 @@ class Config: UNSET = UNSET @classmethod - def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config: + def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict[str, str]) -> Config: # init schema @@ -80,7 +93,7 @@ class Config: cfg.update(upd_cfg) return cfg - def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]): + def __init__(self, cfg_schema: dict[str, typing.Any], deprecated: dict[str, str]): """Constructor of class Config. :param cfg_schema: Schema of the configuration @@ -93,10 +106,10 @@ class Config: self.deprecated = deprecated self.cfg = copy.deepcopy(cfg_schema) - def __getitem__(self, key: str) -> Any: + def __getitem__(self, key: str) -> typing.Any: return self.get(key) - def validate(self, cfg: dict): + def validate(self, cfg: dict[str, typing.Any]): """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`. Validation is done by :py:obj:`validate`.""" @@ -111,7 +124,7 @@ class Config: """Returns default value of field ``name`` in ``self.cfg_schema``.""" return value(name, self.cfg_schema) - def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any: + def get(self, name: str, default: typing.Any = UNSET, replace: bool = True) -> typing.Any: """Returns the value to which ``name`` points in the configuration. If there is no such ``name`` in the config and the ``default`` is @@ -214,8 +227,8 @@ def value(name: str, data_dict: dict): def validate( - schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str] -) -> typing.Tuple[bool, list]: + schema_dict: dict[str, typing.Any], data_dict: dict[str, typing.Any], deprecated: dict[str, str] +) -> tuple[bool, list[str]]: """Deep validation of dictionary in ``data_dict`` against dictionary in ``schema_dict``. Argument deprecated is a dictionary that maps deprecated configuration names to a messages:: diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index f64991d50..4543e7217 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -20,8 +20,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -29,7 +28,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 0975cc85e..3cc65ee17 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -21,8 +21,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -30,7 +29,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 1287e5453..6e7480728 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -18,8 +18,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -27,7 +26,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: if request.headers.get('Accept-Language', '').strip() == '': diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index eed15f989..6adcd4b39 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -18,8 +18,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -27,7 +26,7 @@ from ._helpers import too_many_requests def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_sec_fetch.py b/searx/botdetection/http_sec_fetch.py index f64ee4b2c..edead3bfa 100644 --- a/searx/botdetection/http_sec_fetch.py +++ b/searx/botdetection/http_sec_fetch.py @@ -32,8 +32,6 @@ import re import flask import werkzeug -from searx.extended_types import SXNG_Request - from . import config from ._helpers import logger @@ -78,7 +76,7 @@ def is_browser_supported(user_agent: str) -> bool: def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, ) -> werkzeug.Response | None: diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 57d5bfee7..9b28660fe 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -20,8 +20,7 @@ from ipaddress import ( ) import werkzeug - -from searx.extended_types import SXNG_Request +import flask from . import config from ._helpers import too_many_requests @@ -56,7 +55,7 @@ def regexp_user_agent(): def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, # pylint: disable=unused-argument ) -> werkzeug.Response | None: diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 93af8b7c5..2b216baf7 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -45,12 +45,11 @@ from ipaddress import ( import flask import werkzeug -from searx.extended_types import SXNG_Request -from searx import valkeydb from searx.valkeylib import incr_sliding_window, drop_counter from . import link_token from . import config +from . import valkeydb from ._helpers import ( too_many_requests, logger, @@ -92,12 +91,12 @@ SUSPICIOUS_IP_MAX = 3 def filter_request( network: IPv4Network | IPv6Network, - request: SXNG_Request, + request: flask.Request, cfg: config.Config, ) -> werkzeug.Response | None: # pylint: disable=too-many-return-statements - valkey_client = valkeydb.client() + valkey_client = valkeydb.get_valkey_client() if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) diff --git a/searx/botdetection/ip_lists.py b/searx/botdetection/ip_lists.py index 2ad1c62d0..77628b577 100644 --- a/searx/botdetection/ip_lists.py +++ b/searx/botdetection/ip_lists.py @@ -4,21 +4,22 @@ Method ``ip_lists`` ------------------- -The ``ip_lists`` method implements IP :py:obj:`block- <block_ip>` and -:py:obj:`pass-lists <pass_ip>`. +The ``ip_lists`` method implements :py:obj:`block-list <block_ip>` and +:py:obj:`pass-list <pass_ip>`. .. code:: toml [botdetection.ip_lists] pass_ip = [ - '167.235.158.251', # IPv4 of check.searx.space - '192.168.0.0/16', # IPv4 private network - 'fe80::/10' # IPv6 linklocal + '167.235.158.251', # IPv4 of check.searx.space + '192.168.0.0/16', # IPv4 private network + 'fe80::/10', # IPv6 linklocal ] + block_ip = [ - '93.184.216.34', # IPv4 of example.org - '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class + '93.184.216.34', # IPv4 of example.org + '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class ] """ @@ -72,7 +73,6 @@ def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bo def ip_is_subnet_of_member_in_list( real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config ) -> Tuple[bool, str]: - for net in cfg.get(list_name, default=[]): try: net = ip_network(net, strict=False) diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 600796380..9e815e194 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -43,17 +43,18 @@ from ipaddress import ( import string import random +import flask -from searx import logger -from searx import valkeydb from searx.valkeylib import secret_hash -from searx.extended_types import SXNG_Request from ._helpers import ( get_network, - get_real_ip, + logger, ) +from . import config +from . import valkeydb + TOKEN_LIVE_TIME = 600 """Lifetime (sec) of limiter's CSS token.""" @@ -69,17 +70,14 @@ TOKEN_KEY = 'SearXNG_limiter.token' logger = logger.getChild('botdetection.link_token') -def is_suspicious(network: IPv4Network | IPv6Network, request: SXNG_Request, renew: bool = False): +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): """Checks whether a valid ping is exists for this (client) network, if not this request is rated as *suspicious*. If a valid ping exists and argument ``renew`` is ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. """ - valkey_client = valkeydb.client() - if not valkey_client: - return False - + valkey_client = valkeydb.get_valkey_client() ping_key = get_ping_key(network, request) if not valkey_client.get(ping_key): logger.info("missing ping (IP: %s) / request: %s", network.compressed, ping_key) @@ -92,28 +90,29 @@ def is_suspicious(network: IPv4Network | IPv6Network, request: SXNG_Request, ren return False -def ping(request: SXNG_Request, token: str): +def ping(request: flask.Request, token: str): """This function is called by a request to URL ``/client<token>.css``. If ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. """ - from . import valkey_client, cfg # pylint: disable=import-outside-toplevel, cyclic-import + valkey_client = valkeydb.get_valkey_client() + cfg = config.get_global_cfg() - if not valkey_client: - return if not token_is_valid(token): return - real_ip = ip_address(get_real_ip(request)) + real_ip = ip_address(request.remote_addr) # type: ignore network = get_network(real_ip, cfg) ping_key = get_ping_key(network, request) - logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) + logger.debug( + "store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip.compressed, ping_key + ) valkey_client.set(ping_key, 1, ex=PING_LIVE_TIME) -def get_ping_key(network: IPv4Network | IPv6Network, request: SXNG_Request) -> str: +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: """Generates a hashed key that fits (more or less) to a *WEB-browser session* in a network.""" return ( @@ -134,20 +133,23 @@ def token_is_valid(token) -> bool: def get_token() -> str: """Returns current token. If there is no currently active token a new token - is generated randomly and stored in the valkey DB. + is generated randomly and stored in the Valkey DB. Without without a + database connection, string "12345678" is returned. - :py:obj:`TOKEN_LIVE_TIME` - :py:obj:`TOKEN_KEY` """ - valkey_client = valkeydb.client() - if not valkey_client: + try: + valkey_client = valkeydb.get_valkey_client() + except ValueError: # This function is also called when limiter is inactive / no valkey DB # (see render function in webapp.py) return '12345678' + token = valkey_client.get(TOKEN_KEY) if token: - token = token.decode('UTF-8') + token = token.decode('UTF-8') # type: ignore else: token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) valkey_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) diff --git a/searx/botdetection/trusted_proxies.py b/searx/botdetection/trusted_proxies.py new file mode 100644 index 000000000..7191f0eb2 --- /dev/null +++ b/searx/botdetection/trusted_proxies.py @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Implementation of a middleware to determine the real IP of an HTTP request +(:py:obj:`flask.request.remote_addr`) behind a proxy chain.""" +# pylint: disable=too-many-branches + +from __future__ import annotations +import typing as t + +from collections import abc +from ipaddress import IPv4Address, IPv6Address, ip_address, ip_network, IPv4Network, IPv6Network +from werkzeug.http import parse_list_header + +from . import config +from ._helpers import log_error_only_once, logger + +if t.TYPE_CHECKING: + from _typeshed.wsgi import StartResponse + from _typeshed.wsgi import WSGIApplication + from _typeshed.wsgi import WSGIEnvironment + + +class ProxyFix: + """A middleware like the ProxyFix_ class, where the `x_for` argument is + replaced by a method that determines the number of trusted proxies via + the `botdetection.trusted_proxies` setting. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + The remote IP (py:obj:`flask.Request.remote_addr`) of the request is taken + from (first match): + + - X-Forwarded-For_: If the header is set, the first untrusted IP that comes + before the IPs that are still part of the ``botdetection.trusted_proxies`` + is used. + + - `X-Real-IP <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__: + If X-Forwarded-For_ is not set, `X-Real-IP` is used + (``botdetection.trusted_proxies`` is ignored). + + If none of the header is set, the REMOTE_ADDR_ from the WSGI layer is used. + If (for whatever reasons) none IP can be determined, an error message is + displayed and ``100::`` is used instead (:rfc:`6666`). + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + .. _REMOTE_ADDR: + https://wsgi.readthedocs.io/en/latest/proposals-2.0.html#making-some-keys-required + + """ + + def __init__(self, wsgi_app: WSGIApplication) -> None: + self.wsgi_app = wsgi_app + + def trusted_proxies(self) -> list[IPv4Network | IPv6Network]: + cfg = config.get_global_cfg() + proxy_list: list[str] = cfg.get("botdetection.trusted_proxies", default=[]) + return [ip_network(net, strict=False) for net in proxy_list] + + def trusted_remote_addr( + self, + x_forwarded_for: list[IPv4Address | IPv6Address], + trusted_proxies: list[IPv4Network | IPv6Network], + ) -> str: + # always rtl + for addr in reversed(x_forwarded_for): + trust: bool = False + + for net in trusted_proxies: + if addr.version == net.version and addr in net: + logger.debug("trust proxy %s (member of %s)", addr, net) + trust = True + break + + # client address + if not trust: + return addr.compressed + + # fallback to first address + return x_forwarded_for[0].compressed + + def __call__(self, environ: WSGIEnvironment, start_response: StartResponse) -> abc.Iterable[bytes]: + # pylint: disable=too-many-statements + + trusted_proxies = self.trusted_proxies() + + # We do not rely on the REMOTE_ADDR from the WSGI environment / the + # variable is first removed from the WSGI environment and explicitly set + # in this function! + + orig_remote_addr: str | None = environ.pop("REMOTE_ADDR") + + # Validate the IPs involved in this game and delete all invalid ones + # from the WSGI environment. + + if orig_remote_addr: + try: + addr = ip_address(orig_remote_addr) + if addr.version == 6 and addr.ipv4_mapped: + addr = addr.ipv4_mapped + orig_remote_addr = addr.compressed + except ValueError as exc: + logger.error("REMOTE_ADDR: %s / discard REMOTE_ADDR from WSGI environment", exc) + orig_remote_addr = None + + x_real_ip: str | None = environ.get("HTTP_X_REAL_IP") + if x_real_ip: + try: + addr = ip_address(x_real_ip) + if addr.version == 6 and addr.ipv4_mapped: + addr = addr.ipv4_mapped + x_real_ip = addr.compressed + except ValueError as exc: + logger.error("X-Real-IP: %s / discard HTTP_X_REAL_IP from WSGI environment", exc) + environ.pop("HTTP_X_REAL_IP") + x_real_ip = None + + x_forwarded_for: list[IPv4Address | IPv6Address] = [] + if environ.get("HTTP_X_FORWARDED_FOR"): + for x_for_ip in parse_list_header(str(environ.get("HTTP_X_FORWARDED_FOR"))): + try: + addr = ip_address(x_for_ip) + except ValueError as exc: + logger.error("X-Forwarded-For: %s / discard HTTP_X_FORWARDED_FOR from WSGI environment", exc) + environ.pop("HTTP_X_FORWARDED_FOR") + x_forwarded_for = [] + break + + if addr.version == 6 and addr.ipv4_mapped: + addr = addr.ipv4_mapped + x_forwarded_for.append(addr) + + # log questionable WSGI environments + + if not x_forwarded_for and not x_real_ip: + log_error_only_once("X-Forwarded-For nor X-Real-IP header is set!") + + if x_forwarded_for and not trusted_proxies: + log_error_only_once("missing botdetection.trusted_proxies config") + # without trusted_proxies, this variable is useless for determining + # the real IP + x_forwarded_for = [] + + # securing the WSGI environment variables that are adjusted + + environ.update({"botdetection.trusted_proxies.orig": {"REMOTE_ADDR": orig_remote_addr}}) + + # determine *the real IP* + + if x_forwarded_for: + environ["REMOTE_ADDR"] = self.trusted_remote_addr(x_forwarded_for, trusted_proxies) + + elif x_real_ip: + environ["REMOTE_ADDR"] = x_real_ip + + elif orig_remote_addr: + environ["REMOTE_ADDR"] = orig_remote_addr + + else: + logger.error("No remote IP could be determined, use black-hole address: 100::") + environ["REMOTE_ADDR"] = "100::" + + try: + _ = ip_address(environ["REMOTE_ADDR"]) + except ValueError as exc: + logger.error("REMOTE_ADDR: %s, use black-hole address: 100::", exc) + environ["REMOTE_ADDR"] = "100::" + + logger.debug("final REMOTE_ADDR is: %s", environ["REMOTE_ADDR"]) + return self.wsgi_app(environ, start_response) diff --git a/searx/botdetection/valkeydb.py b/searx/botdetection/valkeydb.py new file mode 100644 index 000000000..3b8699786 --- /dev/null +++ b/searx/botdetection/valkeydb.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Providing a Valkey database for the botdetection methods.""" + +from __future__ import annotations + +import valkey + +__all__ = ["set_valkey_client", "get_valkey_client"] + +CLIENT: valkey.Valkey | None = None +"""Global Valkey DB connection (Valkey client object).""" + + +def set_valkey_client(valkey_client: valkey.Valkey): + global CLIENT # pylint: disable=global-statement + CLIENT = valkey_client + + +def get_valkey_client() -> valkey.Valkey: + if CLIENT is None: + raise ValueError("No connection to the Valkey database has been established.") + return CLIENT |