summaryrefslogtreecommitdiff
path: root/searx/botdetection/_helpers.py
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarIT.de>2023-06-03 06:00:15 +0200
committerGitHub <noreply@github.com>2023-06-03 06:00:15 +0200
commit80aaef6c95b572df1fa3a8c30b7fdc1538d7a306 (patch)
treea023de5925099e69b6b5de850cab1e0ddc330f39 /searx/botdetection/_helpers.py
parent1a1ab34d9dba7368b146bc7402e8f54bafea98bb (diff)
parent80af38d37b21dc6e5edbf27bd22310db42a6f923 (diff)
Merge pull request #2357 / limiter -> botdetection
The monolithic implementation of the limiter was divided into methods and implemented in the Python package searx.botdetection. Detailed documentation on the methods has been added. The methods are divided into two groups: 1. Probe HTTP headers - Method http_accept - Method http_accept_encoding - Method http_accept_language - Method http_connection - Method http_user_agent 2. Rate limit: - Method ip_limit - Method link_token (new) The (reduced) implementation of the limiter is now in the module searx.botdetection.limiter. The first group was transferred unchanged to this module. The ip_limit contains the sliding windows implemented by the limiter so far. This merge also fixes some long outstandig issue: - limiter does not evaluate the Accept-Language correct [1] - limiter needs a IPv6 prefix to block networks instead of IPs [2] Without additional configuration the limiter works as before (apart from the bugfixes). For the commissioning of additional methods (link_toke), a configuration must be made in an additional configuration file. Without this configuration, the limiter runs as before (zero configuration). The ip_limit Method implements the sliding windows of the vanilla limiter, additionally the link_token method can be used in this method. The link_token method can be used to investigate whether a request is suspicious. To activate the link_token method in the ip_limit method add the following to your /etc/searxng/limiter.toml:: [botdetection.ip_limit] link_token = true [1] https://github.com/searxng/searxng/issues/2455 [2] https://github.com/searxng/searxng/issues/2477
Diffstat (limited to 'searx/botdetection/_helpers.py')
-rw-r--r--searx/botdetection/_helpers.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py
new file mode 100644
index 000000000..8e0156d6e
--- /dev/null
+++ b/searx/botdetection/_helpers.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+# pylint: disable=missing-module-docstring, invalid-name
+from __future__ import annotations
+
+from ipaddress import (
+ IPv4Network,
+ IPv6Network,
+ IPv6Address,
+ ip_address,
+ ip_network,
+)
+import flask
+import werkzeug
+
+from searx.tools import config
+from searx import logger
+
+logger = logger.getChild('botdetection')
+
+
+def dump_request(request: flask.Request):
+ return (
+ request.path
+ + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ + " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ + " || form: %s" % request.form
+ + " || Accept: %s" % request.headers.get('Accept')
+ + " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ + " || Content-Type: %s" % request.headers.get('Content-Type')
+ + " || Content-Length: %s" % request.headers.get('Content-Length')
+ + " || Connection: %s" % request.headers.get('Connection')
+ + " || User-Agent: %s" % request.headers.get('User-Agent')
+ )
+
+
+def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
+ """Returns a HTTP 429 response object and writes a ERROR message to the
+ 'botdetection' logger. This function is used in part by the filter methods
+ to return the default ``Too Many Requests`` response.
+
+ """
+
+ logger.debug("BLOCK %s: %s", network.compressed, log_msg)
+ return flask.make_response(('Too Many Requests', 429))
+
+
+def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network:
+ """Returns the (client) network of whether the real_ip is part of."""
+
+ ip = ip_address(real_ip)
+ if isinstance(ip, IPv6Address):
+ prefix = cfg['real_ip.ipv6_prefix']
+ else:
+ prefix = cfg['real_ip.ipv4_prefix']
+ network = ip_network(f"{real_ip}/{prefix}", strict=False)
+ # logger.debug("get_network(): %s", network.compressed)
+ return network
+
+
+def get_real_ip(request: flask.Request) -> str:
+ """Returns real IP of the request. Since not all proxies set all the HTTP
+ headers and incoming headers can be faked it may happen that the IP cannot
+ be determined correctly.
+
+ .. sidebar:: :py:obj:`flask.Request.remote_addr`
+
+ SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
+
+ This function tries to get the remote IP in the order listed below,
+ additional some tests are done and if inconsistencies or errors are
+ detected, they are logged.
+
+ The remote IP of the request is taken from (first match):
+
+ - X-Forwarded-For_ header
+ - `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
+ - :py:obj:`flask.Request.remote_addr`
+
+ .. _ProxyFix:
+ https://werkzeug.palletsprojects.com/middleware/proxy_fix/
+
+ .. _X-Forwarded-For:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
+
+ """
+
+ forwarded_for = request.headers.get("X-Forwarded-For")
+ real_ip = request.headers.get('X-Real-IP')
+ remote_addr = request.remote_addr
+ # logger.debug(
+ # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
+ # )
+
+ if not forwarded_for:
+ logger.error("X-Forwarded-For header is not set!")
+ else:
+ from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import
+
+ forwarded_for = [x.strip() for x in forwarded_for.split(',')]
+ x_for: int = get_cfg()['real_ip.x_for']
+ forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
+
+ if not real_ip:
+ logger.error("X-Real-IP header is not set!")
+
+ if forwarded_for and real_ip and forwarded_for != real_ip:
+ logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
+
+ if forwarded_for and remote_addr and forwarded_for != remote_addr:
+ logger.warning(
+ "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
+ )
+
+ if real_ip and remote_addr and real_ip != remote_addr:
+ logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
+
+ request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
+ # logger.debug("get_real_ip() -> %s", request_ip)
+ return request_ip