From 1ec325adccc427fe05cf08da9a2d9d63da7365f4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 23 May 2023 18:16:37 +0200 Subject: [mod] limiter -> botdetection: modularization and documentation In order to be able to meet the outstanding requirements, the implementation is modularized and supplemented with documentation. This patch does not contain functional change, except it fixes issue #2455 ---- Aktivate limiter in the settings.yml and simulate a bot request by:: curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \ -H 'Accept: text/html' -H 'User-Agent: xyz' \ -H 'Accept-Encoding: gzip' \ 'http://127.0.0.1:8888/search?q=foo' In the LOG: DEBUG searx.botdetection.link_token : missing ping for this request: ..... Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time before you get a "Too Many Requests" response. Closes: https://github.com/searxng/searxng/issues/2455 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 26 ++++++ searx/botdetection/http_accept.py | 24 ++++++ searx/botdetection/http_accept_encoding.py | 26 ++++++ searx/botdetection/http_accept_language.py | 23 ++++++ searx/botdetection/http_connection.py | 23 ++++++ searx/botdetection/http_user_agent.py | 54 +++++++++++++ searx/botdetection/ip_limit.py | 90 +++++++++++++++++++++ searx/botdetection/limiter.py | 79 ++++++++++++++++++ searx/botdetection/link_token.py | 126 +++++++++++++++++++++++++++++ 9 files changed, 471 insertions(+) create mode 100644 searx/botdetection/__init__.py create mode 100644 searx/botdetection/http_accept.py create mode 100644 searx/botdetection/http_accept_encoding.py create mode 100644 searx/botdetection/http_accept_language.py create mode 100644 searx/botdetection/http_connection.py create mode 100644 searx/botdetection/http_user_agent.py create mode 100644 searx/botdetection/ip_limit.py create mode 100644 searx/botdetection/limiter.py create mode 100644 searx/botdetection/link_token.py (limited to 'searx/botdetection') diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py new file mode 100644 index 000000000..78a7d30f3 --- /dev/null +++ b/searx/botdetection/__init__.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _botdetection src: + +Bot detection methods +--------------------- + +The methods implemented in this python package are use by the :ref:`limiter src`. + +""" + +import flask + + +def dump_request(request: flask.Request): + return ( + "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path) + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py new file mode 100644 index 000000000..1ab7cb4c1 --- /dev/null +++ b/searx/botdetection/http_accept.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept`` +---------------------- + +The ``http_accept`` method evaluates a request as the request of a bot if the +Accept_ header .. + +- did not contain ``text/html`` + +.. _Accept: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept + +""" + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if 'text/html' not in request.accept_mimetypes: + return 429, "bot detected, HTTP header Accept did not contain text/html" + return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py new file mode 100644 index 000000000..ae630fd68 --- /dev/null +++ b/searx/botdetection/http_accept_encoding.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_encoding`` +------------------------------- + +The ``http_accept_encoding`` method evaluates a request as the request of a +bot if the Accept-Encoding_ header .. + +- did not contain ``gzip`` AND ``deflate`` (if both values are missed) +- did not contain ``text/html`` + +.. _Accept-Encoding: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding + +""" + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] + if not ('gzip' in accept_list or 'deflate' in accept_list): + return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" + return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py new file mode 100644 index 000000000..06743802e --- /dev/null +++ b/searx/botdetection/http_accept_language.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_accept_language`` +------------------------------- + +The ``http_accept_language`` method evaluates a request as the request of a bot +if the Accept-Language_ header is unset. + +.. _Accept-Language: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if request.headers.get('Accept-Language', '').strip() == '': + return 429, "bot detected, missing HTTP header Accept-Language" + return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py new file mode 100644 index 000000000..f61f5e48c --- /dev/null +++ b/searx/botdetection/http_connection.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_connection`` +-------------------------- + +The ``http_connection`` method evaluates a request as the request of a bot if +the Connection_ header is set to ``close``. + +.. _Connection: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection + +""" + + +from typing import Optional, Tuple +import flask + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + if request.headers.get('Connection', '').strip() == 'close': + return 429, "bot detected, HTTP header 'Connection=close'" + return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py new file mode 100644 index 000000000..892ae0bd9 --- /dev/null +++ b/searx/botdetection/http_user_agent.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + +from typing import Optional, Tuple +import re +import flask + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return ( + 429, + f"bot detected, HTTP header User-Agent: {user_agent}", + ) + return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py new file mode 100644 index 000000000..fce3f8b67 --- /dev/null +++ b/searx/botdetection/ip_limit.py @@ -0,0 +1,90 @@ +""" +Method ``ip_limit`` +------------------- + +The ``ip_limit`` method counts request from an IP in *sliding windows*. If +there are to many requests in a sliding window, the request is evaluated as a +bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ +header. To take privacy only the hash value of an IP is stored in the redis DB +and at least for a maximum of 10 minutes. + +The :py:obj:`link_token` method is used to investigate whether a request is +*suspicious*. If the :py:obj:`link_token` method is activated and a request is +*suspicious* the request rates are reduced: + +- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` +- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" + +from typing import Optional, Tuple +import flask + +from searx import redisdb +from searx import logger +from searx.redislib import incr_sliding_window + +from . import link_token + +logger = logger.getChild('botdetection.ip_limit') + +BURST_WINDOW = 20 +"""Time (sec) before sliding window for *burst* requests expires.""" + +BURST_MAX = 15 +"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" + +BURST_MAX_SUSPICIOUS = 2 +"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" + +LONG_WINDOW = 600 +"""Time (sec) before the longer sliding window expires.""" + +LONG_MAX = 150 +"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" + +LONG_MAX_SUSPICIOUS = 10 +"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" + +API_WONDOW = 3600 +"""Time (sec) before sliding window for API requests (format != html) expires.""" + +API_MAX = 4 +"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + redis_client = redisdb.client() + + x_forwarded_for = request.headers.get('X-Forwarded-For', '') + if not x_forwarded_for: + logger.error("missing HTTP header X-Forwarded-For") + + if request.args.get('format', 'html') != 'html': + c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW) + if c > API_MAX: + return 429, "BLOCK %s: API limit exceeded" + + suspicious = link_token.is_suspicious(request) + + if suspicious: + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + if c > BURST_MAX_SUSPICIOUS: + return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" + + c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + if c > LONG_MAX_SUSPICIOUS: + return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" + + else: + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + if c > BURST_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" + + c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + if c > LONG_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX" + return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py new file mode 100644 index 000000000..71044c312 --- /dev/null +++ b/searx/botdetection/limiter.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _limiter src: + +Limiter +======= + +.. sidebar:: info + + The limiter requires a :ref:`Redis ` database. + +Bot protection / IP rate limitation. The intention of rate limitation is to +limit suspicious requests from an IP. The motivation behind this is the fact +that SearXNG passes through requests from bots and is thus classified as a bot +itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked +by the search engine (the origin) in some other way. + +To avoid blocking, the requests from bots to SearXNG must also be blocked, this +is the task of the limiter. To perform this task, the limiter uses the methods +from the :py:obj:`searx.botdetection`. + +To enable the limiter activate: + +.. code:: yaml + + server: + ... + limiter: true # rate limit the number of request on the instance, block some bots + +and set the redis-url connection. Check the value, it depends on your redis DB +(see :ref:`settings redis`), by example: + +.. code:: yaml + + redis: + url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 + +""" + +from typing import Optional, Tuple +import flask + +from searx.botdetection import ( + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, +) + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + + if request.path == '/healthz': + return None + + for func in [ + http_user_agent, + ]: + val = func.filter_request(request) + if val is not None: + return val + + if request.path == '/search': + + for func in [ + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ]: + val = func.filter_request(request) + if val is not None: + return val + + return None diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py new file mode 100644 index 000000000..8ef215f6c --- /dev/null +++ b/searx/botdetection/link_token.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``link_token`` +--------------------- + +The ``link_token`` method evaluates a request as :py:obj:`suspicious +` if the URL ``/client.css`` is not requested by the +client. By adding a random component (the token) in the URL a bot can not send +a ping by request a static URL. + +.. note:: + + This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. + +To get in use of this method a flask URL route needs to be added: + +.. code:: python + + @app.route('/client.css', methods=['GET', 'POST']) + def client_token(token=None): + link_token.ping(request, token) + return Response('', mimetype='text/css') + +And in the HTML template from flask a stylesheet link is needed (the value of +``link_token`` comes from :py:obj:`get_token`): + +.. code:: html + + + +.. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + +""" + +import string +import random +import flask + +from searx import logger +from searx import redisdb +from searx.redislib import secret_hash + +TOKEN_LIVE_TIME = 600 +"""Livetime (sec) of limiter's CSS token.""" + +PING_KEY = 'SearXNG_limiter.ping' +TOKEN_KEY = 'SearXNG_limiter.token' + +logger = logger.getChild('botdetection.link_token') + + +def is_suspicious(request: flask.Request): + """Checks if there is a valid ping for this request, if not this request is + rated as *suspicious*""" + redis_client = redisdb.client() + if not redis_client: + return False + + ping_key = get_ping_key(request) + if not redis_client.get(ping_key): + logger.warning( + "missing ping (IP: %s) / request: %s", + request.headers.get('X-Forwarded-For', ''), + ping_key, + ) + return True + + logger.debug("found ping for this request: %s", ping_key) + return False + + +def ping(request: flask.Request, token: str): + """This function is called by a request to URL ``/client.css``""" + redis_client = redisdb.client() + if not redis_client: + return + if not token_is_valid(token): + return + ping_key = get_ping_key(request) + logger.debug("store ping for: %s", ping_key) + redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME) + + +def get_ping_key(request: flask.Request): + """Generates a hashed key that fits (more or less) to a request. At least + X-Forwarded-For_ is needed to be able to assign the request to an IP. + + """ + return secret_hash( + PING_KEY + + request.headers.get('X-Forwarded-For', '') + + request.headers.get('Accept-Language', '') + + request.headers.get('User-Agent', '') + ) + + +def token_is_valid(token) -> bool: + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + +def get_token() -> str: + """Returns current token. If there is no currently active token a new token + is generated randomly and stored in the redis DB. + + - :py:obj:`TOKEN_LIVE_TIME` + - :py:obj:`TOKEN_KEY` + + """ + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) + redis_client.set(TOKEN_KEY, token, ex=TOKEN_LIVE_TIME) + return token -- cgit v1.2.3 From 66fdec0eb92bf11c0bc477d6fb1df3dc783e4dcb Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 May 2023 17:24:43 +0200 Subject: [mod] limiter: add config file /etc/searxng/limiter.toml Signed-off-by: Markus Heiser --- searx/botdetection/http_accept.py | 5 +++- searx/botdetection/http_accept_encoding.py | 5 +++- searx/botdetection/http_accept_language.py | 6 +++-- searx/botdetection/http_connection.py | 6 +++-- searx/botdetection/http_user_agent.py | 6 ++++- searx/botdetection/ip_limit.py | 11 +++++--- searx/botdetection/limiter.py | 43 ++++++++++++++++++++++++++++-- searx/botdetection/limiter.toml | 3 +++ 8 files changed, 73 insertions(+), 12 deletions(-) create mode 100644 searx/botdetection/limiter.toml (limited to 'searx/botdetection') diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 1ab7cb4c1..23670a283 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -13,12 +13,15 @@ Accept_ header .. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept """ +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if 'text/html' not in request.accept_mimetypes: return 429, "bot detected, HTTP header Accept did not contain text/html" return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index ae630fd68..191249711 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -14,12 +14,15 @@ bot if the Accept-Encoding_ header .. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding """ +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 06743802e..558a216cf 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -11,13 +11,15 @@ if the Accept-Language_ header is unset. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ - +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config + -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if request.headers.get('Accept-Language', '').strip() == '': return 429, "bot detected, missing HTTP header Accept-Language" return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index f61f5e48c..0ef24a7b8 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -11,13 +11,15 @@ the Connection_ header is set to ``close``. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection """ - +# pylint: disable=unused-argument from typing import Optional, Tuple import flask +from searx.tools import config + -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: if request.headers.get('Connection', '').strip() == 'close': return 429, "bot detected, HTTP header 'Connection=close'" return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 892ae0bd9..3d1ec9173 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -12,11 +12,15 @@ the User-Agent_ header is unset or matches the regular expression https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent """ +# pylint: disable=unused-argument from typing import Optional, Tuple import re import flask +from searx.tools import config + + USER_AGENT = ( r'(' + r'unknown' @@ -44,7 +48,7 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): return ( diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index fce3f8b67..2646920c2 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -1,4 +1,5 @@ -""" +""".. _botdetection.ip_limit: + Method ``ip_limit`` ------------------- @@ -22,6 +23,8 @@ The :py:obj:`link_token` method is used to investigate whether a request is from typing import Optional, Tuple import flask +from searx.tools import config + from searx import redisdb from searx import logger @@ -56,7 +59,7 @@ API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: redis_client = redisdb.client() x_forwarded_for = request.headers.get('X-Forwarded-For', '') @@ -68,7 +71,9 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: if c > API_MAX: return 429, "BLOCK %s: API limit exceeded" - suspicious = link_token.is_suspicious(request) + suspicious = False + if cfg['botdetection.ip_limit.link_token']: + suspicious = link_token.is_suspicious(request) if suspicious: c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 71044c312..cc1e00b3c 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -38,8 +38,11 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ from typing import Optional, Tuple +from pathlib import Path import flask +import pytomlpp as toml +from searx.tools import config from searx.botdetection import ( http_accept, http_accept_encoding, @@ -49,6 +52,42 @@ from searx.botdetection import ( ip_limit, ) +LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" +"""Base configuration (schema) of the botdetection.""" + +LIMITER_CFG = Path('/etc/searxng/limiter.toml') +"""Lokal Limiter configuration.""" + +CFG_DEPRECATED = { + # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." +} + +CFG = config.Config({}, {}) + + +def init_cfg(log): + global CFG # pylint: disable=global-statement + CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) + + if not LIMITER_CFG.exists(): + log.warning("missing config file: %s", LIMITER_CFG) + return + + log.warning("load config file: %s", LIMITER_CFG) + try: + upd_cfg = toml.load(LIMITER_CFG) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", LIMITER_CFG, msg) + raise + + is_valid, issue_list = CFG.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") + CFG.update(upd_cfg) + def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: @@ -58,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val @@ -72,7 +111,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml new file mode 100644 index 000000000..30cd1b53c --- /dev/null +++ b/searx/botdetection/limiter.toml @@ -0,0 +1,3 @@ +[botdetection.ip_limit] + +link_token = true \ No newline at end of file -- cgit v1.2.3 From 9d7456fd6c49fbd96f03f6a5dedd6ba05e924d0a Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 27 May 2023 18:58:06 +0200 Subject: [fix] limiter.toml: botdetection.ip_limit turn off link_token by default To activate the ``link_token`` method in the ``ip_limit`` method add the following to your ``/etc/searxng/limiter.toml``:: [botdetection.ip_limit] link_token = true Related: https://github.com/searxng/searxng/pull/2357#issuecomment-1554116941 Signed-off-by: Markus Heiser --- searx/botdetection/ip_limit.py | 15 ++++++++++++--- searx/botdetection/limiter.toml | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'searx/botdetection') diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 2646920c2..e72015190 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -9,9 +9,18 @@ bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. To take privacy only the hash value of an IP is stored in the redis DB and at least for a maximum of 10 minutes. -The :py:obj:`link_token` method is used to investigate whether a request is -*suspicious*. If the :py:obj:`link_token` method is activated and a request is -*suspicious* the request rates are reduced: +The :py:obj:`.link_token` method can be used to investigate whether a request is +*suspicious*. To activate the :py:obj:`.link_token` method in the +:py:obj:`.ip_limit` method add the following to your +``/etc/searxng/limiter.toml``: + +.. code:: toml + + [botdetection.ip_limit] + link_token = true + +If the :py:obj:`.link_token` method is activated and a request is *suspicious* +the request rates are reduced: - :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` - :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index 30cd1b53c..28c4e7589 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,3 +1,3 @@ [botdetection.ip_limit] -link_token = true \ No newline at end of file +link_token = false \ No newline at end of file -- cgit v1.2.3 From 52f1452c09ab2ec74aa5898d9ea749f33a71a814 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 27 May 2023 21:36:34 +0200 Subject: [mod] limiter: ip_limt - monitore suspicious IPs To intercept bots that get their IPs from a range of IPs, there is a ``SUSPICIOUS_IP_WINDOW``. In this window the suspicious IPs are stored for a longer time. IPs stored in this sliding window have a maximum of ``SUSPICIOUS_IP_MAX`` accesses before they are blocked. As soon as the IP makes a request that is not suspicious, the sliding window for this IP is droped. Signed-off-by: Markus Heiser --- searx/botdetection/ip_limit.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'searx/botdetection') diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index e72015190..9cffff7f0 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -25,6 +25,13 @@ the request rates are reduced: - :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` - :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` +To intercept bots that get their IPs from a range of IPs, there is a +:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored +for a longer time. IPs stored in this sliding window have a maximum of +:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP +makes a request that is not suspicious, the sliding window for this IP is +droped. + .. _X-Forwarded-For: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For @@ -37,7 +44,7 @@ from searx.tools import config from searx import redisdb from searx import logger -from searx.redislib import incr_sliding_window +from searx.redislib import incr_sliding_window, drop_counter from . import link_token @@ -67,6 +74,12 @@ API_WONDOW = 3600 API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" +SUSPICIOUS_IP_WINDOW = 3600 * 24 +"""Time (sec) before sliding window for one suspicious IP expires.""" + +SUSPICIOUS_IP_MAX = 3 +"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" + def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: redis_client = redisdb.client() @@ -81,10 +94,18 @@ def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple return 429, "BLOCK %s: API limit exceeded" suspicious = False + suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for + if cfg['botdetection.ip_limit.link_token']: suspicious = link_token.is_suspicious(request) if suspicious: + + # this IP is suspicious: count requests from this IP + c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW) + if c > SUSPICIOUS_IP_MAX: + return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW" + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" @@ -94,6 +115,11 @@ def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" else: + + if cfg['botdetection.ip_limit.link_token']: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, suspicious_ip_counter) + c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) if c > BURST_MAX: return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" -- cgit v1.2.3 From b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 28 May 2023 18:58:31 +0200 Subject: [mod] botdetection - improve ip_limit and link_token methods - counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 16 +---- searx/botdetection/_helpers.py | 93 ++++++++++++++++++++++++++++++ searx/botdetection/http_accept.py | 8 ++- searx/botdetection/http_accept_encoding.py | 8 ++- searx/botdetection/http_accept_language.py | 8 ++- searx/botdetection/http_connection.py | 8 ++- searx/botdetection/http_user_agent.py | 11 ++-- searx/botdetection/ip_limit.py | 61 +++++++++++--------- searx/botdetection/limiter.py | 11 +++- searx/botdetection/link_token.py | 43 ++++++++++---- 10 files changed, 193 insertions(+), 74 deletions(-) create mode 100644 searx/botdetection/_helpers.py (limited to 'searx/botdetection') diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index 78a7d30f3..b4de0f9c8 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -9,18 +9,4 @@ The methods implemented in this python package are use by the :ref:`limiter src` """ -import flask - - -def dump_request(request: flask.Request): - return ( - "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path) - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept') - + " || Accept-Language: %s" % request.headers.get('Accept-Language') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') - + " || Content-Type: %s" % request.headers.get('Content-Type') - + " || Content-Length: %s" % request.headers.get('Content-Length') - + " || Connection: %s" % request.headers.get('Connection') - + " || User-Agent: %s" % request.headers.get('User-Agent') - ) +from ._helpers import dump_request diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py new file mode 100644 index 000000000..b034b980b --- /dev/null +++ b/searx/botdetection/_helpers.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, invalid-name + +from typing import Optional +import flask +import werkzeug + +from searx import logger + +logger = logger.getChild('botdetection') + + +def dump_request(request: flask.Request): + return ( + "%s: %s" % (get_real_ip(request), request.path) + + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) + + +def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]: + log_prefix = 'BLOCK %s: ' % get_real_ip(request) + logger.debug(log_prefix + log_msg) + return flask.make_response(('Too Many Requests', 429)) + + +def get_real_ip(request: flask.Request) -> str: + """Returns real IP of the request. Since not all proxies set all the HTTP + headers and incoming headers can be faked it may happen that the IP cannot + be determined correctly. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + This function tries to get the remote IP in the order listed below, + additional some tests are done and if inconsistencies or errors are + detected, they are logged. + + The remote IP of the request is taken from (first match): + + - X-Forwarded-For_ header + - `X-real-IP header `__ + - :py:obj:`flask.Request.remote_addr` + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + """ + + forwarded_for = request.headers.get("X-Forwarded-For") + real_ip = request.headers.get('X-Real-IP') + remote_addr = request.remote_addr + logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr) + + if not forwarded_for: + logger.error("X-Forwarded-For header is not set!") + else: + from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import + + forwarded_for = [x.strip() for x in forwarded_for.split(',')] + x_for: int = get_cfg()['real_ip.x_for'] + forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] + + if not real_ip: + logger.error("X-Real-IP header is not set!") + + if forwarded_for and real_ip and forwarded_for != real_ip: + logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) + + if forwarded_for and remote_addr and forwarded_for != remote_addr: + logger.warning( + "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for + ) + + if real_ip and remote_addr and real_ip != remote_addr: + logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) + + request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' + logger.debug("get_real_ip() -> %s", request_ip) + return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 23670a283..60e2330ae 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -15,13 +15,15 @@ Accept_ header .. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if 'text/html' not in request.accept_mimetypes: - return 429, "bot detected, HTTP header Accept did not contain text/html" + return too_many_requests(request, "HTTP header Accept did not contain text/html") return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 191249711..5301c5d9d 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -16,14 +16,16 @@ bot if the Accept-Encoding_ header .. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): - return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" + return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate") return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 558a216cf..060f67ec0 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -13,13 +13,15 @@ if the Accept-Language_ header is unset. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if request.headers.get('Accept-Language', '').strip() == '': - return 429, "bot detected, missing HTTP header Accept-Language" + return too_many_requests(request, "missing HTTP header Accept-Language") return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index 0ef24a7b8..e718dfe3f 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -13,13 +13,15 @@ the Connection_ header is set to ``close``. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if request.headers.get('Connection', '').strip() == 'close': - return 429, "bot detected, HTTP header 'Connection=close'" + return too_many_requests(request, "HTTP header 'Connection=close") return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 3d1ec9173..70309e975 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -14,11 +14,13 @@ the User-Agent_ header is unset or matches the regular expression """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import re import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests USER_AGENT = ( @@ -48,11 +50,8 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): - return ( - 429, - f"bot detected, HTTP header User-Agent: {user_agent}", - ) + return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}") return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 9cffff7f0..e7fa57187 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint """.. _botdetection.ip_limit: Method ``ip_limit`` @@ -37,16 +39,18 @@ droped. """ -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config - from searx import redisdb from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token +from ._helpers import too_many_requests + logger = logger.getChild('botdetection.ip_limit') @@ -81,50 +85,51 @@ SUSPICIOUS_IP_MAX = 3 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: + # pylint: disable=too-many-return-statements redis_client = redisdb.client() - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - if not x_forwarded_for: + client_ip = request.headers.get('X-Forwarded-For', '') + if not client_ip: logger.error("missing HTTP header X-Forwarded-For") if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW) + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) if c > API_MAX: - return 429, "BLOCK %s: API limit exceeded" - - suspicious = False - suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for + return too_many_requests(request, "too many request in API_WINDOW") if cfg['botdetection.ip_limit.link_token']: - suspicious = link_token.is_suspicious(request) - if suspicious: + suspicious = link_token.is_suspicious(request, True) + + if not suspicious: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip) + return None # this IP is suspicious: count requests from this IP - c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW) if c > SUSPICIOUS_IP_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW" + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip) + return flask.redirect(flask.url_for('index'), code=302) - c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: - return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" + return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) if c > LONG_MAX_SUSPICIOUS: - return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" + return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") - else: + return None - if cfg['botdetection.ip_limit.link_token']: - # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, suspicious_ip_counter) + # vanilla limiter without extensions counts BURST_MAX and LONG_MAX + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + if c > BURST_MAX: + return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)") - c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) - if c > BURST_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + if c > LONG_MAX: + return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)") - c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) - if c > LONG_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX" return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index cc1e00b3c..93826684f 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -42,6 +42,7 @@ from pathlib import Path import flask import pytomlpp as toml +from searx import logger from searx.tools import config from searx.botdetection import ( http_accept, @@ -62,7 +63,13 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = config.Config({}, {}) +CFG = None + + +def get_cfg() -> config.Config: + if CFG is None: + init_cfg(logger) + return CFG def init_cfg(log): @@ -73,7 +80,7 @@ def init_cfg(log): log.warning("missing config file: %s", LIMITER_CFG) return - log.warning("load config file: %s", LIMITER_CFG) + log.info("load config file: %s", LIMITER_CFG) try: upd_cfg = toml.load(LIMITER_CFG) except toml.DecodeError as exc: diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 8ef215f6c..376d06d61 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -47,15 +47,24 @@ from searx.redislib import secret_hash TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" +PING_LIVE_TIME = 3600 +"""Livetime (sec) of the ping-key from a client (request)""" + PING_KEY = 'SearXNG_limiter.ping' +"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" + TOKEN_KEY = 'SearXNG_limiter.token' +"""Key for which the current token is stored in the DB""" logger = logger.getChild('botdetection.link_token') -def is_suspicious(request: flask.Request): +def is_suspicious(request: flask.Request, renew: bool = False): """Checks if there is a valid ping for this request, if not this request is - rated as *suspicious*""" + rated as *suspicious*. If a valid ping exists and argument ``renew`` is + ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. + + """ redis_client = redisdb.client() if not redis_client: return False @@ -69,12 +78,19 @@ def is_suspicious(request: flask.Request): ) return True - logger.debug("found ping for this request: %s", ping_key) + if renew: + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + logger.debug("found ping for client request: %s", ping_key) return False def ping(request: flask.Request, token: str): - """This function is called by a request to URL ``/client.css``""" + """This function is called by a request to URL ``/client.css``. If + ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. + The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. + + """ redis_client = redisdb.client() if not redis_client: return @@ -82,19 +98,24 @@ def ping(request: flask.Request, token: str): return ping_key = get_ping_key(request) logger.debug("store ping for: %s", ping_key) - redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME) + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) def get_ping_key(request: flask.Request): - """Generates a hashed key that fits (more or less) to a request. At least - X-Forwarded-For_ is needed to be able to assign the request to an IP. + """Generates a hashed key that fits (more or less) to a client (request). + At least X-Forwarded-For_ is needed to be able to assign the request to an + IP. """ - return secret_hash( + return ( PING_KEY - + request.headers.get('X-Forwarded-For', '') - + request.headers.get('Accept-Language', '') - + request.headers.get('User-Agent', '') + + "[" + + secret_hash( + request.headers.get('X-Forwarded-For', '') + + request.headers.get('Accept-Language', '') + + request.headers.get('User-Agent', '') + ) + + "]" ) -- cgit v1.2.3 From 38431d2e142b7da6a9b48aad203f02a2eff7e6fd Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 29 May 2023 19:46:37 +0200 Subject: [fix] correct determination of the IP for the request For correct determination of the IP to the request the function botdetection.get_real_ip() is implemented. This fonction is used in the ip_limit and link_token method of the botdetection and it is used in the self_info plugin. A documentation about the X-Forwarded-For header has been added. [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566211059 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 20 +++++++++++++++++--- searx/botdetection/ip_limit.py | 6 ++---- searx/botdetection/limiter.toml | 7 ++++++- searx/botdetection/link_token.py | 7 +++---- 4 files changed, 28 insertions(+), 12 deletions(-) (limited to 'searx/botdetection') diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index b4de0f9c8..c903b0bb4 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -2,11 +2,25 @@ # lint: pylint """.. _botdetection src: -Bot detection methods ---------------------- +X-Forwarded-For +=============== -The methods implemented in this python package are use by the :ref:`limiter src`. +.. attention:: + + A correct setup of the HTTP request headers ``X-Forwarded-For`` and + ``X-Real-IP`` is essential to be able to assign a request to an IP correctly: + + - `NGINX RequestHeader`_ + - `Apache RequestHeader`_ + +.. _NGINX RequestHeader: + https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site +.. _Apache RequestHeader: + https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site + +.. autofunction:: searx.botdetection.get_real_ip """ from ._helpers import dump_request +from ._helpers import get_real_ip diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index e7fa57187..268285dd9 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -49,7 +49,7 @@ from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token -from ._helpers import too_many_requests +from ._helpers import too_many_requests, get_real_ip logger = logger.getChild('botdetection.ip_limit') @@ -89,9 +89,7 @@ def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkz # pylint: disable=too-many-return-statements redis_client = redisdb.client() - client_ip = request.headers.get('X-Forwarded-For', '') - if not client_ip: - logger.error("missing HTTP header X-Forwarded-For") + client_ip = get_real_ip(request) if request.args.get('format', 'html') != 'html': c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index 28c4e7589..af797d32c 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,3 +1,8 @@ [botdetection.ip_limit] -link_token = false \ No newline at end of file +link_token = false + +[real_ip] + +# Number of values to trust for X-Forwarded-For. +x_for = 1 diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 376d06d61..a83214a33 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -43,6 +43,7 @@ import flask from searx import logger from searx import redisdb from searx.redislib import secret_hash +from ._helpers import get_real_ip TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" @@ -73,7 +74,7 @@ def is_suspicious(request: flask.Request, renew: bool = False): if not redis_client.get(ping_key): logger.warning( "missing ping (IP: %s) / request: %s", - request.headers.get('X-Forwarded-For', ''), + get_real_ip(request), ping_key, ) return True @@ -111,9 +112,7 @@ def get_ping_key(request: flask.Request): PING_KEY + "[" + secret_hash( - request.headers.get('X-Forwarded-For', '') - + request.headers.get('Accept-Language', '') - + request.headers.get('User-Agent', '') + get_real_ip(request) + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') ) + "]" ) -- cgit v1.2.3 From 281e36f4b7848374535d5e953050ae73423191ca Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 15:41:48 +0200 Subject: [fix] limiter: replace real_ip by IPv4/v6 network Closes: https://github.com/searxng/searxng/issues/2477 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 1 + searx/botdetection/_helpers.py | 44 +++++++++++++++++---- searx/botdetection/http_accept.py | 16 ++++++-- searx/botdetection/http_accept_encoding.py | 16 ++++++-- searx/botdetection/http_accept_language.py | 14 +++++-- searx/botdetection/http_connection.py | 16 ++++++-- searx/botdetection/http_user_agent.py | 16 ++++++-- searx/botdetection/ip_limit.py | 49 +++++++++++++++--------- searx/botdetection/limiter.py | 61 +++++++++++++----------------- searx/botdetection/limiter.toml | 20 ++++++++-- searx/botdetection/link_token.py | 54 +++++++++++++++----------- 11 files changed, 207 insertions(+), 100 deletions(-) (limited to 'searx/botdetection') diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index c903b0bb4..fcd8e5630 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -24,3 +24,4 @@ X-Forwarded-For from ._helpers import dump_request from ._helpers import get_real_ip +from ._helpers import too_many_requests diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py index b034b980b..8e0156d6e 100644 --- a/searx/botdetection/_helpers.py +++ b/searx/botdetection/_helpers.py @@ -1,11 +1,19 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pylint: disable=missing-module-docstring, invalid-name - -from typing import Optional +from __future__ import annotations + +from ipaddress import ( + IPv4Network, + IPv6Network, + IPv6Address, + ip_address, + ip_network, +) import flask import werkzeug +from searx.tools import config from searx import logger logger = logger.getChild('botdetection') @@ -13,7 +21,7 @@ logger = logger.getChild('botdetection') def dump_request(request: flask.Request): return ( - "%s: %s" % (get_real_ip(request), request.path) + request.path + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + " || form: %s" % request.form @@ -27,12 +35,30 @@ def dump_request(request: flask.Request): ) -def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]: - log_prefix = 'BLOCK %s: ' % get_real_ip(request) - logger.debug(log_prefix + log_msg) +def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: + """Returns a HTTP 429 response object and writes a ERROR message to the + 'botdetection' logger. This function is used in part by the filter methods + to return the default ``Too Many Requests`` response. + + """ + + logger.debug("BLOCK %s: %s", network.compressed, log_msg) return flask.make_response(('Too Many Requests', 429)) +def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network: + """Returns the (client) network of whether the real_ip is part of.""" + + ip = ip_address(real_ip) + if isinstance(ip, IPv6Address): + prefix = cfg['real_ip.ipv6_prefix'] + else: + prefix = cfg['real_ip.ipv4_prefix'] + network = ip_network(f"{real_ip}/{prefix}", strict=False) + # logger.debug("get_network(): %s", network.compressed) + return network + + def get_real_ip(request: flask.Request) -> str: """Returns real IP of the request. Since not all proxies set all the HTTP headers and incoming headers can be faked it may happen that the IP cannot @@ -63,7 +89,9 @@ def get_real_ip(request: flask.Request) -> str: forwarded_for = request.headers.get("X-Forwarded-For") real_ip = request.headers.get('X-Real-IP') remote_addr = request.remote_addr - logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr) + # logger.debug( + # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr + # ) if not forwarded_for: logger.error("X-Forwarded-For header is not set!") @@ -89,5 +117,5 @@ def get_real_ip(request: flask.Request) -> str: logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' - logger.debug("get_real_ip() -> %s", request_ip) + # logger.debug("get_real_ip() -> %s", request_ip) return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 60e2330ae..b78a86278 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -15,7 +15,12 @@ Accept_ header .. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -23,7 +28,12 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if 'text/html' not in request.accept_mimetypes: - return too_many_requests(request, "HTTP header Accept did not contain text/html") + return too_many_requests(network, "HTTP header Accept did not contain text/html") return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 5301c5d9d..60718a4ca 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -16,7 +16,12 @@ bot if the Accept-Encoding_ header .. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -24,8 +29,13 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): - return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate") + return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 060f67ec0..395d28bfd 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -12,8 +12,12 @@ if the Accept-Language_ header is unset. """ # pylint: disable=unused-argument +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) -from typing import Optional import flask import werkzeug @@ -21,7 +25,11 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: if request.headers.get('Accept-Language', '').strip() == '': - return too_many_requests(request, "missing HTTP header Accept-Language") + return too_many_requests(network, "missing HTTP header Accept-Language") return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index e718dfe3f..ee0d80a23 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -13,7 +13,12 @@ the Connection_ header is set to ``close``. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -21,7 +26,12 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if request.headers.get('Connection', '').strip() == 'close': - return too_many_requests(request, "HTTP header 'Connection=close") + return too_many_requests(network, "HTTP header 'Connection=close") return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 70309e975..17025f68b 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -14,8 +14,13 @@ the User-Agent_ header is unset or matches the regular expression """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations import re +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -50,8 +55,13 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): - return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}") + return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 268285dd9..46e026371 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -38,8 +38,12 @@ droped. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For """ +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) -from typing import Optional import flask import werkzeug from searx.tools import config @@ -49,7 +53,7 @@ from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token -from ._helpers import too_many_requests, get_real_ip +from ._helpers import too_many_requests logger = logger.getChild('botdetection.ip_limit') @@ -85,49 +89,58 @@ SUSPICIOUS_IP_MAX = 3 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + # pylint: disable=too-many-return-statements redis_client = redisdb.client() - client_ip = get_real_ip(request) + if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: + logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) + return None if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW) if c > API_MAX: - return too_many_requests(request, "too many request in API_WINDOW") + return too_many_requests(network, "too many request in API_WINDOW") if cfg['botdetection.ip_limit.link_token']: - suspicious = link_token.is_suspicious(request, True) + suspicious = link_token.is_suspicious(network, request, True) if not suspicious: # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip) + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) return None # this IP is suspicious: count requests from this IP - c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW) + c = incr_sliding_window( + redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW + ) if c > SUSPICIOUS_IP_MAX: - logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip) + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) return flask.redirect(flask.url_for('index'), code=302) - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: - return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) if c > LONG_MAX_SUSPICIOUS: - return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") return None # vanilla limiter without extensions counts BURST_MAX and LONG_MAX - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX: - return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)") + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) if c > LONG_MAX: - return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)") + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 93826684f..18ffc8407 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -37,14 +37,16 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ -from typing import Optional, Tuple +from __future__ import annotations + from pathlib import Path import flask -import pytomlpp as toml +import werkzeug -from searx import logger from searx.tools import config -from searx.botdetection import ( +from searx import logger + +from . import ( http_accept, http_accept_encoding, http_accept_language, @@ -53,6 +55,16 @@ from searx.botdetection import ( ip_limit, ) +from ._helpers import ( + get_network, + get_real_ip, + dump_request, +) + +logger = logger.getChild('botdetection.limiter') + +CFG: config.Config = None # type: ignore + LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" """Base configuration (schema) of the botdetection.""" @@ -63,40 +75,21 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = None - def get_cfg() -> config.Config: + global CFG # pylint: disable=global-statement if CFG is None: - init_cfg(logger) + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) return CFG -def init_cfg(log): - global CFG # pylint: disable=global-statement - CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) - - if not LIMITER_CFG.exists(): - log.warning("missing config file: %s", LIMITER_CFG) - return - - log.info("load config file: %s", LIMITER_CFG) - try: - upd_cfg = toml.load(LIMITER_CFG) - except toml.DecodeError as exc: - msg = str(exc).replace('\t', '').replace('\n', ' ') - log.error("%s: %s", LIMITER_CFG, msg) - raise +def filter_request(request: flask.Request) -> werkzeug.Response | None: - is_valid, issue_list = CFG.validate(upd_cfg) - for msg in issue_list: - log.error(str(msg)) - if not is_valid: - raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") - CFG.update(upd_cfg) - - -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + cfg = get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + if network.is_link_local: + return None if request.path == '/healthz': return None @@ -104,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val @@ -118,8 +111,8 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val - + logger.debug(f"OK {network}: %s", dump_request(flask.request)) return None diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index af797d32c..71a231e8f 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,8 +1,22 @@ +[real_ip] + +# Number of values to trust for X-Forwarded-For. + +x_for = 1 + +# The prefix defines the number of leading bits in an address that are compared +# to determine whether or not an address is part of a (client) network. + +ipv4_prefix = 32 +ipv6_prefix = 48 + [botdetection.ip_limit] +# To get unlimited access in a local network, by default link-lokal addresses +# (networks) are not monitored by the ip_limit +filter_link_local = false + +# acrivate link_token method in the ip_limit method link_token = false -[real_ip] -# Number of values to trust for X-Forwarded-For. -x_for = 1 diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index a83214a33..11a6a56b5 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -6,7 +6,7 @@ Method ``link_token`` The ``link_token`` method evaluates a request as :py:obj:`suspicious ` if the URL ``/client.css`` is not requested by the -client. By adding a random component (the token) in the URL a bot can not send +client. By adding a random component (the token) in the URL, a bot can not send a ping by request a static URL. .. note:: @@ -35,6 +35,11 @@ And in the HTML template from flask a stylesheet link is needed (the value of https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For """ +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) import string import random @@ -43,7 +48,11 @@ import flask from searx import logger from searx import redisdb from searx.redislib import secret_hash -from ._helpers import get_real_ip + +from ._helpers import ( + get_network, + get_real_ip, +) TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" @@ -60,29 +69,26 @@ TOKEN_KEY = 'SearXNG_limiter.token' logger = logger.getChild('botdetection.link_token') -def is_suspicious(request: flask.Request, renew: bool = False): - """Checks if there is a valid ping for this request, if not this request is - rated as *suspicious*. If a valid ping exists and argument ``renew`` is - ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): + """Checks whether a valid ping is exists for this (client) network, if not + this request is rated as *suspicious*. If a valid ping exists and argument + ``renew`` is ``True`` the expire time of this ping is reset to + :py:obj:`PING_LIVE_TIME`. """ redis_client = redisdb.client() if not redis_client: return False - ping_key = get_ping_key(request) + ping_key = get_ping_key(network, request) if not redis_client.get(ping_key): - logger.warning( - "missing ping (IP: %s) / request: %s", - get_real_ip(request), - ping_key, - ) + logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key) return True if renew: redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) - logger.debug("found ping for client request: %s", ping_key) + logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) return False @@ -92,27 +98,31 @@ def ping(request: flask.Request, token: str): The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. """ + from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import + redis_client = redisdb.client() if not redis_client: return if not token_is_valid(token): return - ping_key = get_ping_key(request) - logger.debug("store ping for: %s", ping_key) - redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + cfg = limiter.get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) -def get_ping_key(request: flask.Request): - """Generates a hashed key that fits (more or less) to a client (request). - At least X-Forwarded-For_ is needed to be able to assign the request to an - IP. + ping_key = get_ping_key(network, request) + logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) - """ + +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: + """Generates a hashed key that fits (more or less) to a *WEB-browser + session* in a network.""" return ( PING_KEY + "[" + secret_hash( - get_real_ip(request) + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') + network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') ) + "]" ) -- cgit v1.2.3 From 80af38d37b21dc6e5edbf27bd22310db42a6f923 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 16:00:49 +0200 Subject: [mod] increase SUSPICIOUS_IP_WINDOW from one day to 30 days In my tests I see bots rotating IPs (with endless IP lists). If such a bot has 100 IPs and has three attempts (SUSPICIOUS_IP_MAX = 3) then it can successfully send up to 300 requests in one day while rotating the IP. To block the bots for a longer period of time the SUSPICIOUS_IP_WINDOW, as the time period in which an IP is observed, must be increased. For normal WEB-browsers this is no problem, because the SUSPICIOUS_IP_WINDOW is deleted as soon as the CSS with the token is loaded. SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 Time (sec) before sliding window for one suspicious IP expires. SUSPICIOUS_IP_MAX = 3 Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" Signed-off-by: Markus Heiser --- searx/botdetection/ip_limit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/botdetection') diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 46e026371..bb4229f0e 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -82,7 +82,7 @@ API_WONDOW = 3600 API_MAX = 4 """Maximum requests from one IP in the :py:obj:`API_WONDOW`""" -SUSPICIOUS_IP_WINDOW = 3600 * 24 +SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 """Time (sec) before sliding window for one suspicious IP expires.""" SUSPICIOUS_IP_MAX = 3 -- cgit v1.2.3