From 1ec325adccc427fe05cf08da9a2d9d63da7365f4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 23 May 2023 18:16:37 +0200 Subject: [mod] limiter -> botdetection: modularization and documentation In order to be able to meet the outstanding requirements, the implementation is modularized and supplemented with documentation. This patch does not contain functional change, except it fixes issue #2455 ---- Aktivate limiter in the settings.yml and simulate a bot request by:: curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \ -H 'Accept: text/html' -H 'User-Agent: xyz' \ -H 'Accept-Encoding: gzip' \ 'http://127.0.0.1:8888/search?q=foo' In the LOG: DEBUG searx.botdetection.link_token : missing ping for this request: ..... Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time before you get a "Too Many Requests" response. Closes: https://github.com/searxng/searxng/issues/2455 Signed-off-by: Markus Heiser --- searx/botdetection/limiter.py | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 searx/botdetection/limiter.py (limited to 'searx/botdetection/limiter.py') diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py new file mode 100644 index 000000000..71044c312 --- /dev/null +++ b/searx/botdetection/limiter.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""".. _limiter src: + +Limiter +======= + +.. sidebar:: info + + The limiter requires a :ref:`Redis ` database. + +Bot protection / IP rate limitation. The intention of rate limitation is to +limit suspicious requests from an IP. The motivation behind this is the fact +that SearXNG passes through requests from bots and is thus classified as a bot +itself. As a result, the SearXNG engine then receives a CAPTCHA or is blocked +by the search engine (the origin) in some other way. + +To avoid blocking, the requests from bots to SearXNG must also be blocked, this +is the task of the limiter. To perform this task, the limiter uses the methods +from the :py:obj:`searx.botdetection`. + +To enable the limiter activate: + +.. code:: yaml + + server: + ... + limiter: true # rate limit the number of request on the instance, block some bots + +and set the redis-url connection. Check the value, it depends on your redis DB +(see :ref:`settings redis`), by example: + +.. code:: yaml + + redis: + url: unix:///usr/local/searxng-redis/run/redis.sock?db=0 + +""" + +from typing import Optional, Tuple +import flask + +from searx.botdetection import ( + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, +) + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + + if request.path == '/healthz': + return None + + for func in [ + http_user_agent, + ]: + val = func.filter_request(request) + if val is not None: + return val + + if request.path == '/search': + + for func in [ + http_accept, + http_accept_encoding, + http_accept_language, + http_connection, + http_user_agent, + ip_limit, + ]: + val = func.filter_request(request) + if val is not None: + return val + + return None -- cgit v1.2.3 From 66fdec0eb92bf11c0bc477d6fb1df3dc783e4dcb Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 May 2023 17:24:43 +0200 Subject: [mod] limiter: add config file /etc/searxng/limiter.toml Signed-off-by: Markus Heiser --- searx/botdetection/limiter.py | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'searx/botdetection/limiter.py') diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 71044c312..cc1e00b3c 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -38,8 +38,11 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ from typing import Optional, Tuple +from pathlib import Path import flask +import pytomlpp as toml +from searx.tools import config from searx.botdetection import ( http_accept, http_accept_encoding, @@ -49,6 +52,42 @@ from searx.botdetection import ( ip_limit, ) +LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" +"""Base configuration (schema) of the botdetection.""" + +LIMITER_CFG = Path('/etc/searxng/limiter.toml') +"""Lokal Limiter configuration.""" + +CFG_DEPRECATED = { + # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." +} + +CFG = config.Config({}, {}) + + +def init_cfg(log): + global CFG # pylint: disable=global-statement + CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) + + if not LIMITER_CFG.exists(): + log.warning("missing config file: %s", LIMITER_CFG) + return + + log.warning("load config file: %s", LIMITER_CFG) + try: + upd_cfg = toml.load(LIMITER_CFG) + except toml.DecodeError as exc: + msg = str(exc).replace('\t', '').replace('\n', ' ') + log.error("%s: %s", LIMITER_CFG, msg) + raise + + is_valid, issue_list = CFG.validate(upd_cfg) + for msg in issue_list: + log.error(str(msg)) + if not is_valid: + raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") + CFG.update(upd_cfg) + def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: @@ -58,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val @@ -72,7 +111,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request) + val = func.filter_request(request, CFG) if val is not None: return val -- cgit v1.2.3 From b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 28 May 2023 18:58:31 +0200 Subject: [mod] botdetection - improve ip_limit and link_token methods - counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser --- searx/botdetection/limiter.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'searx/botdetection/limiter.py') diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index cc1e00b3c..93826684f 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -42,6 +42,7 @@ from pathlib import Path import flask import pytomlpp as toml +from searx import logger from searx.tools import config from searx.botdetection import ( http_accept, @@ -62,7 +63,13 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = config.Config({}, {}) +CFG = None + + +def get_cfg() -> config.Config: + if CFG is None: + init_cfg(logger) + return CFG def init_cfg(log): @@ -73,7 +80,7 @@ def init_cfg(log): log.warning("missing config file: %s", LIMITER_CFG) return - log.warning("load config file: %s", LIMITER_CFG) + log.info("load config file: %s", LIMITER_CFG) try: upd_cfg = toml.load(LIMITER_CFG) except toml.DecodeError as exc: -- cgit v1.2.3 From 281e36f4b7848374535d5e953050ae73423191ca Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 15:41:48 +0200 Subject: [fix] limiter: replace real_ip by IPv4/v6 network Closes: https://github.com/searxng/searxng/issues/2477 Signed-off-by: Markus Heiser --- searx/botdetection/limiter.py | 61 +++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 34 deletions(-) (limited to 'searx/botdetection/limiter.py') diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 93826684f..18ffc8407 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -37,14 +37,16 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ -from typing import Optional, Tuple +from __future__ import annotations + from pathlib import Path import flask -import pytomlpp as toml +import werkzeug -from searx import logger from searx.tools import config -from searx.botdetection import ( +from searx import logger + +from . import ( http_accept, http_accept_encoding, http_accept_language, @@ -53,6 +55,16 @@ from searx.botdetection import ( ip_limit, ) +from ._helpers import ( + get_network, + get_real_ip, + dump_request, +) + +logger = logger.getChild('botdetection.limiter') + +CFG: config.Config = None # type: ignore + LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" """Base configuration (schema) of the botdetection.""" @@ -63,40 +75,21 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = None - def get_cfg() -> config.Config: + global CFG # pylint: disable=global-statement if CFG is None: - init_cfg(logger) + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) return CFG -def init_cfg(log): - global CFG # pylint: disable=global-statement - CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) - - if not LIMITER_CFG.exists(): - log.warning("missing config file: %s", LIMITER_CFG) - return - - log.info("load config file: %s", LIMITER_CFG) - try: - upd_cfg = toml.load(LIMITER_CFG) - except toml.DecodeError as exc: - msg = str(exc).replace('\t', '').replace('\n', ' ') - log.error("%s: %s", LIMITER_CFG, msg) - raise +def filter_request(request: flask.Request) -> werkzeug.Response | None: - is_valid, issue_list = CFG.validate(upd_cfg) - for msg in issue_list: - log.error(str(msg)) - if not is_valid: - raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") - CFG.update(upd_cfg) - - -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + cfg = get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + if network.is_link_local: + return None if request.path == '/healthz': return None @@ -104,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val @@ -118,8 +111,8 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val - + logger.debug(f"OK {network}: %s", dump_request(flask.request)) return None -- cgit v1.2.3