From dba569462d0e9c4dbd77a54bb42ef5c3b1916142 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 19 Apr 2023 17:20:03 +0200 Subject: [mod] limiter: reduce request rates for requests without a ping Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'searx/plugins') diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 46c82f588..c7d74248b 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -18,7 +18,7 @@ from flask import request from searx import redisdb from searx.plugins import logger -from searx.redislib import incr_sliding_window +from searx.redislib import incr_sliding_window, secret_hash name = "Request limiter" description = "Limit the number of request" @@ -41,6 +41,18 @@ block_user_agent = re.compile( + r')' ) +PING_KEY = 'SearXNG_limiter.ping' +TOKEN_KEY = 'SearXNG_limiter.token' + + +def ping(): + redis_client = redisdb.client() + user_agent = request.headers.get('User-Agent', 'unknown') + x_forwarded_for = request.headers.get('X-Forwarded-For', '') + + ping_key = PING_KEY + user_agent + x_forwarded_for + redis_client.set(secret_hash(ping_key), 1, ex=600) + def is_accepted_request() -> bool: # pylint: disable=too-many-return-statements @@ -57,9 +69,20 @@ def is_accepted_request() -> bool: if request.path == '/search': + c_burst_max = 2 + c_10min_max = 10 + + ping_key = PING_KEY + user_agent + x_forwarded_for + if redis_client.get(secret_hash(ping_key)): + logger.debug('got a ping') + c_burst_max = 15 + c_10min_max = 150 + else: + logger.debug('missing a ping') + c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) - if c_burst > 15 or c_10min > 150: + if c_burst > c_burst_max or c_10min > c_10min_max: logger.debug("BLOCK %s: to many request", x_forwarded_for) return False -- cgit v1.2.3 From 5226044c13817688a5ca3461743844dca4ed3d2b Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 19 Apr 2023 18:59:23 +0200 Subject: [mod] limiter: add random token to the limiter URL By adding a random component in the limiter URL a bot can no longer send a ping by request a static URL. Related: https://github.com/searxng/searxng/pull/2357#issuecomment-1518525094 Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'searx/plugins') diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index c7d74248b..69bd576d4 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -14,6 +14,8 @@ Enable the plugin in ``settings.yml``: """ import re +import string +import random from flask import request from searx import redisdb @@ -54,6 +56,27 @@ def ping(): redis_client.set(secret_hash(ping_key), 1, ex=600) +def get_token(): + redis_client = redisdb.client() + if not redis_client: + # This function is also called when limiter is inactive / no redis DB + # (see render function in webapp.py) + return '12345678' + token = redis_client.get(TOKEN_KEY) + if token: + token = token.decode('UTF-8') + else: + token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) + redis_client.set(TOKEN_KEY, token, ex=600) + return token + + +def token_is_valid(token): + valid = token == get_token() + logger.debug("token is valid --> %s", valid) + return valid + + def is_accepted_request() -> bool: # pylint: disable=too-many-return-statements redis_client = redisdb.client() @@ -83,7 +106,7 @@ def is_accepted_request() -> bool: c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) if c_burst > c_burst_max or c_10min > c_10min_max: - logger.debug("BLOCK %s: to many request", x_forwarded_for) + logger.debug("BLOCK %s: too many request", x_forwarded_for) return False if len(request.headers.get('Accept-Language', '').strip()) == '': -- cgit v1.2.3 From 1ec325adccc427fe05cf08da9a2d9d63da7365f4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 23 May 2023 18:16:37 +0200 Subject: [mod] limiter -> botdetection: modularization and documentation In order to be able to meet the outstanding requirements, the implementation is modularized and supplemented with documentation. This patch does not contain functional change, except it fixes issue #2455 ---- Aktivate limiter in the settings.yml and simulate a bot request by:: curl -H 'Accept-Language: de-DE,en-US;q=0.7,en;q=0.3' \ -H 'Accept: text/html' -H 'User-Agent: xyz' \ -H 'Accept-Encoding: gzip' \ 'http://127.0.0.1:8888/search?q=foo' In the LOG: DEBUG searx.botdetection.link_token : missing ping for this request: ..... Since ``BURST_MAX_SUSPICIOUS = 2`` you can repeat the query above two time before you get a "Too Many Requests" response. Closes: https://github.com/searxng/searxng/issues/2455 Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 155 +++++------------------------------------------ 1 file changed, 16 insertions(+), 139 deletions(-) (limited to 'searx/plugins') diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 69bd576d4..d9566b92b 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -1,165 +1,42 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pyright: basic -"""Some bot protection / rate limitation +"""see :ref:`limiter src`""" -To monitor rate limits and protect privacy the IP addresses are getting stored -with a hash so the limiter plugin knows who to block. A redis database is -needed to store the hash values. - -Enable the plugin in ``settings.yml``: - -- ``server.limiter: true`` -- ``redis.url: ...`` check the value, see :ref:`settings redis` -""" - -import re -import string -import random -from flask import request +import flask from searx import redisdb from searx.plugins import logger -from searx.redislib import incr_sliding_window, secret_hash +from searx.botdetection import limiter +from searx.botdetection import dump_request name = "Request limiter" description = "Limit the number of request" default_on = False preference_section = 'service' -logger = logger.getChild('limiter') - -block_user_agent = re.compile( - r'(' - + r'unknown' - + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' - + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' - + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' - + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' - + r'|ZmEu|BLEXBot|bitlybot' - # unmaintained Farside instances - + r'|' - + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') - + '|.*PetalBot.*' - + r')' -) - -PING_KEY = 'SearXNG_limiter.ping' -TOKEN_KEY = 'SearXNG_limiter.token' - - -def ping(): - redis_client = redisdb.client() - user_agent = request.headers.get('User-Agent', 'unknown') - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - - ping_key = PING_KEY + user_agent + x_forwarded_for - redis_client.set(secret_hash(ping_key), 1, ex=600) - - -def get_token(): - redis_client = redisdb.client() - if not redis_client: - # This function is also called when limiter is inactive / no redis DB - # (see render function in webapp.py) - return '12345678' - token = redis_client.get(TOKEN_KEY) - if token: - token = token.decode('UTF-8') - else: - token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) - redis_client.set(TOKEN_KEY, token, ex=600) - return token - - -def token_is_valid(token): - valid = token == get_token() - logger.debug("token is valid --> %s", valid) - return valid - - -def is_accepted_request() -> bool: - # pylint: disable=too-many-return-statements - redis_client = redisdb.client() - user_agent = request.headers.get('User-Agent', 'unknown') - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - - if request.path == '/healthz': - return True - if block_user_agent.match(user_agent): - logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent)) - return False - - if request.path == '/search': - - c_burst_max = 2 - c_10min_max = 10 - - ping_key = PING_KEY + user_agent + x_forwarded_for - if redis_client.get(secret_hash(ping_key)): - logger.debug('got a ping') - c_burst_max = 15 - c_10min_max = 150 - else: - logger.debug('missing a ping') - - c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20) - c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600) - if c_burst > c_burst_max or c_10min > c_10min_max: - logger.debug("BLOCK %s: too many request", x_forwarded_for) - return False - - if len(request.headers.get('Accept-Language', '').strip()) == '': - logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for) - return False - - if request.headers.get('Connection') == 'close': - logger.debug("BLOCK %s: got Connection=close", x_forwarded_for) - return False - - accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] - if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list: - logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for) - return False - - if 'text/html' not in request.accept_mimetypes: - logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for) - return False - - if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600) - if c > 4: - logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for) - return False +logger = logger.getChild('limiter') - logger.debug( - "OK %s: '%s'" % (x_forwarded_for, request.path) - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept', '') - + " || Accept-Language: %s" % request.headers.get('Accept-Language', '') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '') - + " || Content-Type: %s" % request.headers.get('Content-Type', '') - + " || Content-Length: %s" % request.headers.get('Content-Length', '') - + " || Connection: %s" % request.headers.get('Connection', '') - + " || User-Agent: %s" % user_agent - ) - return True +def pre_request(): + """See :ref:`flask.Flask.before_request`""" + val = limiter.filter_request(flask.request) + if val is not None: + http_status, msg = val + client_ip = flask.request.headers.get('X-Forwarded-For', '') + logger.error("BLOCK (IP %s): %s" % (client_ip, msg)) + return 'Too Many Requests', http_status -def pre_request(): - if not is_accepted_request(): - return 'Too Many Requests', 429 + logger.debug("OK: %s" % dump_request(flask.request)) return None -def init(app, settings): +def init(app: flask.Flask, settings) -> bool: if not settings['server']['limiter']: return False - if not redisdb.client(): - logger.error("The limiter requires Redis") # pylint: disable=undefined-variable + logger.error("The limiter requires Redis") return False - app.before_request(pre_request) return True -- cgit v1.2.3 From 66fdec0eb92bf11c0bc477d6fb1df3dc783e4dcb Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 May 2023 17:24:43 +0200 Subject: [mod] limiter: add config file /etc/searxng/limiter.toml Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 1 + 1 file changed, 1 insertion(+) (limited to 'searx/plugins') diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index d9566b92b..92b0aa2a0 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -38,5 +38,6 @@ def init(app: flask.Flask, settings) -> bool: if not redisdb.client(): logger.error("The limiter requires Redis") return False + limiter.init_cfg(logger) app.before_request(pre_request) return True -- cgit v1.2.3 From b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 28 May 2023 18:58:31 +0200 Subject: [mod] botdetection - improve ip_limit and link_token methods - counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'searx/plugins') diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 92b0aa2a0..7edbb1ce0 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -20,16 +20,10 @@ logger = logger.getChild('limiter') def pre_request(): """See :ref:`flask.Flask.before_request`""" - - val = limiter.filter_request(flask.request) - if val is not None: - http_status, msg = val - client_ip = flask.request.headers.get('X-Forwarded-For', '') - logger.error("BLOCK (IP %s): %s" % (client_ip, msg)) - return 'Too Many Requests', http_status - - logger.debug("OK: %s" % dump_request(flask.request)) - return None + ret_val = limiter.filter_request(flask.request) + if ret_val is None: + logger.debug("OK: %s" % dump_request(flask.request)) + return ret_val def init(app: flask.Flask, settings) -> bool: -- cgit v1.2.3 From 38431d2e142b7da6a9b48aad203f02a2eff7e6fd Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 29 May 2023 19:46:37 +0200 Subject: [fix] correct determination of the IP for the request For correct determination of the IP to the request the function botdetection.get_real_ip() is implemented. This fonction is used in the ip_limit and link_token method of the botdetection and it is used in the self_info plugin. A documentation about the X-Forwarded-For header has been added. [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566211059 Signed-off-by: Markus Heiser --- searx/plugins/self_info.py | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'searx/plugins') diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py index fbe4518b5..8079ee0d4 100644 --- a/searx/plugins/self_info.py +++ b/searx/plugins/self_info.py @@ -1,21 +1,11 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring,invalid-name -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. - -(C) 2015 by Adam Tauber, -''' -from flask_babel import gettext import re +from flask_babel import gettext + +from searx.botdetection._helpers import get_real_ip name = gettext('Self Information') description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".') @@ -28,18 +18,11 @@ query_examples = '' p = re.compile('.*user[ -]agent.*', re.IGNORECASE) -# attach callback to the post search hook -# request: flask request object -# ctx: the whole local context of the pre search hook def post_search(request, search): if search.search_query.pageno > 1: return True if search.search_query.query == 'ip': - x_forwarded_for = request.headers.getlist("X-Forwarded-For") - if x_forwarded_for: - ip = x_forwarded_for[0] - else: - ip = request.remote_addr + ip = get_real_ip(request) search.result_container.answers['ip'] = {'answer': ip} elif p.match(search.search_query.query): ua = request.user_agent -- cgit v1.2.3 From 281e36f4b7848374535d5e953050ae73423191ca Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 15:41:48 +0200 Subject: [fix] limiter: replace real_ip by IPv4/v6 network Closes: https://github.com/searxng/searxng/issues/2477 Signed-off-by: Markus Heiser --- searx/plugins/limiter.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'searx/plugins') diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 7edbb1ce0..a8beb5e88 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -8,7 +8,6 @@ import flask from searx import redisdb from searx.plugins import logger from searx.botdetection import limiter -from searx.botdetection import dump_request name = "Request limiter" description = "Limit the number of request" @@ -20,10 +19,7 @@ logger = logger.getChild('limiter') def pre_request(): """See :ref:`flask.Flask.before_request`""" - ret_val = limiter.filter_request(flask.request) - if ret_val is None: - logger.debug("OK: %s" % dump_request(flask.request)) - return ret_val + return limiter.filter_request(flask.request) def init(app: flask.Flask, settings) -> bool: @@ -32,6 +28,5 @@ def init(app: flask.Flask, settings) -> bool: if not redisdb.client(): logger.error("The limiter requires Redis") return False - limiter.init_cfg(logger) app.before_request(pre_request) return True -- cgit v1.2.3