summaryrefslogtreecommitdiff
path: root/searx/botdetection/http_user_agent.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/botdetection/http_user_agent.py')
-rw-r--r--searx/botdetection/http_user_agent.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py
new file mode 100644
index 000000000..892ae0bd9
--- /dev/null
+++ b/searx/botdetection/http_user_agent.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""
+Method ``http_user_agent``
+--------------------------
+
+The ``http_user_agent`` method evaluates a request as the request of a bot if
+the User-Agent_ header is unset or matches the regular expression
+:py:obj:`USER_AGENT`.
+
+.. _User-Agent:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
+
+"""
+
+from typing import Optional, Tuple
+import re
+import flask
+
+USER_AGENT = (
+ r'('
+ + r'unknown'
+ + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ + r'|ZmEu|BLEXBot|bitlybot'
+ # unmaintained Farside instances
+ + r'|'
+ + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
+ # other bots and client to block
+ + '|.*PetalBot.*'
+ + r')'
+)
+"""Regular expression that matches to User-Agent_ from known *bots*"""
+
+_regexp = None
+
+
+def regexp_user_agent():
+ global _regexp # pylint: disable=global-statement
+ if not _regexp:
+ _regexp = re.compile(USER_AGENT)
+ return _regexp
+
+
+def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
+ user_agent = request.headers.get('User-Agent', 'unknown')
+ if regexp_user_agent().match(user_agent):
+ return (
+ 429,
+ f"bot detected, HTTP header User-Agent: {user_agent}",
+ )
+ return None