diff options
Diffstat (limited to 'searx/botdetection/http_user_agent.py')
| -rw-r--r-- | searx/botdetection/http_user_agent.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py new file mode 100644 index 000000000..892ae0bd9 --- /dev/null +++ b/searx/botdetection/http_user_agent.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" +Method ``http_user_agent`` +-------------------------- + +The ``http_user_agent`` method evaluates a request as the request of a bot if +the User-Agent_ header is unset or matches the regular expression +:py:obj:`USER_AGENT`. + +.. _User-Agent: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent + +""" + +from typing import Optional, Tuple +import re +import flask + +USER_AGENT = ( + r'(' + + r'unknown' + + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + # unmaintained Farside instances + + r'|' + + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') + # other bots and client to block + + '|.*PetalBot.*' + + r')' +) +"""Regular expression that matches to User-Agent_ from known *bots*""" + +_regexp = None + + +def regexp_user_agent(): + global _regexp # pylint: disable=global-statement + if not _regexp: + _regexp = re.compile(USER_AGENT) + return _regexp + + +def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + user_agent = request.headers.get('User-Agent', 'unknown') + if regexp_user_agent().match(user_agent): + return ( + 429, + f"bot detected, HTTP header User-Agent: {user_agent}", + ) + return None |