summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarIT.de>2025-10-06 10:12:38 +0200
committerGitHub <noreply@github.com>2025-10-06 10:12:38 +0200
commitd8d5de4d47eeac922a0376e7e23de166610f8a8d (patch)
treeddaeedbfc0c96e707bfe6d53f7bcc3af7b55fd53 /searx/engines
parent34eb32f4185a784fdf29ef990ddb1bcd79a90fe1 (diff)
[fix] google scholar - detect CAPTCHA (HTTP redirects) (#5268)
In the case of .. response, for example, an HTTP 302 is returned by Google Scholar:: Our systems have detected unusual traffic from your computer network. Please try again later. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/google_scholar.py14
1 files changed, 12 insertions, 2 deletions
diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py
index 8a82b36ee..b60b257bd 100644
--- a/searx/engines/google_scholar.py
+++ b/searx/engines/google_scholar.py
@@ -27,6 +27,7 @@ import typing as t
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
+import httpx
from searx.utils import (
eval_xpath,
@@ -36,7 +37,7 @@ from searx.utils import (
ElementType,
)
-from searx.exceptions import SearxEngineCaptchaException
+from searx.exceptions import SearxEngineCaptchaException, SearxEngineAccessDeniedException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
@@ -97,6 +98,15 @@ def request(query: str, params: "OnlineParams") -> None:
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
+ if resp.status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers:
+ if "/sorry/index?continue" in resp.headers["Location"]:
+ # Our systems have detected unusual traffic from your computer
+ # network. Please try again later.
+ raise SearxEngineAccessDeniedException(
+ message="google_scholar: unusual traffic detected",
+ )
+ raise httpx.TooManyRedirects(f"location {resp.headers['Location'].split('?')[0]}")
+
res = EngineResults()
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
@@ -192,7 +202,7 @@ def detect_google_captcha(dom: ElementType):
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
- raise SearxEngineCaptchaException()
+ raise SearxEngineCaptchaException(message="CAPTCHA (gs_captcha_f)")
def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: