From 050451347b021d05d26c7a0797c790bbd83442e4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 19 Oct 2024 14:19:27 +0200 Subject: [fix] engine: duckduckgo - CAPTCHA detection The previous implementation could not distinguish a CAPTCHA response from an ordinary result list. In the previous implementation a CAPTCHA was taken as a result list where no items are in. DDG does not block IPs. Instead, a CAPTCHA wall is placed in front of request on a dubious request. Signed-off-by: Markus Heiser --- searx/engines/duckduckgo.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 27171778d..2a917ed7a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -25,6 +25,7 @@ from searx.network import get # see https://github.com/searxng/searxng/issues/7 from searx import redisdb from searx.enginelib.traits import EngineTraits from searx.utils import extr +from searx.exceptions import SearxEngineCaptchaException if TYPE_CHECKING: import logging @@ -292,6 +293,15 @@ def request(query, params): return params +def detect_ddg_captcha(dom): + """In case of CAPTCHA ddg open its own *not a Robot* dialog and is + not redirected to CAPTCHA page. + """ + if eval_xpath(dom, "//form[@id='challenge-form']"): + # set suspend time to zero is OK --> ddg does not block the IP + raise SearxEngineCaptchaException(suspended_time=0) + + def response(resp): if resp.status_code == 303: @@ -299,6 +309,7 @@ def response(resp): results = [] doc = lxml.html.fromstring(resp.text) + detect_ddg_captcha(doc) result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') -- cgit v1.2.3