summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoruseralias <212567668+useralias@users.noreply.github.com>2025-05-23 07:01:10 -0400
committerGitHub <noreply@github.com>2025-05-23 13:01:10 +0200
commit4fa7de80334deee51845550c4cfa11f94edb5e88 (patch)
treeb5df2e9d3cb0435019d4e5e30708ce7fb9371b02
parent98badc9cd0cfd00405014f843de7dd883d75ecca (diff)
[refactor] duckduckgo engine: improve request logic and code structure (#4837)
Changes: - Add trailing slash to base URL to prevent potential redirects - Remove advanced search syntax filtering (no longer guarantees a CAPTCHA) - Correct pagination offset calculation: Page 2 now starts at offset 10, subsequent pages use 10 + (n-2)*15 formula instead of the previous broken 20 + (n-2)*50 calculation that caused CAPTCHAs - Restructure request parameter building to better match a real request - "kt" cookie is no longer an empty string if the language/region is "all" - Group related parameter assignments together - Add header logging to debugging output Related: - https://github.com/searxng/searxng/issues/4824
-rw-r--r--searx/engines/duckduckgo.py117
1 files changed, 51 insertions, 66 deletions
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 2b6888af7..d877d596b 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -58,7 +58,7 @@ paging = True
time_range_support = True
safesearch = True # user can't select but the results are filtered
-url = "https://html.duckduckgo.com/html"
+url = "https://html.duckduckgo.com/html/"
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
@@ -248,7 +248,6 @@ def quote_ddg_bangs(query):
def request(query, params):
-
query = quote_ddg_bangs(query)
if len(query) >= 500:
@@ -256,93 +255,79 @@ def request(query, params):
params["url"] = None
return
- # Advanced search syntax ends in CAPTCHA
- # https://duckduckgo.com/duckduckgo-help-pages/results/syntax/
- query = " ".join(
- [
- x.removeprefix("site:").removeprefix("intitle:").removeprefix("inurl:").removeprefix("filetype:")
- for x in query.split()
- ]
- )
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
- if eng_region == "wt-wt":
- # https://html.duckduckgo.com/html sets an empty value for "all".
- eng_region = ""
-
- params['data']['kl'] = eng_region
- params['cookies']['kl'] = eng_region
- # eng_lang = get_ddg_lang(traits, params['searxng_locale'])
+ # Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
+ # (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
+ # and breaking changes in the future.
+ #
+ # The params['data'] dictionary can have the following key parameters, in this order:
+ # - q (str): Search query string
+ # - b (str): Beginning parameter - empty string for first page requests
+ # - s (int): Search offset for pagination
+ # - nextParams (str): Continuation parameters from previous page response, typically empty
+ # - v (str): Typically 'l' for subsequent pages
+ # - o (str): Output format, typically 'json'
+ # - dc (int): Display count - value equal to offset (s) + 1
+ # - api (str): API endpoint identifier, typically 'd.js'
+ # - vqd (str): Validation query digest
+ # - kl (str): Keyboard language/region code (e.g., 'en-us')
+ # - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
- params['url'] = url
- params['method'] = 'POST'
params['data']['q'] = query
- # The API is not documented, so we do some reverse engineering and emulate
- # what https://html.duckduckgo.com/html does when you press "next Page" link
- # again and again ..
-
- params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
-
- params['headers']['Sec-Fetch-Dest'] = "document"
- params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
- params['headers']['Sec-Fetch-Site'] = "same-origin"
- params['headers']['Sec-Fetch-User'] = "?1"
-
- # Form of the initial search page does have empty values in the form
if params['pageno'] == 1:
-
params['data']['b'] = ""
-
- params['data']['df'] = ''
- if params['time_range'] in time_range_dict:
-
- params['data']['df'] = time_range_dict[params['time_range']]
- params['cookies']['df'] = time_range_dict[params['time_range']]
-
- if params['pageno'] == 2:
-
- # second page does have an offset of 20
- offset = (params['pageno'] - 1) * 20
+ elif params['pageno'] >= 2:
+ offset = 10 + (params['pageno'] - 2) * 15 # Page 2 = 10, Page 3+ = 10 + n*15
params['data']['s'] = offset
- params['data']['dc'] = offset + 1
-
- elif params['pageno'] > 2:
-
- # third and following pages do have an offset of 20 + n*50
- offset = 20 + (params['pageno'] - 2) * 50
- params['data']['s'] = offset
- params['data']['dc'] = offset + 1
-
- if params['pageno'] > 1:
-
- # initial page does not have these additional data in the input form
- params['data']['o'] = form_data.get('o', 'json')
- params['data']['api'] = form_data.get('api', 'd.js')
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
- params['headers']['Referer'] = url
+ params['data']['o'] = form_data.get('o', 'json')
+ params['data']['dc'] = offset + 1
+ params['data']['api'] = form_data.get('api', 'd.js')
+ # vqd is required to request other pages after the first one
vqd = get_vqd(query, eng_region, force_request=False)
-
- # Certain conditions must be met in order to call up one of the
- # following pages ...
-
if vqd:
- params['data']['vqd'] = vqd # follow up pages / requests needs a vqd argument
+ params['data']['vqd'] = vqd
else:
- # Don't try to call follow up pages without a vqd value. DDG
- # recognizes this as a request from a bot. This lowers the
+ # Don't try to call follow up pages without a vqd value.
+ # DDG recognizes this as a request from a bot. This lowers the
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
params["url"] = None
return
if params['searxng_locale'].startswith("zh"):
- # Some locales (at least China) do not have a "next page" button and ddg
+ # Some locales (at least China) do not have a "next page" button and DDG
# will return a HTTP/2 403 Forbidden for a request of such a page.
params["url"] = None
return
+ # Put empty kl in form data if language/region set to all
+ if eng_region == "wt-wt":
+ params['data']['kl'] = ""
+ else:
+ params['data']['kl'] = eng_region
+
+ params['data']['df'] = ''
+ if params['time_range'] in time_range_dict:
+ params['data']['df'] = time_range_dict[params['time_range']]
+ params['cookies']['df'] = time_range_dict[params['time_range']]
+
+ params['cookies']['kl'] = eng_region
+
+ params['url'] = url
+ params['method'] = 'POST'
+
+ params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
+ params['headers']['Referer'] = url
+ params['headers']['Sec-Fetch-Dest'] = "document"
+ params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
+ params['headers']['Sec-Fetch-Site'] = "same-origin"
+ params['headers']['Sec-Fetch-User'] = "?1"
+
+ logger.debug("param headers: %s", params['headers'])
logger.debug("param data: %s", params['data'])
logger.debug("param cookies: %s", params['cookies'])