diff options
Diffstat (limited to 'searx/engines/duckduckgo.py')
| -rw-r--r-- | searx/engines/duckduckgo.py | 98 |
1 files changed, 52 insertions, 46 deletions
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 407d731f0..0d2c0af2d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -18,16 +18,27 @@ from json import loads from searx.engines.xpath import extract_text from searx.poolrequests import get from searx.url_utils import urlencode +from searx.utils import match_language, eval_xpath # engine dependent config categories = ['general'] paging = True language_support = True -supported_languages_url = 'https://duckduckgo.com/d2030.js' +supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True +language_aliases = { + 'ar-SA': 'ar-XA', + 'es-419': 'es-XL', + 'ja': 'jp-JP', + 'ko': 'kr-KR', + 'sl-SI': 'sl-SL', + 'zh-TW': 'tzh-TW', + 'zh-HK': 'tzh-HK' +} + # search-url -url = 'https://duckduckgo.com/html?{query}&s={offset}&api=/d.js&o=json&dc={dc_param}' +url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}' time_range_url = '&df={range}' time_range_dict = {'day': 'd', @@ -42,55 +53,48 @@ content_xpath = './/a[@class="result__snippet"]' # match query's language to a region code that duckduckgo will accept -def get_region_code(lang, lang_list=None): - # custom fixes for languages +def get_region_code(lang, lang_list=[]): if lang == 'all': - region_code = None - elif lang[:2] == 'ja': - region_code = 'jp-jp' - elif lang[:2] == 'sl': - region_code = 'sl-sl' - elif lang == 'zh-TW': - region_code = 'tw-tzh' - elif lang == 'zh-HK': - region_code = 'hk-tzh' - elif lang[-2:] == 'SA': - region_code = 'xa-' + lang.split('-')[0] - elif lang[-2:] == 'GB': - region_code = 'uk-' + lang.split('-')[0] - else: - region_code = lang.split('-') - if len(region_code) == 2: - # country code goes first - region_code = region_code[1].lower() + '-' + region_code[0].lower() - else: - # tries to get a country code from language - region_code = region_code[0].lower() - for lc in (lang_list or supported_languages): - lc = lc.split('-') - if region_code == lc[0]: - region_code = lc[1].lower() + '-' + lc[0].lower() - break - return region_code + return None + + lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT') + lang_parts = lang_code.split('-') + + # country code goes first + return lang_parts[1].lower() + '-' + lang_parts[0].lower() -# do search-request def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: + if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict: return params offset = (params['pageno'] - 1) * 30 - region_code = get_region_code(params['language']) - if region_code: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + region_code = get_region_code(params['language'], supported_languages) + params['url'] = 'https://duckduckgo.com/html/' + if params['pageno'] > 1: + params['method'] = 'POST' + params['data']['q'] = query + params['data']['s'] = offset + params['data']['dc'] = 30 + params['data']['nextParams'] = '' + params['data']['v'] = 'l' + params['data']['o'] = 'json' + params['data']['api'] = '/d.js' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + if region_code: + params['data']['kl'] = region_code else: - params['url'] = url.format( - query=urlencode({'q': query}), offset=offset, dc_param=offset) + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) return params @@ -102,17 +106,19 @@ def response(resp): doc = fromstring(resp.text) # parse results - for r in doc.xpath(result_xpath): + for i, r in enumerate(eval_xpath(doc, result_xpath)): + if i >= 30: + break try: - res_url = r.xpath(url_xpath)[-1] + res_url = eval_xpath(r, url_xpath)[-1] except: continue if not res_url: continue - title = extract_text(r.xpath(title_xpath)) - content = extract_text(r.xpath(content_xpath)) + title = extract_text(eval_xpath(r, title_xpath)) + content = extract_text(eval_xpath(r, content_xpath)) # append result results.append({'title': title, @@ -134,4 +140,4 @@ def _fetch_supported_languages(resp): regions_json = loads(response_page) supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) - return supported_languages + return list(supported_languages) |