diff options
| -rw-r--r-- | searx/engines/duckduckgo.py | 61 | ||||
| -rw-r--r-- | searx/engines/vimeo.py | 2 | ||||
| -rw-r--r-- | searx/engines/yahoo.py | 7 | ||||
| -rw-r--r-- | searx/settings.yml | 15 |
4 files changed, 62 insertions, 23 deletions
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 228a3028f..ec2858e69 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,29 +1,64 @@ -from json import loads from urllib import urlencode +from lxml.html import fromstring from searx.utils import html_to_text -url = 'https://duckduckgo.com/' -search_url = url + 'd.js?{query}&p=1&s={offset}' +url = 'https://duckduckgo.com/html?{query}&s={offset}' locale = 'us-en' -paging = True - - def request(query, params): offset = (params['pageno'] - 1) * 30 q = urlencode({'q': query, 'l': locale}) - params['url'] = search_url.format(query=q, offset=offset) + params['url'] = url.format(query=q, offset=offset) return params def response(resp): + result_xpath = '//div[@class="results_links results_links_deep web-result"]' + url_xpath = './/a[@class="large"]/@href' + title_xpath = './/a[@class="large"]//text()' + content_xpath = './/div[@class="snippet"]//text()' results = [] - search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] - for r in search_res: - if not r.get('t'): + + doc = fromstring(resp.text) + + for r in doc.xpath(result_xpath): + try: + res_url = r.xpath(url_xpath)[-1] + except: continue - results.append({'title': r['t'], - 'content': html_to_text(r['a']), - 'url': r['u']}) + if not res_url: + continue + title = html_to_text(''.join(r.xpath(title_xpath))) + content = html_to_text(''.join(r.xpath(content_xpath))) + results.append({'title': title, + 'content': content, + 'url': res_url}) + return results + + +#from json import loads +#search_url = url + 'd.js?{query}&p=1&s={offset}' +# +#paging = True +# +# +#def request(query, params): +# offset = (params['pageno'] - 1) * 30 +# q = urlencode({'q': query, +# 'l': locale}) +# params['url'] = search_url.format(query=q, offset=offset) +# return params +# +# +#def response(resp): +# results = [] +# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] +# for r in search_res: +# if not r.get('t'): +# continue +# results.append({'title': r['t'], +# 'content': html_to_text(r['a']), +# 'url': r['u']}) +# return results diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 8efa042a9..94a6dd545 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -1,7 +1,7 @@ from urllib import urlencode from HTMLParser import HTMLParser from lxml import html -from xpath import extract_text +from searx.engines.xpath import extract_text from dateutil import parser base_url = 'http://vimeo.com' diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index c6cabb58a..f89741839 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -47,8 +47,11 @@ def response(resp): dom = html.fromstring(resp.text) for result in dom.xpath(results_xpath): - url = parse_url(extract_url(result.xpath(url_xpath), search_url)) - title = extract_text(result.xpath(title_xpath)[0]) + try: + url = parse_url(extract_url(result.xpath(url_xpath), search_url)) + title = extract_text(result.xpath(title_xpath)[0]) + except: + continue content = extract_text(result.xpath(content_xpath)[0]) results.append({'url': url, 'title': title, 'content': content}) diff --git a/searx/settings.yml b/searx/settings.yml index e9ba2179e..69d182d1d 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -112,13 +112,14 @@ engines: # shortcut : unc # url : https://uncyclopedia.wikia.com/ - - name : urbandictionary - engine : xpath - search_url : http://www.urbandictionary.com/define.php?term={query} - url_xpath : //div[@class="word"]//a/@href - title_xpath : //div[@class="word"]//a - content_xpath : //div[@class="definition"] - shortcut : ud +# tmp suspended - too slow, too many errors +# - name : urbandictionary +# engine : xpath +# search_url : http://www.urbandictionary.com/define.php?term={query} +# url_xpath : //div[@class="word"]//a/@href +# title_xpath : //div[@class="word"]//a +# content_xpath : //div[@class="definition"] +# shortcut : ud - name : yahoo engine : yahoo |