diff options
| -rw-r--r-- | searx/engines/arxiv.py | 1 | ||||
| -rw-r--r-- | searx/engines/deviantart.py | 47 | ||||
| -rw-r--r-- | searx/engines/digg.py | 36 | ||||
| -rw-r--r-- | searx/engines/duckduckgo.py | 35 | ||||
| -rw-r--r-- | searx/engines/gigablast.py | 7 | ||||
| -rw-r--r-- | searx/engines/startpage.py | 26 | ||||
| -rw-r--r-- | searx/engines/www1x.py | 35 | ||||
| -rw-r--r-- | searx/exceptions.py | 1 | ||||
| -rw-r--r-- | searx/plugins/tracker_url_remover.py | 26 | ||||
| -rw-r--r-- | searx/query.py | 2 | ||||
| -rw-r--r-- | searx/settings.yml | 17 | ||||
| -rw-r--r-- | searx/utils.py | 5 | ||||
| -rw-r--r-- | tests/unit/engines/test_deviantart.py | 71 | ||||
| -rw-r--r-- | tests/unit/engines/test_digg.py | 85 | ||||
| -rw-r--r-- | tests/unit/engines/test_startpage.py | 123 | ||||
| -rw-r--r-- | tests/unit/engines/test_www1x.py | 43 |
16 files changed, 144 insertions, 416 deletions
diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index 182861892..e3c871d17 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -17,6 +17,7 @@ from searx.url_utils import urlencode categories = ['science'] +paging = True base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + '{query}&start={offset}&max_results={number_of_results}' diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index bb85c6dc5..a0e27e622 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -24,7 +24,7 @@ time_range_support = True # search-url base_url = 'https://www.deviantart.com/' -search_url = base_url + 'browse/all/?offset={offset}&{query}' +search_url = base_url + 'search?page={page}&{query}' time_range_url = '&order={range}' time_range_dict = {'day': 11, @@ -37,9 +37,7 @@ def request(query, params): if params['time_range'] and params['time_range'] not in time_range_dict: return params - offset = (params['pageno'] - 1) * 24 - - params['url'] = search_url.format(offset=offset, + params['url'] = search_url.format(page=params['pageno'], query=urlencode({'q': query})) if params['time_range'] in time_range_dict: params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) @@ -57,28 +55,27 @@ def response(resp): dom = html.fromstring(resp.text) - regex = re.compile(r'\/200H\/') - # parse results - for result in dom.xpath('.//span[@class="thumb wide"]'): - link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0] - url = link.attrib.get('href') - title = extract_text(result.xpath('.//span[@class="title"]')) - thumbnail_src = link.xpath('.//img')[0].attrib.get('src') - img_src = regex.sub('/', thumbnail_src) - - # http to https, remove domain sharding - thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) - thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) - - url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) + for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): + for result in row.xpath('./div'): + link = result.xpath('.//a[@data-hook="deviation_link"]')[0] + url = link.attrib.get('href') + title = link.attrib.get('title') + thumbnail_src = result.xpath('.//img')[0].attrib.get('src') + img_src = thumbnail_src + + # http to https, remove domain sharding + thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) + thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) + + url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) # return results return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 4369ccb84..073410eb0 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -15,7 +15,8 @@ import string from dateutil import parser from json import loads from lxml import html -from searx.url_utils import quote_plus +from searx.url_utils import urlencode +from datetime import datetime # engine dependent config categories = ['news', 'social media'] @@ -23,7 +24,7 @@ paging = True # search-url base_url = 'https://digg.com/' -search_url = base_url + 'api/search/{query}.json?position={position}&format=html' +search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' # specific xpath variables results_xpath = '//article' @@ -38,9 +39,9 @@ digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + offset = (params['pageno'] - 1) * 20 params['url'] = search_url.format(position=offset, - query=quote_plus(query)) + query=urlencode({'q': query})) params['cookies']['frontend.auid'] = ''.join(random.choice( digg_cookie_chars) for _ in range(22)) return params @@ -52,30 +53,17 @@ def response(resp): search_result = loads(resp.text) - if 'html' not in search_result or search_result['html'] == '': - return results - - dom = html.fromstring(search_result['html']) - # parse results - for result in dom.xpath(results_xpath): - url = result.attrib.get('data-contenturl') - thumbnail = result.xpath('.//img')[0].attrib.get('src') - title = ''.join(result.xpath(title_xpath)) - content = ''.join(result.xpath(content_xpath)) - pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') - publishedDate = parser.parse(pubdate) - - # http to https - thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com") + for result in search_result['mapped']: + published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") # append result - results.append({'url': url, - 'title': title, - 'content': content, + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['excerpt'], 'template': 'videos.html', - 'publishedDate': publishedDate, - 'thumbnail': thumbnail}) + 'publishedDate': published, + 'thumbnail': result['images']['thumbImage']}) # return results return results diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index fb8f523ac..e77ef0126 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -65,21 +65,36 @@ def get_region_code(lang, lang_list=[]): def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: + if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict: return params offset = (params['pageno'] - 1) * 30 region_code = get_region_code(params['language'], supported_languages) - if region_code: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + params['url'] = 'https://duckduckgo.com/html/' + if params['pageno'] > 1: + params['method'] = 'POST' + params['data']['q'] = query + params['data']['s'] = offset + params['data']['dc'] = 30 + params['data']['nextParams'] = '' + params['data']['v'] = 'l' + params['data']['o'] = 'json' + params['data']['api'] = '/d.js' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + if region_code: + params['data']['kl'] = region_code else: - params['url'] = url.format( - query=urlencode({'q': query}), offset=offset, dc_param=offset) + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) return params @@ -91,7 +106,9 @@ def response(resp): doc = fromstring(resp.text) # parse results - for r in doc.xpath(result_xpath): + for i, r in enumerate(doc.xpath(result_xpath)): + if i >= 30: + break try: res_url = r.xpath(url_xpath)[-1] except: diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index a6aa5d718..6b0402233 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -35,8 +35,8 @@ search_string = 'search?{query}'\ '&ff={safesearch}'\ '&rxiec={rxieu}'\ '&ulse={ulse}'\ - '&rand={rxikd}' # current unix timestamp - + '&rand={rxikd}'\ + '&dbez={dbez}' # specific xpath variables results_xpath = '//response//result' url_xpath = './/url' @@ -70,7 +70,8 @@ def request(query, params): rxieu=random.randint(1000000000, 9999999999), ulse=random.randint(100000000, 999999999), lang=language, - safesearch=safesearch) + safesearch=safesearch, + dbez=random.randint(100000000, 999999999)) params['url'] = base_url + search_path diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 6638f3d83..0f0ec6e18 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -15,6 +15,7 @@ from dateutil import parser from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text +from searx.languages import language_codes # engine dependent config categories = ['general'] @@ -22,7 +23,7 @@ categories = ['general'] # (probably the parameter qid), require # storing of qid's between mulitble search-calls -# paging = False +paging = True language_support = True # search-url @@ -32,23 +33,32 @@ search_url = base_url + 'do/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]' -link_xpath = './/h3/a' -content_xpath = './p[@class="search-item__body"]' +results_xpath = '//div[@class="w-gl__result"]' +link_xpath = './/a[@class="w-gl__result-title"]' +content_xpath = './/p[@class="w-gl__description"]' # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 params['url'] = search_url params['method'] = 'POST' - params['data'] = {'query': query, - 'startat': offset} + params['data'] = { + 'query': query, + 'page': params['pageno'], + 'cat': 'web', + 'cmd': 'process_search', + 'engine0': 'v1all', + } # set language if specified if params['language'] != 'all': - params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) + language = 'english' + for lc, _, _, lang in language_codes: + if lc == params['language']: + language = lang + params['data']['language'] = language + params['data']['lui'] = language return params diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 508803240..f1154b16d 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -11,8 +11,8 @@ """ from lxml import html -import re from searx.url_utils import urlencode, urljoin +from searx.engines.xpath import extract_text # engine dependent config categories = ['images'] @@ -34,41 +34,18 @@ def request(query, params): def response(resp): results = [] - # get links from result-text - regex = re.compile('(</a>|<a)') - results_parts = re.split(regex, resp.text) - - cur_element = '' - - # iterate over link parts - for result_part in results_parts: + dom = html.fromstring(resp.text) + for res in dom.xpath('//div[@class="List-item MainListing"]'): # processed start and end of link - if result_part == '<a': - cur_element = result_part - continue - elif result_part != '</a>': - cur_element += result_part - continue - - cur_element += result_part - - # fix xml-error - cur_element = cur_element.replace('"></a>', '"/></a>') - - dom = html.fromstring(cur_element) - link = dom.xpath('//a')[0] + link = res.xpath('//a')[0] url = urljoin(base_url, link.attrib.get('href')) - title = link.attrib.get('title', '') + title = extract_text(link) - thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) + thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) # TODO: get image with higher resolution img_src = thumbnail_src - # check if url is showing to a photo - if '/photo/' not in url: - continue - # append result results.append({'url': url, 'title': title, diff --git a/searx/exceptions.py b/searx/exceptions.py index c605ddcab..0175acfa3 100644 --- a/searx/exceptions.py +++ b/searx/exceptions.py @@ -28,5 +28,6 @@ class SearxParameterException(SearxException): else: message = 'Invalid value "' + value + '" for parameter ' + name super(SearxParameterException, self).__init__(message) + self.message = message self.parameter_name = name self.parameter_value = value diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py index 630c8a638..8cc063bba 100644 --- a/searx/plugins/tracker_url_remover.py +++ b/searx/plugins/tracker_url_remover.py @@ -17,10 +17,10 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. from flask_babel import gettext import re -from searx.url_utils import urlunparse +from searx.url_utils import urlunparse, parse_qsl, urlencode -regexes = {re.compile(r'utm_[^&]+&?'), - re.compile(r'(wkey|wemail)[^&]+&?'), +regexes = {re.compile(r'utm_[^&]+'), + re.compile(r'(wkey|wemail)[^&]*'), re.compile(r'&$')} name = gettext('Tracker URL remover') @@ -34,12 +34,18 @@ def on_result(request, search, result): if query == "": return True - - for reg in regexes: - query = reg.sub('', query) - - if query != result['parsed_url'].query: - result['parsed_url'] = result['parsed_url']._replace(query=query) - result['url'] = urlunparse(result['parsed_url']) + parsed_query = parse_qsl(query) + + changed = False + for i, (param_name, _) in enumerate(list(parsed_query)): + for reg in regexes: + if reg.match(param_name): + parsed_query.pop(i) + changed = True + break + + if changed: + result['parsed_url'] = result['parsed_url']._replace(query=urlencode(parsed_query)) + result['url'] = urlunparse(result['parsed_url']) return True diff --git a/searx/query.py b/searx/query.py index 382aed871..c4002bd31 100644 --- a/searx/query.py +++ b/searx/query.py @@ -184,7 +184,7 @@ class SearchQuery(object): self.lang = lang self.safesearch = safesearch self.pageno = pageno - self.time_range = time_range + self.time_range = None if time_range in ('', 'None', None) else time_range self.timeout_limit = timeout_limit def __str__(self): diff --git a/searx/settings.yml b/searx/settings.yml index cf2b13e08..835fbe5f6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -161,11 +161,12 @@ engines: weight : 2 disabled : True - - name : digbt - engine : digbt - shortcut : dbt - timeout : 6.0 - disabled : True +# cloudflare protected +# - name : digbt +# engine : digbt +# shortcut : dbt +# timeout : 6.0 +# disabled : True - name : digg engine : digg @@ -703,9 +704,9 @@ engines: shortcut: vo categories: social media search_url : https://searchvoat.co/?t={query} - url_xpath : //div[@class="entry"]/p/a[contains(@class, "title")]/@href - title_xpath : //div[@class="entry"]/p/a[contains(@class, "title")] - content_xpath : //div[@class="entry"]/p/span[@class="domain"]/a/text() + url_xpath : //div[@class="entry"]//p[@class="title"]/a/@href + title_xpath : //div[@class="entry"]//p[@class="title"]/a/text() + content_xpath : //div[@class="entry"]//span[@class="domain"]/a/text() timeout : 10.0 disabled : True diff --git a/searx/utils.py b/searx/utils.py index d88bc9897..eb5da2fa7 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -308,14 +308,15 @@ def int_or_zero(num): def is_valid_lang(lang): is_abbr = (len(lang) == 2) + lang = lang.lower().decode('utf-8') if is_abbr: for l in language_codes: - if l[0][:2] == lang.lower(): + if l[0][:2] == lang: return (True, l[0][:2], l[3].lower()) return False else: for l in language_codes: - if l[1].lower() == lang.lower(): + if l[1].lower() == lang or l[3].lower() == lang: return (True, l[0][:2], l[3].lower()) return False diff --git a/tests/unit/engines/test_deviantart.py b/tests/unit/engines/test_deviantart.py index bd2cf182f..a31151037 100644 --- a/tests/unit/engines/test_deviantart.py +++ b/tests/unit/engines/test_deviantart.py @@ -22,74 +22,3 @@ class TestDeviantartEngine(SearxTestCase): dicto['time_range'] = 'year' params = deviantart.request(query, dicto) self.assertEqual({}, params['url']) - - def test_response(self): - self.assertRaises(AttributeError, deviantart.response, None) - self.assertRaises(AttributeError, deviantart.response, []) - self.assertRaises(AttributeError, deviantart.response, '') - self.assertRaises(AttributeError, deviantart.response, '[]') - - response = mock.Mock(text='<html></html>') - self.assertEqual(deviantart.response(response), []) - - response = mock.Mock(status_code=302) - self.assertEqual(deviantart.response(response), []) - - html = """ - <div id="page-1-results" class="page-results results-page-thumb torpedo-container"> - <span class="thumb wide" href="http://amai911.deviantart.com/art/Horse-195212845" - data-super-full-width="900" data-super-full-height="600"> - <a class="torpedo-thumb-link" href="https://url.of.image"> - <img data-sigil="torpedo-img" src="https://url.of.thumbnail" /> - </a> - <span class="info"><span class="title-wrap"><span class="title">Title of image</span></span> - </div> - """ - response = mock.Mock(text=html) - results = deviantart.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['title'], 'Title of image') - self.assertEqual(results[0]['url'], 'https://url.of.image') - self.assertNotIn('content', results[0]) - self.assertEqual(results[0]['thumbnail_src'], 'https://url.of.thumbnail') - - html = """ - <span class="tt-fh-tc" style="width: 202px;"> - <span class="tt-bb" style="width: 202px;"> - </span> - <span class="shadow"> - <a class="thumb" href="http://url.of.result/2nd.part.of.url" - title="Behoimi BE Animation Test by test-0, Jan 4, - 2010 in Digital Art > Animation"> <i></i> - <img width="200" height="200" alt="Test" - src="http://url.of.thumbnail" data-src="http://th08.deviantart.net/test.jpg"> - </a> - </span> - <!-- ^TTT --> - </span> - <span class="details"> - <a href="http://test-0.deviantart.com/art/Test" class="t" - title="Behoimi BE Animation Test by test-0, Jan 4, 2010"> - <span class="tt-fh-oe">Title of image</span> </a> - <small> - <span class="category"> - <span class="age"> - 5 years ago - </span> - in <a title="Behoimi BE Animation Test by test-0, Jan 4, 2010" - href="http://www.deviantart.com/browse/all/digitalart/animation/">Animation</a> - </span> - <div class="commentcount"> - <a href="http://test-0.deviantart.com/art/Test#comments"> - <span class="iconcommentsstats"></span>9 Comments</a> - </div> - <a class="mlt-link" href="http://www.deviantart.com/morelikethis/149167425"> - <span class="mlt-icon"></span> <span class="mlt-text">More Like This</span> </a> - </span> - </small> <!-- TTT$ --> - """ - response = mock.Mock(text=html) - results = deviantart.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 0) diff --git a/tests/unit/engines/test_digg.py b/tests/unit/engines/test_digg.py index 6e7c9cc99..8bc4c67c2 100644 --- a/tests/unit/engines/test_digg.py +++ b/tests/unit/engines/test_digg.py @@ -14,88 +14,3 @@ class TestDiggEngine(SearxTestCase): self.assertIn('url', params) self.assertIn(query, params['url']) self.assertIn('digg.com', params['url']) - - def test_response(self): - self.assertRaises(AttributeError, digg.response, None) - self.assertRaises(AttributeError, digg.response, []) - self.assertRaises(AttributeError, digg.response, '') - self.assertRaises(AttributeError, digg.response, '[]') - - response = mock.Mock(text='{}') - self.assertEqual(digg.response(response), []) - - response = mock.Mock(text='{"data": []}') - self.assertEqual(digg.response(response), []) - - json = """ - { - "status": "ok", - "num": 10, - "next_position": 20, - "html": "<article itemscope itemtype=\\"http://schema.org/Article\\" - class=\\"story-container digg-story-el hentry entry story-1sRANah col-1\\" - data-content-id=\\"1sRANah\\" data-contenturl=\\"http://url.of.link\\" - data-position=\\"0\\" data-diggs=\\"24\\" data-tweets=\\"69\\" - data-digg-score=\\"1190\\"> <div class=\\"story-image story-image-thumb\\"> - <a data-position=\\"0\\" data-content-id=\\"1sRANah\\" - class=\\"story-link\\" href=\\"http://www.thedailybeast.com/\\" - target=\\"_blank\\"><img class=\\"story-image-img\\" - src=\\"http://url.of.image.jpeg\\" width=\\"312\\" height=\\"170\\" - alt=\\"\\" /> </a> </div> <div class=\\"story-content\\"><header - class=\\"story-header\\"> <div itemprop=\\"alternativeHeadline\\" - class=\\"story-kicker\\" >Kicker</div> <h2 itemprop=\\"headline\\" - class=\\"story-title entry-title\\"><a class=\\"story-title-link story-link\\" - rel=\\"bookmark\\" itemprop=\\"url\\" href=\\"http://www.thedailybeast.com/\\" - target=\\"_blank\\">Title of article</h2> <div class=\\"story-meta\\"> - <div class=\\"story-score \\"> - <div class=\\"story-score-diggscore diggscore-1sRANah\\">1190</div> - <div class=\\"story-score-details\\"> <div class=\\"arrow\\"></div> - <ul class=\\"story-score-details-list\\"> <li - class=\\"story-score-detail story-score-diggs\\"><span - class=\\"label\\">Diggs:</span> <span class=\\"count diggs-1sRANah\\">24</span> - </li> <li class=\\"story-score-detail story-score-twitter\\"><span - class=\\"label\\">Tweets:</span> <span class=\\"count tweets-1sRANah\\">69</span> - </li> <li class=\\"story-score-detail story-score-facebook\\"><span - class=\\"label\\">Facebook Shares:</span> <span - class=\\"count fb_shares-1sRANah\\">1097</span></li> </ul> </div> </div> - <span class=\\"story-meta-item story-source\\"> <a - itemprop=\\"publisher copyrightHolder sourceOrganization provider\\" - class=\\"story-meta-item-link story-source-link\\" - href=\\"/source/thedailybeast.com\\">The Daily Beast </a> </span> - <span class=\\"story-meta-item story-tag first-tag\\"> <a - itemprop=\\"keywords\\" rel=\\"tag\\" - class=\\"story-meta-item-link story-tag-link\\" href=\\"/tag/news\\">News</a> - </span> <abbr class=\\"published story-meta-item story-timestamp\\" - title=\\"2014-10-18 14:53:45\\"> <time datetime=\\"2014-10-18 14:53:45\\">18 Oct 2014</time> - </abbr> </div> </header> </div> <ul class=\\"story-actions\\"> <li - class=\\"story-action story-action-digg btn-story-action-container\\"> - <a class=\\"target digg-1sRANah\\" href=\\"#\\">Digg</a></li> <li - class=\\"story-action story-action-save btn-story-action-container\\"> - <a class=\\"target save-1sRANah\\" href=\\"#\\">Save</a></li> <li - class=\\"story-action story-action-share\\"><a - class=\\"target share-facebook\\" href=\\"https://www.facebook.com/\\">Facebook</a></li> - <li class=\\"story-action story-action-share\\"><a class=\\"target share-twitter\\" - href=\\"https://twitter.com/\\">Twitter</a></li> </ul> </article>" - } - """ - json = json.replace('\r\n', '').replace('\n', '').replace('\r', '') - response = mock.Mock(text=json) - results = digg.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['title'], 'Title of article') - self.assertEqual(results[0]['url'], 'http://url.of.link') - self.assertEqual(results[0]['thumbnail'], 'http://url.of.image.jpeg') - self.assertEqual(results[0]['content'], '') - - json = """ - { - "status": "error", - "num": 10, - "next_position": 20 - } - """ - response = mock.Mock(text=json) - results = digg.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 0) diff --git a/tests/unit/engines/test_startpage.py b/tests/unit/engines/test_startpage.py index a4704ce22..ac4454738 100644 --- a/tests/unit/engines/test_startpage.py +++ b/tests/unit/engines/test_startpage.py @@ -18,12 +18,9 @@ class TestStartpageEngine(SearxTestCase): self.assertIn('data', params) self.assertIn('query', params['data']) self.assertIn(query, params['data']['query']) - self.assertIn('with_language', params['data']) - self.assertIn('lang_fr', params['data']['with_language']) dicto['language'] = 'all' params = startpage.request(query, dicto) - self.assertNotIn('with_language', params['data']) def test_response(self): self.assertRaises(AttributeError, startpage.response, None) @@ -35,33 +32,32 @@ class TestStartpageEngine(SearxTestCase): self.assertEqual(startpage.response(response), []) html = """ - <li class="search-result search-item"> - <h3> - <a href='http://this.should.be.the.link/' id='title_2' name='title_2' > - This should be the title +<div class="w-gl__result"> + <a + class="w-gl__result-title" + href="http://this.should.be.the.link/" + data-onw="1" + rel="noopener noreferrer" + target="_blank"> + + <h3>This should be the title</h3> </a> - <span id='title_stars_2' name='title_stars_2'> </span> - </h3> - <p class="search-item__body"> - This should be the content. - </p> - <p> - <span class='url'>www.speed<b>test</b>.net/fr/ - </span> - - - <A class="proxy" id="proxy_link" HREF="https://ixquick-proxy.com/do/spg/proxy?ep=&edata=&ek=&ekdata=" - class='proxy'> - Navigation avec Ixquick Proxy - </A> - - - <A HREF="https://ixquick-proxy.com/do/spg/highlight.pl?l=francais&c=hf&cat=web&q=test&rl=NONE&rid= - &hlq=https://startpage.com/do/search&mtabp=-1&mtcmd=process_search&mtlanguage=francais&mtengine0= - &mtcat=web&u=http:%2F%2Fwww.speedtest.net%2Ffr%2F" class='proxy'> - Mis en surbrillance - </A> - </p> - </li> - """ + <div class="w-gl__result-second-line-container"> + <div class="w-gl__result-url-container"> + <a + class="w-gl__result-url" + href="http://this.should.be.the.link/" + rel="noopener noreferrer" + target="_blank">https://www.cnbc.com/2019/10/12/dj-zedd-banned-in-china-for-liking-a-south-park-tweet.html</a> + </div> + <a + class="w-gl__anonymous-view-url" + href="https://eu-browse.startpage.com/do/proxy?ep=556b554d576b6f5054554546423167764b5445616455554d5342675441774659495246304848774f5267385453304941486b5949546c63704e33774f526b705544565647516d4a61554246304847674f4a556f6957415a4f436b455042426b6b4f7a64535a52784a56514a4f45307743446c567250445a4f4c52514e5677554e46776b4b545563704c7931554c5167465467644f42464d4f4255426f4d693152624634525741305845526c595746636b626d67494e42705743466c515252634f4267456e597a7346596b7856435134465345634f564249794b5752785643315863546769515773764a5163494c5877505246315865456f5141426b4f41774167596d6c5a4e30395758773442465251495677596c624770665a6b786344466b4151455663425249794d6a78525a55554157516f4342556766526b51314b57514e&ek=4q58686o5047786n6343527259445247576p6o38&ekdata=84abd523dc13cba5c65164d04d7d7263" + target="_blank">Anonymous View</a> + </div> + <p class="w-gl__description">This should be the content.</p> + </div> + """ # noqa response = mock.Mock(text=html.encode('utf-8')) results = startpage.response(response) self.assertEqual(type(results), list) @@ -69,72 +65,3 @@ class TestStartpageEngine(SearxTestCase): self.assertEqual(results[0]['title'], 'This should be the title') self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/') self.assertEqual(results[0]['content'], 'This should be the content.') - - html = """ - <li class="search-result search-item"> - <h3> - <a href='http://www.google.com/aclk?sa=l&ai=C' id='title_2' name='title_2' > - This should be the title - </a> - <span id='title_stars_2' name='title_stars_2'> </span> - </h3> - <p class="search-item__body"> - This should be the content. - </p> - <p> - <span class='url'>www.speed<b>test</b>.net/fr/ - </span> - - - <A class="proxy" id="proxy_link" HREF="https://ixquick-proxy.com/do/spg/proxy?ep=&edata=&ek=&ekdata=" - class='proxy'> - Navigation avec Ixquick Proxy - </A> - - - <A HREF="https://ixquick-proxy.com/do/spg/highlight.pl?l=francais&c=hf&cat=web&q=test&rl=NONE&rid= - &hlq=https://startpage.com/do/search&mtabp=-1&mtcmd=process_search&mtlanguage=francais&mtengine0= - &mtcat=web&u=http:%2F%2Fwww.speedtest.net%2Ffr%2F" class='proxy'> - Mis en surbrillance - </A> - </p> - </li> - <li class="search-result search-item"> - <h3> - <span id='title_stars_2' name='title_stars_2'> </span> - </h3> - <p class="search-item__body"> - This should be the content. - </p> - <p> - <span class='url'>www.speed<b>test</b>.net/fr/ - </span> - </p> - </li> - <li class="search-result search-item"> - <h3> - <a href='http://this.should.be.the.link/' id='title_2' name='title_2' > - This should be the title - </a> - <span id='title_stars_2' name='title_stars_2'> </span> - </h3> - <p> - <span class='url'>www.speed<b>test</b>.net/fr/ - </span> - - - <A class="proxy" id="proxy_link" HREF="https://ixquick-proxy.com/do/spg/proxy?ep=&edata=&ek=&ekdata=" - class='proxy'> - Navigation avec Ixquick Proxy - </A> - - - <A HREF="https://ixquick-proxy.com/do/spg/highlight.pl?l=francais&c=hf&cat=web&q=test&rl=NONE&rid= - &hlq=https://startpage.com/do/search&mtabp=-1&mtcmd=process_search&mtlanguage=francais&mtengine0= - &mtcat=web&u=http:%2F%2Fwww.speedtest.net%2Ffr%2F" class='proxy'> - Mis en surbrillance - </A> - </p> - </li> - """ - response = mock.Mock(text=html.encode('utf-8')) - results = startpage.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['content'], '') diff --git a/tests/unit/engines/test_www1x.py b/tests/unit/engines/test_www1x.py index 9df8de6bf..40f5200fd 100644 --- a/tests/unit/engines/test_www1x.py +++ b/tests/unit/engines/test_www1x.py @@ -12,46 +12,3 @@ class TestWww1xEngine(SearxTestCase): self.assertTrue('url' in params) self.assertTrue(query in params['url']) self.assertTrue('1x.com' in params['url']) - - def test_response(self): - self.assertRaises(AttributeError, www1x.response, None) - self.assertRaises(AttributeError, www1x.response, []) - self.assertRaises(AttributeError, www1x.response, '') - self.assertRaises(AttributeError, www1x.response, '[]') - - response = mock.Mock(text='<html></html>') - self.assertEqual(www1x.response(response), []) - html = """ - <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters - [ - <!ELEMENT characters (character*) > - <!ELEMENT character (#PCDATA ) > - - <!ENTITY iexcl "¡" > - <!ENTITY cent "¢" > - <!ENTITY pound "£" > - ] - ><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%"> - <tr> - <td style="min-width: 220px;" valign="top"> - <div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div> - <div> - <a href="/photo/123456" class="dynamiclink"> -<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;"> - </a> - <a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink"> -<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg"> - </a> - </div> - </td> - </table> - ]]></searchresult></root> - """ - response = mock.Mock(text=html) - results = www1x.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 1) - self.assertEqual(results[0]['url'], 'https://1x.com/photo/123456') - self.assertEqual(results[0]['thumbnail_src'], 'https://1x.com/images/user/testimage-123456.jpg') - self.assertEqual(results[0]['content'], '') - self.assertEqual(results[0]['template'], 'images.html') |