diff options
| author | Mathieu Brunot <mathieu.brunot@monogramm.io> | 2019-10-16 19:30:02 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-10-16 19:30:02 +0200 |
| commit | a51b2b6c20c1346748c09aec051d78f6822c580c (patch) | |
| tree | 5f08fadc14de64e86c5582ebbdc928e4abe7475b /searx/engines | |
| parent | 4d17d453bf99641797dcbfa1f1a35dd5b21dcab4 (diff) | |
| parent | 12f42d1572311a56401637ac5c7dc66008eb979c (diff) | |
Merge branch 'master' into feature/accessibility
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/__init__.py | 27 | ||||
| -rw-r--r-- | searx/engines/arxiv.py | 1 | ||||
| -rw-r--r-- | searx/engines/deviantart.py | 47 | ||||
| -rw-r--r-- | searx/engines/digg.py | 36 | ||||
| -rw-r--r-- | searx/engines/duckduckgo.py | 35 | ||||
| -rw-r--r-- | searx/engines/gigablast.py | 7 | ||||
| -rw-r--r-- | searx/engines/soundcloud.py | 2 | ||||
| -rw-r--r-- | searx/engines/startpage.py | 26 | ||||
| -rw-r--r-- | searx/engines/wolframalpha_noapi.py | 2 | ||||
| -rw-r--r-- | searx/engines/www1x.py | 35 |
10 files changed, 106 insertions, 112 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index a10b1ccd9..2393f52b6 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -27,7 +27,7 @@ from json import loads from requests import get from searx import settings from searx import logger -from searx.utils import load_module, match_language +from searx.utils import load_module, match_language, get_engine_from_settings logger = logger.getChild('engines') @@ -53,7 +53,8 @@ engine_default_args = {'paging': False, 'disabled': False, 'suspend_end_time': 0, 'continuous_errors': 0, - 'time_range_support': False} + 'time_range_support': False, + 'offline': False} def load_engine(engine_data): @@ -128,14 +129,16 @@ def load_engine(engine_data): engine.stats = { 'result_count': 0, 'search_count': 0, - 'page_load_time': 0, - 'page_load_count': 0, 'engine_time': 0, 'engine_time_count': 0, 'score_count': 0, 'errors': 0 } + if not engine.offline: + engine.stats['page_load_time'] = 0 + engine.stats['page_load_count'] = 0 + for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) @@ -173,11 +176,6 @@ def get_engines_stats(): results_num = \ engine.stats['result_count'] / float(engine.stats['search_count']) - if engine.stats['page_load_count'] != 0: - load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa - else: - load_times = 0 - if engine.stats['engine_time_count'] != 0: this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa else: @@ -189,14 +187,19 @@ def get_engines_stats(): else: score = score_per_result = 0.0 - max_pageload = max(load_times, max_pageload) + if not engine.offline: + load_times = 0 + if engine.stats['page_load_count'] != 0: + load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa + max_pageload = max(load_times, max_pageload) + pageloads.append({'avg': load_times, 'name': engine.name}) + max_engine_times = max(this_engine_time, max_engine_times) max_results = max(results_num, max_results) max_score = max(score, max_score) max_score_per_result = max(score_per_result, max_score_per_result) max_errors = max(max_errors, engine.stats['errors']) - pageloads.append({'avg': load_times, 'name': engine.name}) engine_times.append({'avg': this_engine_time, 'name': engine.name}) results.append({'avg': results_num, 'name': engine.name}) scores.append({'avg': score, 'name': engine.name}) @@ -255,7 +258,7 @@ def initialize_engines(engine_list): load_engines(engine_list) def engine_init(engine_name, init_fn): - init_fn() + init_fn(get_engine_from_settings(engine_name)) logger.debug('%s engine: Initialized', engine_name) for engine_name, engine in engines.items(): diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index 182861892..e3c871d17 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -17,6 +17,7 @@ from searx.url_utils import urlencode categories = ['science'] +paging = True base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + '{query}&start={offset}&max_results={number_of_results}' diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index bb85c6dc5..a0e27e622 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -24,7 +24,7 @@ time_range_support = True # search-url base_url = 'https://www.deviantart.com/' -search_url = base_url + 'browse/all/?offset={offset}&{query}' +search_url = base_url + 'search?page={page}&{query}' time_range_url = '&order={range}' time_range_dict = {'day': 11, @@ -37,9 +37,7 @@ def request(query, params): if params['time_range'] and params['time_range'] not in time_range_dict: return params - offset = (params['pageno'] - 1) * 24 - - params['url'] = search_url.format(offset=offset, + params['url'] = search_url.format(page=params['pageno'], query=urlencode({'q': query})) if params['time_range'] in time_range_dict: params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) @@ -57,28 +55,27 @@ def response(resp): dom = html.fromstring(resp.text) - regex = re.compile(r'\/200H\/') - # parse results - for result in dom.xpath('.//span[@class="thumb wide"]'): - link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0] - url = link.attrib.get('href') - title = extract_text(result.xpath('.//span[@class="title"]')) - thumbnail_src = link.xpath('.//img')[0].attrib.get('src') - img_src = regex.sub('/', thumbnail_src) - - # http to https, remove domain sharding - thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) - thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) - - url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) + for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): + for result in row.xpath('./div'): + link = result.xpath('.//a[@data-hook="deviation_link"]')[0] + url = link.attrib.get('href') + title = link.attrib.get('title') + thumbnail_src = result.xpath('.//img')[0].attrib.get('src') + img_src = thumbnail_src + + # http to https, remove domain sharding + thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) + thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) + + url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) # return results return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 4369ccb84..073410eb0 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -15,7 +15,8 @@ import string from dateutil import parser from json import loads from lxml import html -from searx.url_utils import quote_plus +from searx.url_utils import urlencode +from datetime import datetime # engine dependent config categories = ['news', 'social media'] @@ -23,7 +24,7 @@ paging = True # search-url base_url = 'https://digg.com/' -search_url = base_url + 'api/search/{query}.json?position={position}&format=html' +search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' # specific xpath variables results_xpath = '//article' @@ -38,9 +39,9 @@ digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + offset = (params['pageno'] - 1) * 20 params['url'] = search_url.format(position=offset, - query=quote_plus(query)) + query=urlencode({'q': query})) params['cookies']['frontend.auid'] = ''.join(random.choice( digg_cookie_chars) for _ in range(22)) return params @@ -52,30 +53,17 @@ def response(resp): search_result = loads(resp.text) - if 'html' not in search_result or search_result['html'] == '': - return results - - dom = html.fromstring(search_result['html']) - # parse results - for result in dom.xpath(results_xpath): - url = result.attrib.get('data-contenturl') - thumbnail = result.xpath('.//img')[0].attrib.get('src') - title = ''.join(result.xpath(title_xpath)) - content = ''.join(result.xpath(content_xpath)) - pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') - publishedDate = parser.parse(pubdate) - - # http to https - thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com") + for result in search_result['mapped']: + published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") # append result - results.append({'url': url, - 'title': title, - 'content': content, + results.append({'url': result['url'], + 'title': result['title'], + 'content': result['excerpt'], 'template': 'videos.html', - 'publishedDate': publishedDate, - 'thumbnail': thumbnail}) + 'publishedDate': published, + 'thumbnail': result['images']['thumbImage']}) # return results return results diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index fb8f523ac..e77ef0126 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -65,21 +65,36 @@ def get_region_code(lang, lang_list=[]): def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: + if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict: return params offset = (params['pageno'] - 1) * 30 region_code = get_region_code(params['language'], supported_languages) - if region_code: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + params['url'] = 'https://duckduckgo.com/html/' + if params['pageno'] > 1: + params['method'] = 'POST' + params['data']['q'] = query + params['data']['s'] = offset + params['data']['dc'] = 30 + params['data']['nextParams'] = '' + params['data']['v'] = 'l' + params['data']['o'] = 'json' + params['data']['api'] = '/d.js' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + if region_code: + params['data']['kl'] = region_code else: - params['url'] = url.format( - query=urlencode({'q': query}), offset=offset, dc_param=offset) + if region_code: + params['url'] = url.format( + query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) + else: + params['url'] = url.format( + query=urlencode({'q': query}), offset=offset, dc_param=offset) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) return params @@ -91,7 +106,9 @@ def response(resp): doc = fromstring(resp.text) # parse results - for r in doc.xpath(result_xpath): + for i, r in enumerate(doc.xpath(result_xpath)): + if i >= 30: + break try: res_url = r.xpath(url_xpath)[-1] except: diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index a6aa5d718..6b0402233 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -35,8 +35,8 @@ search_string = 'search?{query}'\ '&ff={safesearch}'\ '&rxiec={rxieu}'\ '&ulse={ulse}'\ - '&rand={rxikd}' # current unix timestamp - + '&rand={rxikd}'\ + '&dbez={dbez}' # specific xpath variables results_xpath = '//response//result' url_xpath = './/url' @@ -70,7 +70,8 @@ def request(query, params): rxieu=random.randint(1000000000, 9999999999), ulse=random.randint(100000000, 999999999), lang=language, - safesearch=safesearch) + safesearch=safesearch, + dbez=random.randint(100000000, 999999999)) params['url'] = base_url + search_path diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 3ba9a7f39..870998545 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -66,7 +66,7 @@ def get_client_id(): return "" -def init(): +def init(engine_settings=None): global guest_client_id # api-key guest_client_id = get_client_id() diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 6638f3d83..0f0ec6e18 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -15,6 +15,7 @@ from dateutil import parser from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text +from searx.languages import language_codes # engine dependent config categories = ['general'] @@ -22,7 +23,7 @@ categories = ['general'] # (probably the parameter qid), require # storing of qid's between mulitble search-calls -# paging = False +paging = True language_support = True # search-url @@ -32,23 +33,32 @@ search_url = base_url + 'do/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]' -link_xpath = './/h3/a' -content_xpath = './p[@class="search-item__body"]' +results_xpath = '//div[@class="w-gl__result"]' +link_xpath = './/a[@class="w-gl__result-title"]' +content_xpath = './/p[@class="w-gl__description"]' # do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 params['url'] = search_url params['method'] = 'POST' - params['data'] = {'query': query, - 'startat': offset} + params['data'] = { + 'query': query, + 'page': params['pageno'], + 'cat': 'web', + 'cmd': 'process_search', + 'engine0': 'v1all', + } # set language if specified if params['language'] != 'all': - params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) + language = 'english' + for lc, _, _, lang in language_codes: + if lc == params['language']: + language = lang + params['data']['language'] = language + params['data']['lui'] = language return params diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 2cbbc5adc..387c9fa17 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -55,7 +55,7 @@ def obtain_token(): return token -def init(): +def init(engine_settings=None): obtain_token() diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 508803240..f1154b16d 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -11,8 +11,8 @@ """ from lxml import html -import re from searx.url_utils import urlencode, urljoin +from searx.engines.xpath import extract_text # engine dependent config categories = ['images'] @@ -34,41 +34,18 @@ def request(query, params): def response(resp): results = [] - # get links from result-text - regex = re.compile('(</a>|<a)') - results_parts = re.split(regex, resp.text) - - cur_element = '' - - # iterate over link parts - for result_part in results_parts: + dom = html.fromstring(resp.text) + for res in dom.xpath('//div[@class="List-item MainListing"]'): # processed start and end of link - if result_part == '<a': - cur_element = result_part - continue - elif result_part != '</a>': - cur_element += result_part - continue - - cur_element += result_part - - # fix xml-error - cur_element = cur_element.replace('"></a>', '"/></a>') - - dom = html.fromstring(cur_element) - link = dom.xpath('//a')[0] + link = res.xpath('//a')[0] url = urljoin(base_url, link.attrib.get('href')) - title = link.attrib.get('title', '') + title = extract_text(link) - thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) + thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) # TODO: get image with higher resolution img_src = thumbnail_src - # check if url is showing to a photo - if '/photo/' not in url: - continue - # append result results.append({'url': url, 'title': title, |