diff options
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/__init__.py | 32 | ||||
| -rw-r--r-- | searx/engines/acgsou.py | 4 | ||||
| -rw-r--r-- | searx/engines/arxiv.py | 2 | ||||
| -rw-r--r-- | searx/engines/currency_convert.py | 1 | ||||
| -rw-r--r-- | searx/engines/dictzone.py | 1 | ||||
| -rw-r--r-- | searx/engines/duckduckgo_definitions.py | 4 | ||||
| -rw-r--r-- | searx/engines/duden.py | 43 | ||||
| -rw-r--r-- | searx/engines/filecrop.py | 85 | ||||
| -rw-r--r-- | searx/engines/seedpeer.py | 78 | ||||
| -rw-r--r-- | searx/engines/soundcloud.py | 2 | ||||
| -rw-r--r-- | searx/engines/translated.py | 1 | ||||
| -rw-r--r-- | searx/engines/wikipedia.py | 2 | ||||
| -rw-r--r-- | searx/engines/www1x.py | 24 | ||||
| -rw-r--r-- | searx/engines/youtube_noapi.py | 2 |
14 files changed, 73 insertions, 208 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index ddd6a7feb..0b77f2a95 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -20,6 +20,7 @@ import sys import threading from os.path import realpath, dirname from babel.localedata import locale_identifiers +from urllib.parse import urlparse from flask_babel import gettext from operator import itemgetter from searx import settings @@ -289,3 +290,34 @@ def initialize_engines(engine_list): if init_fn: logger.debug('%s engine: Starting background initialization', engine_name) threading.Thread(target=engine_init, args=(engine_name, init_fn)).start() + + _set_https_support_for_engine(engine) + + +def _set_https_support_for_engine(engine): + # check HTTPS support if it is not disabled + if not engine.offline and not hasattr(engine, 'https_support'): + params = engine.request('http_test', { + 'method': 'GET', + 'headers': {}, + 'data': {}, + 'url': '', + 'cookies': {}, + 'verify': True, + 'auth': None, + 'pageno': 1, + 'time_range': None, + 'language': '', + 'safesearch': False, + 'is_test': True, + 'category': 'files', + 'raise_for_status': True, + }) + + if 'url' not in params: + return + + parsed_url = urlparse(params['url']) + https_support = parsed_url.scheme == 'https' + + setattr(engine, 'https_support', https_support) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index b8b367c24..637443edc 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -18,7 +18,7 @@ categories = ['files', 'images', 'videos', 'music'] paging = True # search-url -base_url = 'http://www.acgsou.com/' +base_url = 'https://www.acgsou.com/' search_url = base_url + 'search.php?{query}&page={offset}' # xpath queries xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' @@ -40,7 +40,7 @@ def response(resp): for result in eval_xpath_list(dom, xpath_results): # defaults filesize = 0 - magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" + magnet_link = "magnet:?xt=urn:btih:{}&tr=https://tracker.acgsou.com:2710/announce" category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[])) page_a = eval_xpath_getindex(result, xpath_title, 0) diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index c702c5987..1190de363 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -19,7 +19,7 @@ from searx.utils import eval_xpath_list, eval_xpath_getindex categories = ['science'] paging = True -base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ +base_url = 'https://export.arxiv.org/api/query?search_query=all:'\ + '{query}&start={offset}&max_results={number_of_results}' # engine dependent config diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index f41c135b9..87e21d0af 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -9,6 +9,7 @@ url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}' weight = 100 parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) +https_support = True def normalize_name(name): diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 5a1fea3cf..727eb6598 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -20,6 +20,7 @@ weight = 100 parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) results_xpath = './/table[@id="r"]/tr' +https_support = True def request(query, params): diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 5a7649173..1d1c84b4b 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -10,7 +10,7 @@ DuckDuckGo (definitions) """ import json -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse, urljoin from lxml import html from searx import logger @@ -102,6 +102,8 @@ def response(resp): # image image = search_res.get('Image') image = None if image == '' else image + if image is not None and urlparse(image).netloc == '': + image = urljoin('https://duckduckgo.com', image) # urls # Official website, Wikipedia page diff --git a/searx/engines/duden.py b/searx/engines/duden.py index 1484a21e5..1475fb846 100644 --- a/searx/engines/duden.py +++ b/searx/engines/duden.py @@ -8,11 +8,10 @@ @parse url, title, content """ -from lxml import html, etree import re from urllib.parse import quote, urljoin -from searx.utils import extract_text, eval_xpath -from searx import logger +from lxml import html +from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex categories = ['general'] paging = True @@ -40,6 +39,9 @@ def request(query, params): params['url'] = search_url_fmt.format(query=quote(query)) else: params['url'] = search_url.format(offset=offset, query=quote(query)) + # after the last page of results, spelling corrections are returned after a HTTP redirect + # whatever the page number is + params['soft_max_redirects'] = 1 return params @@ -51,28 +53,21 @@ def response(resp): dom = html.fromstring(resp.text) - try: - number_of_results_string =\ - re.sub('[^0-9]', '', - eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]) - + number_of_results_element =\ + eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', + 0, default=None) + if number_of_results_element is not None: + number_of_results_string = re.sub('[^0-9]', '', number_of_results_element) results.append({'number_of_results': int(number_of_results_string)}) - except: - logger.debug("Couldn't read number of results.") - - for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'): - try: - url = eval_xpath(result, './/h2/a')[0].get('href') - url = urljoin(base_url, url) - title = eval_xpath(result, 'string(.//h2/a)').strip() - content = extract_text(eval_xpath(result, './/p')) - # append result - results.append({'url': url, - 'title': title, - 'content': content}) - except: - logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) - continue + for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'): + url = eval_xpath_getindex(result, './/h2/a', 0).get('href') + url = urljoin(base_url, url) + title = eval_xpath(result, 'string(.//h2/a)').strip() + content = extract_text(eval_xpath(result, './/p')) + # append result + results.append({'url': url, + 'title': title, + 'content': content}) return results diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py deleted file mode 100644 index 0331e7b19..000000000 --- a/searx/engines/filecrop.py +++ /dev/null @@ -1,85 +0,0 @@ -from html.parser import HTMLParser -from urllib.parse import urlencode - - -url = 'http://www.filecrop.com/' -search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa - -paging = True - - -class FilecropResultParser(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844) - - def __init__(self): - HTMLParser.__init__(self) - self.__start_processing = False - - self.results = [] - self.result = {} - - self.tr_counter = 0 - self.data_counter = 0 - - def handle_starttag(self, tag, attrs): - - if tag == 'tr': - if ('bgcolor', '#edeff5') in attrs or\ - ('bgcolor', '#ffffff') in attrs: - self.__start_processing = True - - if not self.__start_processing: - return - - if tag == 'label': - self.result['title'] = [attr[1] for attr in attrs - if attr[0] == 'title'][0] - elif tag == 'a' and ('rel', 'nofollow') in attrs\ - and ('class', 'sourcelink') in attrs: - if 'content' in self.result: - self.result['content'] += [attr[1] for attr in attrs - if attr[0] == 'title'][0] - else: - self.result['content'] = [attr[1] for attr in attrs - if attr[0] == 'title'][0] - self.result['content'] += ' ' - elif tag == 'a': - self.result['url'] = url + [attr[1] for attr in attrs - if attr[0] == 'href'][0] - - def handle_endtag(self, tag): - if self.__start_processing is False: - return - - if tag == 'tr': - self.tr_counter += 1 - - if self.tr_counter == 2: - self.__start_processing = False - self.tr_counter = 0 - self.data_counter = 0 - self.results.append(self.result) - self.result = {} - - def handle_data(self, data): - if not self.__start_processing: - return - - if 'content' in self.result: - self.result['content'] += data + ' ' - else: - self.result['content'] = data + ' ' - - self.data_counter += 1 - - -def request(query, params): - index = 1 + (params['pageno'] - 1) * 30 - params['url'] = search_url.format(query=urlencode({'w': query}), index=index) - return params - - -def response(resp): - parser = FilecropResultParser() - parser.feed(resp.text) - - return parser.results diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py deleted file mode 100644 index 39916da6e..000000000 --- a/searx/engines/seedpeer.py +++ /dev/null @@ -1,78 +0,0 @@ -# Seedpeer (Videos, Music, Files) -# -# @website https://seedpeer.me -# @provide-api no (nothing found) -# -# @using-api no -# @results HTML (using search portal) -# @stable yes (HTML can change) -# @parse url, title, content, seed, leech, magnetlink - -from lxml import html -from json import loads -from operator import itemgetter -from urllib.parse import quote, urljoin -from searx.utils import extract_text - - -url = 'https://seedpeer.me/' -search_url = url + 'search/{search_term}?page={page_no}' -torrent_file_url = url + 'torrent/{torrent_hash}' - -# specific xpath variables -script_xpath = '//script[@type="text/javascript"][not(@src)]' -torrent_xpath = '(//table)[2]/tbody/tr' -link_xpath = '(./td)[1]/a/@href' -age_xpath = '(./td)[2]' -size_xpath = '(./td)[3]' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(search_term=quote(query), - page_no=params['pageno']) - return params - - -# get response from search-request -def response(resp): - results = [] - dom = html.fromstring(resp.text) - result_rows = dom.xpath(torrent_xpath) - - try: - script_element = dom.xpath(script_xpath)[0] - json_string = script_element.text[script_element.text.find('{'):] - torrents_json = loads(json_string) - except: - return [] - - # parse results - for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']): - title = torrent_json['name'] - seed = int(torrent_json['seeds']) - leech = int(torrent_json['peers']) - size = int(torrent_json['size']) - torrent_hash = torrent_json['hash'] - - torrentfile = torrent_file_url.format(torrent_hash=torrent_hash) - magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash) - - age = extract_text(torrent_row.xpath(age_xpath)) - link = torrent_row.xpath(link_xpath)[0] - - href = urljoin(url, link) - - # append result - results.append({'url': href, - 'title': title, - 'content': age, - 'seed': seed, - 'leech': leech, - 'filesize': size, - 'torrentfile': torrentfile, - 'magnetlink': magnetlink, - 'template': 'torrent.html'}) - - # return results sorted by seeder - return sorted(results, key=itemgetter('seed'), reverse=True) diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index b1e01759f..84ff21a88 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -91,7 +91,7 @@ def response(resp): for result in search_res.get('collection', []): if result['kind'] in ('track', 'playlist'): title = result['title'] - content = result['description'] + content = result['description'] or '' publishedDate = parser.parse(result['last_modified']) uri = quote_plus(result['uri']) embedded = embedded_url.format(uri=uri) diff --git a/searx/engines/translated.py b/searx/engines/translated.py index a50e7c830..75b8b5f42 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -15,6 +15,7 @@ categories = ['general'] url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' weight = 100 +https_support = True parser_re = re.compile('.*?([a-z]+)-([a-z]+) (.{2,})$', re.I) api_key = '' diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 9fce170eb..000e1af76 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -52,7 +52,7 @@ def response(resp): api_result = loads(resp.text) # skip disambiguation pages - if api_result['type'] != 'standard': + if api_result.get('type') != 'standard': return [] title = api_result['title'] diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 8d691c852..b8f111a50 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -7,12 +7,12 @@ @using-api no @results HTML @stable no (HTML can change) - @parse url, title, thumbnail, img_src, content + @parse url, title, thumbnail """ -from lxml import html +from lxml import html, etree from urllib.parse import urlencode, urljoin -from searx.utils import extract_text +from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex # engine dependent config categories = ['images'] @@ -21,6 +21,7 @@ paging = False # search-url base_url = 'https://1x.com' search_url = base_url + '/backend/search.php?{query}' +gallery_url = 'https://gallery.1x.com/' # do search-request @@ -33,23 +34,18 @@ def request(query, params): # get response from search-request def response(resp): results = [] - - dom = html.fromstring(resp.text) - for res in dom.xpath('//div[@class="List-item MainListing"]'): - # processed start and end of link - link = res.xpath('//a')[0] - + xmldom = etree.fromstring(resp.content) + xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0) + dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div') + for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'): url = urljoin(base_url, link.attrib.get('href')) title = extract_text(link) - - thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src']) - # TODO: get image with higher resolution - img_src = thumbnail_src + thumbnail_src = urljoin(gallery_url, eval_xpath_getindex(link, './/img', 0).attrib['src']) # append result results.append({'url': url, 'title': title, - 'img_src': img_src, + 'img_src': thumbnail_src, 'content': '', 'thumbnail_src': thumbnail_src, 'template': 'images.html'}) diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 5f7d2ceab..36fc72e36 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -49,7 +49,7 @@ def response(resp): results = [] results_data = resp.text[resp.text.find('ytInitialData'):] - results_data = results_data[results_data.find('{'):results_data.find(';\n')] + results_data = results_data[results_data.find('{'):results_data.find(';</script>')] results_json = loads(results_data) if results_data else {} sections = results_json.get('contents', {})\ |