summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py32
-rw-r--r--searx/engines/acgsou.py4
-rw-r--r--searx/engines/arxiv.py2
-rw-r--r--searx/engines/currency_convert.py1
-rw-r--r--searx/engines/dictzone.py1
-rw-r--r--searx/engines/duckduckgo_definitions.py4
-rw-r--r--searx/engines/duden.py43
-rw-r--r--searx/engines/filecrop.py85
-rw-r--r--searx/engines/seedpeer.py78
-rw-r--r--searx/engines/soundcloud.py2
-rw-r--r--searx/engines/translated.py1
-rw-r--r--searx/engines/wikipedia.py2
-rw-r--r--searx/engines/www1x.py24
-rw-r--r--searx/engines/youtube_noapi.py2
14 files changed, 73 insertions, 208 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index ddd6a7feb..0b77f2a95 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -20,6 +20,7 @@ import sys
import threading
from os.path import realpath, dirname
from babel.localedata import locale_identifiers
+from urllib.parse import urlparse
from flask_babel import gettext
from operator import itemgetter
from searx import settings
@@ -289,3 +290,34 @@ def initialize_engines(engine_list):
if init_fn:
logger.debug('%s engine: Starting background initialization', engine_name)
threading.Thread(target=engine_init, args=(engine_name, init_fn)).start()
+
+ _set_https_support_for_engine(engine)
+
+
+def _set_https_support_for_engine(engine):
+ # check HTTPS support if it is not disabled
+ if not engine.offline and not hasattr(engine, 'https_support'):
+ params = engine.request('http_test', {
+ 'method': 'GET',
+ 'headers': {},
+ 'data': {},
+ 'url': '',
+ 'cookies': {},
+ 'verify': True,
+ 'auth': None,
+ 'pageno': 1,
+ 'time_range': None,
+ 'language': '',
+ 'safesearch': False,
+ 'is_test': True,
+ 'category': 'files',
+ 'raise_for_status': True,
+ })
+
+ if 'url' not in params:
+ return
+
+ parsed_url = urlparse(params['url'])
+ https_support = parsed_url.scheme == 'https'
+
+ setattr(engine, 'https_support', https_support)
diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py
index b8b367c24..637443edc 100644
--- a/searx/engines/acgsou.py
+++ b/searx/engines/acgsou.py
@@ -18,7 +18,7 @@ categories = ['files', 'images', 'videos', 'music']
paging = True
# search-url
-base_url = 'http://www.acgsou.com/'
+base_url = 'https://www.acgsou.com/'
search_url = base_url + 'search.php?{query}&page={offset}'
# xpath queries
xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]'
@@ -40,7 +40,7 @@ def response(resp):
for result in eval_xpath_list(dom, xpath_results):
# defaults
filesize = 0
- magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce"
+ magnet_link = "magnet:?xt=urn:btih:{}&tr=https://tracker.acgsou.com:2710/announce"
category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[]))
page_a = eval_xpath_getindex(result, xpath_title, 0)
diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py
index c702c5987..1190de363 100644
--- a/searx/engines/arxiv.py
+++ b/searx/engines/arxiv.py
@@ -19,7 +19,7 @@ from searx.utils import eval_xpath_list, eval_xpath_getindex
categories = ['science']
paging = True
-base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
+base_url = 'https://export.arxiv.org/api/query?search_query=all:'\
+ '{query}&start={offset}&max_results={number_of_results}'
# engine dependent config
diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py
index f41c135b9..87e21d0af 100644
--- a/searx/engines/currency_convert.py
+++ b/searx/engines/currency_convert.py
@@ -9,6 +9,7 @@ url = 'https://duckduckgo.com/js/spice/currency/1/{0}/{1}'
weight = 100
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
+https_support = True
def normalize_name(name):
diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py
index 5a1fea3cf..727eb6598 100644
--- a/searx/engines/dictzone.py
+++ b/searx/engines/dictzone.py
@@ -20,6 +20,7 @@ weight = 100
parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
results_xpath = './/table[@id="r"]/tr'
+https_support = True
def request(query, params):
diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
index 5a7649173..1d1c84b4b 100644
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -10,7 +10,7 @@ DuckDuckGo (definitions)
"""
import json
-from urllib.parse import urlencode
+from urllib.parse import urlencode, urlparse, urljoin
from lxml import html
from searx import logger
@@ -102,6 +102,8 @@ def response(resp):
# image
image = search_res.get('Image')
image = None if image == '' else image
+ if image is not None and urlparse(image).netloc == '':
+ image = urljoin('https://duckduckgo.com', image)
# urls
# Official website, Wikipedia page
diff --git a/searx/engines/duden.py b/searx/engines/duden.py
index 1484a21e5..1475fb846 100644
--- a/searx/engines/duden.py
+++ b/searx/engines/duden.py
@@ -8,11 +8,10 @@
@parse url, title, content
"""
-from lxml import html, etree
import re
from urllib.parse import quote, urljoin
-from searx.utils import extract_text, eval_xpath
-from searx import logger
+from lxml import html
+from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
categories = ['general']
paging = True
@@ -40,6 +39,9 @@ def request(query, params):
params['url'] = search_url_fmt.format(query=quote(query))
else:
params['url'] = search_url.format(offset=offset, query=quote(query))
+ # after the last page of results, spelling corrections are returned after a HTTP redirect
+ # whatever the page number is
+ params['soft_max_redirects'] = 1
return params
@@ -51,28 +53,21 @@ def response(resp):
dom = html.fromstring(resp.text)
- try:
- number_of_results_string =\
- re.sub('[^0-9]', '',
- eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0])
-
+ number_of_results_element =\
+ eval_xpath_getindex(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()',
+ 0, default=None)
+ if number_of_results_element is not None:
+ number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
results.append({'number_of_results': int(number_of_results_string)})
- except:
- logger.debug("Couldn't read number of results.")
-
- for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'):
- try:
- url = eval_xpath(result, './/h2/a')[0].get('href')
- url = urljoin(base_url, url)
- title = eval_xpath(result, 'string(.//h2/a)').strip()
- content = extract_text(eval_xpath(result, './/p'))
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
- except:
- logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
- continue
+ for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
+ url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
+ url = urljoin(base_url, url)
+ title = eval_xpath(result, 'string(.//h2/a)').strip()
+ content = extract_text(eval_xpath(result, './/p'))
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
return results
diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py
deleted file mode 100644
index 0331e7b19..000000000
--- a/searx/engines/filecrop.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from html.parser import HTMLParser
-from urllib.parse import urlencode
-
-
-url = 'http://www.filecrop.com/'
-search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa
-
-paging = True
-
-
-class FilecropResultParser(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844)
-
- def __init__(self):
- HTMLParser.__init__(self)
- self.__start_processing = False
-
- self.results = []
- self.result = {}
-
- self.tr_counter = 0
- self.data_counter = 0
-
- def handle_starttag(self, tag, attrs):
-
- if tag == 'tr':
- if ('bgcolor', '#edeff5') in attrs or\
- ('bgcolor', '#ffffff') in attrs:
- self.__start_processing = True
-
- if not self.__start_processing:
- return
-
- if tag == 'label':
- self.result['title'] = [attr[1] for attr in attrs
- if attr[0] == 'title'][0]
- elif tag == 'a' and ('rel', 'nofollow') in attrs\
- and ('class', 'sourcelink') in attrs:
- if 'content' in self.result:
- self.result['content'] += [attr[1] for attr in attrs
- if attr[0] == 'title'][0]
- else:
- self.result['content'] = [attr[1] for attr in attrs
- if attr[0] == 'title'][0]
- self.result['content'] += ' '
- elif tag == 'a':
- self.result['url'] = url + [attr[1] for attr in attrs
- if attr[0] == 'href'][0]
-
- def handle_endtag(self, tag):
- if self.__start_processing is False:
- return
-
- if tag == 'tr':
- self.tr_counter += 1
-
- if self.tr_counter == 2:
- self.__start_processing = False
- self.tr_counter = 0
- self.data_counter = 0
- self.results.append(self.result)
- self.result = {}
-
- def handle_data(self, data):
- if not self.__start_processing:
- return
-
- if 'content' in self.result:
- self.result['content'] += data + ' '
- else:
- self.result['content'] = data + ' '
-
- self.data_counter += 1
-
-
-def request(query, params):
- index = 1 + (params['pageno'] - 1) * 30
- params['url'] = search_url.format(query=urlencode({'w': query}), index=index)
- return params
-
-
-def response(resp):
- parser = FilecropResultParser()
- parser.feed(resp.text)
-
- return parser.results
diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py
deleted file mode 100644
index 39916da6e..000000000
--- a/searx/engines/seedpeer.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Seedpeer (Videos, Music, Files)
-#
-# @website https://seedpeer.me
-# @provide-api no (nothing found)
-#
-# @using-api no
-# @results HTML (using search portal)
-# @stable yes (HTML can change)
-# @parse url, title, content, seed, leech, magnetlink
-
-from lxml import html
-from json import loads
-from operator import itemgetter
-from urllib.parse import quote, urljoin
-from searx.utils import extract_text
-
-
-url = 'https://seedpeer.me/'
-search_url = url + 'search/{search_term}?page={page_no}'
-torrent_file_url = url + 'torrent/{torrent_hash}'
-
-# specific xpath variables
-script_xpath = '//script[@type="text/javascript"][not(@src)]'
-torrent_xpath = '(//table)[2]/tbody/tr'
-link_xpath = '(./td)[1]/a/@href'
-age_xpath = '(./td)[2]'
-size_xpath = '(./td)[3]'
-
-
-# do search-request
-def request(query, params):
- params['url'] = search_url.format(search_term=quote(query),
- page_no=params['pageno'])
- return params
-
-
-# get response from search-request
-def response(resp):
- results = []
- dom = html.fromstring(resp.text)
- result_rows = dom.xpath(torrent_xpath)
-
- try:
- script_element = dom.xpath(script_xpath)[0]
- json_string = script_element.text[script_element.text.find('{'):]
- torrents_json = loads(json_string)
- except:
- return []
-
- # parse results
- for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']):
- title = torrent_json['name']
- seed = int(torrent_json['seeds'])
- leech = int(torrent_json['peers'])
- size = int(torrent_json['size'])
- torrent_hash = torrent_json['hash']
-
- torrentfile = torrent_file_url.format(torrent_hash=torrent_hash)
- magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash)
-
- age = extract_text(torrent_row.xpath(age_xpath))
- link = torrent_row.xpath(link_xpath)[0]
-
- href = urljoin(url, link)
-
- # append result
- results.append({'url': href,
- 'title': title,
- 'content': age,
- 'seed': seed,
- 'leech': leech,
- 'filesize': size,
- 'torrentfile': torrentfile,
- 'magnetlink': magnetlink,
- 'template': 'torrent.html'})
-
- # return results sorted by seeder
- return sorted(results, key=itemgetter('seed'), reverse=True)
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index b1e01759f..84ff21a88 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -91,7 +91,7 @@ def response(resp):
for result in search_res.get('collection', []):
if result['kind'] in ('track', 'playlist'):
title = result['title']
- content = result['description']
+ content = result['description'] or ''
publishedDate = parser.parse(result['last_modified'])
uri = quote_plus(result['uri'])
embedded = embedded_url.format(uri=uri)
diff --git a/searx/engines/translated.py b/searx/engines/translated.py
index a50e7c830..75b8b5f42 100644
--- a/searx/engines/translated.py
+++ b/searx/engines/translated.py
@@ -15,6 +15,7 @@ categories = ['general']
url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}'
web_url = 'https://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}'
weight = 100
+https_support = True
parser_re = re.compile('.*?([a-z]+)-([a-z]+) (.{2,})$', re.I)
api_key = ''
diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py
index 9fce170eb..000e1af76 100644
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -52,7 +52,7 @@ def response(resp):
api_result = loads(resp.text)
# skip disambiguation pages
- if api_result['type'] != 'standard':
+ if api_result.get('type') != 'standard':
return []
title = api_result['title']
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py
index 8d691c852..b8f111a50 100644
--- a/searx/engines/www1x.py
+++ b/searx/engines/www1x.py
@@ -7,12 +7,12 @@
@using-api no
@results HTML
@stable no (HTML can change)
- @parse url, title, thumbnail, img_src, content
+ @parse url, title, thumbnail
"""
-from lxml import html
+from lxml import html, etree
from urllib.parse import urlencode, urljoin
-from searx.utils import extract_text
+from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
# engine dependent config
categories = ['images']
@@ -21,6 +21,7 @@ paging = False
# search-url
base_url = 'https://1x.com'
search_url = base_url + '/backend/search.php?{query}'
+gallery_url = 'https://gallery.1x.com/'
# do search-request
@@ -33,23 +34,18 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
-
- dom = html.fromstring(resp.text)
- for res in dom.xpath('//div[@class="List-item MainListing"]'):
- # processed start and end of link
- link = res.xpath('//a')[0]
-
+ xmldom = etree.fromstring(resp.content)
+ xmlsearchresult = eval_xpath_getindex(xmldom, '//searchresult', 0)
+ dom = html.fragment_fromstring(xmlsearchresult.text, create_parent='div')
+ for link in eval_xpath_list(dom, '/div/table/tr/td/div[2]//a'):
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link)
-
- thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src'])
- # TODO: get image with higher resolution
- img_src = thumbnail_src
+ thumbnail_src = urljoin(gallery_url, eval_xpath_getindex(link, './/img', 0).attrib['src'])
# append result
results.append({'url': url,
'title': title,
- 'img_src': img_src,
+ 'img_src': thumbnail_src,
'content': '',
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
index 5f7d2ceab..36fc72e36 100644
--- a/searx/engines/youtube_noapi.py
+++ b/searx/engines/youtube_noapi.py
@@ -49,7 +49,7 @@ def response(resp):
results = []
results_data = resp.text[resp.text.find('ytInitialData'):]
- results_data = results_data[results_data.find('{'):results_data.find(';\n')]
+ results_data = results_data[results_data.find('{'):results_data.find(';</script>')]
results_json = loads(results_data) if results_data else {}
sections = results_json.get('contents', {})\