summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py3
-rw-r--r--searx/engines/bing_images.py2
-rw-r--r--searx/engines/bing_news.py111
-rw-r--r--searx/engines/currency_convert.py6
-rw-r--r--searx/engines/flickr_noapi.py17
-rw-r--r--searx/engines/google.py271
-rw-r--r--searx/engines/qwant.py98
-rw-r--r--searx/engines/swisscows.py108
-rw-r--r--searx/engines/vimeo.py12
-rw-r--r--searx/engines/www1x.py4
-rw-r--r--searx/engines/youtube_api.py83
-rw-r--r--searx/engines/youtube_noapi.py81
12 files changed, 699 insertions, 97 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 18a45d851..42e1f08bc 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -71,6 +71,9 @@ def load_engine(engine_data):
if not hasattr(engine, 'language_support'):
engine.language_support = True
+ if not hasattr(engine, 'safesearch'):
+ engine.safesearch = False
+
if not hasattr(engine, 'timeout'):
engine.timeout = settings['server']['request_timeout']
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index b06a57edc..839b8e5be 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -28,7 +28,7 @@ safesearch = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}'
-thumb_url = "http://ts1.mm.bing.net/th?id={ihk}" # no https, bad certificate
+thumb_url = "https://www.bing.com/th?id={ihk}"
# safesearch definitions
safesearch_types = {2: 'STRICT',
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index 1e5d361c1..a2397c48e 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -6,18 +6,17 @@
max. 5000 query/month
@using-api no (because of query limit)
- @results HTML (using search portal)
- @stable no (HTML can change)
- @parse url, title, content, publishedDate
+ @results RSS (using search portal)
+ @stable yes (except perhaps for the images)
+ @parse url, title, content, publishedDate, thumbnail
"""
from urllib import urlencode
-from cgi import escape
-from lxml import html
-from datetime import datetime, timedelta
+from urlparse import urlparse, parse_qsl
+from datetime import datetime
from dateutil import parser
-import re
-from searx.engines.xpath import extract_text
+from lxml import etree
+from searx.utils import list_get
# engine dependent config
categories = ['news']
@@ -26,7 +25,25 @@ language_support = True
# search-url
base_url = 'https://www.bing.com/'
-search_string = 'news/search?{query}&first={offset}'
+search_string = 'news/search?{query}&first={offset}&format=RSS'
+
+
+# remove click
+def url_cleanup(url_string):
+ parsed_url = urlparse(url_string)
+ if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
+ query = dict(parse_qsl(parsed_url.query))
+ return query.get('url', None)
+ return url_string
+
+
+# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
+def image_url_cleanup(url_string):
+ parsed_url = urlparse(url_string)
+ if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
+ query = dict(parse_qsl(parsed_url.query))
+ return "https://www.bing.com/th?id=" + query.get('id')
+ return url_string
# do search-request
@@ -42,8 +59,6 @@ def request(query, params):
query=urlencode({'q': query, 'setmkt': language}),
offset=offset)
- params['cookies']['_FP'] = "ui=en-US"
-
params['url'] = base_url + search_path
return params
@@ -53,50 +68,44 @@ def request(query, params):
def response(resp):
results = []
- dom = html.fromstring(resp.content)
+ rss = etree.fromstring(resp.content)
+
+ ns = rss.nsmap
# parse results
- for result in dom.xpath('//div[@class="sn_r"]'):
- link = result.xpath('.//div[@class="newstitle"]/a')[0]
- url = link.attrib.get('href')
- title = extract_text(link)
- contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
- content = escape(extract_text(contentXPath))
-
- # parse publishedDate
- publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
- '//span[contains(@class,"sn_ST")]'
- '//span[contains(@class,"sn_tm")]')
-
- publishedDate = escape(extract_text(publishedDateXPath))
-
- if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
- timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
- elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
- timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
- elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
- timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now()\
- - timedelta(hours=int(timeNumbers[0]))\
- - timedelta(minutes=int(timeNumbers[1]))
- elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
- timeNumbers = re.findall(r'\d+', publishedDate)
- publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
- else:
- try:
- publishedDate = parser.parse(publishedDate, dayfirst=False)
- except TypeError:
- publishedDate = datetime.now()
- except ValueError:
- publishedDate = datetime.now()
+ for item in rss.xpath('./channel/item'):
+ # url / title / content
+ url = url_cleanup(item.xpath('./link/text()')[0])
+ title = list_get(item.xpath('./title/text()'), 0, url)
+ content = list_get(item.xpath('./description/text()'), 0, '')
+
+ # publishedDate
+ publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
+ try:
+ publishedDate = parser.parse(publishedDate, dayfirst=False)
+ except TypeError:
+ publishedDate = datetime.now()
+ except ValueError:
+ publishedDate = datetime.now()
+
+ # thumbnail
+ thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
+ if thumbnail is not None:
+ thumbnail = image_url_cleanup(thumbnail)
# append result
- results.append({'url': url,
- 'title': title,
- 'publishedDate': publishedDate,
- 'content': content})
+ if thumbnail is not None:
+ results.append({'template': 'videos.html',
+ 'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': content,
+ 'thumbnail': thumbnail})
+ else:
+ results.append({'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': content})
# return results
return results
diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py
index 1ba4575c5..26830a167 100644
--- a/searx/engines/currency_convert.py
+++ b/searx/engines/currency_convert.py
@@ -9,7 +9,7 @@ categories = []
url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X'
weight = 100
-parser_re = re.compile(r'^\W*(\d+(?:\.\d+)?)\W*([^.0-9].+)\W*in?\W*([^\.]+)\W*$', re.I) # noqa
+parser_re = re.compile(u'^\W*(\d+(?:\.\d+)?)\W*([^.0-9].+)\W+in?\W+([^\.]+)\W*$', re.I) # noqa
db = 1
@@ -17,7 +17,7 @@ db = 1
def normalize_name(name):
name = name.lower().replace('-', ' ')
name = re.sub(' +', ' ', name)
- return unicodedata.normalize('NFKD', u"" + name).lower()
+ return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
@@ -35,7 +35,7 @@ def iso4217_to_name(iso4217, language):
def request(query, params):
- m = parser_re.match(query)
+ m = parser_re.match(unicode(query, 'utf8'))
if not m:
# wrong query
return params
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
index 2071b8e36..87b912eb3 100644
--- a/searx/engines/flickr_noapi.py
+++ b/searx/engines/flickr_noapi.py
@@ -25,7 +25,7 @@ categories = ['images']
url = 'https://www.flickr.com/'
search_url = url + 'search?{query}&page={page}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
-regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
+regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
paging = True
@@ -38,6 +38,7 @@ def build_flickr_url(user_id, photo_id):
def request(query, params):
params['url'] = search_url.format(query=urlencode({'text': query}),
page=params['pageno'])
+
return params
@@ -75,10 +76,10 @@ def response(resp):
logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
continue
- if 'id' not in photo['owner']:
+ if 'ownerNsid' not in photo:
continue
-# For a bigger thumbnail, keep only the url_z, not the url_n
+ # For a bigger thumbnail, keep only the url_z, not the url_n
if 'n' in photo['sizes']:
thumbnail_src = photo['sizes']['n']['url']
elif 'z' in photo['sizes']:
@@ -86,20 +87,14 @@ def response(resp):
else:
thumbnail_src = img_src
- url = build_flickr_url(photo['owner']['id'], photo['id'])
+ url = build_flickr_url(photo['ownerNsid'], photo['id'])
title = photo.get('title', '')
content = '<span class="photo-author">' +\
- photo['owner']['username'] +\
+ photo['username'] +\
'</span><br />'
- if 'description' in photo:
- content = content +\
- '<span class="description">' +\
- photo['description'] +\
- '</span>'
-
# append result
results.append({'url': url,
'title': title,
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 807c58ed5..0e78a9e2c 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -8,39 +8,126 @@
# @stable no (HTML can change)
# @parse url, title, content, suggestion
+import re
from urllib import urlencode
from urlparse import urlparse, parse_qsl
from lxml import html
from searx.poolrequests import get
from searx.engines.xpath import extract_text, extract_url
+
# engine dependent config
categories = ['general']
paging = True
language_support = True
+use_locale_domain = True
+
+# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
+default_hostname = 'www.google.com'
+
+country_to_hostname = {
+ 'BG': 'www.google.bg', # Bulgaria
+ 'CZ': 'www.google.cz', # Czech Republic
+ 'DE': 'www.google.de', # Germany
+ 'DK': 'www.google.dk', # Denmark
+ 'AT': 'www.google.at', # Austria
+ 'CH': 'www.google.ch', # Switzerland
+ 'GR': 'www.google.gr', # Greece
+ 'AU': 'www.google.com.au', # Australia
+ 'CA': 'www.google.ca', # Canada
+ 'GB': 'www.google.co.uk', # United Kingdom
+ 'ID': 'www.google.co.id', # Indonesia
+ 'IE': 'www.google.ie', # Ireland
+ 'IN': 'www.google.co.in', # India
+ 'MY': 'www.google.com.my', # Malaysia
+ 'NZ': 'www.google.co.nz', # New Zealand
+ 'PH': 'www.google.com.ph', # Philippines
+ 'SG': 'www.google.com.sg', # Singapore
+ # 'US': 'www.google.us', # United State, redirect to .com
+ 'ZA': 'www.google.co.za', # South Africa
+ 'AR': 'www.google.com.ar', # Argentina
+ 'CL': 'www.google.cl', # Chile
+ 'ES': 'www.google.es', # Span
+ 'MX': 'www.google.com.mx', # Mexico
+ 'EE': 'www.google.ee', # Estonia
+ 'FI': 'www.google.fi', # Finland
+ 'BE': 'www.google.be', # Belgium
+ 'FR': 'www.google.fr', # France
+ 'IL': 'www.google.co.il', # Israel
+ 'HR': 'www.google.hr', # Croatia
+ 'HU': 'www.google.hu', # Hungary
+ 'IT': 'www.google.it', # Italy
+ 'JP': 'www.google.co.jp', # Japan
+ 'KR': 'www.google.co.kr', # South Korean
+ 'LT': 'www.google.lt', # Lithuania
+ 'LV': 'www.google.lv', # Latvia
+ 'NO': 'www.google.no', # Norway
+ 'NL': 'www.google.nl', # Netherlands
+ 'PL': 'www.google.pl', # Poland
+ 'BR': 'www.google.com.br', # Brazil
+ 'PT': 'www.google.pt', # Portugal
+ 'RO': 'www.google.ro', # Romania
+ 'RU': 'www.google.ru', # Russia
+ 'SK': 'www.google.sk', # Slovakia
+ 'SL': 'www.google.si', # Slovenia (SL -> si)
+ 'SE': 'www.google.se', # Sweden
+ 'TH': 'www.google.co.th', # Thailand
+ 'TR': 'www.google.com.tr', # Turkey
+ 'UA': 'www.google.com.ua', # Ikraine
+ # 'CN': 'www.google.cn', # China, only from china ?
+ 'HK': 'www.google.com.hk', # Hong kong
+ 'TW': 'www.google.com.tw' # Taiwan
+}
+
+# osm
+url_map = 'https://www.openstreetmap.org/'\
+ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
# search-url
-google_hostname = 'www.google.com'
search_path = '/search'
-redirect_path = '/url'
-images_path = '/images'
-search_url = ('https://' +
- google_hostname +
+search_url = ('https://{hostname}' +
search_path +
'?{query}&start={offset}&gbv=1')
+# other URLs
+map_hostname_start = 'maps.google.'
+maps_path = '/maps'
+redirect_path = '/url'
+images_path = '/images'
+
# specific xpath variables
results_xpath = '//li[@class="g"]'
url_xpath = './/h3/a/@href'
title_xpath = './/h3'
content_xpath = './/span[@class="st"]'
+content_misc_xpath = './/div[@class="f slp"]'
suggestion_xpath = '//p[@class="_Bmc"]'
+# map : detail location
+map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
+map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span'
+map_website_url_xpath = 'h3[2]/a/@href'
+map_website_title_xpath = 'h3[2]'
+
+# map : near the location
+map_near = 'table[@class="ts"]//tr'
+map_near_title = './/h4'
+map_near_url = './/h4/a/@href'
+map_near_phone = './/span[@class="nobr"]'
+
+# images
images_xpath = './/div/a'
image_url_xpath = './@href'
image_img_src_xpath = './img/@src'
+# property names
+# FIXME : no translation
+property_address = "Address"
+property_phone = "Phone number"
+
+# cookies
pref_cookie = ''
+nid_cookie = {}
# see https://support.google.com/websearch/answer/873?hl=en
@@ -52,8 +139,21 @@ def get_google_pref_cookie():
return pref_cookie
+def get_google_nid_cookie(google_hostname):
+ global nid_cookie
+ if google_hostname not in nid_cookie:
+ resp = get('https://' + google_hostname)
+ nid_cookie[google_hostname] = resp.cookies.get("NID", None)
+ return nid_cookie[google_hostname]
+
+
# remove google-specific tracking-url
-def parse_url(url_string):
+def parse_url(url_string, google_hostname):
+ # sanity check
+ if url_string is None:
+ return url_string
+
+ # normal case
parsed_url = urlparse(url_string)
if (parsed_url.netloc in [google_hostname, '']
and parsed_url.path == redirect_path):
@@ -63,21 +163,45 @@ def parse_url(url_string):
return url_string
+# returns extract_text on the first result selected by the xpath or None
+def extract_text_from_dom(result, xpath):
+ r = result.xpath(xpath)
+ if len(r) > 0:
+ return extract_text(r[0])
+ return None
+
+
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
if params['language'] == 'all':
language = 'en'
+ country = 'US'
else:
- language = params['language'].replace('_', '-').lower()
+ language_array = params['language'].lower().split('_')
+ if len(language_array) == 2:
+ country = language_array[1]
+ else:
+ country = 'US'
+ language = language_array[0] + ',' + language_array[0] + '-' + country
+
+ if use_locale_domain:
+ google_hostname = country_to_hostname.get(country.upper(), default_hostname)
+ else:
+ google_hostname = default_hostname
params['url'] = search_url.format(offset=offset,
- query=urlencode({'q': query}))
+ query=urlencode({'q': query}),
+ hostname=google_hostname)
params['headers']['Accept-Language'] = language
- if language.startswith('en'):
+ params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+ if google_hostname == default_hostname:
params['cookies']['PREF'] = get_google_pref_cookie()
+ params['cookies']['NID'] = get_google_nid_cookie(google_hostname)
+
+ params['google_hostname'] = google_hostname
return params
@@ -86,33 +210,63 @@ def request(query, params):
def response(resp):
results = []
+ # detect google sorry
+ resp_url = urlparse(resp.url)
+ if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
+ raise RuntimeWarning('sorry.google.com')
+
+ # which hostname ?
+ google_hostname = resp.search_params.get('google_hostname')
+ google_url = "https://" + google_hostname
+
+ # convert the text to dom
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
title = extract_text(result.xpath(title_xpath)[0])
try:
- url = parse_url(extract_url(result.xpath(url_xpath), search_url))
- parsed_url = urlparse(url)
- if (parsed_url.netloc == google_hostname
- and parsed_url.path == search_path):
- # remove the link to google news
- continue
+ url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
+ parsed_url = urlparse(url, google_hostname)
+
+ # map result
+ if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path))
+ or (parsed_url.netloc.startswith(map_hostname_start))):
+ x = result.xpath(map_near)
+ if len(x) > 0:
+ # map : near the location
+ results = results + parse_map_near(parsed_url, x, google_hostname)
+ else:
+ # map : detail about a location
+ results = results + parse_map_detail(parsed_url, result, google_hostname)
+
+ # google news
+ elif (parsed_url.netloc == google_hostname
+ and parsed_url.path == search_path):
+ # skipping news results
+ pass
# images result
- if (parsed_url.netloc == google_hostname
- and parsed_url.path == images_path):
+ elif (parsed_url.netloc == google_hostname
+ and parsed_url.path == images_path):
# only thumbnail image provided,
# so skipping image results
- # results = results + parse_images(result)
+ # results = results + parse_images(result, google_hostname)
pass
+
else:
# normal result
- content = extract_text(result.xpath(content_xpath)[0])
+ content = extract_text_from_dom(result, content_xpath)
+ if content is None:
+ continue
+ content_misc = extract_text_from_dom(result, content_misc_xpath)
+ if content_misc is not None:
+ content = content_misc + "<br />" + content
# append result
results.append({'url': url,
'title': title,
- 'content': content})
+ 'content': content
+ })
except:
continue
@@ -125,10 +279,10 @@ def response(resp):
return results
-def parse_images(result):
+def parse_images(result, google_hostname):
results = []
for image in result.xpath(images_xpath):
- url = parse_url(extract_text(image.xpath(image_url_xpath)[0]))
+ url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname)
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
# append result
@@ -136,6 +290,77 @@ def parse_images(result):
'title': '',
'content': '',
'img_src': img_src,
- 'template': 'images.html'})
+ 'template': 'images.html'
+ })
+
+ return results
+
+
+def parse_map_near(parsed_url, x, google_hostname):
+ results = []
+
+ for result in x:
+ title = extract_text_from_dom(result, map_near_title)
+ url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname)
+ attributes = []
+ phone = extract_text_from_dom(result, map_near_phone)
+ add_attributes(attributes, property_phone, phone, 'tel:' + phone)
+ results.append({'title': title,
+ 'url': url,
+ 'content': attributes_to_html(attributes)
+ })
return results
+
+
+def parse_map_detail(parsed_url, result, google_hostname):
+ results = []
+
+ # try to parse the geoloc
+ m = re.search('@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path)
+ if m is None:
+ m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query)
+
+ if m is not None:
+ # geoloc found (ignored)
+ lon = float(m.group(2)) # noqa
+ lat = float(m.group(1)) # noqa
+ zoom = int(m.group(3)) # noqa
+
+ # attributes
+ attributes = []
+ address = extract_text_from_dom(result, map_address_xpath)
+ phone = extract_text_from_dom(result, map_phone_xpath)
+ add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon))
+ add_attributes(attributes, property_phone, phone, 'tel:' + phone)
+
+ # title / content / url
+ website_title = extract_text_from_dom(result, map_website_title_xpath)
+ content = extract_text_from_dom(result, content_xpath)
+ website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname)
+
+ # add a result if there is a website
+ if website_url is not None:
+ results.append({'title': website_title,
+ 'content': (content + '<br />' if content is not None else '')
+ + attributes_to_html(attributes),
+ 'url': website_url
+ })
+
+ return results
+
+
+def add_attributes(attributes, name, value, url):
+ if value is not None and len(value) > 0:
+ attributes.append({'label': name, 'value': value, 'url': url})
+
+
+def attributes_to_html(attributes):
+ retval = '<table class="table table-striped">'
+ for a in attributes:
+ value = a.get('value')
+ if 'url' in a:
+ value = '<a href="' + a.get('url') + '">' + value + '</a>'
+ retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
+ retval = retval + '</table>'
+ return retval
diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py
new file mode 100644
index 000000000..872bd4e95
--- /dev/null
+++ b/searx/engines/qwant.py
@@ -0,0 +1,98 @@
+"""
+ Qwant (Web, Images, News, Social)
+
+ @website https://qwant.com/
+ @provide-api not officially (https://api.qwant.com/api/search/)
+
+ @using-api yes
+ @results JSON
+ @stable yes
+ @parse url, title, content
+"""
+
+from urllib import urlencode
+from json import loads
+from datetime import datetime
+
+# engine dependent config
+categories = None
+paging = True
+language_support = True
+
+category_to_keyword = {'general': 'web',
+ 'images': 'images',
+ 'news': 'news',
+ 'social media': 'social'}
+
+# search-url
+url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}'
+
+
+# do search-request
+def request(query, params):
+ offset = (params['pageno'] - 1) * 10
+
+ if categories[0] and categories[0] in category_to_keyword:
+
+ params['url'] = url.format(keyword=category_to_keyword[categories[0]],
+ query=urlencode({'q': query}),
+ offset=offset)
+ else:
+ params['url'] = url.format(keyword='web',
+ query=urlencode({'q': query}),
+ offset=offset)
+
+ # add language tag if specified
+ if params['language'] != 'all':
+ params['url'] += '&locale=' + params['language'].lower()
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if 'data' not in search_results:
+ return []
+
+ data = search_results.get('data', {})
+
+ res = data.get('result', {})
+
+ # parse results
+ for result in res.get('items', {}):
+
+ title = result['title']
+ res_url = result['url']
+ content = result['desc']
+
+ if category_to_keyword.get(categories[0], '') == 'web':
+ results.append({'title': title,
+ 'content': content,
+ 'url': res_url})
+
+ elif category_to_keyword.get(categories[0], '') == 'images':
+ thumbnail_src = result['thumbnail']
+ img_src = result['media']
+ results.append({'template': 'images.html',
+ 'url': res_url,
+ 'title': title,
+ 'content': '',
+ 'thumbnail_src': thumbnail_src,
+ 'img_src': img_src})
+
+ elif (category_to_keyword.get(categories[0], '') == 'news' or
+ category_to_keyword.get(categories[0], '') == 'social'):
+ published_date = datetime.fromtimestamp(result['date'], None)
+
+ results.append({'url': res_url,
+ 'title': title,
+ 'publishedDate': published_date,
+ 'content': content})
+
+ # return results
+ return results
diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py
new file mode 100644
index 000000000..2d31264ca
--- /dev/null
+++ b/searx/engines/swisscows.py
@@ -0,0 +1,108 @@
+"""
+ Swisscows (Web, Images)
+
+ @website https://swisscows.ch
+ @provide-api no
+
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from json import loads
+from urllib import urlencode, unquote
+import re
+
+# engine dependent config
+categories = ['general', 'images']
+paging = True
+language_support = True
+
+# search-url
+base_url = 'https://swisscows.ch/'
+search_string = '?{query}&page={page}'
+
+# regex
+regex_json = re.compile('initialData: {"Request":(.|\n)*},\s*environment')
+regex_json_remove_start = re.compile('^initialData:\s*')
+regex_json_remove_end = re.compile(',\s*environment$')
+regex_img_url_remove_start = re.compile('^https?://i\.swisscows\.ch/\?link=')
+
+
+# do search-request
+def request(query, params):
+ if params['language'] == 'all':
+ ui_language = 'browser'
+ region = 'browser'
+ else:
+ region = params['language'].replace('_', '-')
+ ui_language = params['language'].split('_')[0]
+
+ search_path = search_string.format(
+ query=urlencode({'query': query,
+ 'uiLanguage': ui_language,
+ 'region': region}),
+ page=params['pageno'])
+
+ # image search query is something like 'image?{query}&page={page}'
+ if params['category'] == 'images':
+ search_path = 'image' + search_path
+
+ params['url'] = base_url + search_path
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ json_regex = regex_json.search(resp.content)
+
+ # check if results are returned
+ if not json_regex:
+ return []
+
+ json_raw = regex_json_remove_end.sub('', regex_json_remove_start.sub('', json_regex.group()))
+ json = loads(json_raw)
+
+ # parse results
+ for result in json['Results'].get('items', []):
+ result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '')
+
+ # parse image results
+ if result.get('ContentType', '').startswith('image'):
+ img_url = unquote(regex_img_url_remove_start.sub('', result['Url']))
+
+ # append result
+ results.append({'url': result['SourceUrl'],
+ 'title': result['Title'],
+ 'content': '',
+ 'img_src': img_url,
+ 'template': 'images.html'})
+
+ # parse general results
+ else:
+ result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '')
+ result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '')
+
+ # append result
+ results.append({'url': result_url,
+ 'title': result_title,
+ 'content': result_content})
+
+ # parse images
+ for result in json.get('Images', []):
+ # decode image url
+ img_url = unquote(regex_img_url_remove_start.sub('', result['Url']))
+
+ # append result
+ results.append({'url': result['SourceUrl'],
+ 'title': result['Title'],
+ 'content': '',
+ 'img_src': img_url,
+ 'template': 'images.html'})
+
+ # return results
+ return results
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
index 0dcc65b7c..517ac1c44 100644
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
@@ -27,11 +27,11 @@ base_url = 'https://vimeo.com'
search_url = base_url + '/search/page:{pageno}?{query}'
# specific xpath variables
-results_xpath = '//div[@id="browse_content"]/ol/li'
-url_xpath = './a/@href'
-title_xpath = './a/div[@class="data"]/p[@class="title"]'
-content_xpath = './a/img/@src'
-publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
+results_xpath = '//div[contains(@class,"results_grid")]/ul/li'
+url_xpath = './/a/@href'
+title_xpath = './/span[@class="title"]'
+thumbnail_xpath = './/img[@class="js-clip_thumbnail_image"]/@src'
+publishedDate_xpath = './/time/attribute::datetime'
embedded_url = '<iframe data-src="//player.vimeo.com/video{videoid}" ' +\
'width="540" height="304" frameborder="0" ' +\
@@ -58,7 +58,7 @@ def response(resp):
videoid = result.xpath(url_xpath)[0]
url = base_url + videoid
title = p.unescape(extract_text(result.xpath(title_xpath)))
- thumbnail = extract_text(result.xpath(content_xpath)[0])
+ thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
embedded = embedded_url.format(videoid=videoid)
diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py
index 12868ad22..ddb79bfea 100644
--- a/searx/engines/www1x.py
+++ b/searx/engines/www1x.py
@@ -20,8 +20,8 @@ import re
categories = ['images']
paging = False
-# search-url, no HTTPS (there is a valid certificate for https://api2.1x.com/ )
-base_url = 'http://1x.com'
+# search-url
+base_url = 'https://1x.com'
search_url = base_url+'/backend/search.php?{query}'
diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py
new file mode 100644
index 000000000..8fd939a25
--- /dev/null
+++ b/searx/engines/youtube_api.py
@@ -0,0 +1,83 @@
+# Youtube (Videos)
+#
+# @website https://www.youtube.com/
+# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list)
+#
+# @using-api yes
+# @results JSON
+# @stable yes
+# @parse url, title, content, publishedDate, thumbnail, embedded
+
+from json import loads
+from urllib import urlencode
+from dateutil import parser
+
+# engine dependent config
+categories = ['videos', 'music']
+paging = False
+language_support = True
+api_key = None
+
+# search-url
+base_url = 'https://www.googleapis.com/youtube/v3/search'
+search_url = base_url + '?part=snippet&{query}&maxResults=20&key={api_key}'
+
+embedded_url = '<iframe width="540" height="304" ' +\
+ 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
+ 'frameborder="0" allowfullscreen></iframe>'
+
+base_youtube_url = 'https://www.youtube.com/watch?v='
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=urlencode({'q': query}),
+ api_key=api_key)
+
+ # add language tag if specified
+ if params['language'] != 'all':
+ params['url'] += '&relevanceLanguage=' + params['language'].split('_')[0]
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ search_results = loads(resp.text)
+
+ # return empty array if there are no results
+ if 'items' not in search_results:
+ return []
+
+ # parse results
+ for result in search_results['items']:
+ videoid = result['id']['videoId']
+
+ title = result['snippet']['title']
+ content = ''
+ thumbnail = ''
+
+ pubdate = result['snippet']['publishedAt']
+ publishedDate = parser.parse(pubdate)
+
+ thumbnail = result['snippet']['thumbnails']['high']['url']
+
+ content = result['snippet']['description']
+
+ url = base_youtube_url + videoid
+
+ embedded = embedded_url.format(videoid=videoid)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'videos.html',
+ 'publishedDate': publishedDate,
+ 'embedded': embedded,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
new file mode 100644
index 000000000..401fca4c9
--- /dev/null
+++ b/searx/engines/youtube_noapi.py
@@ -0,0 +1,81 @@
+# Youtube (Videos)
+#
+# @website https://www.youtube.com/
+# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list)
+#
+# @using-api no
+# @results HTML
+# @stable no
+# @parse url, title, content, publishedDate, thumbnail, embedded
+
+from urllib import quote_plus
+from lxml import html
+from searx.engines.xpath import extract_text
+from searx.utils import list_get
+
+# engine dependent config
+categories = ['videos', 'music']
+paging = True
+language_support = False
+
+# search-url
+base_url = 'https://www.youtube.com/results'
+search_url = base_url + '?search_query={query}&page={page}'
+
+embedded_url = '<iframe width="540" height="304" ' +\
+ 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
+ 'frameborder="0" allowfullscreen></iframe>'
+
+base_youtube_url = 'https://www.youtube.com/watch?v='
+
+# specific xpath variables
+results_xpath = "//ol/li/div[contains(@class, 'yt-lockup yt-lockup-tile yt-lockup-video vve-check')]"
+url_xpath = './/h3/a/@href'
+title_xpath = './/div[@class="yt-lockup-content"]/h3/a'
+content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]'
+
+
+# returns extract_text on the first result selected by the xpath or None
+def extract_text_from_dom(result, xpath):
+ r = result.xpath(xpath)
+ if len(r) > 0:
+ return extract_text(r[0])
+ return None
+
+
+# do search-request
+def request(query, params):
+ params['url'] = search_url.format(query=quote_plus(query),
+ page=params['pageno'])
+
+ return params
+
+
+# get response from search-request
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ # parse results
+ for result in dom.xpath(results_xpath):
+ videoid = list_get(result.xpath('@data-context-item-id'), 0)
+ if videoid is not None:
+ url = base_youtube_url + videoid
+ thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'
+
+ title = extract_text_from_dom(result, title_xpath) or videoid
+ content = extract_text_from_dom(result, content_xpath)
+
+ embedded = embedded_url.format(videoid=videoid)
+
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'template': 'videos.html',
+ 'embedded': embedded,
+ 'thumbnail': thumbnail})
+
+ # return results
+ return results