summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorKang-min Liu <gugod@gugod.org>2015-11-14 00:05:44 +0100
committerKang-min Liu <gugod@gugod.org>2015-11-14 00:05:44 +0100
commitac8759cd3ff99024864fd04d7c4bef5c3a00b971 (patch)
tree30c3f8b61504532df926bbffedcc8df80a8e926e /searx/engines
parentc7c6c35ccd7373d2107b70b92badb9b70d31905f (diff)
parente98aef6fc4954681e58d774203d522f0ae478004 (diff)
Merge remote-tracking branch 'origin/master'
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/__init__.py2
-rw-r--r--searx/engines/bing.py2
-rw-r--r--searx/engines/bing_images.py2
-rw-r--r--searx/engines/gigablast.py8
-rw-r--r--searx/engines/google.py13
-rw-r--r--searx/engines/piratebay.py2
-rw-r--r--searx/engines/searchcode_code.py5
-rw-r--r--searx/engines/searchcode_doc.py5
-rw-r--r--searx/engines/startpage.py53
-rw-r--r--searx/engines/twitter.py8
-rw-r--r--searx/engines/wikidata.py34
-rw-r--r--searx/engines/yandex.py62
-rw-r--r--searx/engines/youtube.py93
13 files changed, 169 insertions, 120 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 42e1f08bc..447138d3b 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -75,7 +75,7 @@ def load_engine(engine_data):
engine.safesearch = False
if not hasattr(engine, 'timeout'):
- engine.timeout = settings['server']['request_timeout']
+ engine.timeout = settings['outgoing']['request_timeout']
if not hasattr(engine, 'shortcut'):
engine.shortcut = ''
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index c72e6aeff..171606cf6 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -52,7 +52,7 @@ def request(query, params):
def response(resp):
results = []
- dom = html.fromstring(resp.content)
+ dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'):
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 839b8e5be..06850dfe1 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -63,7 +63,7 @@ def request(query, params):
def response(resp):
results = []
- dom = html.fromstring(resp.content)
+ dom = html.fromstring(resp.text)
# init regex for yaml-parsing
p = re.compile('({|,)([a-z]+):(")')
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index b852de9ba..cfc8e7159 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -13,6 +13,8 @@
from urllib import urlencode
from cgi import escape
from lxml import etree
+from random import randint
+from time import time
# engine dependent config
categories = ['general']
@@ -21,7 +23,7 @@ number_of_results = 5
# search-url, invalid HTTPS certificate
base_url = 'http://gigablast.com/'
-search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
+search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}'
# specific xpath variables
results_xpath = '//response//result'
@@ -37,7 +39,9 @@ def request(query, params):
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset,
- number_of_results=number_of_results)
+ number_of_results=number_of_results,
+ uxid=randint(10000, 10000000),
+ rand=int(time()))
params['url'] = base_url + search_path
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 0e78a9e2c..67e6ebb87 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -9,11 +9,15 @@
# @parse url, title, content, suggestion
import re
+from cgi import escape
from urllib import urlencode
from urlparse import urlparse, parse_qsl
-from lxml import html
+from lxml import html, etree
from searx.poolrequests import get
from searx.engines.xpath import extract_text, extract_url
+from searx.search import logger
+
+logger = logger.getChild('google engine')
# engine dependent config
@@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
- return extract_text(r[0])
+ return escape(extract_text(r[0]))
return None
@@ -224,8 +228,8 @@ def response(resp):
# parse results
for result in dom.xpath(results_xpath):
- title = extract_text(result.xpath(title_xpath)[0])
try:
+ title = extract_text(result.xpath(title_xpath)[0])
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
parsed_url = urlparse(url, google_hostname)
@@ -268,12 +272,13 @@ def response(resp):
'content': content
})
except:
+ logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
continue
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
- results.append({'suggestion': extract_text(suggestion)})
+ results.append({'suggestion': escape(extract_text(suggestion))})
# return results
return results
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
index ab0dfd44c..55446b410 100644
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files']
paging = True
# search-url
-url = 'https://thepiratebay.am/'
+url = 'https://thepiratebay.se/'
search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
# piratebay specific type-definitions
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index 21d9c4ac2..bd5eb71d2 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -34,6 +34,11 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']-1)
+ # Disable SSL verification
+ # error: (60) SSL certificate problem: unable to get local issuer
+ # certificate
+ params['verify'] = False
+
return params
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
index 582b98d79..9453f31a4 100644
--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -27,6 +27,11 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']-1)
+ # Disable SSL verification
+ # error: (60) SSL certificate problem: unable to get local issuer
+ # certificate
+ params['verify'] = False
+
return params
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 9d5b4befe..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -66,20 +68,57 @@ def response(resp):
url = link.attrib.get('href')
# block google-ad url's
- if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
+ if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
+ continue
+
+ # block startpage search url's
+ if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
+ continue
+
+ # block ixquick search url's
+ if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results
diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py
index a0ee18a47..36efac186 100644
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@@ -55,10 +55,14 @@ def response(resp):
# parse results
for tweet in dom.xpath(results_xpath):
- link = tweet.xpath(link_xpath)[0]
+ try:
+ link = tweet.xpath(link_xpath)[0]
+ content = extract_text(tweet.xpath(content_xpath)[0])
+ except Exception:
+ continue
+
url = urljoin(base_url, link.attrib.get('href'))
title = extract_text(tweet.xpath(title_xpath))
- content = extract_text(tweet.xpath(content_xpath)[0])
pubdate = tweet.xpath(timestamp_xpath)
if len(pubdate) > 0:
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index 43f72761e..fc840d47c 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -1,8 +1,15 @@
import json
-from urllib import urlencode
+
+from searx import logger
from searx.poolrequests import get
from searx.utils import format_date_by_locale
+from datetime import datetime
+from dateutil.parser import parse as dateutil_parse
+from urllib import urlencode
+
+
+logger = logger.getChild('wikidata')
result_count = 1
wikidata_host = 'https://www.wikidata.org'
wikidata_api = wikidata_host + '/w/api.php'
@@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
if postal_code is not None:
attributes.append({'label': 'Postal code(s)', 'value': postal_code})
- date_of_birth = get_time(claims, 'P569', None)
+ date_of_birth = get_time(claims, 'P569', locale, None)
if date_of_birth is not None:
- date_of_birth = format_date_by_locale(date_of_birth[8:], locale)
attributes.append({'label': 'Date of birth', 'value': date_of_birth})
- date_of_death = get_time(claims, 'P570', None)
+ date_of_death = get_time(claims, 'P570', locale, None)
if date_of_death is not None:
- date_of_death = format_date_by_locale(date_of_death[8:], locale)
attributes.append({'label': 'Date of death', 'value': date_of_death})
if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
@@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None):
return result[0]
-def get_time(claims, propertyName, defaultValue=None):
+def get_time(claims, propertyName, locale, defaultValue=None):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return defaultValue
@@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None):
result.append(value.get('time', ''))
if len(result) == 0:
- return defaultValue
+ date_string = defaultValue
else:
- return ', '.join(result)
+ date_string = ', '.join(result)
+
+ try:
+ parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
+ except:
+ if date_string.startswith('-'):
+ return date_string.split('T')[0]
+ try:
+ parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
+ except:
+ logger.debug('could not parse date %s', date_string)
+ return date_string.split('T')[0]
+
+ return format_date_by_locale(parsed_date, locale)
def get_geolink(claims, propertyName, defaultValue=''):
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
new file mode 100644
index 000000000..edc6ad5f2
--- /dev/null
+++ b/searx/engines/yandex.py
@@ -0,0 +1,62 @@
+"""
+ Yahoo (Web)
+
+ @website https://yandex.ru/
+ @provide-api ?
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from urllib import urlencode
+from lxml import html
+from searx.search import logger
+
+logger = logger.getChild('yandex engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True # TODO
+
+default_tld = 'com'
+language_map = {'ru': 'ru',
+ 'ua': 'uk',
+ 'tr': 'com.tr'}
+
+# search-url
+base_url = 'https://yandex.{tld}/'
+search_url = 'search/?{query}&p={page}'
+
+results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
+url_xpath = './/h2/a/@href'
+title_xpath = './/h2/a//text()'
+content_xpath = './/div[@class="serp-item__text"]//text()'
+
+
+def request(query, params):
+ lang = params['language'].split('_')[0]
+ host = base_url.format(tld=language_map.get(lang) or default_tld)
+ params['url'] = host + search_url.format(page=params['pageno']-1,
+ query=urlencode({'text': query}))
+ return params
+
+
+# get response from search-request
+def response(resp):
+ dom = html.fromstring(resp.text)
+ results = []
+
+ for result in dom.xpath(results_xpath):
+ try:
+ res = {'url': result.xpath(url_xpath)[0],
+ 'title': ''.join(result.xpath(title_xpath)),
+ 'content': ''.join(result.xpath(content_xpath))}
+ except:
+ logger.exception('yandex parse crash')
+ continue
+
+ results.append(res)
+
+ return results
diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py
deleted file mode 100644
index c77cd2d0e..000000000
--- a/searx/engines/youtube.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Youtube (Videos)
-#
-# @website https://www.youtube.com/
-# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
-#
-# @using-api yes
-# @results JSON
-# @stable yes
-# @parse url, title, content, publishedDate, thumbnail, embedded
-
-from json import loads
-from urllib import urlencode
-from dateutil import parser
-
-# engine dependent config
-categories = ['videos', 'music']
-paging = True
-language_support = True
-
-# search-url
-base_url = 'https://gdata.youtube.com/feeds/api/videos'
-search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'
-
-embedded_url = '<iframe width="540" height="304" ' +\
- 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
- 'frameborder="0" allowfullscreen></iframe>'
-
-
-# do search-request
-def request(query, params):
- index = (params['pageno'] - 1) * 5 + 1
-
- params['url'] = search_url.format(query=urlencode({'q': query}),
- index=index)
-
- # add language tag if specified
- if params['language'] != 'all':
- params['url'] += '&lr=' + params['language'].split('_')[0]
-
- return params
-
-
-# get response from search-request
-def response(resp):
- results = []
-
- search_results = loads(resp.text)
-
- # return empty array if there are no results
- if 'feed' not in search_results:
- return []
-
- feed = search_results['feed']
-
- # parse results
- for result in feed['entry']:
- url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
-
- if not url:
- continue
-
- # remove tracking
- url = url[0].replace('feature=youtube_gdata', '')
- if url.endswith('&'):
- url = url[:-1]
-
- videoid = url[32:]
-
- title = result['title']['$t']
- content = ''
- thumbnail = ''
-
- pubdate = result['published']['$t']
- publishedDate = parser.parse(pubdate)
-
- if 'media$thumbnail' in result['media$group']:
- thumbnail = result['media$group']['media$thumbnail'][0]['url']
-
- content = result['content']['$t']
-
- embedded = embedded_url.format(videoid=videoid)
-
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content,
- 'template': 'videos.html',
- 'publishedDate': publishedDate,
- 'embedded': embedded,
- 'thumbnail': thumbnail})
-
- # return results
- return results