summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/bing_news.py2
-rw-r--r--searx/engines/gigablast.py8
-rw-r--r--searx/engines/google.py13
-rw-r--r--searx/engines/startpage.py43
-rw-r--r--searx/engines/yandex.py62
5 files changed, 115 insertions, 13 deletions
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index 943bf882e..a2397c48e 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -68,7 +68,7 @@ def request(query, params):
def response(resp):
results = []
- rss = etree.fromstring(resp.text)
+ rss = etree.fromstring(resp.content)
ns = rss.nsmap
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index b852de9ba..cfc8e7159 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -13,6 +13,8 @@
from urllib import urlencode
from cgi import escape
from lxml import etree
+from random import randint
+from time import time
# engine dependent config
categories = ['general']
@@ -21,7 +23,7 @@ number_of_results = 5
# search-url, invalid HTTPS certificate
base_url = 'http://gigablast.com/'
-search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
+search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}'
# specific xpath variables
results_xpath = '//response//result'
@@ -37,7 +39,9 @@ def request(query, params):
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset,
- number_of_results=number_of_results)
+ number_of_results=number_of_results,
+ uxid=randint(10000, 10000000),
+ rand=int(time()))
params['url'] = base_url + search_path
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 0e78a9e2c..67e6ebb87 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -9,11 +9,15 @@
# @parse url, title, content, suggestion
import re
+from cgi import escape
from urllib import urlencode
from urlparse import urlparse, parse_qsl
-from lxml import html
+from lxml import html, etree
from searx.poolrequests import get
from searx.engines.xpath import extract_text, extract_url
+from searx.search import logger
+
+logger = logger.getChild('google engine')
# engine dependent config
@@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
- return extract_text(r[0])
+ return escape(extract_text(r[0]))
return None
@@ -224,8 +228,8 @@ def response(resp):
# parse results
for result in dom.xpath(results_xpath):
- title = extract_text(result.xpath(title_xpath)[0])
try:
+ title = extract_text(result.xpath(title_xpath)[0])
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
parsed_url = urlparse(url, google_hostname)
@@ -268,12 +272,13 @@ def response(resp):
'content': content
})
except:
+ logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
continue
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
- results.append({'suggestion': extract_text(suggestion)})
+ results.append({'suggestion': escape(extract_text(suggestion))})
# return results
return results
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 7d58f7f01..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
new file mode 100644
index 000000000..edc6ad5f2
--- /dev/null
+++ b/searx/engines/yandex.py
@@ -0,0 +1,62 @@
+"""
+ Yahoo (Web)
+
+ @website https://yandex.ru/
+ @provide-api ?
+ @using-api no
+ @results HTML (using search portal)
+ @stable no (HTML can change)
+ @parse url, title, content
+"""
+
+from urllib import urlencode
+from lxml import html
+from searx.search import logger
+
+logger = logger.getChild('yandex engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True # TODO
+
+default_tld = 'com'
+language_map = {'ru': 'ru',
+ 'ua': 'uk',
+ 'tr': 'com.tr'}
+
+# search-url
+base_url = 'https://yandex.{tld}/'
+search_url = 'search/?{query}&p={page}'
+
+results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
+url_xpath = './/h2/a/@href'
+title_xpath = './/h2/a//text()'
+content_xpath = './/div[@class="serp-item__text"]//text()'
+
+
+def request(query, params):
+ lang = params['language'].split('_')[0]
+ host = base_url.format(tld=language_map.get(lang) or default_tld)
+ params['url'] = host + search_url.format(page=params['pageno']-1,
+ query=urlencode({'text': query}))
+ return params
+
+
+# get response from search-request
+def response(resp):
+ dom = html.fromstring(resp.text)
+ results = []
+
+ for result in dom.xpath(results_xpath):
+ try:
+ res = {'url': result.xpath(url_xpath)[0],
+ 'title': ''.join(result.xpath(title_xpath)),
+ 'content': ''.join(result.xpath(content_xpath))}
+ except:
+ logger.exception('yandex parse crash')
+ continue
+
+ results.append(res)
+
+ return results