5 files changed, 115 insertions, 13 deletions
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py
index 943bf882e..a2397c48e 100644
--- a/searx/engines/bing_news.py
+++ b/searx/engines/bing_news.py
@@ -68,7 +68,7 @@ def request(query, params):
 def response(resp):
     results = []
 
-    rss = etree.fromstring(resp.text)
+    rss = etree.fromstring(resp.content)
 
     ns = rss.nsmap
 
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index b852de9ba..cfc8e7159 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -13,6 +13,8 @@
 from urllib import urlencode
 from cgi import escape
 from lxml import etree
+from random import randint
+from time import time
 
 # engine dependent config
 categories = ['general']
@@ -21,7 +23,7 @@ number_of_results = 5
 
 # search-url, invalid HTTPS certificate
 base_url = 'http://gigablast.com/'
-search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
+search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}'
 
 # specific xpath variables
 results_xpath = '//response//result'
@@ -37,7 +39,9 @@ def request(query, params):
     search_path = search_string.format(
         query=urlencode({'q': query}),
         offset=offset,
-        number_of_results=number_of_results)
+        number_of_results=number_of_results,
+        uxid=randint(10000, 10000000),
+        rand=int(time()))
 
     params['url'] = base_url + search_path
 
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 0e78a9e2c..67e6ebb87 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -9,11 +9,15 @@
 # @parse       url, title, content, suggestion
 
 import re
+from cgi import escape
 from urllib import urlencode
 from urlparse import urlparse, parse_qsl
-from lxml import html
+from lxml import html, etree
 from searx.poolrequests import get
 from searx.engines.xpath import extract_text, extract_url
+from searx.search import logger
+
+logger = logger.getChild('google engine')
 
 
 # engine dependent config
@@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname):
 def extract_text_from_dom(result, xpath):
     r = result.xpath(xpath)
     if len(r) > 0:
-        return extract_text(r[0])
+        return escape(extract_text(r[0]))
     return None
 
 
@@ -224,8 +228,8 @@ def response(resp):
 
     # parse results
     for result in dom.xpath(results_xpath):
-        title = extract_text(result.xpath(title_xpath)[0])
         try:
+            title = extract_text(result.xpath(title_xpath)[0])
             url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
             parsed_url = urlparse(url, google_hostname)
 
@@ -268,12 +272,13 @@ def response(resp):
                                 'content': content
                                 })
         except:
+            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
             continue
 
     # parse suggestion
     for suggestion in dom.xpath(suggestion_xpath):
         # append suggestion
-        results.append({'suggestion': extract_text(suggestion)})
+        results.append({'suggestion': escape(extract_text(suggestion))})
 
     # return results
     return results
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 7d58f7f01..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
 
 from lxml import html
 from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
 import re
 from searx.engines.xpath import extract_text
 
@@ -79,15 +81,44 @@ def response(resp):
 
         title = escape(extract_text(link))
 
-        if result.xpath('./p[@class="desc"]'):
-            content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+        if result.xpath('./p[@class="desc clk"]'):
+            content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
         else:
             content = ''
 
-        # append result
-        results.append({'url': url,
-                        'title': title,
-                        'content': content})
+        published_date = None
+
+        # check if search result starts with something like: "2 Sep 2014 ... "
+        if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+            date_pos = content.find('...')+4
+            date_string = content[0:date_pos-5]
+            published_date = parser.parse(date_string, dayfirst=True)
+
+            # fix content string
+            content = content[date_pos:]
+
+        # check if search result starts with something like: "5 days ago ... "
+        elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+            date_pos = content.find('...')+4
+            date_string = content[0:date_pos-5]
+
+            # calculate datetime
+            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+            # fix content string
+            content = content[date_pos:]
+
+        if published_date:
+            # append result
+            results.append({'url': url,
+                            'title': title,
+                            'content': content,
+                            'publishedDate': published_date})
+        else:
+            # append result
+            results.append({'url': url,
+                            'title': title,
+                            'content': content})
 
     # return results
     return results
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
new file mode 100644
index 000000000..edc6ad5f2
--- /dev/null
+++ b/searx/engines/yandex.py
@@ -0,0 +1,62 @@
+"""
+ Yahoo (Web)
+
+ @website     https://yandex.ru/
+ @provide-api ?
+ @using-api   no
+ @results     HTML (using search portal)
+ @stable      no (HTML can change)
+ @parse       url, title, content
+"""
+
+from urllib import urlencode
+from lxml import html
+from searx.search import logger
+
+logger = logger.getChild('yandex engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True  # TODO
+
+default_tld = 'com'
+language_map = {'ru': 'ru',
+                'ua': 'uk',
+                'tr': 'com.tr'}
+
+# search-url
+base_url = 'https://yandex.{tld}/'
+search_url = 'search/?{query}&p={page}'
+
+results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
+url_xpath = './/h2/a/@href'
+title_xpath = './/h2/a//text()'
+content_xpath = './/div[@class="serp-item__text"]//text()'
+
+
+def request(query, params):
+    lang = params['language'].split('_')[0]
+    host = base_url.format(tld=language_map.get(lang) or default_tld)
+    params['url'] = host + search_url.format(page=params['pageno']-1,
+                                             query=urlencode({'text': query}))
+    return params
+
+
+# get response from search-request
+def response(resp):
+    dom = html.fromstring(resp.text)
+    results = []
+
+    for result in dom.xpath(results_xpath):
+        try:
+            res = {'url': result.xpath(url_xpath)[0],
+                   'title': ''.join(result.xpath(title_xpath)),
+                   'content': ''.join(result.xpath(content_xpath))}
+        except:
+            logger.exception('yandex parse crash')
+            continue
+
+        results.append(res)
+
+    return results