Merge remote-tracking branch 'origin/master'

author: Kang-min Liu <gugod@gugod.org> 2015-11-14 00:05:44 +0100
committer: Kang-min Liu <gugod@gugod.org> 2015-11-14 00:05:44 +0100
commit: ac8759cd3ff99024864fd04d7c4bef5c3a00b971 (patch)
tree: 30c3f8b61504532df926bbffedcc8df80a8e926e /searx/engines
parent: c7c6c35ccd7373d2107b70b92badb9b70d31905f (diff)
parent: e98aef6fc4954681e58d774203d522f0ae478004 (diff)
13 files changed, 169 insertions, 120 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 42e1f08bc..447138d3b 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -75,7 +75,7 @@ def load_engine(engine_data):
         engine.safesearch = False
 
     if not hasattr(engine, 'timeout'):
-        engine.timeout = settings['server']['request_timeout']
+        engine.timeout = settings['outgoing']['request_timeout']
 
     if not hasattr(engine, 'shortcut'):
         engine.shortcut = ''
diff --git a/searx/engines/bing.py b/searx/engines/bing.py
index c72e6aeff..171606cf6 100644
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@@ -52,7 +52,7 @@ def request(query, params):
 def response(resp):
     results = []
 
-    dom = html.fromstring(resp.content)
+    dom = html.fromstring(resp.text)
 
     # parse results
     for result in dom.xpath('//div[@class="sa_cc"]'):
diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py
index 839b8e5be..06850dfe1 100644
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@@ -63,7 +63,7 @@ def request(query, params):
 def response(resp):
     results = []
 
-    dom = html.fromstring(resp.content)
+    dom = html.fromstring(resp.text)
 
     # init regex for yaml-parsing
     p = re.compile('({|,)([a-z]+):(")')
diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py
index b852de9ba..cfc8e7159 100644
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@@ -13,6 +13,8 @@
 from urllib import urlencode
 from cgi import escape
 from lxml import etree
+from random import randint
+from time import time
 
 # engine dependent config
 categories = ['general']
@@ -21,7 +23,7 @@ number_of_results = 5
 
 # search-url, invalid HTTPS certificate
 base_url = 'http://gigablast.com/'
-search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0'
+search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}'
 
 # specific xpath variables
 results_xpath = '//response//result'
@@ -37,7 +39,9 @@ def request(query, params):
     search_path = search_string.format(
         query=urlencode({'q': query}),
         offset=offset,
-        number_of_results=number_of_results)
+        number_of_results=number_of_results,
+        uxid=randint(10000, 10000000),
+        rand=int(time()))
 
     params['url'] = base_url + search_path
 
diff --git a/searx/engines/google.py b/searx/engines/google.py
index 0e78a9e2c..67e6ebb87 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -9,11 +9,15 @@
 # @parse       url, title, content, suggestion
 
 import re
+from cgi import escape
 from urllib import urlencode
 from urlparse import urlparse, parse_qsl
-from lxml import html
+from lxml import html, etree
 from searx.poolrequests import get
 from searx.engines.xpath import extract_text, extract_url
+from searx.search import logger
+
+logger = logger.getChild('google engine')
 
 
 # engine dependent config
@@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname):
 def extract_text_from_dom(result, xpath):
     r = result.xpath(xpath)
     if len(r) > 0:
-        return extract_text(r[0])
+        return escape(extract_text(r[0]))
     return None
 
 
@@ -224,8 +228,8 @@ def response(resp):
 
     # parse results
     for result in dom.xpath(results_xpath):
-        title = extract_text(result.xpath(title_xpath)[0])
         try:
+            title = extract_text(result.xpath(title_xpath)[0])
             url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
             parsed_url = urlparse(url, google_hostname)
 
@@ -268,12 +272,13 @@ def response(resp):
                                 'content': content
                                 })
         except:
+            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
             continue
 
     # parse suggestion
     for suggestion in dom.xpath(suggestion_xpath):
         # append suggestion
-        results.append({'suggestion': extract_text(suggestion)})
+        results.append({'suggestion': escape(extract_text(suggestion))})
 
     # return results
     return results
diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py
index ab0dfd44c..55446b410 100644
--- a/searx/engines/piratebay.py
+++ b/searx/engines/piratebay.py
@@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files']
 paging = True
 
 # search-url
-url = 'https://thepiratebay.am/'
+url = 'https://thepiratebay.se/'
 search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
 
 # piratebay specific type-definitions
diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py
index 21d9c4ac2..bd5eb71d2 100644
--- a/searx/engines/searchcode_code.py
+++ b/searx/engines/searchcode_code.py
@@ -34,6 +34,11 @@ def request(query, params):
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       pageno=params['pageno']-1)
 
+    # Disable SSL verification
+    # error: (60) SSL certificate problem: unable to get local issuer
+    # certificate
+    params['verify'] = False
+
     return params
 
 
diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py
index 582b98d79..9453f31a4 100644
--- a/searx/engines/searchcode_doc.py
+++ b/searx/engines/searchcode_doc.py
@@ -27,6 +27,11 @@ def request(query, params):
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       pageno=params['pageno']-1)
 
+    # Disable SSL verification
+    # error: (60) SSL certificate problem: unable to get local issuer
+    # certificate
+    params['verify'] = False
+
     return params
 
 
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 9d5b4befe..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
 
 from lxml import html
 from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
 import re
 from searx.engines.xpath import extract_text
 
@@ -66,20 +68,57 @@ def response(resp):
         url = link.attrib.get('href')
 
         # block google-ad url's
-        if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
+        if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
+            continue
+
+        # block startpage search url's
+        if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
+            continue
+
+        # block ixquick search url's
+        if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
             continue
 
         title = escape(extract_text(link))
 
-        if result.xpath('./p[@class="desc"]'):
-            content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+        if result.xpath('./p[@class="desc clk"]'):
+            content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
         else:
             content = ''
 
-        # append result
-        results.append({'url': url,
-                        'title': title,
-                        'content': content})
+        published_date = None
+
+        # check if search result starts with something like: "2 Sep 2014 ... "
+        if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+            date_pos = content.find('...')+4
+            date_string = content[0:date_pos-5]
+            published_date = parser.parse(date_string, dayfirst=True)
+
+            # fix content string
+            content = content[date_pos:]
+
+        # check if search result starts with something like: "5 days ago ... "
+        elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+            date_pos = content.find('...')+4
+            date_string = content[0:date_pos-5]
+
+            # calculate datetime
+            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+            # fix content string
+            content = content[date_pos:]
+
+        if published_date:
+            # append result
+            results.append({'url': url,
+                            'title': title,
+                            'content': content,
+                            'publishedDate': published_date})
+        else:
+            # append result
+            results.append({'url': url,
+                            'title': title,
+                            'content': content})
 
     # return results
     return results
diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py
index a0ee18a47..36efac186 100644
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@@ -55,10 +55,14 @@ def response(resp):
 
     # parse results
     for tweet in dom.xpath(results_xpath):
-        link = tweet.xpath(link_xpath)[0]
+        try:
+            link = tweet.xpath(link_xpath)[0]
+            content = extract_text(tweet.xpath(content_xpath)[0])
+        except Exception:
+            continue
+
         url = urljoin(base_url, link.attrib.get('href'))
         title = extract_text(tweet.xpath(title_xpath))
-        content = extract_text(tweet.xpath(content_xpath)[0])
 
         pubdate = tweet.xpath(timestamp_xpath)
         if len(pubdate) > 0:
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index 43f72761e..fc840d47c 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -1,8 +1,15 @@
 import json
-from urllib import urlencode
+
+from searx import logger
 from searx.poolrequests import get
 from searx.utils import format_date_by_locale
 
+from datetime import datetime
+from dateutil.parser import parse as dateutil_parse
+from urllib import urlencode
+
+
+logger = logger.getChild('wikidata')
 result_count = 1
 wikidata_host = 'https://www.wikidata.org'
 wikidata_api = wikidata_host + '/w/api.php'
@@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
     if postal_code is not None:
         attributes.append({'label': 'Postal code(s)', 'value': postal_code})
 
-    date_of_birth = get_time(claims, 'P569', None)
+    date_of_birth = get_time(claims, 'P569', locale, None)
     if date_of_birth is not None:
-        date_of_birth = format_date_by_locale(date_of_birth[8:], locale)
         attributes.append({'label': 'Date of birth', 'value': date_of_birth})
 
-    date_of_death = get_time(claims, 'P570', None)
+    date_of_death = get_time(claims, 'P570', locale, None)
     if date_of_death is not None:
-        date_of_death = format_date_by_locale(date_of_death[8:], locale)
         attributes.append({'label': 'Date of death', 'value': date_of_death})
 
     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
@@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None):
         return result[0]
 
 
-def get_time(claims, propertyName, defaultValue=None):
+def get_time(claims, propertyName, locale, defaultValue=None):
     propValue = claims.get(propertyName, {})
     if len(propValue) == 0:
         return defaultValue
@@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None):
             result.append(value.get('time', ''))
 
     if len(result) == 0:
-        return defaultValue
+        date_string = defaultValue
     else:
-        return ', '.join(result)
+        date_string = ', '.join(result)
+
+    try:
+        parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
+    except:
+        if date_string.startswith('-'):
+            return date_string.split('T')[0]
+        try:
+            parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
+        except:
+            logger.debug('could not parse date %s', date_string)
+            return date_string.split('T')[0]
+
+    return format_date_by_locale(parsed_date, locale)
 
 
 def get_geolink(claims, propertyName, defaultValue=''):
diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py
new file mode 100644
index 000000000..edc6ad5f2
--- /dev/null
+++ b/searx/engines/yandex.py
@@ -0,0 +1,62 @@
+"""
+ Yahoo (Web)
+
+ @website     https://yandex.ru/
+ @provide-api ?
+ @using-api   no
+ @results     HTML (using search portal)
+ @stable      no (HTML can change)
+ @parse       url, title, content
+"""
+
+from urllib import urlencode
+from lxml import html
+from searx.search import logger
+
+logger = logger.getChild('yandex engine')
+
+# engine dependent config
+categories = ['general']
+paging = True
+language_support = True  # TODO
+
+default_tld = 'com'
+language_map = {'ru': 'ru',
+                'ua': 'uk',
+                'tr': 'com.tr'}
+
+# search-url
+base_url = 'https://yandex.{tld}/'
+search_url = 'search/?{query}&p={page}'
+
+results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]'
+url_xpath = './/h2/a/@href'
+title_xpath = './/h2/a//text()'
+content_xpath = './/div[@class="serp-item__text"]//text()'
+
+
+def request(query, params):
+    lang = params['language'].split('_')[0]
+    host = base_url.format(tld=language_map.get(lang) or default_tld)
+    params['url'] = host + search_url.format(page=params['pageno']-1,
+                                             query=urlencode({'text': query}))
+    return params
+
+
+# get response from search-request
+def response(resp):
+    dom = html.fromstring(resp.text)
+    results = []
+
+    for result in dom.xpath(results_xpath):
+        try:
+            res = {'url': result.xpath(url_xpath)[0],
+                   'title': ''.join(result.xpath(title_xpath)),
+                   'content': ''.join(result.xpath(content_xpath))}
+        except:
+            logger.exception('yandex parse crash')
+            continue
+
+        results.append(res)
+
+    return results
diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py
deleted file mode 100644
index c77cd2d0e..000000000
--- a/searx/engines/youtube.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Youtube (Videos)
-#
-# @website     https://www.youtube.com/
-# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
-#
-# @using-api   yes
-# @results     JSON
-# @stable      yes
-# @parse       url, title, content, publishedDate, thumbnail, embedded
-
-from json import loads
-from urllib import urlencode
-from dateutil import parser
-
-# engine dependent config
-categories = ['videos', 'music']
-paging = True
-language_support = True
-
-# search-url
-base_url = 'https://gdata.youtube.com/feeds/api/videos'
-search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'
-
-embedded_url = '<iframe width="540" height="304" ' +\
-    'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\
-    'frameborder="0" allowfullscreen></iframe>'
-
-
-# do search-request
-def request(query, params):
-    index = (params['pageno'] - 1) * 5 + 1
-
-    params['url'] = search_url.format(query=urlencode({'q': query}),
-                                      index=index)
-
-    # add language tag if specified
-    if params['language'] != 'all':
-        params['url'] += '&lr=' + params['language'].split('_')[0]
-
-    return params
-
-
-# get response from search-request
-def response(resp):
-    results = []
-
-    search_results = loads(resp.text)
-
-    # return empty array if there are no results
-    if 'feed' not in search_results:
-        return []
-
-    feed = search_results['feed']
-
-    # parse results
-    for result in feed['entry']:
-        url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
-
-        if not url:
-            continue
-
-        # remove tracking
-        url = url[0].replace('feature=youtube_gdata', '')
-        if url.endswith('&'):
-            url = url[:-1]
-
-        videoid = url[32:]
-
-        title = result['title']['$t']
-        content = ''
-        thumbnail = ''
-
-        pubdate = result['published']['$t']
-        publishedDate = parser.parse(pubdate)
-
-        if 'media$thumbnail' in result['media$group']:
-            thumbnail = result['media$group']['media$thumbnail'][0]['url']
-
-        content = result['content']['$t']
-
-        embedded = embedded_url.format(videoid=videoid)
-
-        # append result
-        results.append({'url': url,
-                        'title': title,
-                        'content': content,
-                        'template': 'videos.html',
-                        'publishedDate': publishedDate,
-                        'embedded': embedded,
-                        'thumbnail': thumbnail})
-
-    # return results
-    return results
author	Kang-min Liu <gugod@gugod.org>	2015-11-14 00:05:44 +0100
committer	Kang-min Liu <gugod@gugod.org>	2015-11-14 00:05:44 +0100
commit	ac8759cd3ff99024864fd04d7c4bef5c3a00b971 (patch)
tree	30c3f8b61504532df926bbffedcc8df80a8e926e /searx/engines
parent	c7c6c35ccd7373d2107b70b92badb9b70d31905f (diff)
parent	e98aef6fc4954681e58d774203d522f0ae478004 (diff)