Merge pull request #57 from pointhi/results

improving publishDate extraction and output of it
author: Adam Tauber <asciimoo@gmail.com> 2014-03-18 18:20:10 +0100
committer: Adam Tauber <asciimoo@gmail.com> 2014-03-18 18:20:10 +0100
commit: 018a14431bd3612db4e8840ce24f3e60026ece0f (patch)
tree: 18c7f0ed489c0c0f206ac1f0a191b1ce0ab045b2 /searx/engines
parent: faed14b2c691746ba6cf98d164a5e6b1ca3ee4c9 (diff)
parent: 993271bed30e24c7ae1e0f63b64e030829206f27 (diff)
4 files changed, 19 insertions, 13 deletions
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index 43ccaa3e3..b8a7be3ee 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -2,6 +2,7 @@
 
 from urllib import urlencode
 from json import loads
+from dateutil import parser
 from datetime import datetime
 
 categories = ['news']
@@ -32,16 +33,9 @@ def response(resp):
         return []
 
     for result in search_res['responseData']['results']:
-# S.149 (159), library.pdf
-# datetime.strptime("Mon, 10 Mar 2014 16:26:15 -0700",
-#                   "%a, %d %b %Y %H:%M:%S %z")
-#        publishedDate = parse(result['publishedDate'])
-        publishedDate = datetime.strptime(
-            str.join(' ', result['publishedDate'].split(None)[0:5]),
-            "%a, %d %b %Y %H:%M:%S")
-        #utc_offset = timedelta(result['publishedDate'].split(None)[5])
-        # local = utc + offset
-        #publishedDate = publishedDate + utc_offset
+
+# Mon, 10 Mar 2014 16:26:15 -0700
+        publishedDate = parser.parse(result['publishedDate'])
 
         results.append({'url': result['unescapedUrl'],
                         'title': result['titleNoFormatting'],
diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py
index a95c75b49..d2d2a4dd0 100644
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
@@ -2,6 +2,8 @@ from urllib import urlencode
 from HTMLParser import HTMLParser
 from lxml import html
 from xpath import extract_text
+from datetime import datetime
+from dateutil import parser
 
 base_url = 'http://vimeo.com'
 search_url = base_url + '/search?{query}'
@@ -10,6 +12,7 @@ content_xpath = None
 title_xpath = None
 results_xpath = ''
 content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
 
 # the cookie set by vimeo contains all the following values,
 # but only __utma seems to be requiered
@@ -40,9 +43,12 @@ def response(resp):
         url = base_url + result.xpath(url_xpath)[0]
         title = p.unescape(extract_text(result.xpath(title_xpath)))
         thumbnail = extract_text(result.xpath(content_xpath)[0])
+        publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
+
         results.append({'url': url,
                         'title': title,
                         'content': content_tpl.format(url, title, thumbnail),
                         'template': 'videos.html',
+                        'publishedDate': publishedDate,
                         'thumbnail': thumbnail})
     return results
diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py
index a1e9df59c..43da93ede 100644
--- a/searx/engines/yahoo_news.py
+++ b/searx/engines/yahoo_news.py
@@ -6,6 +6,7 @@ from searx.engines.xpath import extract_text, extract_url
 from searx.engines.yahoo import parse_url
 from datetime import datetime, timedelta
 import re
+from dateutil import parser
 
 categories = ['news']
 search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
@@ -52,9 +53,7 @@ def response(resp):
                     - timedelta(hours=int(timeNumbers[0]))\
                     - timedelta(minutes=int(timeNumbers[1]))
             else:
-                # TODO year in string possible?
-                publishedDate = datetime.strptime(publishedDate,
-                                                  "%b %d %H:%M%p")
+                publishedDate = parser.parse(publishedDate)
 
         if publishedDate.year == 1900:
             publishedDate = publishedDate.replace(year=datetime.now().year)
diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py
index 5b04f3513..f6b08b330 100644
--- a/searx/engines/youtube.py
+++ b/searx/engines/youtube.py
@@ -1,5 +1,7 @@
 from json import loads
 from urllib import urlencode
+from dateutil import parser
+from datetime import datetime
 
 categories = ['videos']
 
@@ -35,6 +37,10 @@ def response(resp):
         content = ''
         thumbnail = ''
 
+#"2013-12-31T15:22:51.000Z"
+        pubdate = result['published']['$t']
+        publishedDate = parser.parse(pubdate)
+
         if result['media$group']['media$thumbnail']:
             thumbnail = result['media$group']['media$thumbnail'][0]['url']
             content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail)  # noqa
@@ -48,6 +54,7 @@ def response(resp):
                         'title': title,
                         'content': content,
                         'template': 'videos.html',
+                        'publishedDate': publishedDate,
                         'thumbnail': thumbnail})
 
     return results
author	Adam Tauber <asciimoo@gmail.com>	2014-03-18 18:20:10 +0100
committer	Adam Tauber <asciimoo@gmail.com>	2014-03-18 18:20:10 +0100
commit	018a14431bd3612db4e8840ce24f3e60026ece0f (patch)
tree	18c7f0ed489c0c0f206ac1f0a191b1ce0ab045b2 /searx/engines
parent	faed14b2c691746ba6cf98d164a5e6b1ca3ee4c9 (diff)
parent	993271bed30e24c7ae1e0f63b64e030829206f27 (diff)