summaryrefslogtreecommitdiff
path: root/searx/engines/startpage.py
diff options
context:
space:
mode:
authorKang-min Liu <gugod@gugod.org>2015-11-14 00:05:44 +0100
committerKang-min Liu <gugod@gugod.org>2015-11-14 00:05:44 +0100
commitac8759cd3ff99024864fd04d7c4bef5c3a00b971 (patch)
tree30c3f8b61504532df926bbffedcc8df80a8e926e /searx/engines/startpage.py
parentc7c6c35ccd7373d2107b70b92badb9b70d31905f (diff)
parente98aef6fc4954681e58d774203d522f0ae478004 (diff)
Merge remote-tracking branch 'origin/master'
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r--searx/engines/startpage.py53
1 files changed, 46 insertions, 7 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 9d5b4befe..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -66,20 +68,57 @@ def response(resp):
url = link.attrib.get('href')
# block google-ad url's
- if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
+ if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
+ continue
+
+ # block startpage search url's
+ if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
+ continue
+
+ # block ixquick search url's
+ if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results