From 23b9095cbf2d31a1495ee3d63a55bd81548cd367 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:28:55 +0200 Subject: [fix] improve result handling of startpage engine --- searx/engines/startpage.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'searx/engines/startpage.py') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 9d5b4befe..08e4f7a5b 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -66,7 +66,11 @@ def response(resp): url = link.attrib.get('href') # block google-ad url's - if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): + if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue title = escape(extract_text(link)) -- cgit v1.2.3 From 996c96fffff328497c2ba305c61e064256c84188 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:31:30 +0200 Subject: [fix] block ixquick search url's --- searx/engines/startpage.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'searx/engines/startpage.py') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 08e4f7a5b..7d58f7f01 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -73,6 +73,10 @@ def response(resp): if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue + # block ixquick search url's + if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): + continue + title = escape(extract_text(link)) if result.xpath('./p[@class="desc"]'): -- cgit v1.2.3 From 4508c966677708a2926afb1d05f134f252d8f93a Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 24 Oct 2015 16:15:30 +0200 Subject: [enh] fix content fetching, parse published date from description --- searx/engines/startpage.py | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'searx/engines/startpage.py') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 7d58f7f01..a91cafa00 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -12,6 +12,8 @@ from lxml import html from cgi import escape +from dateutil import parser +from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text @@ -79,15 +81,44 @@ def response(resp): title = escape(extract_text(link)) - if result.xpath('./p[@class="desc"]'): - content = escape(extract_text(result.xpath('./p[@class="desc"]'))) + if result.xpath('./p[@class="desc clk"]'): + content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) else: content = '' - # append result - results.append({'url': url, - 'title': title, - 'content': content}) + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match("^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) # return results return results -- cgit v1.2.3