From b2492c94f422e18cb8954ec983134f4fa5c7cdc0 Mon Sep 17 00:00:00 2001 From: asciimoo Date: Mon, 20 Jan 2014 02:31:20 +0100 Subject: [fix] pep/flake8 compatibility --- searx/engines/xpath.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'searx/engines/xpath.py') diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 5e2c3c38b..a7d24e2a2 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -1,21 +1,24 @@ from lxml import html from urllib import urlencode, unquote from urlparse import urlparse, urljoin -from cgi import escape from lxml.etree import _ElementStringResult -search_url = None -url_xpath = None +search_url = None +url_xpath = None content_xpath = None -title_xpath = None +title_xpath = None suggestion_xpath = '' results_xpath = '' + ''' if xpath_results is list, extract the text from each result and concat the list -if xpath_results is a xml element, extract all the text node from it ( text_content() method from lxml ) +if xpath_results is a xml element, extract all the text node from it + ( text_content() method from lxml ) if xpath_results is a string element, then it's already done ''' + + def extract_text(xpath_results): if type(xpath_results) == list: # it's list of result : concat everything using recursive call @@ -60,7 +63,8 @@ def normalize_url(url): url += '/' # FIXME : hack for yahoo - if parsed_url.hostname == 'search.yahoo.com' and parsed_url.path.startswith('/r'): + if parsed_url.hostname == 'search.yahoo.com'\ + and parsed_url.path.startswith('/r'): p = parsed_url.path mark = p.find('/**') if mark != -1: @@ -82,15 +86,15 @@ def response(resp): if results_xpath: for result in dom.xpath(results_xpath): url = extract_url(result.xpath(url_xpath)) - title = extract_text(result.xpath(title_xpath)[0 ]) + title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) results.append({'url': url, 'title': title, 'content': content}) else: for url, title, content in zip( - map(extract_url, dom.xpath(url_xpath)), \ - map(extract_text, dom.xpath(title_xpath)), \ - map(extract_text, dom.xpath(content_xpath)), \ - ): + map(extract_url, dom.xpath(url_xpath)), + map(extract_text, dom.xpath(title_xpath)), + map(extract_text, dom.xpath(content_xpath)) + ): results.append({'url': url, 'title': title, 'content': content}) if not suggestion_xpath: -- cgit v1.2.3 From 59eeeaab87951fd6fa3302ec240db98902a20b2c Mon Sep 17 00:00:00 2001 From: asciimoo Date: Thu, 23 Jan 2014 11:08:08 +0100 Subject: [fix] html tag removal --- searx/engines/xpath.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'searx/engines/xpath.py') diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index a7d24e2a2..8960b5f21 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -2,6 +2,7 @@ from lxml import html from urllib import urlencode, unquote from urlparse import urlparse, urljoin from lxml.etree import _ElementStringResult +from searx.utils import html_to_text search_url = None url_xpath = None @@ -33,7 +34,7 @@ def extract_text(xpath_results): return ''.join(xpath_results) else: # it's a element - return xpath_results.text_content() + return html_to_text(xpath_results.text_content()) def extract_url(xpath_results): -- cgit v1.2.3