summaryrefslogtreecommitdiff
path: root/searx/engines/xpath.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines/xpath.py')
-rw-r--r--searx/engines/xpath.py35
1 files changed, 23 insertions, 12 deletions
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index c8c56da44..b75896cc7 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -1,12 +1,13 @@
from lxml import html
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
-from searx.utils import html_to_text
+from searx.utils import html_to_text, eval_xpath
from searx.url_utils import unquote, urlencode, urljoin, urlparse
search_url = None
url_xpath = None
content_xpath = None
title_xpath = None
+thumbnail_xpath = False
paging = False
suggestion_xpath = ''
results_xpath = ''
@@ -40,7 +41,9 @@ def extract_text(xpath_results):
return ''.join(xpath_results)
else:
# it's a element
- text = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
+ text = html.tostring(
+ xpath_results, encoding='unicode', method='text', with_tail=False
+ )
text = text.strip().replace('\n', ' ')
return ' '.join(text.split())
@@ -53,7 +56,7 @@ def extract_url(xpath_results, search_url):
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
- url = u'{0}:{1}'.format(parsed_search_url.scheme, url)
+ url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
@@ -101,22 +104,30 @@ def response(resp):
results = []
dom = html.fromstring(resp.text)
if results_xpath:
- for result in dom.xpath(results_xpath):
- url = extract_url(result.xpath(url_xpath), search_url)
- title = extract_text(result.xpath(title_xpath))
- content = extract_text(result.xpath(content_xpath))
- results.append({'url': url, 'title': title, 'content': content})
+ for result in eval_xpath(dom, results_xpath):
+ url = extract_url(eval_xpath(result, url_xpath), search_url)
+ title = extract_text(eval_xpath(result, title_xpath))
+ content = extract_text(eval_xpath(result, content_xpath))
+ tmp_result = {'url': url, 'title': title, 'content': content}
+
+ # add thumbnail if available
+ if thumbnail_xpath:
+ thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
+ if len(thumbnail_xpath_result) > 0:
+ tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
+
+ results.append(tmp_result)
else:
for url, title, content in zip(
(extract_url(x, search_url) for
- x in dom.xpath(url_xpath)),
- map(extract_text, dom.xpath(title_xpath)),
- map(extract_text, dom.xpath(content_xpath))
+ x in eval_xpath(dom, url_xpath)),
+ map(extract_text, eval_xpath(dom, title_xpath)),
+ map(extract_text, eval_xpath(dom, content_xpath))
):
results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath:
return results
- for suggestion in dom.xpath(suggestion_xpath):
+ for suggestion in eval_xpath(dom, suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)})
return results