diff options
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/bing_news.py | 2 | ||||
| -rw-r--r-- | searx/engines/gigablast.py | 8 | ||||
| -rw-r--r-- | searx/engines/google.py | 13 | ||||
| -rw-r--r-- | searx/engines/startpage.py | 43 | ||||
| -rw-r--r-- | searx/engines/yandex.py | 62 |
5 files changed, 115 insertions, 13 deletions
diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 943bf882e..a2397c48e 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -68,7 +68,7 @@ def request(query, params): def response(resp): results = [] - rss = etree.fromstring(resp.text) + rss = etree.fromstring(resp.content) ns = rss.nsmap diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index b852de9ba..cfc8e7159 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -13,6 +13,8 @@ from urllib import urlencode from cgi import escape from lxml import etree +from random import randint +from time import time # engine dependent config categories = ['general'] @@ -21,7 +23,7 @@ number_of_results = 5 # search-url, invalid HTTPS certificate base_url = 'http://gigablast.com/' -search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0' +search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}' # specific xpath variables results_xpath = '//response//result' @@ -37,7 +39,9 @@ def request(query, params): search_path = search_string.format( query=urlencode({'q': query}), offset=offset, - number_of_results=number_of_results) + number_of_results=number_of_results, + uxid=randint(10000, 10000000), + rand=int(time())) params['url'] = base_url + search_path diff --git a/searx/engines/google.py b/searx/engines/google.py index 0e78a9e2c..67e6ebb87 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,11 +9,15 @@ # @parse url, title, content, suggestion import re +from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl -from lxml import html +from lxml import html, etree from searx.poolrequests import get from searx.engines.xpath import extract_text, extract_url +from searx.search import logger + +logger = logger.getChild('google engine') # engine dependent config @@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname): def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: - return extract_text(r[0]) + return escape(extract_text(r[0])) return None @@ -224,8 +228,8 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): - title = extract_text(result.xpath(title_xpath)[0]) try: + title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) @@ -268,12 +272,13 @@ def response(resp): 'content': content }) except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion - results.append({'suggestion': extract_text(suggestion)}) + results.append({'suggestion': escape(extract_text(suggestion))}) # return results return results diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 7d58f7f01..a91cafa00 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -12,6 +12,8 @@ from lxml import html from cgi import escape +from dateutil import parser +from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text @@ -79,15 +81,44 @@ def response(resp): title = escape(extract_text(link)) - if result.xpath('./p[@class="desc"]'): - content = escape(extract_text(result.xpath('./p[@class="desc"]'))) + if result.xpath('./p[@class="desc clk"]'): + content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) else: content = '' - # append result - results.append({'url': url, - 'title': title, - 'content': content}) + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match("^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) # return results return results diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..edc6ad5f2 --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,62 @@ +""" + Yahoo (Web) + + @website https://yandex.ru/ + @provide-api ? + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from urllib import urlencode +from lxml import html +from searx.search import logger + +logger = logger.getChild('yandex engine') + +# engine dependent config +categories = ['general'] +paging = True +language_support = True # TODO + +default_tld = 'com' +language_map = {'ru': 'ru', + 'ua': 'uk', + 'tr': 'com.tr'} + +# search-url +base_url = 'https://yandex.{tld}/' +search_url = 'search/?{query}&p={page}' + +results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]' +url_xpath = './/h2/a/@href' +title_xpath = './/h2/a//text()' +content_xpath = './/div[@class="serp-item__text"]//text()' + + +def request(query, params): + lang = params['language'].split('_')[0] + host = base_url.format(tld=language_map.get(lang) or default_tld) + params['url'] = host + search_url.format(page=params['pageno']-1, + query=urlencode({'text': query})) + return params + + +# get response from search-request +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for result in dom.xpath(results_xpath): + try: + res = {'url': result.xpath(url_xpath)[0], + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} + except: + logger.exception('yandex parse crash') + continue + + results.append(res) + + return results |