diff options
| author | Kang-min Liu <gugod@gugod.org> | 2015-11-14 00:05:44 +0100 |
|---|---|---|
| committer | Kang-min Liu <gugod@gugod.org> | 2015-11-14 00:05:44 +0100 |
| commit | ac8759cd3ff99024864fd04d7c4bef5c3a00b971 (patch) | |
| tree | 30c3f8b61504532df926bbffedcc8df80a8e926e /searx/engines | |
| parent | c7c6c35ccd7373d2107b70b92badb9b70d31905f (diff) | |
| parent | e98aef6fc4954681e58d774203d522f0ae478004 (diff) | |
Merge remote-tracking branch 'origin/master'
Diffstat (limited to 'searx/engines')
| -rw-r--r-- | searx/engines/__init__.py | 2 | ||||
| -rw-r--r-- | searx/engines/bing.py | 2 | ||||
| -rw-r--r-- | searx/engines/bing_images.py | 2 | ||||
| -rw-r--r-- | searx/engines/gigablast.py | 8 | ||||
| -rw-r--r-- | searx/engines/google.py | 13 | ||||
| -rw-r--r-- | searx/engines/piratebay.py | 2 | ||||
| -rw-r--r-- | searx/engines/searchcode_code.py | 5 | ||||
| -rw-r--r-- | searx/engines/searchcode_doc.py | 5 | ||||
| -rw-r--r-- | searx/engines/startpage.py | 53 | ||||
| -rw-r--r-- | searx/engines/twitter.py | 8 | ||||
| -rw-r--r-- | searx/engines/wikidata.py | 34 | ||||
| -rw-r--r-- | searx/engines/yandex.py | 62 | ||||
| -rw-r--r-- | searx/engines/youtube.py | 93 |
13 files changed, 169 insertions, 120 deletions
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 42e1f08bc..447138d3b 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -75,7 +75,7 @@ def load_engine(engine_data): engine.safesearch = False if not hasattr(engine, 'timeout'): - engine.timeout = settings['server']['request_timeout'] + engine.timeout = settings['outgoing']['request_timeout'] if not hasattr(engine, 'shortcut'): engine.shortcut = '' diff --git a/searx/engines/bing.py b/searx/engines/bing.py index c72e6aeff..171606cf6 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -52,7 +52,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # parse results for result in dom.xpath('//div[@class="sa_cc"]'): diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 839b8e5be..06850dfe1 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -63,7 +63,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # init regex for yaml-parsing p = re.compile('({|,)([a-z]+):(")') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index b852de9ba..cfc8e7159 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -13,6 +13,8 @@ from urllib import urlencode from cgi import escape from lxml import etree +from random import randint +from time import time # engine dependent config categories = ['general'] @@ -21,7 +23,7 @@ number_of_results = 5 # search-url, invalid HTTPS certificate base_url = 'http://gigablast.com/' -search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0' +search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}' # specific xpath variables results_xpath = '//response//result' @@ -37,7 +39,9 @@ def request(query, params): search_path = search_string.format( query=urlencode({'q': query}), offset=offset, - number_of_results=number_of_results) + number_of_results=number_of_results, + uxid=randint(10000, 10000000), + rand=int(time())) params['url'] = base_url + search_path diff --git a/searx/engines/google.py b/searx/engines/google.py index 0e78a9e2c..67e6ebb87 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,11 +9,15 @@ # @parse url, title, content, suggestion import re +from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl -from lxml import html +from lxml import html, etree from searx.poolrequests import get from searx.engines.xpath import extract_text, extract_url +from searx.search import logger + +logger = logger.getChild('google engine') # engine dependent config @@ -167,7 +171,7 @@ def parse_url(url_string, google_hostname): def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: - return extract_text(r[0]) + return escape(extract_text(r[0])) return None @@ -224,8 +228,8 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): - title = extract_text(result.xpath(title_xpath)[0]) try: + title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) @@ -268,12 +272,13 @@ def response(resp): 'content': content }) except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion - results.append({'suggestion': extract_text(suggestion)}) + results.append({'suggestion': escape(extract_text(suggestion))}) # return results return results diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index ab0dfd44c..55446b410 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files'] paging = True # search-url -url = 'https://thepiratebay.am/' +url = 'https://thepiratebay.se/' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' # piratebay specific type-definitions diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 21d9c4ac2..bd5eb71d2 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -34,6 +34,11 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']-1) + # Disable SSL verification + # error: (60) SSL certificate problem: unable to get local issuer + # certificate + params['verify'] = False + return params diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index 582b98d79..9453f31a4 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -27,6 +27,11 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']-1) + # Disable SSL verification + # error: (60) SSL certificate problem: unable to get local issuer + # certificate + params['verify'] = False + return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 9d5b4befe..a91cafa00 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -12,6 +12,8 @@ from lxml import html from cgi import escape +from dateutil import parser +from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text @@ -66,20 +68,57 @@ def response(resp): url = link.attrib.get('href') # block google-ad url's - if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): + if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): + continue + + # block ixquick search url's + if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): continue title = escape(extract_text(link)) - if result.xpath('./p[@class="desc"]'): - content = escape(extract_text(result.xpath('./p[@class="desc"]'))) + if result.xpath('./p[@class="desc clk"]'): + content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) else: content = '' - # append result - results.append({'url': url, - 'title': title, - 'content': content}) + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match("^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) # return results return results diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index a0ee18a47..36efac186 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -55,10 +55,14 @@ def response(resp): # parse results for tweet in dom.xpath(results_xpath): - link = tweet.xpath(link_xpath)[0] + try: + link = tweet.xpath(link_xpath)[0] + content = extract_text(tweet.xpath(content_xpath)[0]) + except Exception: + continue + url = urljoin(base_url, link.attrib.get('href')) title = extract_text(tweet.xpath(title_xpath)) - content = extract_text(tweet.xpath(content_xpath)[0]) pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 43f72761e..fc840d47c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -1,8 +1,15 @@ import json -from urllib import urlencode + +from searx import logger from searx.poolrequests import get from searx.utils import format_date_by_locale +from datetime import datetime +from dateutil.parser import parse as dateutil_parse +from urllib import urlencode + + +logger = logger.getChild('wikidata') result_count = 1 wikidata_host = 'https://www.wikidata.org' wikidata_api = wikidata_host + '/w/api.php' @@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale): if postal_code is not None: attributes.append({'label': 'Postal code(s)', 'value': postal_code}) - date_of_birth = get_time(claims, 'P569', None) + date_of_birth = get_time(claims, 'P569', locale, None) if date_of_birth is not None: - date_of_birth = format_date_by_locale(date_of_birth[8:], locale) attributes.append({'label': 'Date of birth', 'value': date_of_birth}) - date_of_death = get_time(claims, 'P570', None) + date_of_death = get_time(claims, 'P570', locale, None) if date_of_death is not None: - date_of_death = format_date_by_locale(date_of_death[8:], locale) attributes.append({'label': 'Date of death', 'value': date_of_death}) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: @@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None): return result[0] -def get_time(claims, propertyName, defaultValue=None): +def get_time(claims, propertyName, locale, defaultValue=None): propValue = claims.get(propertyName, {}) if len(propValue) == 0: return defaultValue @@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None): result.append(value.get('time', '')) if len(result) == 0: - return defaultValue + date_string = defaultValue else: - return ', '.join(result) + date_string = ', '.join(result) + + try: + parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") + except: + if date_string.startswith('-'): + return date_string.split('T')[0] + try: + parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) + except: + logger.debug('could not parse date %s', date_string) + return date_string.split('T')[0] + + return format_date_by_locale(parsed_date, locale) def get_geolink(claims, propertyName, defaultValue=''): diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..edc6ad5f2 --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,62 @@ +""" + Yahoo (Web) + + @website https://yandex.ru/ + @provide-api ? + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from urllib import urlencode +from lxml import html +from searx.search import logger + +logger = logger.getChild('yandex engine') + +# engine dependent config +categories = ['general'] +paging = True +language_support = True # TODO + +default_tld = 'com' +language_map = {'ru': 'ru', + 'ua': 'uk', + 'tr': 'com.tr'} + +# search-url +base_url = 'https://yandex.{tld}/' +search_url = 'search/?{query}&p={page}' + +results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]' +url_xpath = './/h2/a/@href' +title_xpath = './/h2/a//text()' +content_xpath = './/div[@class="serp-item__text"]//text()' + + +def request(query, params): + lang = params['language'].split('_')[0] + host = base_url.format(tld=language_map.get(lang) or default_tld) + params['url'] = host + search_url.format(page=params['pageno']-1, + query=urlencode({'text': query})) + return params + + +# get response from search-request +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for result in dom.xpath(results_xpath): + try: + res = {'url': result.xpath(url_xpath)[0], + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} + except: + logger.exception('yandex parse crash') + continue + + results.append(res) + + return results diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py deleted file mode 100644 index c77cd2d0e..000000000 --- a/searx/engines/youtube.py +++ /dev/null @@ -1,93 +0,0 @@ -# Youtube (Videos) -# -# @website https://www.youtube.com/ -# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/) -# -# @using-api yes -# @results JSON -# @stable yes -# @parse url, title, content, publishedDate, thumbnail, embedded - -from json import loads -from urllib import urlencode -from dateutil import parser - -# engine dependent config -categories = ['videos', 'music'] -paging = True -language_support = True - -# search-url -base_url = 'https://gdata.youtube.com/feeds/api/videos' -search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5' - -embedded_url = '<iframe width="540" height="304" ' +\ - 'data-src="//www.youtube-nocookie.com/embed/{videoid}" ' +\ - 'frameborder="0" allowfullscreen></iframe>' - - -# do search-request -def request(query, params): - index = (params['pageno'] - 1) * 5 + 1 - - params['url'] = search_url.format(query=urlencode({'q': query}), - index=index) - - # add language tag if specified - if params['language'] != 'all': - params['url'] += '&lr=' + params['language'].split('_')[0] - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_results = loads(resp.text) - - # return empty array if there are no results - if 'feed' not in search_results: - return [] - - feed = search_results['feed'] - - # parse results - for result in feed['entry']: - url = [x['href'] for x in result['link'] if x['type'] == 'text/html'] - - if not url: - continue - - # remove tracking - url = url[0].replace('feature=youtube_gdata', '') - if url.endswith('&'): - url = url[:-1] - - videoid = url[32:] - - title = result['title']['$t'] - content = '' - thumbnail = '' - - pubdate = result['published']['$t'] - publishedDate = parser.parse(pubdate) - - if 'media$thumbnail' in result['media$group']: - thumbnail = result['media$group']['media$thumbnail'][0]['url'] - - content = result['content']['$t'] - - embedded = embedded_url.format(videoid=videoid) - - # append result - results.append({'url': url, - 'title': title, - 'content': content, - 'template': 'videos.html', - 'publishedDate': publishedDate, - 'embedded': embedded, - 'thumbnail': thumbnail}) - - # return results - return results |