From 1fcf066a8188b28eb644ea304a131d40b1b341eb Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sun, 2 Aug 2015 19:38:27 +0200 Subject: [mod] change settings file structure according to #314 --- searx/engines/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 42e1f08bc..447138d3b 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -75,7 +75,7 @@ def load_engine(engine_data): engine.safesearch = False if not hasattr(engine, 'timeout'): - engine.timeout = settings['server']['request_timeout'] + engine.timeout = settings['outgoing']['request_timeout'] if not hasattr(engine, 'shortcut'): engine.shortcut = '' -- cgit v1.2.3 From 23b9095cbf2d31a1495ee3d63a55bd81548cd367 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:28:55 +0200 Subject: [fix] improve result handling of startpage engine --- searx/engines/startpage.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 9d5b4befe..08e4f7a5b 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -66,7 +66,11 @@ def response(resp): url = link.attrib.get('href') # block google-ad url's - if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): + if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue title = escape(extract_text(link)) -- cgit v1.2.3 From 996c96fffff328497c2ba305c61e064256c84188 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:31:30 +0200 Subject: [fix] block ixquick search url's --- searx/engines/startpage.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 08e4f7a5b..7d58f7f01 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -73,6 +73,10 @@ def response(resp): if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue + # block ixquick search url's + if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): + continue + title = escape(extract_text(link)) if result.xpath('./p[@class="desc"]'): -- cgit v1.2.3 From 28493d41a327128762c6286a625d219e4b0b4e2e Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 13:12:51 +0200 Subject: [fix] handle missing url in twitter results --- searx/engines/twitter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index a0ee18a47..36efac186 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -55,10 +55,14 @@ def response(resp): # parse results for tweet in dom.xpath(results_xpath): - link = tweet.xpath(link_xpath)[0] + try: + link = tweet.xpath(link_xpath)[0] + content = extract_text(tweet.xpath(content_xpath)[0]) + except Exception: + continue + url = urljoin(base_url, link.attrib.get('href')) title = extract_text(tweet.xpath(title_xpath)) - content = extract_text(tweet.xpath(content_xpath)[0]) pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: -- cgit v1.2.3 From 3f31e1ce6bcaea595a6e773c4ff729cf7a9e31e1 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 22:56:40 +0200 Subject: [fix] piratebay tld according to wikipedia --- searx/engines/piratebay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index ab0dfd44c..55446b410 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files'] paging = True # search-url -url = 'https://thepiratebay.am/' +url = 'https://thepiratebay.se/' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' # piratebay specific type-definitions -- cgit v1.2.3 From b9c8039d743376ab134adb3da146519c5353c36c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 22:56:45 +0200 Subject: [mod] disable searchcode SSL verification (unable to get local issuer) --- searx/engines/searchcode_code.py | 5 +++++ searx/engines/searchcode_doc.py | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 21d9c4ac2..bd5eb71d2 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -34,6 +34,11 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']-1) + # Disable SSL verification + # error: (60) SSL certificate problem: unable to get local issuer + # certificate + params['verify'] = False + return params diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index 582b98d79..9453f31a4 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -27,6 +27,11 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']-1) + # Disable SSL verification + # error: (60) SSL certificate problem: unable to get local issuer + # certificate + params['verify'] = False + return params -- cgit v1.2.3 From 604f32f67276a34a3ead265ff89d3bb807902b26 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 28 Aug 2015 14:51:32 +0200 Subject: [fix] bing unicode encode error - fixes #408 --- searx/engines/bing.py | 2 +- searx/engines/bing_images.py | 2 +- searx/engines/bing_news.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index c72e6aeff..171606cf6 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -52,7 +52,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # parse results for result in dom.xpath('//div[@class="sa_cc"]'): diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 839b8e5be..06850dfe1 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -63,7 +63,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # init regex for yaml-parsing p = re.compile('({|,)([a-z]+):(")') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index a2397c48e..943bf882e 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -68,7 +68,7 @@ def request(query, params): def response(resp): results = [] - rss = etree.fromstring(resp.content) + rss = etree.fromstring(resp.text) ns = rss.nsmap -- cgit v1.2.3 From 78a69e4c982d08a0fb49f1347d7f9db3b15d464f Mon Sep 17 00:00:00 2001 From: Emmanuel Benazera Date: Tue, 1 Sep 2015 16:47:56 +0200 Subject: ddg encoding of URLs appears to be broken, revealed when trying to pickled the results to disk --- searx/engines/duckduckgo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 4ac2099ae..f18f3b446 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -72,7 +72,7 @@ def response(resp): # append result results.append({'title': title, 'content': content, - 'url': res_url}) + 'url': res_url.encode('utf8')}) # return results return results -- cgit v1.2.3 From 362c849797e2e6f0e232642c23744c47a75cdfd4 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 7 Sep 2015 22:39:33 +0200 Subject: [fix][mod] wikidata date handling refactor - fixes #387 --- searx/engines/wikidata.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 43f72761e..fc840d47c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -1,8 +1,15 @@ import json -from urllib import urlencode + +from searx import logger from searx.poolrequests import get from searx.utils import format_date_by_locale +from datetime import datetime +from dateutil.parser import parse as dateutil_parse +from urllib import urlencode + + +logger = logger.getChild('wikidata') result_count = 1 wikidata_host = 'https://www.wikidata.org' wikidata_api = wikidata_host + '/w/api.php' @@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale): if postal_code is not None: attributes.append({'label': 'Postal code(s)', 'value': postal_code}) - date_of_birth = get_time(claims, 'P569', None) + date_of_birth = get_time(claims, 'P569', locale, None) if date_of_birth is not None: - date_of_birth = format_date_by_locale(date_of_birth[8:], locale) attributes.append({'label': 'Date of birth', 'value': date_of_birth}) - date_of_death = get_time(claims, 'P570', None) + date_of_death = get_time(claims, 'P570', locale, None) if date_of_death is not None: - date_of_death = format_date_by_locale(date_of_death[8:], locale) attributes.append({'label': 'Date of death', 'value': date_of_death}) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: @@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None): return result[0] -def get_time(claims, propertyName, defaultValue=None): +def get_time(claims, propertyName, locale, defaultValue=None): propValue = claims.get(propertyName, {}) if len(propValue) == 0: return defaultValue @@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None): result.append(value.get('time', '')) if len(result) == 0: - return defaultValue + date_string = defaultValue else: - return ', '.join(result) + date_string = ', '.join(result) + + try: + parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") + except: + if date_string.startswith('-'): + return date_string.split('T')[0] + try: + parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) + except: + logger.debug('could not parse date %s', date_string) + return date_string.split('T')[0] + + return format_date_by_locale(parsed_date, locale) def get_geolink(claims, propertyName, defaultValue=''): -- cgit v1.2.3 From 4184cece4a0b2d04b45105e755492bfee5fa1a12 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 7 Sep 2015 23:13:04 +0200 Subject: [fix] duckduckgo unicode url - #419 --- searx/engines/duckduckgo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index f18f3b446..4ac2099ae 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -72,7 +72,7 @@ def response(resp): # append result results.append({'title': title, 'content': content, - 'url': res_url.encode('utf8')}) + 'url': res_url}) # return results return results -- cgit v1.2.3 From 6bcbd633a591f06d4b60969cec699519f299a4de Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 8 Sep 2015 22:10:41 +0200 Subject: [fix] remove obsolete youtube engine --- searx/engines/youtube.py | 93 ------------------------------------------------ 1 file changed, 93 deletions(-) delete mode 100644 searx/engines/youtube.py (limited to 'searx/engines') diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py deleted file mode 100644 index c77cd2d0e..000000000 --- a/searx/engines/youtube.py +++ /dev/null @@ -1,93 +0,0 @@ -# Youtube (Videos) -# -# @website https://www.youtube.com/ -# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/) -# -# @using-api yes -# @results JSON -# @stable yes -# @parse url, title, content, publishedDate, thumbnail, embedded - -from json import loads -from urllib import urlencode -from dateutil import parser - -# engine dependent config -categories = ['videos', 'music'] -paging = True -language_support = True - -# search-url -base_url = 'https://gdata.youtube.com/feeds/api/videos' -search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5' - -embedded_url = '' - - -# do search-request -def request(query, params): - index = (params['pageno'] - 1) * 5 + 1 - - params['url'] = search_url.format(query=urlencode({'q': query}), - index=index) - - # add language tag if specified - if params['language'] != 'all': - params['url'] += '&lr=' + params['language'].split('_')[0] - - return params - - -# get response from search-request -def response(resp): - results = [] - - search_results = loads(resp.text) - - # return empty array if there are no results - if 'feed' not in search_results: - return [] - - feed = search_results['feed'] - - # parse results - for result in feed['entry']: - url = [x['href'] for x in result['link'] if x['type'] == 'text/html'] - - if not url: - continue - - # remove tracking - url = url[0].replace('feature=youtube_gdata', '') - if url.endswith('&'): - url = url[:-1] - - videoid = url[32:] - - title = result['title']['$t'] - content = '' - thumbnail = '' - - pubdate = result['published']['$t'] - publishedDate = parser.parse(pubdate) - - if 'media$thumbnail' in result['media$group']: - thumbnail = result['media$group']['media$thumbnail'][0]['url'] - - content = result['content']['$t'] - - embedded = embedded_url.format(videoid=videoid) - - # append result - results.append({'url': url, - 'title': title, - 'content': content, - 'template': 'videos.html', - 'publishedDate': publishedDate, - 'embedded': embedded, - 'thumbnail': thumbnail}) - - # return results - return results -- cgit v1.2.3 From e3df22b1401742ae0ade324ce4403f2b2b45dfe1 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 11 Sep 2015 17:57:09 +0200 Subject: [fix] handle missing url scheme - fixes #428 --- searx/engines/gigablast.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index b852de9ba..04513cc98 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -53,6 +53,8 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): url = result.xpath(url_xpath)[0].text + if not url.startswith('http://') and not url.startswith('https://'): + url = 'http://' + url title = result.xpath(title_xpath)[0].text content = escape(result.xpath(content_xpath)[0].text) -- cgit v1.2.3 From 37c3ace3096d9568f8dbdc3728659f4c77377b33 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 11 Sep 2015 18:33:06 +0200 Subject: [fix] add missing scheme to duplicated results too ++ revert gigablasts handling --- searx/engines/gigablast.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 04513cc98..b852de9ba 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -53,8 +53,6 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): url = result.xpath(url_xpath)[0].text - if not url.startswith('http://') and not url.startswith('https://'): - url = 'http://' + url title = result.xpath(title_xpath)[0].text content = escape(result.xpath(content_xpath)[0].text) -- cgit v1.2.3 From 0ad272c5cb81a9c69008aa86a1f29cd642ddf4ff Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Wed, 30 Sep 2015 16:42:03 +0200 Subject: [fix] content escaping - closes #441 TODO check other engines too --- searx/engines/google.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/google.py b/searx/engines/google.py index 0e78a9e2c..c8299d04b 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,6 +9,7 @@ # @parse url, title, content, suggestion import re +from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html @@ -167,7 +168,7 @@ def parse_url(url_string, google_hostname): def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: - return extract_text(r[0]) + return escape(extract_text(r[0])) return None @@ -273,7 +274,7 @@ def response(resp): # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion - results.append({'suggestion': extract_text(suggestion)}) + results.append({'suggestion': escape(extract_text(suggestion))}) # return results return results -- cgit v1.2.3 From 7aa9f4885a3ecb6d65af6375188c4a7ea8f7714d Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 16 Oct 2015 11:53:52 +0200 Subject: [fix] unicode decoding --- searx/engines/bing_news.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 943bf882e..a2397c48e 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -68,7 +68,7 @@ def request(query, params): def response(resp): results = [] - rss = etree.fromstring(resp.text) + rss = etree.fromstring(resp.content) ns = rss.nsmap -- cgit v1.2.3 From f1ac794a0737c76516cf6c324027f2dd718a67a1 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 16 Oct 2015 12:05:50 +0200 Subject: [fix] gigablast url params --- searx/engines/gigablast.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index b852de9ba..cfc8e7159 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -13,6 +13,8 @@ from urllib import urlencode from cgi import escape from lxml import etree +from random import randint +from time import time # engine dependent config categories = ['general'] @@ -21,7 +23,7 @@ number_of_results = 5 # search-url, invalid HTTPS certificate base_url = 'http://gigablast.com/' -search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0' +search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0&uxid={uxid}&rand={rand}' # specific xpath variables results_xpath = '//response//result' @@ -37,7 +39,9 @@ def request(query, params): search_path = search_string.format( query=urlencode({'q': query}), offset=offset, - number_of_results=number_of_results) + number_of_results=number_of_results, + uxid=randint(10000, 10000000), + rand=int(time())) params['url'] = base_url + search_path -- cgit v1.2.3 From 4508c966677708a2926afb1d05f134f252d8f93a Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Sat, 24 Oct 2015 16:15:30 +0200 Subject: [enh] fix content fetching, parse published date from description --- searx/engines/startpage.py | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 7d58f7f01..a91cafa00 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -12,6 +12,8 @@ from lxml import html from cgi import escape +from dateutil import parser +from datetime import datetime, timedelta import re from searx.engines.xpath import extract_text @@ -79,15 +81,44 @@ def response(resp): title = escape(extract_text(link)) - if result.xpath('./p[@class="desc"]'): - content = escape(extract_text(result.xpath('./p[@class="desc"]'))) + if result.xpath('./p[@class="desc clk"]'): + content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) else: content = '' - # append result - results.append({'url': url, - 'title': title, - 'content': content}) + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + published_date = parser.parse(date_string, dayfirst=True) + + # fix content string + content = content[date_pos:] + + # check if search result starts with something like: "5 days ago ... " + elif re.match("^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...')+4 + date_string = content[0:date_pos-5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + + # fix content string + content = content[date_pos:] + + if published_date: + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'publishedDate': published_date}) + else: + # append result + results.append({'url': url, + 'title': title, + 'content': content}) # return results return results -- cgit v1.2.3 From 5d49c15f791c3b9297bb890b28643e6c50406f35 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 29 Oct 2015 12:47:12 +0100 Subject: [fix] google engine - ignore new useless result type --- searx/engines/google.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/google.py b/searx/engines/google.py index c8299d04b..67e6ebb87 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -12,9 +12,12 @@ import re from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl -from lxml import html +from lxml import html, etree from searx.poolrequests import get from searx.engines.xpath import extract_text, extract_url +from searx.search import logger + +logger = logger.getChild('google engine') # engine dependent config @@ -225,8 +228,8 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): - title = extract_text(result.xpath(title_xpath)[0]) try: + title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) @@ -269,6 +272,7 @@ def response(resp): 'content': content }) except: + logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion -- cgit v1.2.3 From fafc564874c63e2c8fb5082c944779c1572e2f15 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 31 Oct 2015 15:27:23 +0100 Subject: [enh] yandex engine added --- searx/engines/yandex.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 searx/engines/yandex.py (limited to 'searx/engines') diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..a1d2ab792 --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,55 @@ +""" + Yahoo (Web) + + @website https://yandex.ru/ + @provide-api ? + @using-api no + @results HTML (using search portal) + @stable no (HTML can change) + @parse url, title, content +""" + +from urllib import urlencode +from lxml import html +from searx.search import logger + +logger = logger.getChild('yandex engine') + +# engine dependent config +categories = ['general'] +paging = True +language_support = True # TODO + +# search-url +base_url = 'https://yandex.ru/' +search_url = 'search/?{query}&p={page}' + +results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]' +url_xpath = './/h2/a/@href' +title_xpath = './/h2/a//text()' +content_xpath = './/div[@class="serp-item__text"]//text()' + + +def request(query, params): + params['url'] = base_url + search_url.format(page=params['pageno']-1, + query=urlencode({'text': query})) + return params + + +# get response from search-request +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for result in dom.xpath(results_xpath): + try: + res = {'url': result.xpath(url_xpath)[0], + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} + except: + logger.exception('yandex parse crash') + continue + + results.append(res) + + return results -- cgit v1.2.3 From e98aef6fc4954681e58d774203d522f0ae478004 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 31 Oct 2015 23:05:07 +0100 Subject: [fix] yandex engine language support according to #430 --- searx/engines/yandex.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index a1d2ab792..edc6ad5f2 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -20,8 +20,13 @@ categories = ['general'] paging = True language_support = True # TODO +default_tld = 'com' +language_map = {'ru': 'ru', + 'ua': 'uk', + 'tr': 'com.tr'} + # search-url -base_url = 'https://yandex.ru/' +base_url = 'https://yandex.{tld}/' search_url = 'search/?{query}&p={page}' results_xpath = '//div[@class="serp-item serp-item_plain_yes clearfix i-bem"]' @@ -31,8 +36,10 @@ content_xpath = './/div[@class="serp-item__text"]//text()' def request(query, params): - params['url'] = base_url + search_url.format(page=params['pageno']-1, - query=urlencode({'text': query})) + lang = params['language'].split('_')[0] + host = base_url.format(tld=language_map.get(lang) or default_tld) + params['url'] = host + search_url.format(page=params['pageno']-1, + query=urlencode({'text': query})) return params -- cgit v1.2.3