From 44c9216c497862293318a48ad5c39f373cee95e6 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sun, 25 Jan 2015 20:04:44 +0100 Subject: Sanitize extract_text --- searx/engines/xpath.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 72120304e..1a599dc0a 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -28,13 +28,13 @@ def extract_text(xpath_results): result = '' for e in xpath_results: result = result + extract_text(e) - return result + return result.strip() elif type(xpath_results) in [_ElementStringResult, _ElementUnicodeResult]: # it's a string return ''.join(xpath_results) else: # it's a element - return html_to_text(xpath_results.text_content()) + return html_to_text(xpath_results.text_content()).strip() def extract_url(xpath_results, search_url): -- cgit v1.2.3 From 525af2a031b787e22c3e310e61bfcd5fd1737bca Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sun, 25 Jan 2015 20:14:37 +0100 Subject: Add bing in the test units --- searx/engines/bing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 5de461cfe..f9c323d05 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -14,6 +14,7 @@ from urllib import urlencode from cgi import escape from lxml import html +from searx.engines.xpath import extract_text # engine dependent config categories = ['general'] @@ -55,8 +56,8 @@ def response(resp): for result in dom.xpath('//div[@class="sa_cc"]'): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') - title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath('.//p//text()'))) + title = extract_text(link) + content = escape(extract_text(result.xpath('.//p'))) # append result results.append({'url': url, @@ -71,8 +72,8 @@ def response(resp): for result in dom.xpath('//li[@class="b_algo"]'): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') - title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath('.//p//text()'))) + title = extract_text(link) + content = escape(extract_text(result.xpath('.//p'))) # append result results.append({'url': url, -- cgit v1.2.3 From 4dba3739fb3b98572cbd51adab226376b5844105 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Mon, 26 Jan 2015 18:24:08 +0100 Subject: Youtube's unit test --- searx/engines/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py index 59f07c574..1375538a8 100644 --- a/searx/engines/youtube.py +++ b/searx/engines/youtube.py @@ -57,7 +57,7 @@ def response(resp): url = [x['href'] for x in result['link'] if x['type'] == 'text/html'] if not url: - return + continue # remove tracking url = url[0].replace('feature=youtube_gdata', '') @@ -73,7 +73,7 @@ def response(resp): pubdate = result['published']['$t'] publishedDate = parser.parse(pubdate) - if result['media$group']['media$thumbnail']: + if 'media$thumbnail' in result['media$group']: thumbnail = result['media$group']['media$thumbnail'][0]['url'] content = result['content']['$t'] -- cgit v1.2.3 From 3282e62ff92f1c2158cb169d2a21a5988766450c Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Tue, 27 Jan 2015 22:39:25 +0100 Subject: Searchcode engines corrections --- searx/engines/searchcode_code.py | 2 +- searx/engines/searchcode_doc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 655818da2..f276697b1 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -42,7 +42,7 @@ def response(resp): search_results = loads(resp.text) # parse results - for result in search_results['results']: + for result in search_results.get('results', []): href = result['url'] title = "" + result['name'] + " - " + result['filename'] repo = result['repo'] diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index b5b7159be..76da8d752 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -35,7 +35,7 @@ def response(resp): search_results = loads(resp.text) # parse results - for result in search_results['results']: + for result in search_results.get('results', []): href = result['url'] title = "[" + result['type'] + "] " +\ result['namespace'] +\ -- cgit v1.2.3 From 1d255061c7422045ef912a471500513832e0319f Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Thu, 29 Jan 2015 00:26:12 +0100 Subject: Digg's unit test --- searx/engines/digg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 8c457d6b9..1b5f2c8e4 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -44,7 +44,7 @@ def response(resp): search_result = loads(resp.text) - if search_result['html'] == '': + if 'html' not in search_result or search_result['html'] == '': return results dom = html.fromstring(search_result['html']) -- cgit v1.2.3 From d4957045513d6fb32dcffbc7ea87483479a8cb6e Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Thu, 29 Jan 2015 01:13:33 +0100 Subject: Deviant Art's unit test --- searx/engines/deviantart.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index 6284cf598..4198e8c76 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -14,6 +14,7 @@ from urllib import urlencode from urlparse import urljoin from lxml import html import re +from searx.engines.xpath import extract_text # engine dependent config categories = ['images'] @@ -50,9 +51,9 @@ def response(resp): for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'): link = result.xpath('.//a[contains(@class, "thumb")]')[0] url = urljoin(base_url, link.attrib.get('href')) - title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa - title = ''.join(title_links[0].xpath('.//text()')) - thumbnail_src = link.xpath('.//img')[0].attrib['src'] + title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') + title = extract_text(title_links[0]) + thumbnail_src = link.xpath('.//img')[0].attrib.get('src') img_src = regex.sub('/', thumbnail_src) # append result -- cgit v1.2.3 From dad0434f34f04ada2b4b0961bbb714e25c752677 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Thu, 29 Jan 2015 20:15:52 +0100 Subject: Bing images' unit test --- searx/engines/bing_images.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 9ae498427..9d1c22f5a 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -33,7 +33,10 @@ def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 # required for cookie - language = 'en-US' + if params['language'] == 'all': + language = 'en-US' + else: + language = params['language'].replace('_', '-') search_path = search_string.format( query=urlencode({'q': query}), -- cgit v1.2.3 From efde2c21c8656ad21b24980b516ddbbf2e209523 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Thu, 29 Jan 2015 20:56:57 +0100 Subject: Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... --- searx/engines/bing_news.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 789a23b89..182bd36b5 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -15,6 +15,7 @@ from lxml import html from datetime import datetime, timedelta from dateutil import parser import re +from searx.engines.xpath import extract_text # engine dependent config categories = ['news'] @@ -42,6 +43,7 @@ def request(query, params): params['cookies']['_FP'] = "ui=en-US" params['url'] = base_url + search_path + return params @@ -55,44 +57,37 @@ def response(resp): for result in dom.xpath('//div[@class="sn_r"]'): link = result.xpath('.//div[@class="newstitle"]/a')[0] url = link.attrib.get('href') - title = ' '.join(link.xpath('.//text()')) - contentXPath = result.xpath('.//div[@class="sn_txt"]/div' - '//span[@class="sn_snip"]//text()') + title = extract_text(link) + contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]') if contentXPath is not None: - content = escape(' '.join(contentXPath)) + content = escape(extract_text(contentXPath)) # parse publishedDate publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div' '//span[contains(@class,"sn_ST")]' - '//span[contains(@class,"sn_tm")]' - '//text()') + '//span[contains(@class,"sn_tm")]') + if publishedDateXPath is not None: - publishedDate = escape(' '.join(publishedDateXPath)) + publishedDate = escape(extract_text(publishedDateXPath)) if re.match("^[0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) - publishedDate = datetime.now()\ - - timedelta(minutes=int(timeNumbers[0])) + publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0])) elif re.match("^[0-9]+ hour(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) - publishedDate = datetime.now()\ - - timedelta(hours=int(timeNumbers[0])) - elif re.match("^[0-9]+ hour(s|)," - " [0-9]+ minute(s|) ago$", publishedDate): + publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0])) + elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now()\ - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) elif re.match("^[0-9]+ day(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) - publishedDate = datetime.now()\ - - timedelta(days=int(timeNumbers[0])) + publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0])) else: try: - # FIXME use params['language'] to parse either mm/dd or dd/mm publishedDate = parser.parse(publishedDate, dayfirst=False) except TypeError: - # FIXME publishedDate = datetime.now() # append result -- cgit v1.2.3 From 5761d6f0ab071bdae05ecef1966dd3e4cbec6eee Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Thu, 29 Jan 2015 21:19:59 +0100 Subject: Bing news engine corrections XPath *never* return None. (I found the HTML report of coverage) --- searx/engines/bing_news.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 182bd36b5..e6adb2644 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -59,16 +59,14 @@ def response(resp): url = link.attrib.get('href') title = extract_text(link) contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]') - if contentXPath is not None: - content = escape(extract_text(contentXPath)) + content = escape(extract_text(contentXPath)) # parse publishedDate publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div' '//span[contains(@class,"sn_ST")]' '//span[contains(@class,"sn_tm")]') - if publishedDateXPath is not None: - publishedDate = escape(extract_text(publishedDateXPath)) + publishedDate = escape(extract_text(publishedDateXPath)) if re.match("^[0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) -- cgit v1.2.3 From a3d444ab85dbb85dc3200c686ec3323dbb7008cb Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Fri, 30 Jan 2015 19:52:44 +0100 Subject: BTDigg's unit test --- searx/engines/btdigg.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index 973ede9ac..944250628 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -23,11 +23,6 @@ paging = True url = 'https://btdigg.org' search_url = url + '/search?q={search_term}&p={pageno}' -# specific xpath variables -magnet_xpath = './/a[@title="Torrent magnet link"]' -torrent_xpath = './/a[@title="Download torrent file"]' -content_xpath = './/span[@class="font11px lightgrey block"]' - # do search-request def request(query, params): @@ -52,8 +47,8 @@ def response(resp): # parse results for result in search_res: link = result.xpath('.//td[@class="torrent_name"]//a')[0] - href = urljoin(url, link.attrib['href']) - title = escape(extract_text(link.xpath('.//text()'))) + href = urljoin(url, link.attrib.get('href')) + title = escape(extract_text(link)) content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0])) content = "
".join(content.split("\n")) @@ -81,7 +76,7 @@ def response(resp): filesize = int(filesize * 1024 * 1024 * 1024) elif filesize_multiplier == 'MB': filesize = int(filesize * 1024 * 1024) - elif filesize_multiplier == 'kb': + elif filesize_multiplier == 'KB': filesize = int(filesize * 1024) except: filesize = None -- cgit v1.2.3 From 8ea749d6ec0b711c516f3dbdb34a1bd17ae7d945 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Fri, 30 Jan 2015 21:02:17 +0100 Subject: Kickass' unit test --- searx/engines/kickass.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py index ac349283d..8b89e1f47 100644 --- a/searx/engines/kickass.py +++ b/searx/engines/kickass.py @@ -13,6 +13,7 @@ from cgi import escape from urllib import quote from lxml import html from operator import itemgetter +from searx.engines.xpath import extract_text # engine dependent config categories = ['videos', 'music', 'files'] @@ -56,9 +57,8 @@ def response(resp): for result in search_res[1:]: link = result.xpath('.//a[@class="cellMainLink"]')[0] href = urljoin(url, link.attrib['href']) - title = ' '.join(link.xpath('.//text()')) - content = escape(html.tostring(result.xpath(content_xpath)[0], - method="text")) + title = extract_text(link) + content = escape(extract_text(result.xpath(content_xpath))) seed = result.xpath('.//td[contains(@class, "green")]/text()')[0] leech = result.xpath('.//td[contains(@class, "red")]/text()')[0] filesize = result.xpath('.//td[contains(@class, "nobr")]/text()')[0] @@ -88,7 +88,7 @@ def response(resp): filesize = int(filesize * 1024 * 1024 * 1024) elif filesize_multiplier == 'MB': filesize = int(filesize * 1024 * 1024) - elif filesize_multiplier == 'kb': + elif filesize_multiplier == 'KB': filesize = int(filesize * 1024) except: filesize = None -- cgit v1.2.3 From d5b8005ee10054b5260f57c1800ddebfa03c39cf Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sat, 31 Jan 2015 16:16:30 +0100 Subject: Google images' unit test --- searx/engines/google_images.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index cc62a4fd2..092ae6639 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -18,7 +18,7 @@ paging = True # search-url url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa +search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # do search-request @@ -45,14 +45,14 @@ def response(resp): for result in search_res['responseData']['results']: href = result['originalContextUrl'] title = result['title'] - if not result['url']: + if 'url' not in result: continue thumbnail_src = result['tbUrl'] # append result results.append({'url': href, 'title': title, - 'content': '', + 'content': result['content'], 'thumbnail_src': thumbnail_src, 'img_src': unquote(result['url']), 'template': 'images.html'}) -- cgit v1.2.3 From b7dc1fb9d572d53d04c0120d96c76a20a418cc94 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sat, 31 Jan 2015 16:38:03 +0100 Subject: Google news' unit test --- searx/engines/google_news.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index eb114f9c9..3e4371b99 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -20,7 +20,7 @@ language_support = True # engine dependent config url = 'https://ajax.googleapis.com/' -search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa +search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={lang}' # do search-request @@ -33,7 +33,7 @@ def request(query, params): params['url'] = search_url.format(offset=offset, query=urlencode({'q': query}), - language=language) + lang=language) return params @@ -52,6 +52,8 @@ def response(resp): for result in search_res['responseData']['results']: # parse publishedDate publishedDate = parser.parse(result['publishedDate']) + if 'url' not in result: + continue # append result results.append({'url': result['unescapedUrl'], -- cgit v1.2.3 From d20ddf9da147647710127385a3ee95ff273d4fea Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sat, 31 Jan 2015 17:29:22 +0100 Subject: Stackoverflow's unit test --- searx/engines/stackoverflow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index dcbb1890c..78dba9f68 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -12,6 +12,7 @@ from urlparse import urljoin from cgi import escape from urllib import urlencode from lxml import html +from searx.engines.xpath import extract_text # engine dependent config categories = ['it'] @@ -24,8 +25,7 @@ search_url = url+'search?{query}&page={pageno}' # specific xpath variables results_xpath = '//div[contains(@class,"question-summary")]' link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' -title_xpath = './/text()' -content_xpath = './/div[@class="excerpt"]//text()' +content_xpath = './/div[@class="excerpt"]' # do search-request @@ -46,8 +46,8 @@ def response(resp): for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(' '.join(link.xpath(title_xpath))) - content = escape(' '.join(result.xpath(content_xpath))) + title = escape(extract_text(link)) + content = escape(extract_text(result.xpath(content_xpath))) # append result results.append({'url': href, -- cgit v1.2.3 From 04fa31b7f4d45182fa4ced6d6e23fd9ec4960d2e Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sat, 31 Jan 2015 19:49:54 +0100 Subject: Vimeo's unit test --- searx/engines/vimeo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 39033c591..7577d12e1 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -59,8 +59,7 @@ def response(resp): url = base_url + videoid title = p.unescape(extract_text(result.xpath(title_xpath))) thumbnail = extract_text(result.xpath(content_xpath)[0]) - publishedDate = parser.parse(extract_text( - result.xpath(publishedDate_xpath)[0])) + publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0])) embedded = embedded_url.format(videoid=videoid) # append result -- cgit v1.2.3 From f18807955beceb86a99963feedee8355f31c481c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 31 Jan 2015 22:05:13 +0100 Subject: [mod] python importable engine names --- searx/engines/500px.py | 63 ------------------------- searx/engines/flickr-noapi.py | 104 ------------------------------------------ searx/engines/flickr_noapi.py | 104 ++++++++++++++++++++++++++++++++++++++++++ searx/engines/www500px.py | 63 +++++++++++++++++++++++++ 4 files changed, 167 insertions(+), 167 deletions(-) delete mode 100644 searx/engines/500px.py delete mode 100644 searx/engines/flickr-noapi.py create mode 100644 searx/engines/flickr_noapi.py create mode 100644 searx/engines/www500px.py (limited to 'searx/engines') diff --git a/searx/engines/500px.py b/searx/engines/500px.py deleted file mode 100644 index f25678c24..000000000 --- a/searx/engines/500px.py +++ /dev/null @@ -1,63 +0,0 @@ -## 500px (Images) -# -# @website https://500px.com -# @provide-api yes (https://developers.500px.com/) -# -# @using-api no -# @results HTML -# @stable no (HTML can change) -# @parse url, title, thumbnail, img_src, content -# -# @todo rewrite to api - - -from urllib import urlencode -from urlparse import urljoin -from lxml import html -import re - -# engine dependent config -categories = ['images'] -paging = True - -# search-url -base_url = 'https://500px.com' -search_url = base_url+'/search?search?page={pageno}&type=photos&{query}' - - -# do search-request -def request(query, params): - params['url'] = search_url.format(pageno=params['pageno'], - query=urlencode({'q': query})) - - return params - - -# get response from search-request -def response(resp): - results = [] - - dom = html.fromstring(resp.text) - regex = re.compile('3\.jpg.*$') - - # parse results - for result in dom.xpath('//div[@class="photo"]'): - link = result.xpath('.//a')[0] - url = urljoin(base_url, link.attrib.get('href')) - title = result.xpath('.//div[@class="title"]//text()')[0] - thumbnail_src = link.xpath('.//img')[0].attrib['src'] - # To have a bigger thumbnail, uncomment the next line - #thumbnail_src = regex.sub('4.jpg', thumbnail_src) - content = result.xpath('.//div[@class="info"]//text()')[0] - img_src = regex.sub('2048.jpg', thumbnail_src) - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'content': content, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) - - # return results - return results diff --git a/searx/engines/flickr-noapi.py b/searx/engines/flickr-noapi.py deleted file mode 100644 index 66c6f4027..000000000 --- a/searx/engines/flickr-noapi.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python - -# Flickr (Images) -# -# @website https://www.flickr.com -# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) -# -# @using-api no -# @results HTML -# @stable no -# @parse url, title, thumbnail, img_src - -from urllib import urlencode -from json import loads -import re - -categories = ['images'] - -url = 'https://secure.flickr.com/' -search_url = url+'search/?{query}&page={page}' -photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' -regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) -image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') - -paging = True - - -def build_flickr_url(user_id, photo_id): - return photo_url.format(userid=user_id, photoid=photo_id) - - -def request(query, params): - params['url'] = search_url.format(query=urlencode({'text': query}), - page=params['pageno']) - return params - - -def response(resp): - results = [] - - matches = regex.search(resp.text) - - if matches is None: - return results - - match = matches.group(1) - search_results = loads(match) - - if '_data' not in search_results: - return [] - - photos = search_results['_data'] - - for photo in photos: - - # In paged configuration, the first pages' photos - # are represented by a None object - if photo is None: - continue - - img_src = None - # From the biggest to the lowest format - for image_size in image_sizes: - if image_size in photo['sizes']: - img_src = photo['sizes'][image_size]['displayUrl'] - break - - if not img_src: - continue - - if 'id' not in photo['owner']: - continue - -# For a bigger thumbnail, keep only the url_z, not the url_n - if 'n' in photo['sizes']: - thumbnail_src = photo['sizes']['n']['displayUrl'] - elif 'z' in photo['sizes']: - thumbnail_src = photo['sizes']['z']['displayUrl'] - else: - thumbnail_src = img_src - - url = build_flickr_url(photo['owner']['id'], photo['id']) - - title = photo.get('title', '') - - content = '' +\ - photo['owner']['username'] +\ - '
' - - if 'description' in photo: - content = content +\ - '' +\ - photo['description'] +\ - '' - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'content': content, - 'template': 'images.html'}) - - return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py new file mode 100644 index 000000000..66c6f4027 --- /dev/null +++ b/searx/engines/flickr_noapi.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python + +# Flickr (Images) +# +# @website https://www.flickr.com +# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) +# +# @using-api no +# @results HTML +# @stable no +# @parse url, title, thumbnail, img_src + +from urllib import urlencode +from json import loads +import re + +categories = ['images'] + +url = 'https://secure.flickr.com/' +search_url = url+'search/?{query}&page={page}' +photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' +regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) +image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') + +paging = True + + +def build_flickr_url(user_id, photo_id): + return photo_url.format(userid=user_id, photoid=photo_id) + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'text': query}), + page=params['pageno']) + return params + + +def response(resp): + results = [] + + matches = regex.search(resp.text) + + if matches is None: + return results + + match = matches.group(1) + search_results = loads(match) + + if '_data' not in search_results: + return [] + + photos = search_results['_data'] + + for photo in photos: + + # In paged configuration, the first pages' photos + # are represented by a None object + if photo is None: + continue + + img_src = None + # From the biggest to the lowest format + for image_size in image_sizes: + if image_size in photo['sizes']: + img_src = photo['sizes'][image_size]['displayUrl'] + break + + if not img_src: + continue + + if 'id' not in photo['owner']: + continue + +# For a bigger thumbnail, keep only the url_z, not the url_n + if 'n' in photo['sizes']: + thumbnail_src = photo['sizes']['n']['displayUrl'] + elif 'z' in photo['sizes']: + thumbnail_src = photo['sizes']['z']['displayUrl'] + else: + thumbnail_src = img_src + + url = build_flickr_url(photo['owner']['id'], photo['id']) + + title = photo.get('title', '') + + content = '' +\ + photo['owner']['username'] +\ + '
' + + if 'description' in photo: + content = content +\ + '' +\ + photo['description'] +\ + '' + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'content': content, + 'template': 'images.html'}) + + return results diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py new file mode 100644 index 000000000..f25678c24 --- /dev/null +++ b/searx/engines/www500px.py @@ -0,0 +1,63 @@ +## 500px (Images) +# +# @website https://500px.com +# @provide-api yes (https://developers.500px.com/) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, thumbnail, img_src, content +# +# @todo rewrite to api + + +from urllib import urlencode +from urlparse import urljoin +from lxml import html +import re + +# engine dependent config +categories = ['images'] +paging = True + +# search-url +base_url = 'https://500px.com' +search_url = base_url+'/search?search?page={pageno}&type=photos&{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + regex = re.compile('3\.jpg.*$') + + # parse results + for result in dom.xpath('//div[@class="photo"]'): + link = result.xpath('.//a')[0] + url = urljoin(base_url, link.attrib.get('href')) + title = result.xpath('.//div[@class="title"]//text()')[0] + thumbnail_src = link.xpath('.//img')[0].attrib['src'] + # To have a bigger thumbnail, uncomment the next line + #thumbnail_src = regex.sub('4.jpg', thumbnail_src) + content = result.xpath('.//div[@class="info"]//text()')[0] + img_src = regex.sub('2048.jpg', thumbnail_src) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': content, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results -- cgit v1.2.3 From 8cf2ee57216b4dffc419e1762ff1fe4dfd30e227 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sun, 1 Feb 2015 13:43:10 +0100 Subject: 500px unit test --- searx/engines/www500px.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py index f25678c24..99dba4abf 100644 --- a/searx/engines/www500px.py +++ b/searx/engines/www500px.py @@ -15,6 +15,7 @@ from urllib import urlencode from urlparse import urljoin from lxml import html import re +from searx.engines.xpath import extract_text # engine dependent config categories = ['images'] @@ -22,7 +23,7 @@ paging = True # search-url base_url = 'https://500px.com' -search_url = base_url+'/search?search?page={pageno}&type=photos&{query}' +search_url = base_url + '/search?search?page={pageno}&type=photos&{query}' # do search-request @@ -44,11 +45,11 @@ def response(resp): for result in dom.xpath('//div[@class="photo"]'): link = result.xpath('.//a')[0] url = urljoin(base_url, link.attrib.get('href')) - title = result.xpath('.//div[@class="title"]//text()')[0] - thumbnail_src = link.xpath('.//img')[0].attrib['src'] + title = extract_text(result.xpath('.//div[@class="title"]')) + thumbnail_src = link.xpath('.//img')[0].attrib.get('src') # To have a bigger thumbnail, uncomment the next line - #thumbnail_src = regex.sub('4.jpg', thumbnail_src) - content = result.xpath('.//div[@class="info"]//text()')[0] + # thumbnail_src = regex.sub('4.jpg', thumbnail_src) + content = extract_text(result.xpath('.//div[@class="info"]')) img_src = regex.sub('2048.jpg', thumbnail_src) # append result -- cgit v1.2.3 From c6535dd65ebf110d00d633db1170f35cf60b8df0 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sun, 1 Feb 2015 14:31:04 +0100 Subject: Flickr Noapi unit test --- searx/engines/flickr_noapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searx/engines') diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 66c6f4027..73dff44c4 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -17,7 +17,7 @@ import re categories = ['images'] url = 'https://secure.flickr.com/' -search_url = url+'search/?{query}&page={page}' +search_url = url + 'search/?{query}&page={page}' photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' regex = re.compile(r"\"search-photos-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') -- cgit v1.2.3 From 5a16077455ef9e821a2b5f5f7e975be8a37ce83d Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Sun, 1 Feb 2015 15:23:26 +0100 Subject: PirateBay unit test + reactivation in Settings --- searx/engines/piratebay.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'searx/engines') diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index f6144faa2..207df276c 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -13,6 +13,7 @@ from cgi import escape from urllib import quote from lxml import html from operator import itemgetter +from searx.engines.xpath import extract_text # engine dependent config categories = ['videos', 'music', 'files'] @@ -29,7 +30,8 @@ search_types = {'files': '0', # specific xpath variables magnet_xpath = './/a[@title="Download this torrent using magnet"]' -content_xpath = './/font[@class="detDesc"]//text()' +torrent_xpath = './/a[@title="Download this torrent"]' +content_xpath = './/font[@class="detDesc"]' # do search-request @@ -59,8 +61,8 @@ def response(resp): for result in search_res[1:]: link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) - title = ' '.join(link.xpath('.//text()')) - content = escape(' '.join(result.xpath(content_xpath))) + title = extract_text(link) + content = escape(extract_text(result.xpath(content_xpath))) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] # convert seed to int if possible @@ -76,6 +78,7 @@ def response(resp): leech = 0 magnetlink = result.xpath(magnet_xpath)[0] + torrentfile = result.xpath(torrent_xpath)[0] # append result results.append({'url': href, @@ -83,7 +86,8 @@ def response(resp): 'content': content, 'seed': seed, 'leech': leech, - 'magnetlink': magnetlink.attrib['href'], + 'magnetlink': magnetlink.attrib.get('href'), + 'torrentfile': torrentfile.attrib.get('href'), 'template': 'torrent.html'}) # return results sorted by seeder -- cgit v1.2.3