diff options
| author | Adam Tauber <asciimoo@gmail.com> | 2015-01-31 22:05:13 +0100 |
|---|---|---|
| committer | Cqoicebordel <Cqoicebordel@users.noreply.github.com> | 2015-02-01 11:55:47 +0100 |
| commit | f18807955beceb86a99963feedee8355f31c481c (patch) | |
| tree | b659aa4c61379b439eca8dc21e4fa68ef5ba796f /searx/engines/www500px.py | |
| parent | 04fa31b7f4d45182fa4ced6d6e23fd9ec4960d2e (diff) | |
[mod] python importable engine names
Diffstat (limited to 'searx/engines/www500px.py')
| -rw-r--r-- | searx/engines/www500px.py | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py new file mode 100644 index 000000000..f25678c24 --- /dev/null +++ b/searx/engines/www500px.py @@ -0,0 +1,63 @@ +## 500px (Images) +# +# @website https://500px.com +# @provide-api yes (https://developers.500px.com/) +# +# @using-api no +# @results HTML +# @stable no (HTML can change) +# @parse url, title, thumbnail, img_src, content +# +# @todo rewrite to api + + +from urllib import urlencode +from urlparse import urljoin +from lxml import html +import re + +# engine dependent config +categories = ['images'] +paging = True + +# search-url +base_url = 'https://500px.com' +search_url = base_url+'/search?search?page={pageno}&type=photos&{query}' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'q': query})) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + regex = re.compile('3\.jpg.*$') + + # parse results + for result in dom.xpath('//div[@class="photo"]'): + link = result.xpath('.//a')[0] + url = urljoin(base_url, link.attrib.get('href')) + title = result.xpath('.//div[@class="title"]//text()')[0] + thumbnail_src = link.xpath('.//img')[0].attrib['src'] + # To have a bigger thumbnail, uncomment the next line + #thumbnail_src = regex.sub('4.jpg', thumbnail_src) + content = result.xpath('.//div[@class="info"]//text()')[0] + img_src = regex.sub('2048.jpg', thumbnail_src) + + # append result + results.append({'url': url, + 'title': title, + 'img_src': img_src, + 'content': content, + 'thumbnail_src': thumbnail_src, + 'template': 'images.html'}) + + # return results + return results |