diff options
| author | Markus Heiser <markus.heiser@darmarit.de> | 2020-11-03 08:44:41 +0100 |
|---|---|---|
| committer | Markus Heiser <markus.heiser@darmarit.de> | 2020-11-14 17:09:56 +0100 |
| commit | c71d214b0c3caeb6a5e29c5ab8087c0cacc83932 (patch) | |
| tree | 430f8d134a9abc89821ab3f6aac16c22e6346b76 /searx/engines/deviantart.py | |
| parent | 4f18faebe17805ab89ec74e76d722fa64fc2418c (diff) | |
[refactor] deviantart - improve results and clean up source code
Devian's request and response forms has been changed.
- fixed title
- fixed time_range_dict to 'popular-*-***'
- use image from <noscript> if exists
- drop obsolete "http to https, remove domain sharding"
- use query URL https://www.deviantart.com/search/deviations?page=5&q=foo
- add searx/engines/deviantart.py to pylint check (test.pylint)
Error pattern::
There DEBUG:searx:result: invalid title: {'url': 'https://www.deviantart.com/ ...
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/deviantart.py')
| -rw-r--r-- | searx/engines/deviantart.py | 85 |
1 files changed, 40 insertions, 45 deletions
diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index c06a79b79..0378929b2 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -7,75 +7,70 @@ @using-api no (TODO, rewrite to api) @results HTML @stable no (HTML can change) - @parse url, title, thumbnail_src, img_src + @parse url, title, img_src @todo rewrite to api """ +# pylint: disable=missing-function-docstring -from lxml import html -import re from urllib.parse import urlencode - +from lxml import html # engine dependent config categories = ['images'] paging = True time_range_support = True -# search-url -base_url = 'https://www.deviantart.com/' -search_url = base_url + 'search?page={page}&{query}' -time_range_url = '&order={range}' - -time_range_dict = {'day': 11, - 'week': 14, - 'month': 15} +time_range_dict = { + 'day': 'popular-24-hours', + 'week': 'popular-1-week', + 'month': 'popular-1-month', + 'year': 'most-recent', +} +# search-url +base_url = 'https://www.deviantart.com' -# do search-request def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: - return params - params['url'] = search_url.format(page=params['pageno'], - query=urlencode({'q': query})) + # https://www.deviantart.com/search/deviations?page=5&q=foo + + query = { + 'page' : params['pageno'], + 'q' : query, + } if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) + query['order'] = time_range_dict[params['time_range']] - return params + params['url'] = base_url + '/search/deviations?' + urlencode(query) + return params -# get response from search-request def response(resp): - results = [] - # return empty array if a redirection code is returned - if resp.status_code == 302: - return [] + results = [] dom = html.fromstring(resp.text) - # parse results for row in dom.xpath('//div[contains(@data-hook, "content_row")]'): for result in row.xpath('./div'): - link = result.xpath('.//a[@data-hook="deviation_link"]')[0] - url = link.attrib.get('href') - title = link.attrib.get('title') - thumbnail_src = result.xpath('.//img')[0].attrib.get('src') - img_src = thumbnail_src - - # http to https, remove domain sharding - thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) - thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) - - url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) - - # append result - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'template': 'images.html'}) - - # return results + + a_tag = result.xpath('.//a[@data-hook="deviation_link"]')[0] + noscript_tag = a_tag.xpath('.//noscript') + + if noscript_tag: + img_tag = noscript_tag[0].xpath('.//img') + else: + img_tag = a_tag.xpath('.//img') + if not img_tag: + continue + img_tag = img_tag[0] + + results.append({ + 'template': 'images.html', + 'url': a_tag.attrib.get('href'), + 'img_src': img_tag.attrib.get('src'), + 'title': img_tag.attrib.get('alt'), + }) + return results |