1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Public domain image archive"""
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import dumps
from searx.network import get
from searx.utils import extr
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException
THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
"""
Example thumbnail urls (from requests & html):
- https://the-public-domain-review.imgix.net
/shop/nov-2023-prints-00043.jpg
?fit=max&h=360&w=360
- https://the-public-domain-review.imgix.net
/collections/the-history-of-four-footed-beasts-and-serpents-1658/
8616383182_5740fa7851_o.jpg
?fit=max&h=360&w=360
Example full image urls (from html)
- https://the-public-domain-review.imgix.net/shop/
nov-2023-prints-00043.jpg
?fit=clip&w=970&h=800&auto=format,compress
- https://the-public-domain-review.imgix.net/collections/
the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
?fit=clip&w=310&h=800&auto=format,compress
The thumbnail url from the request will be cleaned for the full image link
The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
"""
# about
about = {
"website": 'https://pdimagearchive.org',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
pdia_base_url = 'https://pdimagearchive.org'
pdia_config_start = "/_astro/InfiniteSearch."
pdia_config_end = ".js"
categories = ['images']
page_size = 20
paging = True
__CACHED_API_URL = None
def _clean_url(url):
parsed = urlparse(url)
query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def _get_algolia_api_url():
global __CACHED_API_URL # pylint:disable=global-statement
if __CACHED_API_URL:
return __CACHED_API_URL
# fake request to extract api url
resp = get(f"{pdia_base_url}/search/?q=", timeout=3)
if resp.status_code != 200:
raise LookupError("Failed to fetch config location (and as such the API url) for PDImageArchive")
pdia_config_filepart = extr(resp.text, pdia_config_start, pdia_config_end)
pdia_config_url = pdia_base_url + pdia_config_start + pdia_config_filepart + pdia_config_end
resp = get(pdia_config_url)
if resp.status_code != 200:
raise LookupError("Failed to obtain AWS api url for PDImageArchive")
api_url = extr(resp.text, 'const r="', '"', default=None)
if api_url is None:
raise LookupError("Couldn't obtain AWS api url for PDImageArchive")
__CACHED_API_URL = api_url
return api_url
def _clear_cached_api_url():
global __CACHED_API_URL # pylint:disable=global-statement
__CACHED_API_URL = None
def request(query, params):
params['url'] = _get_algolia_api_url()
params['method'] = 'POST'
request_data = {
'page': params['pageno'] - 1,
'query': query,
'hitsPerPage': page_size,
'indexName': 'prod_all-images',
}
params['headers'] = {'Content-Type': 'application/json'}
params['data'] = dumps(request_data)
# http errors are handled manually to be able to reset the api url
params['raise_for_httperror'] = False
return params
def response(resp):
results = []
json_data = resp.json()
if resp.status_code == 403:
_clear_cached_api_url()
raise SearxEngineAccessDeniedException()
if resp.status_code != 200:
raise SearxEngineException()
if 'results' not in json_data:
return []
for result in json_data['results'][0]['hits']:
content = []
if result.get("themes"):
content.append("Themes: " + result['themes'])
if result.get("encompassingWork"):
content.append("Encompassing work: " + result['encompassingWork'])
base_image_url = result['thumbnail'].split("?")[0]
results.append(
{
'template': 'images.html',
'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
'img_src': _clean_url(base_image_url),
'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
'content': "\n".join(content),
}
)
return results
|