searx/engines/public_domain_image_archive.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Public domain image archive"""

from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import dumps

from searx.network import get
from searx.utils import extr
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException

THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
"""
Example thumbnail urls (from requests & html):
- https://the-public-domain-review.imgix.net
  /shop/nov-2023-prints-00043.jpg
  ?fit=max&h=360&w=360
- https://the-public-domain-review.imgix.net
  /collections/the-history-of-four-footed-beasts-and-serpents-1658/
  8616383182_5740fa7851_o.jpg
  ?fit=max&h=360&w=360

Example full image urls (from html)
- https://the-public-domain-review.imgix.net/shop/
  nov-2023-prints-00043.jpg
  ?fit=clip&w=970&h=800&auto=format,compress
- https://the-public-domain-review.imgix.net/collections/
  the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
  ?fit=clip&w=310&h=800&auto=format,compress

The thumbnail url from the request will be cleaned for the full image link
The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
"""

# about
about = {
    "website": 'https://pdimagearchive.org',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'JSON',
}

pdia_base_url = 'https://pdimagearchive.org'
pdia_config_start = "/_astro/InfiniteSearch."
pdia_config_end = ".js"
categories = ['images']
page_size = 20
paging = True


__CACHED_API_URL = None


def _clean_url(url):
    parsed = urlparse(url)
    query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]

    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))


def _get_algolia_api_url():
    global __CACHED_API_URL  # pylint:disable=global-statement

    if __CACHED_API_URL:
        return __CACHED_API_URL

    # fake request to extract api url
    resp = get(f"{pdia_base_url}/search/?q=", timeout=3)
    if resp.status_code != 200:
        raise LookupError("Failed to fetch config location (and as such the API url) for PDImageArchive")
    pdia_config_filepart = extr(resp.text, pdia_config_start, pdia_config_end)
    pdia_config_url = pdia_base_url + pdia_config_start + pdia_config_filepart + pdia_config_end

    resp = get(pdia_config_url)
    if resp.status_code != 200:
        raise LookupError("Failed to obtain AWS api url for PDImageArchive")

    api_url = extr(resp.text, 'const r="', '"', default=None)

    if api_url is None:
        raise LookupError("Couldn't obtain AWS api url for PDImageArchive")

    __CACHED_API_URL = api_url
    return api_url


def _clear_cached_api_url():
    global __CACHED_API_URL  # pylint:disable=global-statement

    __CACHED_API_URL = None


def request(query, params):
    params['url'] = _get_algolia_api_url()
    params['method'] = 'POST'

    request_data = {
        'page': params['pageno'] - 1,
        'query': query,
        'hitsPerPage': page_size,
        'indexName': 'prod_all-images',
    }
    params['headers'] = {'Content-Type': 'application/json'}
    params['data'] = dumps(request_data)

    # http errors are handled manually to be able to reset the api url
    params['raise_for_httperror'] = False
    return params


def response(resp):
    results = []
    json_data = resp.json()

    if resp.status_code == 403:
        _clear_cached_api_url()
        raise SearxEngineAccessDeniedException()

    if resp.status_code != 200:
        raise SearxEngineException()

    if 'results' not in json_data:
        return []

    for result in json_data['results'][0]['hits']:
        content = []

        if result.get("themes"):
            content.append("Themes: " + result['themes'])

        if result.get("encompassingWork"):
            content.append("Encompassing work: " + result['encompassingWork'])

        base_image_url = result['thumbnail'].split("?")[0]

        results.append(
            {
                'template': 'images.html',
                'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
                'img_src': _clean_url(base_image_url),
                'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
                'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
                'content': "\n".join(content),
            }
        )

    return results