searx/engines/public_domain_image_archive.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Public domain image archive"""

from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import dumps

from searx.network import get
from searx.utils import extr
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException

THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
"""
Example thumbnail urls (from requests & html):
- https://the-public-domain-review.imgix.net
  /shop/nov-2023-prints-00043.jpg
  ?fit=max&h=360&w=360
- https://the-public-domain-review.imgix.net
  /collections/the-history-of-four-footed-beasts-and-serpents-1658/
  8616383182_5740fa7851_o.jpg
  ?fit=max&h=360&w=360

Example full image urls (from html)
- https://the-public-domain-review.imgix.net/shop/
  nov-2023-prints-00043.jpg
  ?fit=clip&w=970&h=800&auto=format,compress
- https://the-public-domain-review.imgix.net/collections/
  the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
  ?fit=clip&w=310&h=800&auto=format,compress

The thumbnail url from the request will be cleaned for the full image link
The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
"""

# about
about = {
    "website": 'https://pdimagearchive.org',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'JSON',
}

base_url = 'https://oqi2j6v4iz-dsn.algolia.net'
pdia_config_url = 'https://pdimagearchive.org/_astro/config.BiNvrvzG.js'
categories = ['images']
page_size = 20
paging = True


__CACHED_API_KEY = None


def _clean_url(url):
    parsed = urlparse(url)
    query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]

    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))


def _get_algolia_api_key():
    global __CACHED_API_KEY  # pylint:disable=global-statement

    if __CACHED_API_KEY:
        return __CACHED_API_KEY

    resp = get(pdia_config_url)
    if resp.status_code != 200:
        raise LookupError("Failed to obtain Algolia API key for PDImageArchive")

    api_key = extr(resp.text, 'r="', '"', default=None)

    if api_key is None:
        raise LookupError("Couldn't obtain Algolia API key for PDImageArchive")

    __CACHED_API_KEY = api_key
    return api_key


def _clear_cached_api_key():
    global __CACHED_API_KEY  # pylint:disable=global-statement

    __CACHED_API_KEY = None


def request(query, params):
    api_key = _get_algolia_api_key()

    args = {
        'x-algolia-api-key': api_key,
        'x-algolia-application-id': 'OQI2J6V4IZ',
    }
    params['url'] = f"{base_url}/1/indexes/*/queries?{urlencode(args)}"
    params["method"] = "POST"

    request_params = {
        "page": params["pageno"] - 1,
        "query": query,
        "highlightPostTag": "__ais-highlight__",
        "highlightPreTag": "__ais-highlight__",
    }
    data = {
        "requests": [
            {"indexName": "prod_all-images", "params": urlencode(request_params)},
        ]
    }
    params["data"] = dumps(data)

    # http errors are handled manually to be able to reset the api key
    params['raise_for_httperror'] = False
    return params


def response(resp):
    results = []
    json_data = resp.json()

    if resp.status_code == 403:
        _clear_cached_api_key()
        raise SearxEngineAccessDeniedException()

    if resp.status_code != 200:
        raise SearxEngineException()

    if 'results' not in json_data:
        return []

    for result in json_data['results'][0]['hits']:
        content = []

        if "themes" in result:
            content.append("Themes: " + result['themes'])

        if "encompassingWork" in result:
            content.append("Encompassing work: " + result['encompassingWork'])
        content = "\n".join(content)

        base_image_url = result['thumbnail'].split("?")[0]

        results.append(
            {
                'template': 'images.html',
                'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
                'img_src': _clean_url(base_image_url),
                'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
                'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
                'content': content,
            }
        )

    return results