summaryrefslogtreecommitdiff
path: root/searx/enginelib/__init__.py
blob: 9d864e622e5f0b238468a91bc598a37e27481a30 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations of the framework for the SearXNG engines.

- :py:obj:`searx.enginelib.EngineCache`
- :py:obj:`searx.enginelib.Engine`
- :py:obj:`searx.enginelib.traits`

There is a command line for developer purposes and for deeper analysis.  Here is
an example in which the command line is called in the development environment::

  $ ./manage pyenv.cmd bash --norc --noprofile
  (py3) python -m searx.enginelib --help

.. hint::

   The long term goal is to modularize all implementations of the engine
   framework here in this Python package.  ToDo:

   - move implementations of the :ref:`searx.engines loader` to a new module in
     the :py:obj:`searx.enginelib` namespace.

-----

"""

__all__ = ["EngineCache", "Engine", "ENGINES_CACHE"]

import typing as t
import abc
from collections.abc import Callable
import logging
import string
import typer

from ..cache import ExpireCacheSQLite, ExpireCacheCfg

if t.TYPE_CHECKING:
    from searx.enginelib import traits
    from searx.enginelib.traits import EngineTraits
    from searx.extended_types import SXNG_Response
    from searx.result_types import EngineResults
    from searx.search.processors import OfflineParamTypes, OnlineParamTypes

ENGINES_CACHE: ExpireCacheSQLite = ExpireCacheSQLite.build_cache(
    ExpireCacheCfg(
        name="ENGINES_CACHE",
        MAXHOLD_TIME=60 * 60 * 24 * 7,  # 7 days
        MAINTENANCE_PERIOD=60 * 60,  # 2h
    )
)
"""Global :py:obj:`searx.cache.ExpireCacheSQLite` instance where the cached
values from all engines are stored.  The `MAXHOLD_TIME` is 7 days and the
`MAINTENANCE_PERIOD` is set to two hours."""

app = typer.Typer()


@app.command()
def state():
    """Show state for the caches of the engines."""

    title = "cache tables and key/values"
    print(title)
    print("=" * len(title))
    print(ENGINES_CACHE.state().report())
    print()
    title = f"properties of {ENGINES_CACHE.cfg.name}"
    print(title)
    print("=" * len(title))
    print(str(ENGINES_CACHE.properties))


@app.command()
def maintenance(force: bool = True):
    """Carry out maintenance on cache of the engines."""
    ENGINES_CACHE.maintenance(force=force)


class EngineCache:
    """Persistent (SQLite) key/value cache that deletes its values again after
    ``expire`` seconds (default/max: :py:obj:`MAXHOLD_TIME
    <searx.cache.ExpireCacheCfg.MAXHOLD_TIME>`).  This class is a wrapper around
    :py:obj:`ENGINES_CACHE` (:py:obj:`ExpireCacheSQLite
    <searx.cache.ExpireCacheSQLite>`).

    In the :origin:`searx/engines/demo_offline.py` engine you can find an
    exemplary implementation of such a cache other examples are implemented
    in:

    - :origin:`searx/engines/radio_browser.py`
    - :origin:`searx/engines/soundcloud.py`
    - :origin:`searx/engines/startpage.py`

    .. code: python

       from searx.enginelib import EngineCache
       CACHE: EngineCache

       def init(engine_settings):
           global CACHE
           CACHE = EngineCache(engine_settings["name"])

       def request(query, params):
           token = CACHE.get(key="token")
           if token is None:
               token = get_token()
               # cache token of this engine for 1h
               CACHE.set(key="token", value=token, expire=3600)
           ...

    For introspection of the DB, jump into developer environment and run command to
    show cache state::

        $ ./manage pyenv.cmd bash --norc --noprofile
        (py3) python -m searx.enginelib cache state

        cache tables and key/values
        ===========================
        [demo_offline        ] 2025-04-22 11:32:50 count        --> (int) 4
        [startpage           ] 2025-04-22 12:32:30 SC_CODE      --> (str) fSOBnhEMlDfE20
        [duckduckgo          ] 2025-04-22 12:32:31 4dff493e.... --> (str) 4-128634958369380006627592672385352473325
        [duckduckgo          ] 2025-04-22 12:40:06 3e2583e2.... --> (str) 4-263126175288871260472289814259666848451
        [radio_browser       ] 2025-04-23 11:33:08 servers      --> (list) ['https://de2.api.radio-browser.info',  ...]
        [soundcloud          ] 2025-04-29 11:40:06 guest_client_id --> (str) EjkRJG0BLNEZquRiPZYdNtJdyGtTuHdp
        [wolframalpha        ] 2025-04-22 12:40:06 code         --> (str) 5aa79f86205ad26188e0e26e28fb7ae7
        number of tables: 6
        number of key/value pairs: 7

    In the "cache tables and key/values" section, the table name (engine name) is at
    first position on the second there is the calculated expire date and on the
    third and fourth position the key/value is shown.

    About duckduckgo: The *vqd coode* of ddg depends on the query term and therefore
    the key is a hash value of the query term (to not to store the raw query term).

    In the "properties of ENGINES_CACHE" section all properties of the SQLiteAppl /
    ExpireCache and their last modification date are shown::

        properties of ENGINES_CACHE
        ===========================
        [last modified: 2025-04-22 11:32:27] DB_SCHEMA           : 1
        [last modified: 2025-04-22 11:32:27] LAST_MAINTENANCE    :
        [last modified: 2025-04-22 11:32:27] crypt_hash          : ca612e3566fdfd7cf7efe...
        [last modified: 2025-04-22 11:32:30] CACHE-TABLE--demo_offline: demo_offline
        [last modified: 2025-04-22 11:32:30] CACHE-TABLE--startpage: startpage
        [last modified: 2025-04-22 11:32:31] CACHE-TABLE--duckduckgo: duckduckgo
        [last modified: 2025-04-22 11:33:08] CACHE-TABLE--radio_browser: radio_browser
        [last modified: 2025-04-22 11:40:06] CACHE-TABLE--soundcloud: soundcloud
        [last modified: 2025-04-22 11:40:06] CACHE-TABLE--wolframalpha: wolframalpha

    These properties provide information about the state of the ExpireCache and
    control the behavior.  For example, the maintenance intervals are controlled by
    the last modification date of the LAST_MAINTENANCE property and the hash value
    of the password can be used to detect whether the password has been changed (in
    this case the DB entries can no longer be decrypted and the entire cache must be
    discarded).
    """

    def __init__(self, engine_name: str, expire: int | None = None):
        self.expire: int = expire or ENGINES_CACHE.cfg.MAXHOLD_TIME
        _valid = "-_." + string.ascii_letters + string.digits
        self.table_name: str = "".join([c if c in _valid else "_" for c in engine_name])

    def set(self, key: str, value: t.Any, expire: int | None = None) -> bool:
        return ENGINES_CACHE.set(
            key=key,
            value=value,
            expire=expire or self.expire,
            ctx=self.table_name,
        )

    def get(self, key: str, default: t.Any = None) -> t.Any:
        return ENGINES_CACHE.get(key, default=default, ctx=self.table_name)

    def secret_hash(self, name: str | bytes) -> str:
        return ENGINES_CACHE.secret_hash(name=name)


class Engine(abc.ABC):  # pylint: disable=too-few-public-methods
    """Class of engine instances build from YAML settings.

    Further documentation see :ref:`general engine configuration`.

    .. hint::

       This class is currently never initialized and only used for type hinting.
    """

    logger: logging.Logger

    # Common options in the engine module

    engine_type: str
    """Type of the engine (:ref:`searx.search.processors`)"""

    paging: bool
    """Engine supports multiple pages."""

    max_page: int = 0
    """If the engine supports paging, then this is the value for the last page
    that is still supported. ``0`` means unlimited numbers of pages."""

    time_range_support: bool
    """Engine supports search time range."""

    safesearch: bool
    """Engine supports SafeSearch"""

    language_support: bool
    """Engine supports languages (locales) search."""

    language: str
    """For an engine, when there is ``language: ...`` in the YAML settings the engine
    does support only this one language:

    .. code:: yaml

      - name: google french
        engine: google
        language: fr
    """

    region: str
    """For an engine, when there is ``region: ...`` in the YAML settings the engine
    does support only this one region::

    .. code:: yaml

      - name: google belgium
        engine: google
        region: fr-BE
    """

    fetch_traits: "Callable[[EngineTraits, bool], None]"
    """Function to to fetch engine's traits from origin."""

    traits: "traits.EngineTraits"
    """Traits of the engine."""

    # settings.yml

    categories: list[str]
    """Specifies to which :ref:`engine categories` the engine should be added."""

    name: str
    """Name that will be used across SearXNG to define this engine.  In settings, on
    the result page .."""

    engine: str
    """Name of the python file used to handle requests and responses to and from
    this search engine (file name from :origin:`searx/engines` without
    ``.py``)."""

    enable_http: bool
    """Enable HTTP (by default only HTTPS is enabled)."""

    shortcut: str
    """Code used to execute bang requests (``!foo``)"""

    timeout: float
    """Specific timeout for search-engine."""

    display_error_messages: bool
    """Display error messages on the web UI."""

    proxies: dict[str, dict[str, str]]
    """Set proxies for a specific engine (YAML):

    .. code:: yaml

       proxies :
         http:  socks5://proxy:port
         https: socks5://proxy:port
    """

    disabled: bool
    """To disable by default the engine, but not deleting it.  It will allow the
    user to manually activate it in the settings."""

    inactive: bool
    """Remove the engine from the settings (*disabled & removed*)."""

    about: dict[str, dict[str, str]]
    """Additional fields describing the engine.

    .. code:: yaml

       about:
          website: https://example.com
          wikidata_id: Q306656
          official_api_documentation: https://example.com/api-doc
          use_official_api: true
          require_api_key: true
          results: HTML
    """

    using_tor_proxy: bool
    """Using tor proxy (``true``) or not (``false``) for this engine."""

    send_accept_language_header: bool
    """When this option is activated, the language (locale) that is selected by
    the user is used to build and send a ``Accept-Language`` header in the
    request to the origin search engine."""

    tokens: list[str]
    """A list of secret tokens to make this engine *private*, more details see
    :ref:`private engines`."""

    weight: int
    """Weighting of the results of this engine (:ref:`weight <settings engines>`)."""

    def setup(self, engine_settings: dict[str, t.Any]) -> bool:  # pylint: disable=unused-argument
        """Dynamic setup of the engine settings.

        With this method, the engine's setup is carried out.  For example, to
        check or dynamically adapt the values handed over in the parameter
        ``engine_settings``.  The return value (True/False) indicates whether
        the setup was successful and the engine can be built or rejected.

        The method is optional and is called synchronously as part of the
        initialization of the service and is therefore only suitable for simple
        (local) exams/changes at the engine setting.  The :py:obj:`Engine.init`
        method must be used for longer tasks in which values of a remote must be
        determined, for example.
        """
        return True

    def init(self, engine_settings: dict[str, t.Any]) -> bool | None:  # pylint: disable=unused-argument
        """Initialization of the engine.

        The method is optional and asynchronous (in a thread).  It is suitable,
        for example, for setting up a cache (for the engine) or for querying
        values (required by the engine) from a remote.

        Whether the initialization was successful can be indicated by the return
        value ``True`` or even ``False``.

        - If no return value is given from this init method (``None``), this is
          equivalent to ``True``.

        - If an exception is thrown as part of the initialization, this is
          equivalent to ``False``.
        """
        return True

    @abc.abstractmethod
    def search(self, query: str, params: "OfflineParamTypes") -> "EngineResults":
        """Search method of the ``offline`` engines"""

    @abc.abstractmethod
    def request(self, query: str, params: "OnlineParamTypes") -> None:
        """Method to build the parameters for the request of an ``online``
        engine."""

    @abc.abstractmethod
    def response(self, resp: "SXNG_Response") -> "EngineResults":
        """Method to parse the response of an ``online`` engine."""