summaryrefslogtreecommitdiff
path: root/searx/search/processors/online.py
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2025-09-11 19:10:27 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2025-09-18 19:40:03 +0200
commit8f8343dc0d78bb57215afc3e99fd9000fce6e0cf (patch)
tree7c0aa8587ed4bc47e403b4148a308191e2d21c55 /searx/search/processors/online.py
parent23257bddce864cfc44d64324dee36b32b1cf5248 (diff)
[mod] addition of various type hints / engine processors
Continuation of #5147 .. typification of the engine processors. BTW: - removed obsolete engine property https_support - fixed & improved currency_convert - engine instances can now implement a engine.setup method [#5147] https://github.com/searxng/searxng/pull/5147 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/search/processors/online.py')
-rw-r--r--searx/search/processors/online.py275
1 files changed, 161 insertions, 114 deletions
diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py
index 778b4ac4d..23bb7fda0 100644
--- a/searx/search/processors/online.py
+++ b/searx/search/processors/online.py
@@ -1,8 +1,9 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-"""Processors for engine-type: ``online``
+"""Processor used for ``online`` engines."""
-"""
-# pylint: disable=use-dict-literal
+__all__ = ["OnlineProcessor", "OnlineParams"]
+
+import typing as t
from timeit import default_timer
import asyncio
@@ -17,50 +18,132 @@ from searx.exceptions import (
SearxEngineTooManyRequestsException,
)
from searx.metrics.error_recorder import count_error
-from .abstract import EngineProcessor
+from .abstract import EngineProcessor, RequestParams
+
+if t.TYPE_CHECKING:
+ from searx.search.models import SearchQuery
+ from searx.results import ResultContainer
+ from searx.result_types import EngineResults
+
+
+class HTTPParams(t.TypedDict):
+ """HTTP request parameters"""
+
+ method: t.Literal["GET", "POST"]
+ """HTTP request method."""
+
+ headers: dict[str, str]
+ """HTTP header information."""
+
+ data: dict[str, str]
+ """Sending `form encoded data`_.
+
+ .. _form encoded data:
+ https://www.python-httpx.org/quickstart/#sending-form-encoded-data
+ """
+
+ json: dict[str, t.Any]
+ """`Sending `JSON encoded data`_.
+
+ .. _JSON encoded data:
+ https://www.python-httpx.org/quickstart/#sending-json-encoded-data
+ """
+
+ content: bytes
+ """`Sending `binary request data`_.
+
+ .. _binary request data:
+ https://www.python-httpx.org/quickstart/#sending-json-encoded-data
+ """
+
+ url: str
+ """Requested url."""
+
+ cookies: dict[str, str]
+ """HTTP cookies."""
+
+ allow_redirects: bool
+ """Follow redirects"""
+
+ max_redirects: int
+ """Maximum redirects, hard limit."""
+
+ soft_max_redirects: int
+ """Maximum redirects, soft limit. Record an error but don't stop the engine."""
+
+ verify: None | t.Literal[False] | str # not sure str really works
+ """If not ``None``, it overrides the verify value defined in the network. Use
+ ``False`` to accept any server certificate and use a path to file to specify a
+ server certificate"""
+
+ auth: str | None
+ """An authentication to use when sending requests."""
+
+ raise_for_httperror: bool
+ """Raise an exception if the `HTTP response status code`_ is ``>= 300``.
+
+ .. _HTTP response status code:
+ https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status
+ """
+
+class OnlineParams(HTTPParams, RequestParams):
+ """Request parameters of a ``online`` engine."""
-def default_request_params():
+
+def default_request_params() -> HTTPParams:
"""Default request parameters for ``online`` engines."""
return {
- # fmt: off
- 'method': 'GET',
- 'headers': {},
- 'data': {},
- 'url': '',
- 'cookies': {},
- 'auth': None
- # fmt: on
+ "method": "GET",
+ "headers": {},
+ "data": {},
+ "json": {},
+ "content": b"",
+ "url": "",
+ "cookies": {},
+ "allow_redirects": False,
+ "max_redirects": 0,
+ "soft_max_redirects": 0,
+ "auth": None,
+ "verify": None,
+ "raise_for_httperror": True,
}
class OnlineProcessor(EngineProcessor):
"""Processor class for ``online`` engines."""
- engine_type = 'online'
+ engine_type: str = "online"
+
+ def init_engine(self) -> bool:
+ """This method is called in a thread, and before the base method is
+ called, the network must be set up for the ``online`` engines."""
+ self.init_network_in_thread(start_time=default_timer(), timeout_limit=self.engine.timeout)
+ return super().init_engine()
- def initialize(self):
+ def init_network_in_thread(self, start_time: float, timeout_limit: float):
# set timeout for all HTTP requests
- searx.network.set_timeout_for_thread(self.engine.timeout, start_time=default_timer())
+ searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
# reset the HTTP total time
searx.network.reset_time_for_thread()
# set the network
- searx.network.set_context_network_name(self.engine_name)
- super().initialize()
-
- def get_params(self, search_query, engine_category):
- """Returns a set of :ref:`request params <engine request online>` or ``None``
- if request is not supported.
- """
- params = super().get_params(search_query, engine_category)
- if params is None:
- return None
+ searx.network.set_context_network_name(self.engine.name)
+
+ def get_params(self, search_query: "SearchQuery", engine_category: str) -> OnlineParams | None:
+ """Returns a dictionary with the :ref:`request params <engine request
+ online>` (:py:obj:`OnlineParams`), if the search condition is not
+ supported by the engine, ``None`` is returned."""
+
+ base_params: RequestParams | None = super().get_params(search_query, engine_category)
+ if base_params is None:
+ return base_params
- # add default params
- params.update(default_request_params())
+ params: OnlineParams = {**default_request_params(), **base_params}
+
+ headers = params["headers"]
# add an user agent
- params['headers']['User-Agent'] = gen_useragent()
+ headers["User-Agent"] = gen_useragent()
# add Accept-Language header
if self.engine.send_accept_language_header and search_query.locale:
@@ -71,73 +154,77 @@ class OnlineProcessor(EngineProcessor):
search_query.locale.territory,
search_query.locale.language,
)
- params['headers']['Accept-Language'] = ac_lang
+ headers["Accept-Language"] = ac_lang
- self.logger.debug('HTTP Accept-Language: %s', params['headers'].get('Accept-Language', ''))
+ self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
return params
- def _send_http_request(self, params):
- # create dictionary which contain all
- # information about the request
- request_args = dict(headers=params['headers'], cookies=params['cookies'], auth=params['auth'])
+ def _send_http_request(self, params: OnlineParams):
+
+ # create dictionary which contain all information about the request
+ request_args: dict[str, t.Any] = {
+ "headers": params["headers"],
+ "cookies": params["cookies"],
+ "auth": params["auth"],
+ }
- # verify
- # if not None, it overrides the verify value defined in the network.
- # use False to accept any server certificate
- # use a path to file to specify a server certificate
- verify = params.get('verify')
+ verify = params.get("verify")
if verify is not None:
- request_args['verify'] = params['verify']
+ request_args["verify"] = verify
# max_redirects
- max_redirects = params.get('max_redirects')
+ max_redirects = params.get("max_redirects")
if max_redirects:
- request_args['max_redirects'] = max_redirects
+ request_args["max_redirects"] = max_redirects
# allow_redirects
- if 'allow_redirects' in params:
- request_args['allow_redirects'] = params['allow_redirects']
+ if "allow_redirects" in params:
+ request_args["allow_redirects"] = params["allow_redirects"]
# soft_max_redirects
- soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)
+ soft_max_redirects: int = params.get("soft_max_redirects", max_redirects or 0)
# raise_for_status
- request_args['raise_for_httperror'] = params.get('raise_for_httperror', True)
+ request_args["raise_for_httperror"] = params.get("raise_for_httperror", True)
# specific type of request (GET or POST)
- if params['method'] == 'GET':
+ if params["method"] == "GET":
req = searx.network.get
else:
req = searx.network.post
-
- request_args['data'] = params['data']
+ if params["data"]:
+ request_args["data"] = params["data"]
+ if params["json"]:
+ request_args["json"] = params["json"]
+ if params["content"]:
+ request_args["content"] = params["content"]
# send the request
- response = req(params['url'], **request_args)
+ response = req(params["url"], **request_args)
# check soft limit of the redirect count
if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error
# but the engine might still return valid results.
- status_code = str(response.status_code or '')
- reason = response.reason_phrase or ''
+ status_code = str(response.status_code or "")
+ reason = response.reason_phrase or ""
hostname = response.url.host
count_error(
- self.engine_name,
- '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
+ self.engine.name,
+ "{} redirects, maximum: {}".format(len(response.history), soft_max_redirects),
(status_code, reason, hostname),
secondary=True,
)
return response
- def _search_basic(self, query, params):
+ def _search_basic(self, query: str, params: OnlineParams) -> "EngineResults|None":
# update request parameters dependent on
# search-engine (contained in engines folder)
self.engine.request(query, params)
# ignoring empty urls
- if not params['url']:
+ if not params["url"]:
return None
# send request
@@ -147,13 +234,15 @@ class OnlineProcessor(EngineProcessor):
response.search_params = params
return self.engine.response(response)
- def search(self, query, params, result_container, start_time, timeout_limit):
- # set timeout for all HTTP requests
- searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
- # reset the HTTP total time
- searx.network.reset_time_for_thread()
- # set the network
- searx.network.set_context_network_name(self.engine_name)
+ def search( # pyright: ignore[reportIncompatibleMethodOverride]
+ self,
+ query: str,
+ params: OnlineParams,
+ result_container: "ResultContainer",
+ start_time: float,
+ timeout_limit: float,
+ ):
+ self.init_network_in_thread(start_time, timeout_limit)
try:
# send requests and parse the results
@@ -162,7 +251,7 @@ class OnlineProcessor(EngineProcessor):
except ssl.SSLError as e:
# requests timeout (connect or read)
self.handle_exception(result_container, e, suspend=True)
- self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine_name).verify))
+ self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine.name).verify))
except (httpx.TimeoutException, asyncio.TimeoutError) as e:
# requests timeout (connect or read)
self.handle_exception(result_container, e, suspend=True)
@@ -179,55 +268,13 @@ class OnlineProcessor(EngineProcessor):
default_timer() - start_time, timeout_limit, e
)
)
- except SearxEngineCaptchaException as e:
- self.handle_exception(result_container, e, suspend=True)
- self.logger.exception('CAPTCHA')
- except SearxEngineTooManyRequestsException as e:
+ except (
+ SearxEngineCaptchaException,
+ SearxEngineTooManyRequestsException,
+ SearxEngineAccessDeniedException,
+ ) as e:
self.handle_exception(result_container, e, suspend=True)
- self.logger.exception('Too many requests')
- except SearxEngineAccessDeniedException as e:
- self.handle_exception(result_container, e, suspend=True)
- self.logger.exception('SearXNG is blocked')
+ self.logger.exception(e.message)
except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, e)
- self.logger.exception('exception : {0}'.format(e))
-
- def get_default_tests(self):
- tests = {}
-
- tests['simple'] = {
- 'matrix': {'query': ('life', 'computer')},
- 'result_container': ['not_empty'],
- }
-
- if getattr(self.engine, 'paging', False):
- tests['paging'] = {
- 'matrix': {'query': 'time', 'pageno': (1, 2, 3)},
- 'result_container': ['not_empty'],
- 'test': ['unique_results'],
- }
- if 'general' in self.engine.categories:
- # avoid documentation about HTML tags (<time> and <input type="time">)
- tests['paging']['matrix']['query'] = 'news'
-
- if getattr(self.engine, 'time_range', False):
- tests['time_range'] = {
- 'matrix': {'query': 'news', 'time_range': (None, 'day')},
- 'result_container': ['not_empty'],
- 'test': ['unique_results'],
- }
-
- if getattr(self.engine, 'traits', False):
- tests['lang_fr'] = {
- 'matrix': {'query': 'paris', 'lang': 'fr'},
- 'result_container': ['not_empty', ('has_language', 'fr')],
- }
- tests['lang_en'] = {
- 'matrix': {'query': 'paris', 'lang': 'en'},
- 'result_container': ['not_empty', ('has_language', 'en')],
- }
-
- if getattr(self.engine, 'safesearch', False):
- tests['safesearch'] = {'matrix': {'query': 'porn', 'safesearch': (0, 2)}, 'test': ['unique_results']}
-
- return tests
+ self.logger.exception("exception : {0}".format(e))