summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/engines/demo_offline.py18
-rw-r--r--searx/engines/duckduckgo.py91
-rw-r--r--searx/engines/radio_browser.py47
-rw-r--r--searx/engines/soundcloud.py32
-rw-r--r--searx/engines/startpage.py34
-rw-r--r--searx/engines/wolframalpha_api.py4
-rw-r--r--searx/engines/wolframalpha_noapi.py42
7 files changed, 167 insertions, 101 deletions
diff --git a/searx/engines/demo_offline.py b/searx/engines/demo_offline.py
index 2cef4f0d0..6a3b8ddf7 100644
--- a/searx/engines/demo_offline.py
+++ b/searx/engines/demo_offline.py
@@ -15,6 +15,7 @@ close to the implementation, its just a simple example. To get in use of this
import json
from searx.result_types import EngineResults
+from searx.enginelib import EngineCache
engine_type = 'offline'
categories = ['general']
@@ -32,14 +33,18 @@ about = {
# if there is a need for globals, use a leading underline
_my_offline_engine: str = ""
+CACHE: EngineCache
+"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
+seconds."""
-def init(engine_settings=None):
+
+def init(engine_settings):
"""Initialization of the (offline) engine. The origin of this demo engine is a
simple json string which is loaded in this example while the engine is
- initialized.
+ initialized."""
+ global _my_offline_engine, CACHE # pylint: disable=global-statement
- """
- global _my_offline_engine # pylint: disable=global-statement
+ CACHE = EngineCache(engine_settings["name"]) # type:ignore
_my_offline_engine = (
'[ {"value": "%s"}'
@@ -57,8 +62,8 @@ def search(query, request_params) -> EngineResults:
results.
"""
res = EngineResults()
+ count = CACHE.get("count", 0)
- count = 0
for row in json.loads(_my_offline_engine):
count += 1
kvmap = {
@@ -75,4 +80,7 @@ def search(query, request_params) -> EngineResults:
)
)
res.add(res.types.LegacyResult(number_of_results=count))
+
+ # cache counter value for 20sec
+ CACHE.set("count", count, expire=20)
return res
diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py
index 450cd9cf8..62e1603a6 100644
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@@ -6,16 +6,17 @@ DuckDuckGo WEB
from __future__ import annotations
-from typing import TYPE_CHECKING
+import json
import re
+import typing
+
from urllib.parse import quote_plus
-import json
+
import babel
import lxml.html
from searx import (
locales,
- redislib,
external_bang,
)
from searx.utils import (
@@ -25,12 +26,12 @@ from searx.utils import (
extract_text,
)
from searx.network import get # see https://github.com/searxng/searxng/issues/762
-from searx import redisdb
from searx.enginelib.traits import EngineTraits
+from searx.enginelib import EngineCache
from searx.exceptions import SearxEngineCaptchaException
from searx.result_types import EngineResults
-if TYPE_CHECKING:
+if typing.TYPE_CHECKING:
import logging
logger: logging.Logger
@@ -61,28 +62,18 @@ url = "https://html.duckduckgo.com/html"
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
-__CACHE = []
+CACHE: EngineCache
+"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
+seconds."""
-def _cache_key(query: str, region: str):
- return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}")
-
-def cache_vqd(query: str, region: str, value: str):
- """Caches a ``vqd`` value from a query."""
- c = redisdb.client()
- if c:
- logger.debug("VALKEY cache vqd value: %s (%s)", value, region)
- c.set(_cache_key(query, region), value, ex=600)
-
- else:
- logger.debug("MEM cache vqd value: %s (%s)", value, region)
- if len(__CACHE) > 100: # cache vqd from last 100 queries
- __CACHE.pop(0)
- __CACHE.append((_cache_key(query, region), value))
+def init(_): # pylint: disable=unused-argument
+ global CACHE # pylint: disable=global-statement
+ CACHE = EngineCache("duckduckgo") # type:ignore
-def get_vqd(query: str, region: str, force_request: bool = False):
+def get_vqd(query: str, region: str, force_request: bool = False) -> str:
"""Returns the ``vqd`` that fits to the *query*.
:param query: The query term
@@ -114,31 +105,34 @@ def get_vqd(query: str, region: str, force_request: bool = False):
seems the block list is a sliding window: to get my IP rid from the bot list
I had to cool down my IP for 1h (send no requests from that IP to DDG).
"""
- key = _cache_key(query, region)
-
- c = redisdb.client()
- if c:
- value = c.get(key)
- if value or value == b'':
- value = value.decode('utf-8') # type: ignore
- logger.debug("re-use CACHED vqd value: %s", value)
- return value
+ key = CACHE.secret_hash(f"{query}//{region}")
+ value = CACHE.get(key=key)
+ if value is not None and not force_request:
+ logger.debug("vqd: re-use cached value: %s", value)
+ return value
+
+ logger.debug("vqd: request value from from duckduckgo.com")
+ resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
+ if resp.status_code == 200: # type: ignore
+ value = extr(resp.text, 'vqd="', '"') # type: ignore
+ if value:
+ logger.debug("vqd value from duckduckgo.com request: '%s'", value)
+ else:
+ logger.error("vqd: can't parse value from ddg response (return empty string)")
+ return ""
+ else:
+ logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
- for k, value in __CACHE:
- if k == key:
- logger.debug("MEM re-use CACHED vqd value: %s", value)
- return value
+ if value:
+ CACHE.set(key=key, value=value)
+ else:
+ logger.error("vqd value from duckduckgo.com ", resp.status_code)
+ return value
- if force_request:
- resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
- if resp.status_code == 200: # type: ignore
- value = extr(resp.text, 'vqd="', '"') # type: ignore
- if value:
- logger.debug("vqd value from DDG request: %s", value)
- cache_vqd(query, region, value)
- return value
- return None
+def set_vqd(query: str, region: str, value: str):
+ key = CACHE.secret_hash(f"{query}//{region}")
+ CACHE.set(key=key, value=value, expire=3600)
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
@@ -373,8 +367,11 @@ def response(resp) -> EngineResults:
# some locales (at least China) does not have a "next page" button
form = form[0]
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
-
- cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd)
+ set_vqd(
+ query=resp.search_params['data']['q'],
+ region=resp.search_params['data']['kl'],
+ value=str(form_vqd),
+ )
# just select "web-result" and ignore results of class "result--ad result--ad--small"
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
@@ -401,7 +398,7 @@ def response(resp) -> EngineResults:
results.add(
results.types.Answer(
answer=zero_click,
- url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0),
+ url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0), # type: ignore
)
)
diff --git a/searx/engines/radio_browser.py b/searx/engines/radio_browser.py
index 64208304e..70aecd476 100644
--- a/searx/engines/radio_browser.py
+++ b/searx/engines/radio_browser.py
@@ -5,7 +5,9 @@
https://de1.api.radio-browser.info/#Advanced_station_search
"""
+from __future__ import annotations
+import typing
import random
import socket
from urllib.parse import urlencode
@@ -13,9 +15,15 @@ import babel
from flask_babel import gettext
from searx.network import get
+from searx.enginelib import EngineCache
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
+if typing.TYPE_CHECKING:
+ import logging
+
+ logger = logging.getLogger()
+
traits: EngineTraits
about = {
@@ -52,11 +60,24 @@ none filters are applied. Valid filters are:
"""
-servers = []
+CACHE: EngineCache
+"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
+seconds."""
def init(_):
- # see https://api.radio-browser.info
+ global CACHE # pylint: disable=global-statement
+ CACHE = EngineCache("radio_browser")
+ server_list()
+
+
+def server_list() -> list[str]:
+
+ servers = CACHE.get("servers", [])
+ if servers:
+ return servers
+
+ # hint: can take up to 40sec!
ips = socket.getaddrinfo("all.api.radio-browser.info", 80, 0, 0, socket.IPPROTO_TCP)
for ip_tuple in ips:
_ip: str = ip_tuple[4][0] # type: ignore
@@ -65,8 +86,22 @@ def init(_):
if srv not in servers:
servers.append(srv)
+ # update server list once in 24h
+ CACHE.set(key="servers", value=servers, expire=60 * 60 * 24)
+
+ return servers
+
def request(query, params):
+
+ servers = server_list()
+ if not servers:
+ logger.error("Fetched server list is empty!")
+ params["url"] = None
+ return
+
+ server = random.choice(servers)
+
args = {
'name': query,
'order': 'votes',
@@ -87,8 +122,7 @@ def request(query, params):
if countrycode in traits.custom['countrycodes']: # type: ignore
args['countrycode'] = countrycode
- params['url'] = f"{random.choice(servers)}/json/stations/search?{urlencode(args)}"
- return params
+ params['url'] = f"{server}/json/stations/search?{urlencode(args)}"
def response(resp):
@@ -154,8 +188,9 @@ def fetch_traits(engine_traits: EngineTraits):
babel_reg_list = get_global("territory_languages").keys()
- language_list = get(f'{servers[0]}/json/languages').json() # type: ignore
- country_list = get(f'{servers[0]}/json/countries').json() # type: ignore
+ server = server_list()[0]
+ language_list = get(f'{server}/json/languages').json() # type: ignore
+ country_list = get(f'{server}/json/countries').json() # type: ignore
for lang in language_list:
diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py
index 23032223e..08df9aa04 100644
--- a/searx/engines/soundcloud.py
+++ b/searx/engines/soundcloud.py
@@ -1,14 +1,23 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""SoundCloud is a German audio streaming service."""
+from __future__ import annotations
import re
-from urllib.parse import quote_plus, urlencode
+import typing
import datetime
+from urllib.parse import quote_plus, urlencode
+
from dateutil import parser
from lxml import html
from searx.network import get as http_get
+from searx.enginelib import EngineCache
+
+if typing.TYPE_CHECKING:
+ import logging
+
+ logger: logging.Logger
about = {
"website": "https://soundcloud.com",
@@ -28,7 +37,6 @@ HTML frontend of the common WEB site.
"""
cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
-guest_client_id = ""
results_per_page = 10
soundcloud_facet = "model"
@@ -48,6 +56,10 @@ app_locale_map = {
"sv": "sv",
}
+CACHE: EngineCache
+"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
+seconds."""
+
def request(query, params):
@@ -55,6 +67,12 @@ def request(query, params):
# - user_id=451561-497874-703312-310156
# - app_version=1740727428
+ guest_client_id = CACHE.get("guest_client_id")
+ if guest_client_id is None:
+ guest_client_id = get_client_id()
+ if guest_client_id:
+ CACHE.set(key="guest_client_id", value=guest_client_id)
+
args = {
"q": query,
"offset": (params['pageno'] - 1) * results_per_page,
@@ -104,12 +122,12 @@ def response(resp):
return results
-def init(engine_settings=None): # pylint: disable=unused-argument
- global guest_client_id # pylint: disable=global-statement
- guest_client_id = get_client_id()
+def init(engine_settings): # pylint: disable=unused-argument
+ global CACHE # pylint: disable=global-statement
+ CACHE = EngineCache(engine_settings["name"]) # type:ignore
-def get_client_id() -> str:
+def get_client_id() -> str | None:
client_id = ""
url = "https://soundcloud.com"
@@ -143,4 +161,4 @@ def get_client_id() -> str:
logger.info("using client_id '%s' for soundclud queries", client_id)
else:
logger.warning("missing valid client_id for soundclud queries")
- return client_id
+ return client_id or None
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 54e05604b..6c77e37c8 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -84,7 +84,6 @@ from typing import TYPE_CHECKING, Any
from collections import OrderedDict
import re
from unicodedata import normalize, combining
-from time import time
from datetime import datetime, timedelta
from json import loads
@@ -97,6 +96,7 @@ from searx.network import get # see https://github.com/searxng/searxng/issues/7
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
from searx.enginelib.traits import EngineTraits
+from searx.enginelib import EngineCache
if TYPE_CHECKING:
import logging
@@ -159,10 +159,21 @@ search_form_xpath = '//form[@id="search"]'
</form>
"""
-# timestamp of the last fetch of 'sc' code
-sc_code_ts = 0
-sc_code = ''
-sc_code_cache_sec = 30
+
+CACHE: EngineCache
+"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
+seconds."""
+
+
+def init(_):
+ global CACHE # pylint: disable=global-statement
+
+ # hint: all three startpage engines (WEB, Images & News) can/should use the
+ # same sc_code ..
+ CACHE = EngineCache("startpage") # type:ignore
+
+
+sc_code_cache_sec = 3600
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
@@ -176,14 +187,10 @@ def get_sc_code(searxng_locale, params):
Startpage's search form generates a new sc-code on each request. This
function scrap a new sc-code from Startpage's home page every
- :py:obj:`sc_code_cache_sec` seconds.
-
- """
-
- global sc_code_ts, sc_code # pylint: disable=global-statement
+ :py:obj:`sc_code_cache_sec` seconds."""
- if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
- logger.debug("get_sc_code: reuse '%s'", sc_code)
+ sc_code = CACHE.get("SC_CODE", "")
+ if sc_code:
return sc_code
headers = {**params['headers']}
@@ -233,8 +240,9 @@ def get_sc_code(searxng_locale, params):
message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore
) from exc
- sc_code_ts = time()
+ sc_code = str(sc_code)
logger.debug("get_sc_code: new value is: %s", sc_code)
+ CACHE.set(key="SC_CODE", value=sc_code, expire=sc_code_cache_sec)
return sc_code
diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py
index 5670e356f..60892d4d8 100644
--- a/searx/engines/wolframalpha_api.py
+++ b/searx/engines/wolframalpha_api.py
@@ -5,7 +5,7 @@
from urllib.parse import urlencode
-from lxml import etree
+import lxml.etree
# about
about = {
@@ -72,7 +72,7 @@ def replace_pua_chars(text):
def response(resp):
results = []
- search_results = etree.XML(resp.content)
+ search_results = lxml.etree.XML(resp.content)
# return empty array if there are no results
if search_results.xpath(failure_xpath):
diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py
index a9d177c32..5ac261d12 100644
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@@ -3,11 +3,13 @@
Wolfram|Alpha (Science)
"""
+from __future__ import annotations
+
from json import loads
-from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
+from searx.enginelib import EngineCache
# about
about = {
@@ -40,41 +42,39 @@ search_url = (
referer_url = url + 'input/?{query}'
-token = {'value': '', 'last_updated': None}
-
# pods to display as image in infobox
# this pods do return a plaintext, but they look better and are more useful as images
image_pods = {'VisualRepresentation', 'Illustration', 'Symbol'}
-# seems, wolframalpha resets its token in every hour
-def obtain_token():
- update_time = time() - (time() % 3600)
- try:
- token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
- token['value'] = loads(token_response.text)['code']
- token['last_updated'] = update_time
- except: # pylint: disable=bare-except
- pass
- return token
+CACHE: EngineCache
+"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
+seconds."""
-def init(engine_settings=None): # pylint: disable=unused-argument
- obtain_token()
+def init(engine_settings):
+ global CACHE # pylint: disable=global-statement
+ CACHE = EngineCache(engine_settings["name"]) # type:ignore
+
+
+def obtain_token() -> str:
+ token = CACHE.get(key="token")
+ if token is None:
+ resp = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
+ token = resp.json()["code"]
+ # seems, wolframalpha resets its token in every hour
+ CACHE.set(key="code", value=token, expire=3600)
+ return token
-# do search-request
def request(query, params):
- # obtain token if last update was more than an hour
- if time() - (token['last_updated'] or 0) > 3600:
- obtain_token()
- params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
+ token = obtain_token()
+ params['url'] = search_url.format(query=urlencode({'input': query}), token=token)
params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
return params
-# get response from search-request
def response(resp):
results = []