summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarIT.de>2025-11-25 13:28:47 +0100
committerGitHub <noreply@github.com>2025-11-25 13:28:47 +0100
commitebb9ea45715d655072400b2b5925f03ec96cf5eb (patch)
tree758e04a2f04c93229d51e445b0d2a92fbfec1c9c /searx/engines
parent54a97e10431c1cdae910d3b37074a63eda7100fc (diff)
[fix] brave engines - web, images & videos (#5478)
brave web: xpath selectors needed to be justified brave images & videos: The JS code with the JS object was read incorrectly; not always, but quite often, it led to exceptions when the Python data structure was created from it. BTW: A complete review was conducted and corrections or additions were made to the type definitions. To test all brave engines in once:: !br !brimg !brvid !brnews weather Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/brave.py311
1 files changed, 147 insertions, 164 deletions
diff --git a/searx/engines/brave.py b/searx/engines/brave.py
index 75775d1ec..2c5854705 100644
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@@ -124,17 +124,17 @@ from urllib.parse import (
urlparse,
)
+import json
from dateutil import parser
from lxml import html
from searx import locales
from searx.utils import (
- extr,
extract_text,
- eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
js_obj_str_to_python,
+ js_obj_str_to_json_str,
get_embeded_stream_url,
)
from searx.enginelib.traits import EngineTraits
@@ -142,17 +142,17 @@ from searx.result_types import EngineResults
from searx.extended_types import SXNG_Response
about = {
- "website": 'https://search.brave.com/',
- "wikidata_id": 'Q22906900',
+ "website": "https://search.brave.com/",
+ "wikidata_id": "Q22906900",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
- "results": 'HTML',
+ "results": "HTML",
}
base_url = "https://search.brave.com/"
categories = []
-brave_category: t.Literal["search", "videos", "images", "news", "goggles"] = 'search'
+brave_category: t.Literal["search", "videos", "images", "news", "goggles"] = "search"
"""Brave supports common web-search, videos, images, news, and goggles search.
- ``search``: Common WEB search
@@ -182,71 +182,86 @@ to do more won't return any result and you will most likely be flagged as a bot.
"""
safesearch = True
-safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
+safesearch_map = {2: "strict", 1: "moderate", 0: "off"} # cookie: safesearch=off
time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category."""
time_range_map: dict[str, str] = {
- 'day': 'pd',
- 'week': 'pw',
- 'month': 'pm',
- 'year': 'py',
+ "day": "pd",
+ "week": "pw",
+ "month": "pm",
+ "year": "py",
}
def request(query: str, params: dict[str, t.Any]) -> None:
args: dict[str, t.Any] = {
- 'q': query,
- 'source': 'web',
+ "q": query,
+ "source": "web",
}
if brave_spellcheck:
- args['spellcheck'] = '1'
+ args["spellcheck"] = "1"
- if brave_category in ('search', 'goggles'):
- if params.get('pageno', 1) - 1:
- args['offset'] = params.get('pageno', 1) - 1
- if time_range_map.get(params['time_range']):
- args['tf'] = time_range_map.get(params['time_range'])
+ if brave_category in ("search", "goggles"):
+ if params.get("pageno", 1) - 1:
+ args["offset"] = params.get("pageno", 1) - 1
+ if time_range_map.get(params["time_range"]):
+ args["tf"] = time_range_map.get(params["time_range"])
- if brave_category == 'goggles':
- args['goggles_id'] = Goggles
+ if brave_category == "goggles":
+ args["goggles_id"] = Goggles
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
+ logger.debug("url %s", params["url"])
# set properties in the cookies
- params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
- # the useLocation is IP based, we use cookie 'country' for the region
- params['cookies']['useLocation'] = '0'
- params['cookies']['summarizer'] = '0'
-
- engine_region = traits.get_region(params['searxng_locale'], 'all')
- params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
+ params["cookies"]["safesearch"] = safesearch_map.get(params["safesearch"], "off")
+ # the useLocation is IP based, we use cookie "country" for the region
+ params["cookies"]["useLocation"] = "0"
+ params["cookies"]["summarizer"] = "0"
- ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
- params['cookies']['ui_lang'] = ui_lang
+ engine_region = traits.get_region(params["searxng_locale"], "all")
+ params["cookies"]["country"] = engine_region.split("-")[-1].lower() # type: ignore
- logger.debug("cookies %s", params['cookies'])
+ ui_lang = locales.get_engine_locale(params["searxng_locale"], traits.custom["ui_lang"], "en-us")
+ params["cookies"]["ui_lang"] = ui_lang
+ logger.debug("cookies %s", params["cookies"])
- params['headers']['Sec-Fetch-Dest'] = "document"
- params['headers']['Sec-Fetch-Mode'] = "navigate"
- params['headers']['Sec-Fetch-Site'] = "same-origin"
- params['headers']['Sec-Fetch-User'] = "?1"
-
-def _extract_published_date(published_date_raw):
+def _extract_published_date(published_date_raw: str | None):
if published_date_raw is None:
return None
-
try:
return parser.parse(published_date_raw)
except parser.ParserError:
return None
+def extract_json_data(text: str) -> dict[str, t.Any]:
+ # Example script source containing the data:
+ #
+ # kit.start(app, element, {
+ # node_ids: [0, 19],
+ # data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
+ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ text = text[text.index("<script") : text.index("</script")]
+ if not text:
+ raise ValueError("can't find JS/JSON data in the given text")
+ start = text.index("data: [{")
+ end = text.rindex("}}]")
+ js_obj_str = text[start:end]
+ js_obj_str = "{" + js_obj_str + "}}]}"
+ # js_obj_str = js_obj_str.replace("\xa0", "") # remove ASCII for &nbsp;
+ # js_obj_str = js_obj_str.replace(r"\u003C", "<").replace(r"\u003c", "<") # fix broken HTML tags in strings
+ json_str = js_obj_str_to_json_str(js_obj_str)
+ data: dict[str, t.Any] = json.loads(json_str)
+ return data
+
+
def response(resp: SXNG_Response) -> EngineResults:
if brave_category in ('search', 'goggles'):
@@ -261,11 +276,8 @@ def response(resp: SXNG_Response) -> EngineResults:
# node_ids: [0, 19],
# data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
- json_data = js_obj_str_to_python(js_object)
-
- # json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
- json_resp = json_data[1]['data']['body']['response']
+ json_data: dict[str, t.Any] = extract_json_data(resp.text)
+ json_resp: dict[str, t.Any] = json_data['data'][1]["data"]['body']['response']
if brave_category == 'images':
return _parse_images(json_resp)
@@ -275,150 +287,121 @@ def response(resp: SXNG_Response) -> EngineResults:
raise ValueError(f"Unsupported brave category: {brave_category}")
-def _parse_search(resp) -> EngineResults:
- result_list = EngineResults()
-
+def _parse_search(resp: SXNG_Response) -> EngineResults:
+ res = EngineResults()
dom = html.fromstring(resp.text)
- # I doubt that Brave is still providing the "answer" class / I haven't seen
- # answers in brave for a long time.
- answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
- if answer_tag:
- url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
- answer = extract_text(answer_tag)
- if answer is not None:
- result_list.add(result_list.types.Answer(answer=answer, url=url))
-
- # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
- xpath_results = '//div[contains(@class, "snippet ")]'
-
- for result in eval_xpath_list(dom, xpath_results):
+ for result in eval_xpath_list(dom, "//div[contains(@class, 'snippet ')]"):
- url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
- title_tag = eval_xpath_getindex(
- result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
- )
+ url: str | None = eval_xpath_getindex(result, ".//a/@href", 0, default=None)
+ title_tag = eval_xpath_getindex(result, ".//div[contains(@class, 'title')]", 0, default=None)
if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
continue
- content: str = extract_text(
- eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
- ) # type: ignore
- pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
- pub_date = _extract_published_date(pub_date_raw)
- if pub_date and content.startswith(pub_date_raw):
- content = content.lstrip(pub_date_raw).strip("- \n\t")
-
- thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
-
- item = {
- 'url': url,
- 'title': extract_text(title_tag),
- 'content': content,
- 'publishedDate': pub_date,
- 'thumbnail': thumbnail,
- }
+ content: str = ""
+ pub_date = None
+
+ _content = eval_xpath_getindex(result, ".//div[contains(@class, 'content')]", 0, default="")
+ if len(_content):
+ content = extract_text(_content) # type: ignore
+ _pub_date = extract_text(
+ eval_xpath_getindex(_content, ".//span[contains(@class, 't-secondary')]", 0, default="")
+ )
+ if _pub_date:
+ pub_date = _extract_published_date(_pub_date)
+ content = content.lstrip(_pub_date).strip("- \n\t")
+
+ thumbnail: str = eval_xpath_getindex(result, ".//a[contains(@class, 'thumbnail')]//img/@src", 0, default="")
+
+ item = res.types.LegacyResult(
+ template="default.html",
+ url=url,
+ title=extract_text(title_tag),
+ content=content,
+ publishedDate=pub_date,
+ thumbnail=thumbnail,
+ )
+ res.add(item)
video_tag = eval_xpath_getindex(
- result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
+ result, ".//div[contains(@class, 'video-snippet') and @data-macro='video']", 0, default=[]
)
- if video_tag is not None:
-
+ if len(video_tag):
# In my tests a video tag in the WEB search was most often not a
# video, except the ones from youtube ..
-
iframe_src = get_embeded_stream_url(url)
if iframe_src:
- item['iframe_src'] = iframe_src
- item['template'] = 'videos.html'
- item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
- pub_date_raw = extract_text(
- eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
- )
- item['publishedDate'] = _extract_published_date(pub_date_raw)
- else:
- item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
-
- result_list.append(item)
+ item["iframe_src"] = iframe_src
+ item["template"] = "videos.html"
- return result_list
+ return res
-def _parse_news(resp) -> EngineResults:
-
- result_list = EngineResults()
+def _parse_news(resp: SXNG_Response) -> EngineResults:
+ res = EngineResults()
dom = html.fromstring(resp.text)
- for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
-
- # import pdb
- # pdb.set_trace()
+ for result in eval_xpath_list(dom, "//div[contains(@class, 'results')]//div[@data-type='news']"):
- url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
+ url = eval_xpath_getindex(result, ".//a[contains(@class, 'result-header')]/@href", 0, default=None)
if url is None:
continue
- title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
- content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
- thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
+ title = eval_xpath_list(result, ".//span[contains(@class, 'snippet-title')]")
+ content = eval_xpath_list(result, ".//p[contains(@class, 'desc')]")
+ thumbnail = eval_xpath_getindex(result, ".//div[contains(@class, 'image-wrapper')]//img/@src", 0, default="")
- item = {
- "url": url,
- "title": title,
- "content": content,
- "thumbnail": thumbnail,
- }
-
- result_list.append(item)
+ item = res.types.LegacyResult(
+ template="default.html",
+ url=url,
+ title=extract_text(title),
+ thumbnail=thumbnail,
+ content=extract_text(content),
+ )
+ res.add(item)
- return result_list
+ return res
-def _parse_images(json_resp) -> EngineResults:
- result_list = EngineResults()
+def _parse_images(json_resp: dict[str, t.Any]) -> EngineResults:
+ res = EngineResults()
for result in json_resp["results"]:
- item = {
- 'url': result['url'],
- 'title': result['title'],
- 'content': result['description'],
- 'template': 'images.html',
- 'resolution': result['properties']['format'],
- 'source': result['source'],
- 'img_src': result['properties']['url'],
- 'thumbnail_src': result['thumbnail']['src'],
- }
- result_list.append(item)
+ item = res.types.LegacyResult(
+ template="images.html",
+ url=result["url"],
+ title=result["title"],
+ source=result["source"],
+ img_src=result["properties"]["url"],
+ thumbnail_src=result["thumbnail"]["src"],
+ )
+ res.add(item)
- return result_list
+ return res
-def _parse_videos(json_resp) -> EngineResults:
- result_list = EngineResults()
+def _parse_videos(json_resp: dict[str, t.Any]) -> EngineResults:
+ res = EngineResults()
for result in json_resp["results"]:
-
- url = result['url']
- item = {
- 'url': url,
- 'title': result['title'],
- 'content': result['description'],
- 'template': 'videos.html',
- 'length': result['video']['duration'],
- 'duration': result['video']['duration'],
- 'publishedDate': _extract_published_date(result['age']),
- }
-
- if result['thumbnail'] is not None:
- item['thumbnail'] = result['thumbnail']['src']
-
- iframe_src = get_embeded_stream_url(url)
+ item = res.types.LegacyResult(
+ template="videos.html",
+ url=result["url"],
+ title=result["title"],
+ content=result["description"],
+ length=result["video"]["duration"],
+ duration=result["video"]["duration"],
+ publishedDate=_extract_published_date(result["age"]),
+ )
+ if result["thumbnail"] is not None:
+ item["thumbnail"] = result["thumbnail"]["src"]
+ iframe_src = get_embeded_stream_url(result["url"])
if iframe_src:
- item['iframe_src'] = iframe_src
+ item["iframe_src"] = iframe_src
- result_list.append(item)
+ res.add(item)
- return result_list
+ return res
def fetch_traits(engine_traits: EngineTraits):
@@ -443,21 +426,21 @@ def fetch_traits(engine_traits: EngineTraits):
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text)
- for option in dom.xpath('//section//option[@value="en-us"]/../option'):
+ for option in dom.xpath("//section//option[@value='en-us']/../option"):
- ui_lang = option.get('value')
+ ui_lang = option.get("value")
try:
- l = babel.Locale.parse(ui_lang, sep='-')
+ l = babel.Locale.parse(ui_lang, sep="-")
if l.territory:
- sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
+ sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep="-"))
else:
- sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
+ sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep="-"))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
continue
- conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
+ conflict = engine_traits.custom["ui_lang"].get(sxng_tag) # type: ignore
if conflict:
if conflict != ui_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
@@ -466,26 +449,26 @@ def fetch_traits(engine_traits: EngineTraits):
# search regions of brave
- resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js')
+ resp = get("https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js")
if not resp.ok:
print("ERROR: response from Brave is not OK.")
- country_js = resp.text[resp.text.index("options:{all") + len('options:') :]
+ country_js = resp.text[resp.text.index("options:{all") + len("options:") :]
country_js = country_js[: country_js.index("},k={default")]
country_tags = js_obj_str_to_python(country_js)
for k, v in country_tags.items():
- if k == 'all':
- engine_traits.all_locale = 'all'
+ if k == "all":
+ engine_traits.all_locale = "all"
continue
- country_tag = v['value']
+ country_tag = v["value"]
# add official languages of the country ..
for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
lang_tag = lang_map.get(lang_tag, lang_tag)
- sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
- # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
+ sxng_tag = region_tag(babel.Locale.parse("%s_%s" % (lang_tag, country_tag.upper())))
+ # print("%-20s: %s <-- %s" % (v["label"], country_tag, sxng_tag))
conflict = engine_traits.regions.get(sxng_tag)
if conflict: