From 7eedd44f5f9965cf2fbff14d276f96944b5c6a98 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 10 Sep 2025 16:10:42 +0200 Subject: [mod] typification of SearXNG: add new result type Paper This patch adds a new result type: Paper - Python class: searx/result_types/paper.py - Jinja template: searx/templates/simple/result_templates/paper.html - CSS (less) client/simple/src/less/result_types/paper.less Signed-off-by: Markus Heiser --- searx/utils.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index a65474c9b..079a99ae2 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -21,7 +21,8 @@ from datetime import timedelta from markdown_it import MarkdownIt from lxml import html -from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError +from lxml.etree import XPath, XPathError, XPathSyntaxError +from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage] from searx import settings from searx.data import USER_AGENTS, data_dir @@ -40,6 +41,9 @@ XPathSpecType: t.TypeAlias = str | XPath """Type alias used by :py:obj:`searx.utils.get_xpath`, :py:obj:`searx.utils.eval_xpath` and other XPath selectors.""" +ElementType: t.TypeAlias = ElementBase | _Element + + _BLOCKED_TAGS = ('script', 'style') _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) @@ -204,15 +208,23 @@ def markdown_to_text(markdown_str: str) -> str: def extract_text( - xpath_results: list[ElementBase] | ElementBase | str | Number | bool | None, + xpath_results: list[ElementType] | ElementType | str | Number | bool | None, allow_none: bool = False, ) -> str | None: """Extract text from a lxml result - * if xpath_results is list, extract the text from each result and concat the list - * if xpath_results is a xml element, extract all the text node from it - ( text_content() method from lxml ) - * if xpath_results is a string element, then it's already done + - If ``xpath_results`` is a list of :py:obj:`ElementType` objects, extract + the text from each result and concatenate the list in a string. + + - If ``xpath_results`` is a :py:obj:`ElementType` object, extract all the + text node from it ( :py:obj:`lxml.html.tostring`, ``method="text"`` ) + + - If ``xpath_results`` is of type :py:obj:`str` or :py:obj:`Number`, + :py:obj:`bool` the string value is returned. + + - If ``xpath_results`` is of type ``None`` a :py:obj:`ValueError` is raised, + except ``allow_none`` is ``True`` where ``None`` is returned. + """ if isinstance(xpath_results, list): # it's list of result : concat everything using recursive call @@ -220,7 +232,7 @@ def extract_text( for e in xpath_results: result = result + (extract_text(e) or '') return result.strip() - if isinstance(xpath_results, ElementBase): + if isinstance(xpath_results, ElementType): # it's a element text: str = html.tostring( # type: ignore xpath_results, # pyright: ignore[reportArgumentType] @@ -289,7 +301,7 @@ def normalize_url(url: str, base_url: str) -> str: return url -def extract_url(xpath_results: list[ElementBase] | ElementBase | str | Number | bool | None, base_url: str) -> str: +def extract_url(xpath_results: list[ElementType] | ElementType | str | Number | bool | None, base_url: str) -> str: """Extract and normalize URL from lxml Element Example: @@ -520,7 +532,7 @@ def get_xpath(xpath_spec: XPathSpecType) -> XPath: raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath') # pyright: ignore[reportUnreachable] -def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType) -> t.Any: +def eval_xpath(element: ElementType, xpath_spec: XPathSpecType) -> t.Any: """Equivalent of ``element.xpath(xpath_str)`` but compile ``xpath_str`` into a :py:obj:`lxml.etree.XPath` object once for all. The return value of ``xpath(..)`` is complex, read `XPath return values`_ for more details. @@ -548,12 +560,12 @@ def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType) -> t.Any: raise SearxEngineXPathException(xpath_spec, arg) from e -def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: int | None = None) -> list[t.Any]: +def eval_xpath_list(element: ElementType, xpath_spec: XPathSpecType, min_len: int | None = None) -> list[t.Any]: """Same as :py:obj:`searx.utils.eval_xpath`, but additionally ensures the return value is a :py:obj:`list`. The minimum length of the list is also checked (if ``min_len`` is set).""" - result = eval_xpath(element, xpath_spec) + result: list[t.Any] = eval_xpath(element, xpath_spec) if not isinstance(result, list): raise SearxEngineXPathException(xpath_spec, 'the result is not a list') if min_len is not None and min_len > len(result): @@ -562,7 +574,7 @@ def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: in def eval_xpath_getindex( - element: ElementBase, + element: ElementType, xpath_spec: XPathSpecType, index: int, default: t.Any = _NOTSET, -- cgit v1.2.3