From 4fb6105d699e19321f6799d7fff05313fd4cd4b9 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 18 Aug 2025 16:30:51 +0200 Subject: [fix] revision of utils.HTMLTextExtractor (#5125) Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632 --- searx/utils.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'searx/utils.py') diff --git a/searx/utils.py b/searx/utils.py index 54b32484e..dff3eb4f4 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str: return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) -class _HTMLTextExtractorException(Exception): - """Internal exception raised when the HTML is invalid""" - - -class _HTMLTextExtractor(HTMLParser): +class HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser): return if tag != self.tags[-1]: - raise _HTMLTextExtractorException() + self.result.append(f"") + return self.tags.pop() @@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('Example') 'Example' - >>> html_to_text(r'regexp: (?>> html_to_text(r'regexp: (?<![a-zA-Z]') 'regexp: (?>> html_to_text(r'

Lorem ipsum dolor sit amet

') + 'Lorem ipsum dolor sit amet

' + + >>> html_to_text(r'> < a') + '> < a' + """ if not html_str: return "" html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) - s = _HTMLTextExtractor() + s = HTMLTextExtractor() try: s.feed(html_str) s.close() except AssertionError: - s = _HTMLTextExtractor() + s = HTMLTextExtractor() s.feed(escape(html_str, quote=True)) s.close() - except _HTMLTextExtractorException: - logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() -- cgit v1.2.3