diff options
Diffstat (limited to 'searx/utils.py')
| -rw-r--r-- | searx/utils.py | 24 |
1 files changed, 13 insertions, 11 deletions
diff --git a/searx/utils.py b/searx/utils.py index 54b32484e..dff3eb4f4 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str: return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) -class _HTMLTextExtractorException(Exception): - """Internal exception raised when the HTML is invalid""" - - -class _HTMLTextExtractor(HTMLParser): +class HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser): return if tag != self.tags[-1]: - raise _HTMLTextExtractorException() + self.result.append(f"</{tag}>") + return self.tags.pop() @@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' - >>> html_to_text(r'regexp: (?<![a-zA-Z]') + >>> html_to_text(r'regexp: (?<![a-zA-Z]') 'regexp: (?<![a-zA-Z]' + + >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>') + 'Lorem ipsum </i>dolor sit amet</p>' + + >>> html_to_text(r'> < a') + '> < a' + """ if not html_str: return "" html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) - s = _HTMLTextExtractor() + s = HTMLTextExtractor() try: s.feed(html_str) s.close() except AssertionError: - s = _HTMLTextExtractor() + s = HTMLTextExtractor() s.feed(escape(html_str, quote=True)) s.close() - except _HTMLTextExtractorException: - logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() |