diff options
| -rw-r--r-- | searx/utils.py | 24 | ||||
| -rw-r--r-- | tests/unit/test_utils.py | 84 |
2 files changed, 42 insertions, 66 deletions
diff --git a/searx/utils.py b/searx/utils.py index 54b32484e..dff3eb4f4 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -74,11 +74,7 @@ def gen_useragent(os_string: Optional[str] = None) -> str: return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) -class _HTMLTextExtractorException(Exception): - """Internal exception raised when the HTML is invalid""" - - -class _HTMLTextExtractor(HTMLParser): +class HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): @@ -96,7 +92,8 @@ class _HTMLTextExtractor(HTMLParser): return if tag != self.tags[-1]: - raise _HTMLTextExtractorException() + self.result.append(f"</{tag}>") + return self.tags.pop() @@ -149,23 +146,28 @@ def html_to_text(html_str: str) -> str: >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' - >>> html_to_text(r'regexp: (?<![a-zA-Z]') + >>> html_to_text(r'regexp: (?<![a-zA-Z]') 'regexp: (?<![a-zA-Z]' + + >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>') + 'Lorem ipsum </i>dolor sit amet</p>' + + >>> html_to_text(r'> < a') + '> < a' + """ if not html_str: return "" html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) - s = _HTMLTextExtractor() + s = HTMLTextExtractor() try: s.feed(html_str) s.close() except AssertionError: - s = _HTMLTextExtractor() + s = HTMLTextExtractor() s.feed(escape(html_str, quote=True)) s.close() - except _HTMLTextExtractorException: - logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) return s.get_text() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ad6ca37a5..01056df74 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -28,30 +28,6 @@ class TestUtils(SearxTestCase): self.assertIsNotNone(utils.searxng_useragent()) self.assertTrue(utils.searxng_useragent().startswith('SearXNG')) - def test_html_to_text(self): - html_str = """ - <a href="/testlink" class="link_access_account"> - <style> - .toto { - color: red; - } - </style> - <span class="toto"> - <span> - <img src="test.jpg" /> - </span> - </span> - <span class="titi"> - Test text - </span> - <script>value='dummy';</script> - </a> - """ - self.assertIsInstance(utils.html_to_text(html_str), str) - self.assertIsNotNone(utils.html_to_text(html_str)) - self.assertEqual(utils.html_to_text(html_str), "Test text") - self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]") - def test_extract_text(self): html_str = """ <a href="/testlink" class="link_access_account"> @@ -99,46 +75,44 @@ class TestUtils(SearxTestCase): with self.assertRaises(Exception): utils.extract_url([], 'https://example.com') - def test_html_to_text_invalid(self): - _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' - self.assertEqual(utils.html_to_text(_html), "Lorem ipsum") - def test_ecma_unscape(self): self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界') - -class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring - - def setUp(self): - super().setUp() - - self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access - - def test__init__(self): - self.assertEqual(self.html_text_extractor.result, []) - @parameterized.expand( [ - ('xF', '\x0f'), - ('XF', '\x0f'), - ('97', 'a'), + ('Example <span id="42">#2</span>', 'Example #2'), + ('<style>.span { color: red; }</style><span>Example</span>', 'Example'), + (r'regexp: (?<![a-zA-Z]', r'regexp: (?<![a-zA-Z]'), + (r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'), + (r'> < a', '> < a'), ] ) - def test_handle_charref(self, charref: str, expected: str): - self.html_text_extractor.handle_charref(charref) - self.assertIn(expected, self.html_text_extractor.result) - - def test_handle_entityref(self): - entity = 'test' - self.html_text_extractor.handle_entityref(entity) - self.assertIn(entity, self.html_text_extractor.result) - - def test_invalid_html(self): - text = '<p><b>Lorem ipsum</i>dolor sit amet</p>' - with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access - self.html_text_extractor.feed(text) + def test_html_to_text(self, html_str: str, text_str: str): + self.assertEqual(utils.html_to_text(html_str), text_str) + + def test_html_to_text_with_a_style_span(self): + html_str = """ + <a href="/testlink" class="link_access_account"> + <style> + .toto { + color: red; + } + </style> + <span class="toto"> + <span> + <img src="test.jpg" /> + </span> + </span> + <span class="titi"> + Test text + </span> + <script>value='dummy';</script> + </a> + """ + self.assertIsInstance(utils.html_to_text(html_str), str) + self.assertEqual(utils.html_to_text(html_str), "Test text") class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring |