From 4fb6105d699e19321f6799d7fff05313fd4cd4b9 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 18 Aug 2025 16:30:51 +0200 Subject: [fix] revision of utils.HTMLTextExtractor (#5125) Related: - https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632 --- tests/unit/test_utils.py | 84 +++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 55 deletions(-) (limited to 'tests') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ad6ca37a5..01056df74 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -28,30 +28,6 @@ class TestUtils(SearxTestCase): self.assertIsNotNone(utils.searxng_useragent()) self.assertTrue(utils.searxng_useragent().startswith('SearXNG')) - def test_html_to_text(self): - html_str = """ - - - - - - - - - Test text - - - - """ - self.assertIsInstance(utils.html_to_text(html_str), str) - self.assertIsNotNone(utils.html_to_text(html_str)) - self.assertEqual(utils.html_to_text(html_str), "Test text") - self.assertEqual(utils.html_to_text(r"regexp: (? @@ -99,46 +75,44 @@ class TestUtils(SearxTestCase): with self.assertRaises(Exception): utils.extract_url([], 'https://example.com') - def test_html_to_text_invalid(self): - _html = '

Lorem ipsumdolor sit amet

' - self.assertEqual(utils.html_to_text(_html), "Lorem ipsum") - def test_ecma_unscape(self): self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界') - -class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring - - def setUp(self): - super().setUp() - - self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access - - def test__init__(self): - self.assertEqual(self.html_text_extractor.result, []) - @parameterized.expand( [ - ('xF', '\x0f'), - ('XF', '\x0f'), - ('97', 'a'), + ('Example #2', 'Example #2'), + ('Example', 'Example'), + (r'regexp: (?<![a-zA-Z]', r'regexp: (?Lorem ipsum dolor sit amet

', 'Lorem ipsum dolor sit amet

'), + (r'> < a', '> < a'), ] ) - def test_handle_charref(self, charref: str, expected: str): - self.html_text_extractor.handle_charref(charref) - self.assertIn(expected, self.html_text_extractor.result) - - def test_handle_entityref(self): - entity = 'test' - self.html_text_extractor.handle_entityref(entity) - self.assertIn(entity, self.html_text_extractor.result) - - def test_invalid_html(self): - text = '

Lorem ipsumdolor sit amet

' - with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access - self.html_text_extractor.feed(text) + def test_html_to_text(self, html_str: str, text_str: str): + self.assertEqual(utils.html_to_text(html_str), text_str) + + def test_html_to_text_with_a_style_span(self): + html_str = """ + + """ + self.assertIsInstance(utils.html_to_text(html_str), str) + self.assertEqual(utils.html_to_text(html_str), "Test text") class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring -- cgit v1.2.3