From 4fb6105d699e19321f6799d7fff05313fd4cd4b9 Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarIT.de>
Date: Mon, 18 Aug 2025 16:30:51 +0200
Subject: [fix] revision of utils.HTMLTextExtractor (#5125)

Related:

- https://github.com/searxng/searxng/pull/5073#issuecomment-3196282632
---
 tests/unit/test_utils.py | 84 +++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 55 deletions(-)

(limited to 'tests')
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index ad6ca37a5..01056df74 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -28,30 +28,6 @@ class TestUtils(SearxTestCase):
         self.assertIsNotNone(utils.searxng_useragent())
         self.assertTrue(utils.searxng_useragent().startswith('SearXNG'))
 
-    def test_html_to_text(self):
-        html_str = """
-        <a href="/testlink" class="link_access_account">
-            <style>
-                .toto {
-                    color: red;
-                }
-            </style>
-            <span class="toto">
-                <span>
-                    <img src="test.jpg" />
-                </span>
-            </span>
-            <span class="titi">
-                            Test text
-            </span>
-            <script>value='dummy';</script>
-        </a>
-        """
-        self.assertIsInstance(utils.html_to_text(html_str), str)
-        self.assertIsNotNone(utils.html_to_text(html_str))
-        self.assertEqual(utils.html_to_text(html_str), "Test text")
-        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
-
     def test_extract_text(self):
         html_str = """
         <a href="/testlink" class="link_access_account">
@@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
         with self.assertRaises(Exception):
             utils.extract_url([], 'https://example.com')
 
-    def test_html_to_text_invalid(self):
-        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
-
     def test_ecma_unscape(self):
         self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
         self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
         self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
 
-
-class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
-
-    def setUp(self):
-        super().setUp()
-
-        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access
-
-    def test__init__(self):
-        self.assertEqual(self.html_text_extractor.result, [])
-
     @parameterized.expand(
         [
-            ('xF', '\x0f'),
-            ('XF', '\x0f'),
-            ('97', 'a'),
+            ('Example <span id="42">#2</span>', 'Example #2'),
+            ('<style>.span { color: red; }</style><span>Example</span>', 'Example'),
+            (r'regexp: (?&lt;![a-zA-Z]', r'regexp: (?<![a-zA-Z]'),
+            (r'<p><b>Lorem ipsum </i>dolor sit amet</p>', 'Lorem ipsum </i>dolor sit amet</p>'),
+            (r'&#x3e &#x3c &#97', '> < a'),
         ]
     )
-    def test_handle_charref(self, charref: str, expected: str):
-        self.html_text_extractor.handle_charref(charref)
-        self.assertIn(expected, self.html_text_extractor.result)
-
-    def test_handle_entityref(self):
-        entity = 'test'
-        self.html_text_extractor.handle_entityref(entity)
-        self.assertIn(entity, self.html_text_extractor.result)
-
-    def test_invalid_html(self):
-        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
-            self.html_text_extractor.feed(text)
+    def test_html_to_text(self, html_str: str, text_str: str):
+        self.assertEqual(utils.html_to_text(html_str), text_str)
+
+    def test_html_to_text_with_a_style_span(self):
+        html_str = """
+        <a href="/testlink" class="link_access_account">
+            <style>
+                .toto {
+                    color: red;
+                }
+            </style>
+            <span class="toto">
+                <span>
+                    <img src="test.jpg" />
+                </span>
+            </span>
+            <span class="titi">
+                            Test text
+            </span>
+            <script>value='dummy';</script>
+        </a>
+        """
+        self.assertIsInstance(utils.html_to_text(html_str), str)
+        self.assertEqual(utils.html_to_text(html_str), "Test text")
 
 
 class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
-- 
cgit v1.2.3