Merge pull request #2241 from dalf/move-extract-text-and-url

Move the extract_text and extract_url functions to searx.utils
author: Alexandre Flament <alex@al-f.net> 2020-10-04 09:06:20 +0200
committer: GitHub <noreply@github.com> 2020-10-04 09:06:20 +0200
commit: b728cb610b92161609b1c40babff25749720fc25 (patch)
tree: 733b665dd897cc8e4cd7e37f7d64a052c260d6f0 /tests/unit/test_utils.py
parent: e2cd9b65bb2b2e1f1085cf48442632da0d52077e (diff)
parent: 8f914a28facec314a2b98b11d3cc1207eb8ee8ab (diff)
1 files changed, 45 insertions, 4 deletions
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 69f5ef92a..f3a98ad71 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+import lxml.etree
+from lxml import html
+
 from searx.testing import SearxTestCase
 from searx import utils
 
@@ -16,7 +19,30 @@ class TestUtils(SearxTestCase):
         self.assertTrue(utils.searx_useragent().startswith('searx'))
 
     def test_html_to_text(self):
-        html = """
+        html_str = """
+        <a href="/testlink" class="link_access_account">
+            <style>
+                .toto {
+                    color: red;
+                }
+            </style>
+            <span class="toto">
+                <span>
+                    <img src="test.jpg" />
+                </span>
+            </span>
+            <span class="titi">
+                            Test text
+            </span>
+            <script>value='dummy';</script>
+        </a>
+        """
+        self.assertIsInstance(utils.html_to_text(html_str), str)
+        self.assertIsNotNone(utils.html_to_text(html_str))
+        self.assertEqual(utils.html_to_text(html_str), "Test text")
+
+    def test_extract_text(self):
+        html_str = """
         <a href="/testlink" class="link_access_account">
             <span class="toto">
                 <span>
@@ -28,9 +54,24 @@ class TestUtils(SearxTestCase):
             </span>
         </a>
         """
-        self.assertIsInstance(utils.html_to_text(html), str)
-        self.assertIsNotNone(utils.html_to_text(html))
-        self.assertEqual(utils.html_to_text(html), "Test text")
+        dom = html.fromstring(html_str)
+        self.assertEqual(utils.extract_text(dom), 'Test text')
+        self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
+        self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
+        self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
+
+    def test_extract_url(self):
+        def f(html_str, search_url):
+            return utils.extract_url(html.fromstring(html_str), search_url)
+        self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
+        self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
+        self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
+        self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
+        self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
+        with self.assertRaises(lxml.etree.ParserError):
+            f('', 'https://example.com')
+        with self.assertRaises(Exception):
+            utils.extract_url([], 'https://example.com')
 
     def test_html_to_text_invalid(self):
         html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
author	Alexandre Flament <alex@al-f.net>	2020-10-04 09:06:20 +0200
committer	GitHub <noreply@github.com>	2020-10-04 09:06:20 +0200
commit	b728cb610b92161609b1c40babff25749720fc25 (patch)
tree	733b665dd897cc8e4cd7e37f7d64a052c260d6f0 /tests/unit/test_utils.py
parent	e2cd9b65bb2b2e1f1085cf48442632da0d52077e (diff)
parent	8f914a28facec314a2b98b11d3cc1207eb8ee8ab (diff)