summaryrefslogtreecommitdiff
path: root/tests/unit/test_utils.py
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2020-10-04 09:06:20 +0200
committerGitHub <noreply@github.com>2020-10-04 09:06:20 +0200
commitb728cb610b92161609b1c40babff25749720fc25 (patch)
tree733b665dd897cc8e4cd7e37f7d64a052c260d6f0 /tests/unit/test_utils.py
parente2cd9b65bb2b2e1f1085cf48442632da0d52077e (diff)
parent8f914a28facec314a2b98b11d3cc1207eb8ee8ab (diff)
Merge pull request #2241 from dalf/move-extract-text-and-url
Move the extract_text and extract_url functions to searx.utils
Diffstat (limited to 'tests/unit/test_utils.py')
-rw-r--r--tests/unit/test_utils.py49
1 files changed, 45 insertions, 4 deletions
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 69f5ef92a..f3a98ad71 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
+import lxml.etree
+from lxml import html
+
from searx.testing import SearxTestCase
from searx import utils
@@ -16,7 +19,30 @@ class TestUtils(SearxTestCase):
self.assertTrue(utils.searx_useragent().startswith('searx'))
def test_html_to_text(self):
- html = """
+ html_str = """
+ <a href="/testlink" class="link_access_account">
+ <style>
+ .toto {
+ color: red;
+ }
+ </style>
+ <span class="toto">
+ <span>
+ <img src="test.jpg" />
+ </span>
+ </span>
+ <span class="titi">
+ Test text
+ </span>
+ <script>value='dummy';</script>
+ </a>
+ """
+ self.assertIsInstance(utils.html_to_text(html_str), str)
+ self.assertIsNotNone(utils.html_to_text(html_str))
+ self.assertEqual(utils.html_to_text(html_str), "Test text")
+
+ def test_extract_text(self):
+ html_str = """
<a href="/testlink" class="link_access_account">
<span class="toto">
<span>
@@ -28,9 +54,24 @@ class TestUtils(SearxTestCase):
</span>
</a>
"""
- self.assertIsInstance(utils.html_to_text(html), str)
- self.assertIsNotNone(utils.html_to_text(html))
- self.assertEqual(utils.html_to_text(html), "Test text")
+ dom = html.fromstring(html_str)
+ self.assertEqual(utils.extract_text(dom), 'Test text')
+ self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
+ self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
+ self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
+
+ def test_extract_url(self):
+ def f(html_str, search_url):
+ return utils.extract_url(html.fromstring(html_str), search_url)
+ self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
+ self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
+ self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
+ self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
+ self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
+ with self.assertRaises(lxml.etree.ParserError):
+ f('', 'https://example.com')
+ with self.assertRaises(Exception):
+ utils.extract_url([], 'https://example.com')
def test_html_to_text_invalid(self):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'