From 2006eb468087c045c46c7d9e1d771e8ab2dfed7b Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 2 Oct 2020 18:13:56 +0200 Subject: [mod] move extract_text, extract_url to searx.utils --- tests/unit/test_utils.py | 49 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'tests') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 69f5ef92a..f3a98ad71 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +import lxml.etree +from lxml import html + from searx.testing import SearxTestCase from searx import utils @@ -16,7 +19,30 @@ class TestUtils(SearxTestCase): self.assertTrue(utils.searx_useragent().startswith('searx')) def test_html_to_text(self): - html = """ + html_str = """ + + + + + + + + + Test text + + + + """ + self.assertIsInstance(utils.html_to_text(html_str), str) + self.assertIsNotNone(utils.html_to_text(html_str)) + self.assertEqual(utils.html_to_text(html_str), "Test text") + + def test_extract_text(self): + html_str = """ @@ -28,9 +54,24 @@ class TestUtils(SearxTestCase): """ - self.assertIsInstance(utils.html_to_text(html), str) - self.assertIsNotNone(utils.html_to_text(html)) - self.assertEqual(utils.html_to_text(html), "Test text") + dom = html.fromstring(html_str) + self.assertEqual(utils.extract_text(dom), 'Test text') + self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text') + self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg') + self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '') + + def test_extract_url(self): + def f(html_str, search_url): + return utils.extract_url(html.fromstring(html_str), search_url) + self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/') + self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/') + self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/') + self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/') + self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1') + with self.assertRaises(lxml.etree.ParserError): + f('', 'https://example.com') + with self.assertRaises(Exception): + utils.extract_url([], 'https://example.com') def test_html_to_text_invalid(self): html = '

Lorem ipsumdolor sit amet

' -- cgit v1.2.3