From 4fb6105d699e19321f6799d7fff05313fd4cd4b9 Mon Sep 17 00:00:00 2001
From: Markus Heiser Lorem ipsum dolor sit amet
-
-
-
- Test text
-
-
-
- """
- self.assertIsInstance(utils.html_to_text(html_str), str)
- self.assertIsNotNone(utils.html_to_text(html_str))
- self.assertEqual(utils.html_to_text(html_str), "Test text")
- self.assertEqual(utils.html_to_text(r"regexp: (?
@@ -99,46 +75,44 @@ class TestUtils(SearxTestCase):
with self.assertRaises(Exception):
utils.extract_url([], 'https://example.com')
- def test_html_to_text_invalid(self):
- _html = 'Lorem ipsumdolor sit amet
' - self.assertEqual(utils.html_to_text(_html), "Lorem ipsum") - def test_ecma_unscape(self): self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界') - -class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring - - def setUp(self): - super().setUp() - - self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access - - def test__init__(self): - self.assertEqual(self.html_text_extractor.result, []) - @parameterized.expand( [ - ('xF', '\x0f'), - ('XF', '\x0f'), - ('97', 'a'), + ('Example #2', 'Example #2'), + ('Example', 'Example'), + (r'regexp: (?<![a-zA-Z]', r'regexp: (?Lorem ipsum dolor sit amet', 'Lorem ipsum dolor sit amet'), + (r'> < a', '> < a'), ] ) - def test_handle_charref(self, charref: str, expected: str): - self.html_text_extractor.handle_charref(charref) - self.assertIn(expected, self.html_text_extractor.result) + def test_html_to_text(self, html_str: str, text_str: str): + self.assertEqual(utils.html_to_text(html_str), text_str) - def test_handle_entityref(self): - entity = 'test' - self.html_text_extractor.handle_entityref(entity) - self.assertIn(entity, self.html_text_extractor.result) - - def test_invalid_html(self): - text = 'Lorem ipsumdolor sit amet
' - with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access - self.html_text_extractor.feed(text) + def test_html_to_text_with_a_style_span(self): + html_str = """ + + + + +
+
+
+
+ Test text
+
+
+
+ """
+ self.assertIsInstance(utils.html_to_text(html_str), str)
+ self.assertEqual(utils.html_to_text(html_str), "Test text")
class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring