From 0476de443ec5a688386f84a48fcf2e8876abe286 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Thu, 17 Oct 2024 04:57:21 +0000 Subject: [PATCH 1/2] [enh] use longest title and test get_ordered_results() --- searx/results.py | 11 ++++--- tests/unit/test_results.py | 61 ++++++++++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/searx/results.py b/searx/results.py index 7c973ca8f..2b677b105 100644 --- a/searx/results.py +++ b/searx/results.py @@ -12,7 +12,6 @@ from searx import logger from searx.engines import engines from searx.metrics import histogram_observe, counter_add, count_error - CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) @@ -133,7 +132,7 @@ def result_score(result, priority): weight = 1.0 for result_engine in result['engines']: - if hasattr(engines[result_engine], 'weight'): + if hasattr(engines.get(result_engine), 'weight'): weight *= float(engines[result_engine].weight) weight *= len(result['positions']) @@ -332,10 +331,14 @@ class ResultContainer: return None def __merge_duplicated_http_result(self, duplicated, result, position): - # using content with more text + # use content with more text if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')): duplicated['content'] = result['content'] + # use title with more text + if result_content_len(result.get('title', '')) > len(duplicated.get('title', '')): + duplicated['title'] = result['title'] + # merge all result's parameters not found in duplicate for key in result.keys(): if not duplicated.get(key): @@ -347,7 +350,7 @@ class ResultContainer: # add engine to list of result-engines duplicated['engines'].add(result['engine']) - # using https if possible + # use https if possible if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': duplicated['url'] = result['parsed_url'].geturl() duplicated['parsed_url'] = result['parsed_url'] diff --git a/tests/unit/test_results.py b/tests/unit/test_results.py index 72486bbc7..608d3c8c3 100644 --- a/tests/unit/test_results.py +++ b/tests/unit/test_results.py @@ -2,9 +2,26 @@ # pylint: disable=missing-module-docstring from searx.results import ResultContainer +from searx.engines import load_engines from tests import SearxTestCase +def make_test_engine_dict(**kwargs) -> dict: + test_engine = { + # fmt: off + 'name': None, + 'engine': None, + 'categories': 'general', + 'shortcut': 'dummy', + 'timeout': 3.0, + 'tokens': [], + # fmt: on + } + + test_engine.update(**kwargs) + return test_engine + + def fake_result(url='https://aa.bb/cc?dd=ee#ff', title='aaa', content='bbb', engine='wikipedia', **kwargs): result = { # fmt: off @@ -19,23 +36,41 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff', title='aaa', content='bbb', eng class ResultContainerTestCase(SearxTestCase): # pylint: disable=missing-class-docstring + def setUp(self) -> None: + stract_engine = make_test_engine_dict(name="stract", engine="stract", shortcut="stra") + duckduckgo_engine = make_test_engine_dict(name="duckduckgo", engine="duckduckgo", shortcut="ddg") + mojeek_engine = make_test_engine_dict(name="mojeek", engine="mojeek", shortcut="mjk") + + load_engines([stract_engine, duckduckgo_engine, mojeek_engine]) + + self.container = ResultContainer() + + def tearDown(self): + load_engines([]) + def test_empty(self): - c = ResultContainer() - self.assertEqual(c.get_ordered_results(), []) + self.assertEqual(self.container.get_ordered_results(), []) def test_one_result(self): - c = ResultContainer() - c.extend('wikipedia', [fake_result()]) - self.assertEqual(c.results_length(), 1) + self.container.extend('wikipedia', [fake_result()]) + + self.assertEqual(self.container.results_length(), 1) def test_one_suggestion(self): - c = ResultContainer() - c.extend('wikipedia', [fake_result(suggestion=True)]) - self.assertEqual(len(c.suggestions), 1) - self.assertEqual(c.results_length(), 0) + self.container.extend('wikipedia', [fake_result(suggestion=True)]) + + self.assertEqual(len(self.container.suggestions), 1) + self.assertEqual(self.container.results_length(), 0) def test_result_merge(self): - c = ResultContainer() - c.extend('wikipedia', [fake_result()]) - c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')]) - self.assertEqual(c.results_length(), 2) + self.container.extend('wikipedia', [fake_result()]) + self.container.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')]) + + self.assertEqual(self.container.results_length(), 2) + + def test_result_merge_by_title(self): + self.container.extend('stract', [fake_result(engine='stract', title='short title')]) + self.container.extend('duckduckgo', [fake_result(engine='duckduckgo', title='normal title')]) + self.container.extend('mojeek', [fake_result(engine='mojeek', title='this long long title')]) + + self.assertEqual(self.container.get_ordered_results()[0].get('title', ''), 'this long long title') From 21dd524a12722c7f2d72cf2c6497f77f17bdba6f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 30 Oct 2024 13:16:31 +0100 Subject: [PATCH 2/2] [fix] unit tests: call searx.search.initialize in test's setUp Depending on the order the unit tests are executed, the searx.search module is initalized or not, issue reported in [1]:: Traceback (most recent call last): File "searxng/tests/unit/test_results.py", line 72, in test_result_merge_by_title self.container.extend('stract', [fake_result(engine='stract', title='short title')]) File "searxng/searx/results.py", line 243, in extend histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count') File "searxng/searx/metrics/__init__.py", line 49, in histogram_observe histogram_storage.get(*args).observe(duration) ^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'get' To ensure that the searx.search module is initialized, the - searx.engines.load_engines is replace by - searx.search.initialize [1] https://github.com/searxng/searxng/pull/3932#discussion_r1822406569 Signed-off-by: Markus Heiser --- tests/unit/processors/test_online.py | 6 ++-- tests/unit/test_engine_mariadb_server.py | 17 +--------- tests/unit/test_engine_tineye.py | 42 ++++++++++++++---------- tests/unit/test_query.py | 6 ++-- tests/unit/test_results.py | 9 +++-- 5 files changed, 35 insertions(+), 45 deletions(-) diff --git a/tests/unit/processors/test_online.py b/tests/unit/processors/test_online.py index 10e0deb97..fcb01587d 100644 --- a/tests/unit/processors/test_online.py +++ b/tests/unit/processors/test_online.py @@ -3,7 +3,7 @@ from searx.search import SearchQuery, EngineRef from searx.search.processors import online -from searx.engines import load_engines +import searx.search from searx import engines from tests import SearxTestCase @@ -22,10 +22,10 @@ TEST_ENGINE = { class TestOnlineProcessor(SearxTestCase): # pylint: disable=missing-class-docstring def setUp(self): - load_engines([TEST_ENGINE]) + searx.search.initialize([TEST_ENGINE]) def tearDown(self): - load_engines([]) + searx.search.load_engines([]) def _get_params(self, online_processor, search_query, engine_category): params = online_processor.get_params(search_query, engine_category) diff --git a/tests/unit/test_engine_mariadb_server.py b/tests/unit/test_engine_mariadb_server.py index 423132e34..c4144a601 100644 --- a/tests/unit/test_engine_mariadb_server.py +++ b/tests/unit/test_engine_mariadb_server.py @@ -2,26 +2,11 @@ # pylint: disable=missing-module-docstring from unittest.mock import MagicMock, Mock -from searx.engines import load_engines, mariadb_server +from searx.engines import mariadb_server from tests import SearxTestCase class MariadbServerTests(SearxTestCase): # pylint: disable=missing-class-docstring - def setUp(self): - load_engines( - [ - { - 'name': 'mariadb server', - 'engine': 'mariadb_server', - 'shortcut': 'mdb', - 'timeout': 9.0, - 'disabled': True, - } - ] - ) - - def tearDown(self): - load_engines([]) def test_init_no_query_str_raises(self): self.assertRaises(ValueError, lambda: mariadb_server.init({})) diff --git a/tests/unit/test_engine_tineye.py b/tests/unit/test_engine_tineye.py index 5855a7313..7dc8233d4 100644 --- a/tests/unit/test_engine_tineye.py +++ b/tests/unit/test_engine_tineye.py @@ -1,28 +1,34 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=missing-module-docstring - +import logging from datetime import datetime from unittest.mock import Mock from requests import HTTPError from parameterized import parameterized -from searx.engines import load_engines, tineye +import searx.search +import searx.engines from tests import SearxTestCase class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring def setUp(self): - load_engines([{'name': 'tineye', 'engine': 'tineye', 'shortcut': 'tin', 'timeout': 9.0, 'disabled': True}]) + searx.search.initialize( + [{'name': 'tineye', 'engine': 'tineye', 'shortcut': 'tin', 'timeout': 9.0, 'disabled': True}] + ) + + self.tineye = searx.engines.engines['tineye'] + self.tineye.logger.setLevel(logging.CRITICAL) def tearDown(self): - load_engines([]) + searx.search.load_engines([]) def test_status_code_raises(self): response = Mock() response.status_code = 401 response.raise_for_status.side_effect = HTTPError() - self.assertRaises(HTTPError, lambda: tineye.response(response)) + self.assertRaises(HTTPError, lambda: self.tineye.response(response)) @parameterized.expand([(400), (422)]) def test_returns_empty_list(self, status_code): @@ -30,7 +36,7 @@ class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring response.json.return_value = {} response.status_code = status_code response.raise_for_status.side_effect = HTTPError() - results = tineye.response(response) + results = self.tineye.response(response) self.assertEqual(0, len(results)) def test_logs_format_for_422(self): @@ -39,9 +45,9 @@ class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring response.status_code = 422 response.raise_for_status.side_effect = HTTPError() - with self.assertLogs(tineye.logger) as assert_logs_context: - tineye.response(response) - self.assertIn(tineye.FORMAT_NOT_SUPPORTED, ','.join(assert_logs_context.output)) + with self.assertLogs(self.tineye.logger) as assert_logs_context: + self.tineye.response(response) + self.assertIn(self.tineye.FORMAT_NOT_SUPPORTED, ','.join(assert_logs_context.output)) def test_logs_signature_for_422(self): response = Mock() @@ -49,9 +55,9 @@ class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring response.status_code = 422 response.raise_for_status.side_effect = HTTPError() - with self.assertLogs(tineye.logger) as assert_logs_context: - tineye.response(response) - self.assertIn(tineye.NO_SIGNATURE_ERROR, ','.join(assert_logs_context.output)) + with self.assertLogs(self.tineye.logger) as assert_logs_context: + self.tineye.response(response) + self.assertIn(self.tineye.NO_SIGNATURE_ERROR, ','.join(assert_logs_context.output)) def test_logs_download_for_422(self): response = Mock() @@ -59,9 +65,9 @@ class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring response.status_code = 422 response.raise_for_status.side_effect = HTTPError() - with self.assertLogs(tineye.logger) as assert_logs_context: - tineye.response(response) - self.assertIn(tineye.DOWNLOAD_ERROR, ','.join(assert_logs_context.output)) + with self.assertLogs(self.tineye.logger) as assert_logs_context: + self.tineye.response(response) + self.assertIn(self.tineye.DOWNLOAD_ERROR, ','.join(assert_logs_context.output)) def test_logs_description_for_400(self): description = 'There was a problem with that request. Error ID: ad5fc955-a934-43c1-8187-f9a61d301645' @@ -70,8 +76,8 @@ class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring response.status_code = 400 response.raise_for_status.side_effect = HTTPError() - with self.assertLogs(tineye.logger) as assert_logs_context: - tineye.response(response) + with self.assertLogs(self.tineye.logger) as assert_logs_context: + self.tineye.response(response) self.assertIn(description, ','.join(assert_logs_context.output)) def test_crawl_date_parses(self): @@ -90,5 +96,5 @@ class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring ] } response.status_code = 200 - results = tineye.response(response) + results = self.tineye.response(response) self.assertEqual(date, results[0]['publishedDate']) diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 601a6e60d..00c53edc7 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -2,7 +2,7 @@ # pylint: disable=missing-module-docstring from parameterized.parameterized import parameterized -from searx.engines import load_engines +import searx.search from searx.query import RawTextQuery from tests import SearxTestCase @@ -218,10 +218,10 @@ class TestBang(SearxTestCase): # pylint:disable=missing-class-docstring THE_QUERY = 'the query' def setUp(self): - load_engines(TEST_ENGINES) + searx.search.initialize(TEST_ENGINES) def tearDown(self): - load_engines([]) + searx.search.load_engines([]) @parameterized.expand(SPECIFIC_BANGS) def test_bang(self, bang: str): diff --git a/tests/unit/test_results.py b/tests/unit/test_results.py index 608d3c8c3..740d36a03 100644 --- a/tests/unit/test_results.py +++ b/tests/unit/test_results.py @@ -2,7 +2,7 @@ # pylint: disable=missing-module-docstring from searx.results import ResultContainer -from searx.engines import load_engines +import searx.search from tests import SearxTestCase @@ -36,17 +36,16 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff', title='aaa', content='bbb', eng class ResultContainerTestCase(SearxTestCase): # pylint: disable=missing-class-docstring + def setUp(self) -> None: stract_engine = make_test_engine_dict(name="stract", engine="stract", shortcut="stra") duckduckgo_engine = make_test_engine_dict(name="duckduckgo", engine="duckduckgo", shortcut="ddg") mojeek_engine = make_test_engine_dict(name="mojeek", engine="mojeek", shortcut="mjk") - - load_engines([stract_engine, duckduckgo_engine, mojeek_engine]) - + searx.search.initialize([stract_engine, duckduckgo_engine, mojeek_engine]) self.container = ResultContainer() def tearDown(self): - load_engines([]) + searx.search.load_engines([]) def test_empty(self): self.assertEqual(self.container.get_ordered_results(), [])