[refactor] typification of SearXNG (MainResult) / result items (part 2)

The class ReslutContainer has been revised, it can now handle the typed Result items of classes: - MainResult - LegacyResult (a dict wrapper for backward compatibility) Due to the now complete typing of theses three clases, instead of the *getitem* accesses, the fields can now be accessed directly via attributes (which is also supported by the IDE). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-01-02 17:10:01 +00:00 · 2025-03-05 17:29:20 +01:00
parent d6ce29f7f0
commit 8769b7c6d6
15 changed files with 608 additions and 541 deletions
--- a/searx/result_types/init.py
+++ b/searx/result_types/init.py
@@ -13,13 +13,13 @@

 from __future__ import annotations

-__all__ = ["Result", "EngineResults", "AnswerSet", "Answer", "Translations"]
+__all__ = ["Result", "MainResult", "EngineResults", "AnswerSet", "Answer", "Translations"]

 import abc

 from searx import enginelib

-from ._base import Result, LegacyResult
+from ._base import Result, MainResult, LegacyResult
 from .answer import AnswerSet, Answer, Translations


@@ -30,13 +30,18 @@ class ResultList(list, abc.ABC):
        """The collection of result types (which have already been implemented)."""

        Answer = Answer
+        MainResult = MainResult
+        Result = Result
        Translations = Translations

+        # for backward compatibility
+        LegacyResult = LegacyResult
+
    def __init__(self):
        # pylint: disable=useless-parent-delegation
        super().__init__()

-    def add(self, result: Result):
+    def add(self, result: Result | LegacyResult):
        """Add a :py:`Result` item to the result list."""
        self.append(result)

--- a/searx/result_types/_base.py
+++ b/searx/result_types/_base.py
@@ -10,6 +10,8 @@
 .. autoclass:: Result
   :members:

+.. _LegacyResult:
+
 .. autoclass:: LegacyResult
   :members:
 """
@@ -22,9 +24,88 @@ __all__ = ["Result"]
 import re
 import urllib.parse
 import warnings
+import typing

 import msgspec

+from searx import logger as log
+
+WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
+
+
+def _normalize_url_fields(result: Result | LegacyResult):
+
+    # As soon we need LegacyResult not any longer, we can move this function to
+    # method Result.normalize_result_fields
+
+    if result.url and not result.parsed_url:
+        if not isinstance(result.url, str):
+            log.debug('result: invalid URL: %s', str(result))
+            result.url = ""
+            result.parsed_url = None
+        else:
+            result.parsed_url = urllib.parse.urlparse(result.url)
+
+    if result.parsed_url:
+        result.parsed_url = result.parsed_url._replace(
+            # if the result has no scheme, use http as default
+            scheme=result.parsed_url.scheme or "http",
+            # normalize ``www.example.com`` to ``example.com``
+            netloc=result.parsed_url.netloc.replace("www.", ""),
+            # normalize ``example.com/path/`` to ``example.com/path``
+            path=result.parsed_url.path.rstrip("/"),
+        )
+        result.url = result.parsed_url.geturl()
+
+    if isinstance(result, LegacyResult) and getattr(result, "infobox", None):
+        # As soon we have InfoboxResult, we can move this function to method
+        # InfoboxResult.normalize_result_fields
+
+        infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
+        for item in infobox_urls:
+            _url = item.get("url")
+            if not _url:
+                continue
+            _url = urllib.parse.urlparse(_url)
+            item["url"] = _url._replace(
+                scheme=_url.scheme or "http",
+                netloc=_url.netloc.replace("www.", ""),
+                path=_url.path.rstrip("/"),
+            ).geturl()
+
+        infobox_id = getattr(result, "id", None)
+        if infobox_id:
+            _url = urllib.parse.urlparse(infobox_id)
+            result.id = _url._replace(
+                scheme=_url.scheme or "http",
+                netloc=_url.netloc.replace("www.", ""),
+                path=_url.path.rstrip("/"),
+            ).geturl()
+
+
+def _normalize_text_fields(result: MainResult | LegacyResult):
+
+    # As soon we need LegacyResult not any longer, we can move this function to
+    # method MainResult.normalize_result_fields
+
+    # Actually, a type check should not be necessary if the engine is
+    # implemented correctly. Historically, however, we have always had a type
+    # check here.
+
+    if result.title and not isinstance(result.title, str):
+        log.debug("result: invalid type of field 'title': %s", str(result))
+        result.title = str(result)
+    if result.content and not isinstance(result.content, str):
+        log.debug("result: invalid type of field 'content': %s", str(result))
+        result.content = str(result)
+
+    # normalize title and content
+    result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
+    result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
+    if result.content == result.title:
+        # avoid duplicate content between the content and title fields
+        result.content = ""
+

 class Result(msgspec.Struct, kw_only=True):
    """Base class of all result types :ref:`result types`."""
@@ -54,21 +135,20 @@ class Result(msgspec.Struct, kw_only=True):
    """

    def normalize_result_fields(self):
-        """Normalize a result ..
+        """Normalize fields ``url`` and ``parse_sql``.

-        - if field ``url`` is set and field ``parse_url`` is unset, init
-          ``parse_url`` from field ``url``.  This method can be extended in the
-          inheritance.
+        - If field ``url`` is set and field ``parse_url`` is unset, init
+          ``parse_url`` from field ``url``.  The ``url`` field is initialized
+          with the resulting value in ``parse_url``, if ``url`` and
+          ``parse_url`` are not equal.

+        - ``www.example.com`` and ``example.com`` are equivalent and are normalized
+          to ``example.com``.
+
+        - ``example.com/path/`` and ``example.com/path`` are equivalent and are
+          normalized to ``example.com/path``.
        """
-
-        if not self.parsed_url and self.url:
-            self.parsed_url = urllib.parse.urlparse(self.url)
-
-            # if the result has no scheme, use http as default
-            if not self.parsed_url.scheme:
-                self.parsed_url = self.parsed_url._replace(scheme="http")
-                self.url = self.parsed_url.geturl()
+        _normalize_url_fields(self)

    def __post_init__(self):
        pass
@@ -84,7 +164,6 @@ class Result(msgspec.Struct, kw_only=True):
        The hash value is used in contexts, e.g. when checking for equality to
        identify identical results from different sources (engines).
        """
-
        return id(self)

    def __eq__(self, other):
@@ -113,12 +192,19 @@ class Result(msgspec.Struct, kw_only=True):
    def as_dict(self):
        return {f: getattr(self, f) for f in self.__struct_fields__}

+    def defaults_from(self, other: Result):
+        """Fields not set in *self* will be updated from the field values of the
+        *other*.
+        """
+        for field_name in self.__struct_fields__:
+            self_val = getattr(self, field_name, False)
+            other_val = getattr(other, field_name, False)
+            if self_val:
+                setattr(self, field_name, other_val)
+

 class MainResult(Result):  # pylint: disable=missing-class-docstring
-
-    # open_group and close_group should not manged in the Result class (we should rop it from here!)
-    open_group: bool = False
-    close_group: bool = False
+    """Base class of all result types displayed in :ref:`area main results`."""

    title: str = ""
    """Link title of the result item."""
@@ -132,6 +218,43 @@ class MainResult(Result):  # pylint: disable=missing-class-docstring
    thumbnail: str = ""
    """URL of a thumbnail that is displayed in the result item."""

+    priority: typing.Literal["", "high", "low"] = ""
+    """The priority can be set via :ref:`hostnames plugin`, for example."""
+
+    engines: set[str] = set()
+    """In a merged results list, the names of the engines that found this result
+    are listed in this field."""
+
+    # open_group and close_group should not manged in the Result
+    # class (we should drop it from here!)
+    open_group: bool = False
+    close_group: bool = False
+    positions: list[int] = []
+    score: float = 0
+    category: str = ""
+
+    def __hash__(self) -> int:
+        """Ordinary url-results are equal if their values for
+        :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme)
+        and :py:obj:`MainResult.img_src` are equal.
+        """
+        if not self.parsed_url:
+            raise ValueError(f"missing a value in field 'parsed_url': {self}")
+
+        url = self.parsed_url
+        return hash(
+            f"{self.template}"
+            + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
+            + f"|{self.img_src}"
+        )
+
+    def normalize_result_fields(self):
+        super().normalize_result_fields()
+
+        _normalize_text_fields(self)
+        if self.engine:
+            self.engines.add(self.engine)
+

 class LegacyResult(dict):
    """A wrapper around a legacy result item.  The SearXNG core uses this class
@@ -150,7 +273,27 @@ class LegacyResult(dict):
    """

    UNSET = object()
-    WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
+
+    # emulate field types from type class Result
+    url: str | None
+    template: str
+    engine: str
+    parsed_url: urllib.parse.ParseResult | None
+
+    # emulate field types from type class MainResult
+    title: str
+    content: str
+    img_src: str
+    thumbnail: str
+    priority: typing.Literal["", "high", "low"]
+    engines: set[str]
+    positions: list[int]
+    score: float
+    category: str
+
+    # infobox result
+    urls: list[dict[str, str]]
+    attributes: list[dict[str, str]]

    def as_dict(self):
        return self
@@ -159,14 +302,26 @@ class LegacyResult(dict):

        super().__init__(*args, **kwargs)

-        # Init fields with defaults / compare with defaults of the fields in class Result
-        self.engine = self.get("engine", "")
-        self.template = self.get("template", "default.html")
-        self.url = self.get("url", None)
-        self.parsed_url = self.get("parsed_url", None)
+        # emulate field types from type class Result
+        self["url"] = self.get("url")
+        self["template"] = self.get("template", "default.html")
+        self["engine"] = self.get("engine", "")
+        self["parsed_url"] = self.get("parsed_url")

-        self.content = self.get("content", "")
-        self.title = self.get("title", "")
+        # emulate field types from type class MainResult
+        self["title"] = self.get("title", "")
+        self["content"] = self.get("content", "")
+        self["img_src"] = self.get("img_src", "")
+        self["thumbnail"] = self.get("thumbnail", "")
+        self["priority"] = self.get("priority", "")
+        self["engines"] = self.get("engines", set())
+        self["positions"] = self.get("positions", "")
+        self["score"] = self.get("score", 0)
+        self["category"] = self.get("category", "")
+
+        if "infobox" in self:
+            self["urls"] = self.get("urls", [])
+            self["attributes"] = self.get("attributes", [])

        # Legacy types that have already been ported to a type ..

@@ -178,13 +333,47 @@ class LegacyResult(dict):
            )
            self.template = "answer/legacy.html"

+        if self.template == "keyvalue.html":
+            warnings.warn(
+                f"engine {self.engine} is using deprecated `dict` for key/value results"
+                f" / use a class from searx.result_types",
+                DeprecationWarning,
+            )
+
+    def __getattr__(self, name: str, default=UNSET) -> typing.Any:
+        if default == self.UNSET and name not in self:
+            raise AttributeError(f"LegacyResult object has no field named: {name}")
+        return self[name]
+
+    def __setattr__(self, name: str, val):
+        self[name] = val
+
    def __hash__(self) -> int:  # type: ignore

        if "answer" in self:
+            # deprecated ..
            return hash(self["answer"])
+
+        if self.template == "images.html":
+            # image results are equal if their values for template, the url and
+            # the img_src are equal.
+            return hash(f"{self.template}|{self.url}|{self.img_src}")
+
        if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]):
-            # it is a commun url-result ..
-            return hash(self.url)
+            # Ordinary url-results are equal if their values for template,
+            # parsed_url (without schema) and img_src` are equal.
+
+            # Code copied from with MainResult.__hash__:
+            if not self.parsed_url:
+                raise ValueError(f"missing a value in field 'parsed_url': {self}")
+
+            url = self.parsed_url
+            return hash(
+                f"{self.template}"
+                + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
+                + f"|{self.img_src}"
+            )
+
        return id(self)

    def __eq__(self, other):
@@ -195,30 +384,13 @@ class LegacyResult(dict):

        return f"LegacyResult: {super().__repr__()}"

-    def __getattr__(self, name: str, default=UNSET):
-
-        if default == self.UNSET and name not in self:
-            raise AttributeError(f"LegacyResult object has no field named: {name}")
-        return self[name]
-
-    def __setattr__(self, name: str, val):
-
-        self[name] = val
-
    def normalize_result_fields(self):
+        _normalize_url_fields(self)
+        _normalize_text_fields(self)
+        if self.engine:
+            self.engines.add(self.engine)

-        self.title = self.WHITESPACE_REGEX.sub(" ", self.title)
-
-        if not self.parsed_url and self.url:
-            self.parsed_url = urllib.parse.urlparse(self.url)
-
-            # if the result has no scheme, use http as default
-            if not self.parsed_url.scheme:
-                self.parsed_url = self.parsed_url._replace(scheme="http")
-                self.url = self.parsed_url.geturl()
-
-        if self.content:
-            self.content = self.WHITESPACE_REGEX.sub(" ", self.content)
-            if self.content == self.title:
-                # avoid duplicate content between the content and title fields
-                self.content = ""
+    def defaults_from(self, other: LegacyResult):
+        for k, v in other.items():
+            if not self.get(k):
+                self[k] = v