This commit is contained in:
GenericMale 2025-01-29 17:18:07 +00:00 committed by GitHub
commit 0bc1c432c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 129 additions and 1 deletions

View File

@ -0,0 +1,9 @@
.. _rerank plugin:
================
Rerank
================
.. automodule:: searx.plugins.rerank
:members:

View File

@ -68,6 +68,9 @@ class PluginInfo:
keywords: list[str] = field(default_factory=list)
"""See :py:obj:`Plugin.keywords`"""
is_allowed: bool = True
"""Switch to disable plugin completely, without the user preference."""
class Plugin(abc.ABC):
"""Abstract base class of all Plugins."""

115
searx/plugins/rerank.py Normal file
View File

@ -0,0 +1,115 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, missing-class-docstring
from __future__ import annotations
import typing
from flask_babel import gettext
from searx import settings
from searx.plugins import Plugin, PluginInfo
from searx.result_types import EngineResults
if typing.TYPE_CHECKING:
from searx.search import SearchWithPlugins
from searx.extended_types import SXNG_Request
try:
import bm25s
except ImportError:
# Import error is ignored because the admin has to install bm25s manually to use the plugin
bm25s = None
class SXNGPlugin(Plugin):
"""Plugin which reranks the search results using the Okapi BM25 algorithm.
This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query,
potentially improving the quality of results. Before enabling this plugin,
ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or
by adding it to the project's `requirements.txt` file.
Configuration:
--------------
To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file:
.. code:: yaml
enabled_plugins:
..
- 'Rerank plugin'
By default, the plugin retains the information about which engines found a particular result.
Results that appear in multiple engine results will receive a score boost.
This approach might be relevant if you wish results found by different engines to be prioritized.
You can modify this behaviour by configuring the ``remove_extra_engines`` setting.
If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine.
This is useful when you prefer the reranking to not be affected by any potential overlap
of results from different engines.
.. code:: yaml
rerank:
remove_extra_engines: true
"""
id = "rerank"
default_on = False
def __init__(self):
super().__init__()
self.stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh']
self.remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines')
self.info = PluginInfo(
id=self.id,
name=gettext("Rerank plugin"),
description=gettext("""Rerank search results, ignoring original engine ranking"""),
preference_section="general",
is_allowed=bm25s is not None,
)
def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
results = EngineResults()
if not bm25s:
return results
# pylint: disable=protected-access
results = search.result_container._merged_results
query = search.search_query.query
locale = search.search_query.locale
# Determine the stopwords based on the selected locale
stopwords = locale.language if locale and locale.language in self.stopword_langs else 'en'
retriever = bm25s.BM25()
result_tokens = bm25s.tokenize(
[
f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}"
for result in results
],
stopwords=stopwords,
)
retriever.index(result_tokens)
query_tokens = bm25s.tokenize(query, stopwords=stopwords)
# Retrieve ranked indices of results based on the query tokens
indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False)
if self.remove_extra_engines:
# Only keep the main engine and set our ranking
for position, index in enumerate(indices[0]):
if 'positions' in results[index]:
results[index]['positions'] = [position + 1]
results[index]['engines'] = set([results[index]['engine']])
else:
# Overwrite all engine positions with the new ranking
# Results returned from multiple engines will still get a score boost
for position, index in enumerate(indices[0]):
if 'positions' in results[index]:
results[index]['positions'] = [position + 1] * len(results[index]['positions'])
return results

View File

@ -249,6 +249,7 @@ outgoing:
# - 'Hostnames plugin' # see 'hostnames' configuration below
# - 'Open Access DOI rewrite'
# - 'Tor check plugin'
# - 'Rerank plugin' # requires the bm25s python dependency to be installed
# Configuration of the "Hostnames plugin":
#

View File

@ -38,7 +38,7 @@
{%- macro plugin_preferences(section) -%}
{%- for plugin in plugins_storage -%}
{%- if plugin.preference_section == section -%}
{%- if plugin.preference_section == section and plugin.is_allowed -%}
<fieldset>{{- '' -}}
<legend>{{ _(plugin.name) }}</legend>{{- '' -}}
<div class="value">