[fix] rerank plugin: adapt to #4183

This commit is contained in:
GenericMale 2025-01-29 18:17:46 +01:00
parent 7fa0fbd93d
commit ab83de9332
4 changed files with 96 additions and 72 deletions

View File

@ -1,7 +1,7 @@
.. _rerank plugin:
================
Rerank plugin
Rerank
================
.. automodule:: searx.plugins.rerank

View File

@ -68,6 +68,9 @@ class PluginInfo:
keywords: list[str] = field(default_factory=list)
"""See :py:obj:`Plugin.keywords`"""
is_allowed: bool = True
"""Switch to disable plugin completely, without the user preference."""
class Plugin(abc.ABC):
"""Abstract base class of all Plugins."""

View File

@ -1,37 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Plugin which reranks the search results using the Okapi BM25 algorithm.
# pylint: disable=missing-module-docstring, missing-class-docstring
from __future__ import annotations
import typing
This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query,
potentially improving the quality of results. Before enabling this plugin,
ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or
by adding it to the project's `requirements.txt` file.
Configuration:
--------------
To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file:
.. code:: yaml
enabled_plugins:
..
- 'Rerank plugin'
By default, the plugin retains the information about which engines found a particular result.
Results that appear in multiple engine results will receive a score boost.
This approach might be relevant if you wish results found by different engines to be prioritized.
You can modify this behaviour by configuring the ``remove_extra_engines`` setting.
If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine.
This is useful when you prefer the reranking to not be affected by any potential overlap
of results from different engines.
.. code:: yaml
rerank:
remove_extra_engines: true
"""
from flask_babel import gettext
from searx import settings
from searx.plugins import Plugin, PluginInfo
from searx.result_types import EngineResults
if typing.TYPE_CHECKING:
from searx.search import SearchWithPlugins
from searx.extended_types import SXNG_Request
try:
import bm25s
@ -39,56 +19,97 @@ except ImportError:
# Import error is ignored because the admin has to install bm25s manually to use the plugin
bm25s = None
name = 'Rerank plugin'
description = 'Rerank search results, ignoring original engine ranking'
default_on = False
preference_section = 'general'
# Supported stopwords for bm25s. Default is 'en'
stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh']
class SXNGPlugin(Plugin):
"""Plugin which reranks the search results using the Okapi BM25 algorithm.
remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines')
This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query,
potentially improving the quality of results. Before enabling this plugin,
ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or
by adding it to the project's `requirements.txt` file.
Configuration:
--------------
To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file:
def post_search(_request, search):
if not bm25s:
return True
.. code:: yaml
# pylint: disable=protected-access
results = search.result_container._merged_results
query = search.search_query.query
locale = search.search_query.locale
enabled_plugins:
..
- 'Rerank plugin'
# Determine the stopwords based on the selected locale
stopwords = locale.language if locale and locale.language in stopword_langs else 'en'
By default, the plugin retains the information about which engines found a particular result.
Results that appear in multiple engine results will receive a score boost.
This approach might be relevant if you wish results found by different engines to be prioritized.
You can modify this behaviour by configuring the ``remove_extra_engines`` setting.
If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine.
This is useful when you prefer the reranking to not be affected by any potential overlap
of results from different engines.
retriever = bm25s.BM25()
result_tokens = bm25s.tokenize(
[f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" for result in results],
stopwords=stopwords,
)
retriever.index(result_tokens)
.. code:: yaml
query_tokens = bm25s.tokenize(query, stopwords=stopwords)
rerank:
remove_extra_engines: true
# Retrieve ranked indices of results based on the query tokens
indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False)
"""
if remove_extra_engines:
# Only keep the main engine and set our ranking
for position, index in enumerate(indices[0]):
if 'positions' in results[index]:
results[index]['positions'] = [position + 1]
results[index]['engines'] = set([results[index]['engine']])
else:
# Overwrite all engine positions with the new ranking
# Results returned from multiple engines will still get a score boost
for position, index in enumerate(indices[0]):
if 'positions' in results[index]:
results[index]['positions'] = [position + 1] * len(results[index]['positions'])
id = "rerank"
default_on = False
return True
def __init__(self):
super().__init__()
self.stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh']
self.remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines')
def is_allowed():
return bm25s is not None
self.info = PluginInfo(
id=self.id,
name=gettext("Rerank plugin"),
description=gettext("""Rerank search results, ignoring original engine ranking"""),
preference_section="general",
is_allowed=bm25s is not None,
)
def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
results = EngineResults()
if not bm25s:
return results
# pylint: disable=protected-access
results = search.result_container._merged_results
query = search.search_query.query
locale = search.search_query.locale
# Determine the stopwords based on the selected locale
stopwords = locale.language if locale and locale.language in self.stopword_langs else 'en'
retriever = bm25s.BM25()
result_tokens = bm25s.tokenize(
[
f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}"
for result in results
],
stopwords=stopwords,
)
retriever.index(result_tokens)
query_tokens = bm25s.tokenize(query, stopwords=stopwords)
# Retrieve ranked indices of results based on the query tokens
indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False)
if self.remove_extra_engines:
# Only keep the main engine and set our ranking
for position, index in enumerate(indices[0]):
if 'positions' in results[index]:
results[index]['positions'] = [position + 1]
results[index]['engines'] = set([results[index]['engine']])
else:
# Overwrite all engine positions with the new ranking
# Results returned from multiple engines will still get a score boost
for position, index in enumerate(indices[0]):
if 'positions' in results[index]:
results[index]['positions'] = [position + 1] * len(results[index]['positions'])
return results

View File

@ -38,7 +38,7 @@
{%- macro plugin_preferences(section) -%}
{%- for plugin in plugins_storage -%}
{%- if plugin.preference_section == section -%}
{%- if plugin.preference_section == section and plugin.is_allowed -%}
<fieldset>{{- '' -}}
<legend>{{ _(plugin.name) }}</legend>{{- '' -}}
<div class="value">