mirror of
https://github.com/searxng/searxng.git
synced 2025-02-20 12:20:04 +00:00
[feat] plugins: new rerank results plugin
This commit is contained in:
parent
738906358b
commit
9f9ae5427d
77
searx/plugins/rerank.py
Normal file
77
searx/plugins/rerank.py
Normal file
@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Plugin which reranks the search results using the Okapi BM25 algorithm.
|
||||
Before enabling the Rerank plugin, you must the install the pip package ``bm25s``.
|
||||
|
||||
Enable in ``settings.yml``:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
enabled_plugins:
|
||||
..
|
||||
- 'Rerank plugin'
|
||||
|
||||
By default, the engine list is retained, so results found by multiple engines receive a score boost.
|
||||
The following setting can be used to ensure that the engine list only contains the first engine.
|
||||
This will prevent overlapping search engine results from affecting the ranking:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
rerank:
|
||||
remove_extra_engines: true
|
||||
|
||||
"""
|
||||
|
||||
from searx import settings
|
||||
|
||||
try:
|
||||
import bm25s
|
||||
except ImportError:
|
||||
# Import error is ignored because the admin has to install bm25s manually to use the engine
|
||||
pass
|
||||
|
||||
name = 'Rerank plugin'
|
||||
description = 'Rerank search results, ignoring original engine ranking'
|
||||
default_on = False
|
||||
preference_section = 'general'
|
||||
|
||||
# Supported stopwords for bm25s. Default is 'en'
|
||||
stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh']
|
||||
|
||||
remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines')
|
||||
|
||||
|
||||
def post_search(_request, search):
|
||||
# pylint: disable=protected-access
|
||||
results = search.result_container._merged_results
|
||||
query = search.search_query.query
|
||||
locale = search.search_query.locale
|
||||
|
||||
# Determine the stopwords based on the selected locale
|
||||
stopwords = locale.language if locale and locale.language in stopword_langs else True
|
||||
|
||||
retriever = bm25s.BM25()
|
||||
result_tokens = bm25s.tokenize(
|
||||
[f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" for result in results],
|
||||
stopwords=stopwords,
|
||||
)
|
||||
retriever.index(result_tokens)
|
||||
|
||||
query_tokens = bm25s.tokenize(query, stopwords=stopwords)
|
||||
|
||||
# Retrieve ranked indices of results based on the query tokens
|
||||
indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False)
|
||||
|
||||
if remove_extra_engines:
|
||||
# Only keep the main engine and set our ranking
|
||||
for position, index in enumerate(indices[0]):
|
||||
if 'positions' in results[index]:
|
||||
results[index]['positions'] = [position + 1]
|
||||
results[index]['engines'] = set([results[index]['engine']])
|
||||
else:
|
||||
# Overwrite all engine positions with the new ranking
|
||||
# Results returned from multiple engines will still get a score boost
|
||||
for position, index in enumerate(indices[0]):
|
||||
if 'positions' in results[index]:
|
||||
results[index]['positions'] = [position + 1] * len(results[index]['positions'])
|
||||
|
||||
return True
|
Loading…
Reference in New Issue
Block a user