Compare commits

...

6 Commits

Author SHA1 Message Date
Bnyro 978f723ea3
Merge 443fcc7233 into 0f9694c90b 2024-11-24 02:01:36 +00:00
Markus Heiser 0f9694c90b [clean] Internet Archive Scholar search API no longer exists
Engine was added in #2733 but the API does no longer exists. Related:

- https://github.com/searxng/searxng/issues/4038

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 17:59:38 +01:00
Markus Heiser ccc4f30b20 [doc] update quantities on the intro page
The quantities on the intro page were partly out of date / example; we already
have 210 engines and not just 70. To avoid having to change the quantities
manually in the future, they are now calculated from the jinja context

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 16:33:08 +01:00
Markus Heiser c4b874e9b0 [fix] engine Library of Congress: fix API URL loc.gov -> www.loc.gov
Avoid HTTP 404 and redirects. Requests to the JSON/YAML API use the base url [1]

    https://www.loc.gov/{endpoint}/?fo=json

[1] https://www.loc.gov/apis/json-and-yaml/requests/

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 13:02:24 +01:00
Markus Heiser 7c4e4ebd40 [log] warning with URL in case of 'raise_for_httperror'
In order to be able to implement error handling, it is necessary to know which
URL triggered the exception / the URL has not yet been logged.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 11:33:19 +01:00
Bnyro 443fcc7233 [feat] metrics: support for open metrics 2024-10-15 11:34:17 +02:00
10 changed files with 150 additions and 113 deletions

View File

@ -4,22 +4,27 @@ Welcome to SearXNG
*Search without being tracked.* *Search without being tracked.*
SearXNG is a free internet metasearch engine which aggregates results from more .. jinja:: searx
than 70 search services. Users are neither tracked nor profiled. Additionally,
SearXNG can be used over Tor for online anonymity. SearXNG is a free internet metasearch engine which aggregates results from up
to {{engines | length}} :ref:`search services <configured engines>`. Users
are neither tracked nor profiled. Additionally, SearXNG can be used over Tor
for online anonymity.
Get started with SearXNG by using one of the instances listed at searx.space_. Get started with SearXNG by using one of the instances listed at searx.space_.
If you don't trust anyone, you can set up your own, see :ref:`installation`. If you don't trust anyone, you can set up your own, see :ref:`installation`.
.. sidebar:: features .. jinja:: searx
.. sidebar:: features
- :ref:`self hosted <installation>` - :ref:`self hosted <installation>`
- :ref:`no user tracking / no profiling <SearXNG protect privacy>` - :ref:`no user tracking / no profiling <SearXNG protect privacy>`
- script & cookies are optional - script & cookies are optional
- secure, encrypted connections - secure, encrypted connections
- :ref:`about 200 search engines <configured engines>` - :ref:`{{engines | length}} search engines <configured engines>`
- `about 60 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_ - `58 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 100 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_ - about 70 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`easy integration of search engines <demo online engine>` - :ref:`easy integration of search engines <demo online engine>`
- professional development: `CI <https://github.com/searxng/searxng/actions>`_, - professional development: `CI <https://github.com/searxng/searxng/actions>`_,
`quality assurance <https://dev.searxng.org/>`_ & `quality assurance <https://dev.searxng.org/>`_ &

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Internet Archive scholar(science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": "https://scholar.archive.org/",
"wikidata_id": "Q115667709",
"official_api_documentation": "https://scholar.archive.org/api/redoc",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['science', 'scientific publications']
paging = True
base_url = "https://scholar.archive.org"
results_per_page = 15
def request(query, params):
args = {
"q": query,
"limit": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
}
params["url"] = f"{base_url}/search?{urlencode(args)}"
params["headers"]["Accept"] = "application/json"
return params
def response(resp):
results = []
json = resp.json()
for result in json["results"]:
publishedDate, content, doi = None, '', None
if result['biblio'].get('release_date'):
publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
if len(result['abstracts']) > 0:
content = result['abstracts'][0].get('body')
elif len(result['_highlights']) > 0:
content = result['_highlights'][0]
if len(result['releases']) > 0:
doi = result['releases'][0].get('doi')
results.append(
{
'template': 'paper.html',
'url': result['fulltext']['access_url'],
'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
'content': html_to_text(content),
'publisher': result['biblio'].get('publisher'),
'doi': doi,
'journal': result['biblio'].get('container_name'),
'authors': result['biblio'].get('contrib_names'),
'tags': result['tags'],
'publishedDate': publishedDate,
'issns': result['biblio'].get('issns'),
'pdf_url': result['fulltext'].get('access_url'),
}
)
return results

View File

@ -27,7 +27,7 @@ categories = ['images']
paging = True paging = True
endpoint = 'photos' endpoint = 'photos'
base_url = 'https://loc.gov' base_url = 'https://www.loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json" search_string = "/{endpoint}/?sp={page}&{query}&fo=json"

View File

@ -8,6 +8,7 @@ from timeit import default_timer
from operator import itemgetter from operator import itemgetter
from searx.engines import engines from searx.engines import engines
from searx.openmetrics import OpenMetricsFamily
from .models import HistogramStorage, CounterStorage, VoidHistogram, VoidCounterStorage from .models import HistogramStorage, CounterStorage, VoidHistogram, VoidCounterStorage
from .error_recorder import count_error, count_exception, errors_per_engines from .error_recorder import count_error, count_exception, errors_per_engines
@ -149,7 +150,9 @@ def get_reliabilities(engline_name_list, checker_results):
checker_result = checker_results.get(engine_name, {}) checker_result = checker_results.get(engine_name, {})
checker_success = checker_result.get('success', True) checker_success = checker_result.get('success', True)
errors = engine_errors.get(engine_name) or [] errors = engine_errors.get(engine_name) or []
if counter('engine', engine_name, 'search', 'count', 'sent') == 0: sent_count = counter('engine', engine_name, 'search', 'count', 'sent')
if sent_count == 0:
# no request # no request
reliability = None reliability = None
elif checker_success and not errors: elif checker_success and not errors:
@ -164,8 +167,9 @@ def get_reliabilities(engline_name_list, checker_results):
reliabilities[engine_name] = { reliabilities[engine_name] = {
'reliability': reliability, 'reliability': reliability,
'sent_count': sent_count,
'errors': errors, 'errors': errors,
'checker': checker_results.get(engine_name, {}).get('errors', {}), 'checker': checker_result.get('errors', {}),
} }
return reliabilities return reliabilities
@ -245,3 +249,53 @@ def get_engines_stats(engine_name_list):
'max_time': math.ceil(max_time_total or 0), 'max_time': math.ceil(max_time_total or 0),
'max_result_count': math.ceil(max_result_count or 0), 'max_result_count': math.ceil(max_result_count or 0),
} }
def openmetrics(engine_stats, engine_reliabilities):
metrics = [
OpenMetricsFamily(
key="searxng_engines_response_time_total_seconds",
type_hint="gauge",
help_hint="The average total response time of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['total'] for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_response_time_processing_seconds",
type_hint="gauge",
help_hint="The average processing response time of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['processing'] for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_response_time_http_seconds",
type_hint="gauge",
help_hint="The average HTTP response time of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['http'] for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_result_count_total",
type_hint="counter",
help_hint="The total amount of results returned by the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['result_count'] for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_request_count_total",
type_hint="counter",
help_hint="The total amount of user requests made to this engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine_reliabilities.get(engine['name'], {}).get('sent_count', 0) for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_reliability_total",
type_hint="counter",
help_hint="The overall reliability of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[
engine_reliabilities.get(engine['name'], {}).get('reliability', 0) for engine in engine_stats['time']
],
),
]
return "".join([str(metric) for metric in metrics])

View File

@ -233,8 +233,7 @@ class Network:
del kwargs['raise_for_httperror'] del kwargs['raise_for_httperror']
return do_raise_for_httperror return do_raise_for_httperror
@staticmethod def patch_response(self, response, do_raise_for_httperror):
def patch_response(response, do_raise_for_httperror):
if isinstance(response, httpx.Response): if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed) # requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
@ -242,8 +241,11 @@ class Network:
# raise an exception # raise an exception
if do_raise_for_httperror: if do_raise_for_httperror:
try:
raise_for_httperror(response) raise_for_httperror(response)
except:
self._logger.warning(f"HTTP Request failed: {response.request.method} {response.request.url}")
raise
return response return response
def is_valid_response(self, response): def is_valid_response(self, response):
@ -269,7 +271,7 @@ class Network:
else: else:
response = await client.request(method, url, **kwargs) response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0: if self.is_valid_response(response) or retries <= 0:
return Network.patch_response(response, do_raise_for_httperror) return self.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e: except httpx.RemoteProtocolError as e:
if not was_disconnected: if not was_disconnected:
# the server has closed the connection: # the server has closed the connection:

35
searx/openmetrics.py Normal file
View File

@ -0,0 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Module providing support for displaying data in OpenMetrics format"""
class OpenMetricsFamily: # pylint: disable=too-few-public-methods
"""A family of metrics.
The key parameter is the metric name that should be used (snake case).
The type_hint parameter must be one of 'counter', 'gauge', 'histogram', 'summary'.
The help_hint parameter is a short string explaining the metric.
The data_info parameter is a dictionary of descriptionary parameters for the data point (e.g. request method/path).
The data parameter is a flat list of the actual data in shape of a primive type.
See https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md for more information.
"""
def __init__(self, key: str, type_hint: str, help_hint: str, data_info: list, data: list):
self.key = key
self.type_hint = type_hint
self.help_hint = help_hint
self.data_info = data_info
self.data = data
def __str__(self):
text_representation = f"""# HELP {self.key} {self.help_hint}
# TYPE {self.key} {self.type_hint}
"""
for i in range(0, len(self.data_info)):
if not self.data[i] and self.data[i] != 0:
continue
info_representation = ','.join([f"{key}=\"{value}\"" for (key, value) in self.data_info[i].items()])
text_representation += f"{self.key}{{{info_representation}}} {self.data[i]}\n"
return text_representation

View File

@ -137,9 +137,6 @@ class OnlineProcessor(EngineProcessor):
self.engine.request(query, params) self.engine.request(query, params)
# ignoring empty urls # ignoring empty urls
if params['url'] is None:
return None
if not params['url']: if not params['url']:
return None return None

View File

@ -12,6 +12,10 @@ general:
contact_url: false contact_url: false
# record stats # record stats
enable_metrics: true enable_metrics: true
# expose stats in open metrics format at /metrics
# leave empty to disable (no password set)
# open_metrics: <password>
open_metrics: ''
brand: brand:
new_issue_url: https://github.com/searxng/searxng/issues/new new_issue_url: https://github.com/searxng/searxng/issues/new
@ -1622,11 +1626,6 @@ engines:
api_site: 'askubuntu' api_site: 'askubuntu'
categories: [it, q&a] categories: [it, q&a]
- name: internetarchivescholar
engine: internet_archive_scholar
shortcut: ias
timeout: 15.0
- name: superuser - name: superuser
engine: stackexchange engine: stackexchange
shortcut: su shortcut: su

View File

@ -143,6 +143,7 @@ SCHEMA = {
'contact_url': SettingsValue((None, False, str), None), 'contact_url': SettingsValue((None, False, str), None),
'donation_url': SettingsValue((bool, str), "https://docs.searxng.org/donate.html"), 'donation_url': SettingsValue((bool, str), "https://docs.searxng.org/donate.html"),
'enable_metrics': SettingsValue(bool, True), 'enable_metrics': SettingsValue(bool, True),
'open_metrics': SettingsValue(str, ''),
}, },
'brand': { 'brand': {
'issue_url': SettingsValue(str, 'https://github.com/searxng/searxng/issues'), 'issue_url': SettingsValue(str, 'https://github.com/searxng/searxng/issues'),

View File

@ -87,10 +87,7 @@ from searx.webadapter import (
get_selected_categories, get_selected_categories,
parse_lang, parse_lang,
) )
from searx.utils import ( from searx.utils import gen_useragent, dict_subset
gen_useragent,
dict_subset,
)
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
from searx.query import RawTextQuery from searx.query import RawTextQuery
from searx.plugins import Plugin, plugins, initialize as plugin_initialize from searx.plugins import Plugin, plugins, initialize as plugin_initialize
@ -104,13 +101,7 @@ from searx.answerers import (
answerers, answerers,
ask, ask,
) )
from searx.metrics import ( from searx.metrics import get_engines_stats, get_engine_errors, get_reliabilities, histogram, counter, openmetrics
get_engines_stats,
get_engine_errors,
get_reliabilities,
histogram,
counter,
)
from searx.flaskfix import patch_application from searx.flaskfix import patch_application
from searx.locales import ( from searx.locales import (
@ -1218,6 +1209,30 @@ def stats_checker():
return jsonify(result) return jsonify(result)
@app.route('/metrics')
def stats_open_metrics():
password = settings['general'].get("open_metrics")
if not (settings['general'].get("enable_metrics") and password):
return Response('open metrics is disabled', status=404, mimetype='text/plain')
if not request.authorization or request.authorization.password != password:
return Response('access forbidden', status=401, mimetype='text/plain')
filtered_engines = dict(filter(lambda kv: request.preferences.validate_token(kv[1]), engines.items()))
checker_results = checker_get_result()
checker_results = (
checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {}
)
engine_stats = get_engines_stats(filtered_engines)
engine_reliabilities = get_reliabilities(filtered_engines, checker_results)
metrics_text = openmetrics(engine_stats, engine_reliabilities)
return Response(metrics_text, mimetype='text/plain')
@app.route('/robots.txt', methods=['GET']) @app.route('/robots.txt', methods=['GET'])
def robots(): def robots():
return Response( return Response(