Compare commits

...

8 Commits

Author SHA1 Message Date
Grant Lanham Jr ab27d6b45b
Merge 27aa9c4cb1 into 0f9694c90b 2024-11-24 02:01:36 +00:00
Markus Heiser 0f9694c90b [clean] Internet Archive Scholar search API no longer exists
Engine was added in #2733 but the API does no longer exists. Related:

- https://github.com/searxng/searxng/issues/4038

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 17:59:38 +01:00
Markus Heiser ccc4f30b20 [doc] update quantities on the intro page
The quantities on the intro page were partly out of date / example; we already
have 210 engines and not just 70. To avoid having to change the quantities
manually in the future, they are now calculated from the jinja context

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 16:33:08 +01:00
Markus Heiser c4b874e9b0 [fix] engine Library of Congress: fix API URL loc.gov -> www.loc.gov
Avoid HTTP 404 and redirects. Requests to the JSON/YAML API use the base url [1]

    https://www.loc.gov/{endpoint}/?fo=json

[1] https://www.loc.gov/apis/json-and-yaml/requests/

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 13:02:24 +01:00
Markus Heiser 7c4e4ebd40 [log] warning with URL in case of 'raise_for_httperror'
In order to be able to implement error handling, it is necessary to know which
URL triggered the exception / the URL has not yet been logged.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 11:33:19 +01:00
Grant Lanham 27aa9c4cb1 Updates to naming, pylint fixing 2024-08-25 16:34:10 -04:00
Grant Lanham 118a748fba Inject log_level into error_recorder
format
2024-08-25 01:03:38 -04:00
Grant Lanham 9f7244d6f1 Implement "engine_exc_info" to allow removing exception logging in Checker 2024-08-25 00:32:37 -04:00
9 changed files with 82 additions and 119 deletions

View File

@ -4,22 +4,27 @@ Welcome to SearXNG
*Search without being tracked.*
SearXNG is a free internet metasearch engine which aggregates results from more
than 70 search services. Users are neither tracked nor profiled. Additionally,
SearXNG can be used over Tor for online anonymity.
.. jinja:: searx
SearXNG is a free internet metasearch engine which aggregates results from up
to {{engines | length}} :ref:`search services <configured engines>`. Users
are neither tracked nor profiled. Additionally, SearXNG can be used over Tor
for online anonymity.
Get started with SearXNG by using one of the instances listed at searx.space_.
If you don't trust anyone, you can set up your own, see :ref:`installation`.
.. jinja:: searx
.. sidebar:: features
- :ref:`self hosted <installation>`
- :ref:`no user tracking / no profiling <SearXNG protect privacy>`
- script & cookies are optional
- secure, encrypted connections
- :ref:`about 200 search engines <configured engines>`
- `about 60 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 100 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`{{engines | length}} search engines <configured engines>`
- `58 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 70 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`easy integration of search engines <demo online engine>`
- professional development: `CI <https://github.com/searxng/searxng/actions>`_,
`quality assurance <https://dev.searxng.org/>`_ &

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Internet Archive scholar(science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": "https://scholar.archive.org/",
"wikidata_id": "Q115667709",
"official_api_documentation": "https://scholar.archive.org/api/redoc",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['science', 'scientific publications']
paging = True
base_url = "https://scholar.archive.org"
results_per_page = 15
def request(query, params):
args = {
"q": query,
"limit": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
}
params["url"] = f"{base_url}/search?{urlencode(args)}"
params["headers"]["Accept"] = "application/json"
return params
def response(resp):
results = []
json = resp.json()
for result in json["results"]:
publishedDate, content, doi = None, '', None
if result['biblio'].get('release_date'):
publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
if len(result['abstracts']) > 0:
content = result['abstracts'][0].get('body')
elif len(result['_highlights']) > 0:
content = result['_highlights'][0]
if len(result['releases']) > 0:
doi = result['releases'][0].get('doi')
results.append(
{
'template': 'paper.html',
'url': result['fulltext']['access_url'],
'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
'content': html_to_text(content),
'publisher': result['biblio'].get('publisher'),
'doi': doi,
'journal': result['biblio'].get('container_name'),
'authors': result['biblio'].get('contrib_names'),
'tags': result['tags'],
'publishedDate': publishedDate,
'issns': result['biblio'].get('issns'),
'pdf_url': result['fulltext'].get('access_url'),
}
)
return results

View File

@ -27,7 +27,7 @@ categories = ['images']
paging = True
endpoint = 'photos'
base_url = 'https://loc.gov'
base_url = 'https://www.loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json"

View File

@ -3,6 +3,7 @@
import typing
import inspect
import logging
from json import JSONDecodeError
from urllib.parse import urlparse
from httpx import HTTPError, HTTPStatusError
@ -30,10 +31,20 @@ class ErrorContext: # pylint: disable=missing-class-docstring
'log_message',
'log_parameters',
'secondary',
'log_level',
)
def __init__( # pylint: disable=too-many-arguments
self, filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary
self,
filename,
function,
line_no,
code,
exception_classname,
log_message,
log_parameters,
secondary,
log_level=logging.WARN,
):
self.filename = filename
self.function = function
@ -43,6 +54,7 @@ class ErrorContext: # pylint: disable=missing-class-docstring
self.log_message = log_message
self.log_parameters = log_parameters
self.secondary = secondary
self.log_level: int = log_level
def __eq__(self, o) -> bool: # pylint: disable=invalid-name
if not isinstance(o, ErrorContext):
@ -56,6 +68,7 @@ class ErrorContext: # pylint: disable=missing-class-docstring
and self.log_message == o.log_message
and self.log_parameters == o.log_parameters
and self.secondary == o.secondary
and self.log_level == o.log_level
)
def __hash__(self):
@ -69,11 +82,12 @@ class ErrorContext: # pylint: disable=missing-class-docstring
self.log_message,
self.log_parameters,
self.secondary,
self.log_level,
)
)
def __repr__(self):
return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r}) {!r}".format(
return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r}), {!r}, {!r}".format(
self.filename,
self.line_no,
self.code,
@ -81,13 +95,14 @@ class ErrorContext: # pylint: disable=missing-class-docstring
self.log_message,
self.log_parameters,
self.secondary,
self.log_level,
)
def add_error_context(engine_name: str, error_context: ErrorContext) -> None:
errors_for_engine = errors_per_engines.setdefault(engine_name, {})
errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1
engines[engine_name].logger.warning('%s', str(error_context))
engines[engine_name].logger.log(error_context.log_level, '%s', str(error_context))
def get_trace(traces):
@ -157,7 +172,9 @@ def get_exception_classname(exc: Exception) -> str:
return exc_module + '.' + exc_name
def get_error_context(framerecords, exception_classname, log_message, log_parameters, secondary) -> ErrorContext:
def get_error_context(
framerecords, exception_classname, log_message, log_parameters, secondary, log_level: int
) -> ErrorContext:
searx_frame = get_trace(framerecords)
filename = searx_frame.filename
if filename.startswith(searx_parent_dir):
@ -166,30 +183,36 @@ def get_error_context(framerecords, exception_classname, log_message, log_parame
line_no = searx_frame.lineno
code = searx_frame.code_context[0].strip()
del framerecords
return ErrorContext(filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary)
return ErrorContext(
filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary, log_level
)
def count_exception(engine_name: str, exc: Exception, secondary: bool = False) -> None:
def count_exception(engine_name: str, exc: Exception, secondary: bool = False, log_level=logging.WARN) -> None:
if not settings['general']['enable_metrics']:
return
framerecords = inspect.trace()
try:
exception_classname = get_exception_classname(exc)
log_parameters = get_messages(exc, framerecords[-1][1])
error_context = get_error_context(framerecords, exception_classname, None, log_parameters, secondary)
error_context = get_error_context(framerecords, exception_classname, None, log_parameters, secondary, log_level)
add_error_context(engine_name, error_context)
finally:
del framerecords
def count_error(
engine_name: str, log_message: str, log_parameters: typing.Optional[typing.Tuple] = None, secondary: bool = False
engine_name: str,
log_message: str,
log_parameters: typing.Optional[typing.Tuple] = None,
secondary: bool = False,
log_level: int = logging.WARN,
) -> None:
if not settings['general']['enable_metrics']:
return
framerecords = list(reversed(inspect.stack()[1:]))
try:
error_context = get_error_context(framerecords, None, log_message, log_parameters or (), secondary)
error_context = get_error_context(framerecords, None, log_message, log_parameters or (), secondary, log_level)
add_error_context(engine_name, error_context)
finally:
del framerecords

View File

@ -233,8 +233,7 @@ class Network:
del kwargs['raise_for_httperror']
return do_raise_for_httperror
@staticmethod
def patch_response(response, do_raise_for_httperror):
def patch_response(self, response, do_raise_for_httperror):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
@ -242,8 +241,11 @@ class Network:
# raise an exception
if do_raise_for_httperror:
try:
raise_for_httperror(response)
except:
self._logger.warning(f"HTTP Request failed: {response.request.method} {response.request.url}")
raise
return response
def is_valid_response(self, response):
@ -269,7 +271,7 @@ class Network:
else:
response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0:
return Network.patch_response(response, do_raise_for_httperror)
return self.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e:
if not was_disconnected:
# the server has closed the connection:

View File

@ -357,6 +357,7 @@ class Checker: # pylint: disable=missing-class-docstring
def __init__(self, processor: EngineProcessor):
self.processor = processor
self.processor.log_engine_exc_info = False # Remove exception information from errors to reduce verbosity
self.tests = self.processor.get_tests()
self.test_results = TestResults()
@ -418,8 +419,10 @@ class Checker: # pylint: disable=missing-class-docstring
result_container_check.check_basic()
return result_container_check
def run_test(self, test_name):
def run_test(self, test_name: str):
test_parameters = self.tests[test_name]
# Not really a warning, but an info log will not appear
logger.warning('---%s---', test_name)
search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
stop_test = False

View File

@ -3,6 +3,8 @@
"""
import logging
from logging import Logger
import threading
from abc import abstractmethod, ABC
from timeit import default_timer
@ -58,12 +60,13 @@ class SuspendedStatus:
class EngineProcessor(ABC):
"""Base classes used for all types of request processors."""
__slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger'
__slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger', 'log_engine_exc_info'
def __init__(self, engine, engine_name: str):
self.engine = engine
self.engine_name = engine_name
self.logger = engines[engine_name].logger
self.logger: Logger = engines[engine_name].logger
self.log_engine_exc_info = True
key = get_network(self.engine_name)
key = id(key) if key else self.engine_name
self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus())
@ -82,6 +85,10 @@ class EngineProcessor(ABC):
def has_initialize_function(self):
return hasattr(self.engine, 'init')
@property
def metrics_log_level(self) -> int:
return logging.WARN if self.log_engine_exc_info else logging.NOTSET
def handle_exception(self, result_container, exception_or_message, suspend=False):
# update result_container
if isinstance(exception_or_message, BaseException):
@ -95,9 +102,9 @@ class EngineProcessor(ABC):
# metrics
counter_inc('engine', self.engine_name, 'search', 'count', 'error')
if isinstance(exception_or_message, BaseException):
count_exception(self.engine_name, exception_or_message)
count_exception(self.engine_name, exception_or_message, log_level=self.metrics_log_level)
else:
count_error(self.engine_name, exception_or_message)
count_error(self.engine_name, exception_or_message, log_level=self.metrics_log_level)
# suspend the engine ?
if suspend:
suspended_time = None

View File

@ -127,6 +127,7 @@ class OnlineProcessor(EngineProcessor):
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
(status_code, reason, hostname),
secondary=True,
log_level=self.metrics_log_level,
)
return response
@ -137,9 +138,6 @@ class OnlineProcessor(EngineProcessor):
self.engine.request(query, params)
# ignoring empty urls
if params['url'] is None:
return None
if not params['url']:
return None
@ -180,20 +178,21 @@ class OnlineProcessor(EngineProcessor):
self.logger.exception(
"requests exception (search duration : {0} s, timeout: {1} s) : {2}".format(
default_timer() - start_time, timeout_limit, e
)
),
exc_info=self.log_engine_exc_info,
)
except SearxEngineCaptchaException as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('CAPTCHA')
self.logger.exception('CAPTCHA', exc_info=self.log_engine_exc_info)
except SearxEngineTooManyRequestsException as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('Too many requests')
self.logger.exception('Too many requests', exc_info=self.log_engine_exc_info)
except SearxEngineAccessDeniedException as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('SearXNG is blocked')
self.logger.exception('SearXNG is blocked', exc_info=self.log_engine_exc_info)
except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, e)
self.logger.exception('exception : {0}'.format(e))
self.logger.exception('exception : {0}'.format(e), exc_info=self.log_engine_exc_info)
def get_default_tests(self):
tests = {}

View File

@ -1622,11 +1622,6 @@ engines:
api_site: 'askubuntu'
categories: [it, q&a]
- name: internetarchivescholar
engine: internet_archive_scholar
shortcut: ias
timeout: 15.0
- name: superuser
engine: stackexchange
shortcut: su