Compare commits

...

8 Commits

Author SHA1 Message Date
Grant Lanham Jr ab27d6b45b
Merge 27aa9c4cb1 into 0f9694c90b 2024-11-24 02:01:36 +00:00
Markus Heiser 0f9694c90b [clean] Internet Archive Scholar search API no longer exists
Engine was added in #2733 but the API does no longer exists. Related:

- https://github.com/searxng/searxng/issues/4038

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 17:59:38 +01:00
Markus Heiser ccc4f30b20 [doc] update quantities on the intro page
The quantities on the intro page were partly out of date / example; we already
have 210 engines and not just 70. To avoid having to change the quantities
manually in the future, they are now calculated from the jinja context

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 16:33:08 +01:00
Markus Heiser c4b874e9b0 [fix] engine Library of Congress: fix API URL loc.gov -> www.loc.gov
Avoid HTTP 404 and redirects. Requests to the JSON/YAML API use the base url [1]

    https://www.loc.gov/{endpoint}/?fo=json

[1] https://www.loc.gov/apis/json-and-yaml/requests/

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 13:02:24 +01:00
Markus Heiser 7c4e4ebd40 [log] warning with URL in case of 'raise_for_httperror'
In order to be able to implement error handling, it is necessary to know which
URL triggered the exception / the URL has not yet been logged.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 11:33:19 +01:00
Grant Lanham 27aa9c4cb1 Updates to naming, pylint fixing 2024-08-25 16:34:10 -04:00
Grant Lanham 118a748fba Inject log_level into error_recorder
format
2024-08-25 01:03:38 -04:00
Grant Lanham 9f7244d6f1 Implement "engine_exc_info" to allow removing exception logging in Checker 2024-08-25 00:32:37 -04:00
9 changed files with 82 additions and 119 deletions

View File

@ -4,26 +4,31 @@ Welcome to SearXNG
*Search without being tracked.* *Search without being tracked.*
SearXNG is a free internet metasearch engine which aggregates results from more .. jinja:: searx
than 70 search services. Users are neither tracked nor profiled. Additionally,
SearXNG can be used over Tor for online anonymity. SearXNG is a free internet metasearch engine which aggregates results from up
to {{engines | length}} :ref:`search services <configured engines>`. Users
are neither tracked nor profiled. Additionally, SearXNG can be used over Tor
for online anonymity.
Get started with SearXNG by using one of the instances listed at searx.space_. Get started with SearXNG by using one of the instances listed at searx.space_.
If you don't trust anyone, you can set up your own, see :ref:`installation`. If you don't trust anyone, you can set up your own, see :ref:`installation`.
.. sidebar:: features .. jinja:: searx
- :ref:`self hosted <installation>` .. sidebar:: features
- :ref:`no user tracking / no profiling <SearXNG protect privacy>`
- script & cookies are optional - :ref:`self hosted <installation>`
- secure, encrypted connections - :ref:`no user tracking / no profiling <SearXNG protect privacy>`
- :ref:`about 200 search engines <configured engines>` - script & cookies are optional
- `about 60 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_ - secure, encrypted connections
- about 100 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_ - :ref:`{{engines | length}} search engines <configured engines>`
- :ref:`easy integration of search engines <demo online engine>` - `58 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- professional development: `CI <https://github.com/searxng/searxng/actions>`_, - about 70 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
`quality assurance <https://dev.searxng.org/>`_ & - :ref:`easy integration of search engines <demo online engine>`
`automated tested UI <https://dev.searxng.org/screenshots.html>`_ - professional development: `CI <https://github.com/searxng/searxng/actions>`_,
`quality assurance <https://dev.searxng.org/>`_ &
`automated tested UI <https://dev.searxng.org/screenshots.html>`_
.. sidebar:: be a part .. sidebar:: be a part

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Internet Archive scholar(science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": "https://scholar.archive.org/",
"wikidata_id": "Q115667709",
"official_api_documentation": "https://scholar.archive.org/api/redoc",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['science', 'scientific publications']
paging = True
base_url = "https://scholar.archive.org"
results_per_page = 15
def request(query, params):
args = {
"q": query,
"limit": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
}
params["url"] = f"{base_url}/search?{urlencode(args)}"
params["headers"]["Accept"] = "application/json"
return params
def response(resp):
results = []
json = resp.json()
for result in json["results"]:
publishedDate, content, doi = None, '', None
if result['biblio'].get('release_date'):
publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
if len(result['abstracts']) > 0:
content = result['abstracts'][0].get('body')
elif len(result['_highlights']) > 0:
content = result['_highlights'][0]
if len(result['releases']) > 0:
doi = result['releases'][0].get('doi')
results.append(
{
'template': 'paper.html',
'url': result['fulltext']['access_url'],
'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
'content': html_to_text(content),
'publisher': result['biblio'].get('publisher'),
'doi': doi,
'journal': result['biblio'].get('container_name'),
'authors': result['biblio'].get('contrib_names'),
'tags': result['tags'],
'publishedDate': publishedDate,
'issns': result['biblio'].get('issns'),
'pdf_url': result['fulltext'].get('access_url'),
}
)
return results

View File

@ -27,7 +27,7 @@ categories = ['images']
paging = True paging = True
endpoint = 'photos' endpoint = 'photos'
base_url = 'https://loc.gov' base_url = 'https://www.loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json" search_string = "/{endpoint}/?sp={page}&{query}&fo=json"

View File

@ -3,6 +3,7 @@
import typing import typing
import inspect import inspect
import logging
from json import JSONDecodeError from json import JSONDecodeError
from urllib.parse import urlparse from urllib.parse import urlparse
from httpx import HTTPError, HTTPStatusError from httpx import HTTPError, HTTPStatusError
@ -30,10 +31,20 @@ class ErrorContext: # pylint: disable=missing-class-docstring
'log_message', 'log_message',
'log_parameters', 'log_parameters',
'secondary', 'secondary',
'log_level',
) )
def __init__( # pylint: disable=too-many-arguments def __init__( # pylint: disable=too-many-arguments
self, filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary self,
filename,
function,
line_no,
code,
exception_classname,
log_message,
log_parameters,
secondary,
log_level=logging.WARN,
): ):
self.filename = filename self.filename = filename
self.function = function self.function = function
@ -43,6 +54,7 @@ class ErrorContext: # pylint: disable=missing-class-docstring
self.log_message = log_message self.log_message = log_message
self.log_parameters = log_parameters self.log_parameters = log_parameters
self.secondary = secondary self.secondary = secondary
self.log_level: int = log_level
def __eq__(self, o) -> bool: # pylint: disable=invalid-name def __eq__(self, o) -> bool: # pylint: disable=invalid-name
if not isinstance(o, ErrorContext): if not isinstance(o, ErrorContext):
@ -56,6 +68,7 @@ class ErrorContext: # pylint: disable=missing-class-docstring
and self.log_message == o.log_message and self.log_message == o.log_message
and self.log_parameters == o.log_parameters and self.log_parameters == o.log_parameters
and self.secondary == o.secondary and self.secondary == o.secondary
and self.log_level == o.log_level
) )
def __hash__(self): def __hash__(self):
@ -69,11 +82,12 @@ class ErrorContext: # pylint: disable=missing-class-docstring
self.log_message, self.log_message,
self.log_parameters, self.log_parameters,
self.secondary, self.secondary,
self.log_level,
) )
) )
def __repr__(self): def __repr__(self):
return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r}) {!r}".format( return "ErrorContext({!r}, {!r}, {!r}, {!r}, {!r}, {!r}), {!r}, {!r}".format(
self.filename, self.filename,
self.line_no, self.line_no,
self.code, self.code,
@ -81,13 +95,14 @@ class ErrorContext: # pylint: disable=missing-class-docstring
self.log_message, self.log_message,
self.log_parameters, self.log_parameters,
self.secondary, self.secondary,
self.log_level,
) )
def add_error_context(engine_name: str, error_context: ErrorContext) -> None: def add_error_context(engine_name: str, error_context: ErrorContext) -> None:
errors_for_engine = errors_per_engines.setdefault(engine_name, {}) errors_for_engine = errors_per_engines.setdefault(engine_name, {})
errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1 errors_for_engine[error_context] = errors_for_engine.get(error_context, 0) + 1
engines[engine_name].logger.warning('%s', str(error_context)) engines[engine_name].logger.log(error_context.log_level, '%s', str(error_context))
def get_trace(traces): def get_trace(traces):
@ -157,7 +172,9 @@ def get_exception_classname(exc: Exception) -> str:
return exc_module + '.' + exc_name return exc_module + '.' + exc_name
def get_error_context(framerecords, exception_classname, log_message, log_parameters, secondary) -> ErrorContext: def get_error_context(
framerecords, exception_classname, log_message, log_parameters, secondary, log_level: int
) -> ErrorContext:
searx_frame = get_trace(framerecords) searx_frame = get_trace(framerecords)
filename = searx_frame.filename filename = searx_frame.filename
if filename.startswith(searx_parent_dir): if filename.startswith(searx_parent_dir):
@ -166,30 +183,36 @@ def get_error_context(framerecords, exception_classname, log_message, log_parame
line_no = searx_frame.lineno line_no = searx_frame.lineno
code = searx_frame.code_context[0].strip() code = searx_frame.code_context[0].strip()
del framerecords del framerecords
return ErrorContext(filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary) return ErrorContext(
filename, function, line_no, code, exception_classname, log_message, log_parameters, secondary, log_level
)
def count_exception(engine_name: str, exc: Exception, secondary: bool = False) -> None: def count_exception(engine_name: str, exc: Exception, secondary: bool = False, log_level=logging.WARN) -> None:
if not settings['general']['enable_metrics']: if not settings['general']['enable_metrics']:
return return
framerecords = inspect.trace() framerecords = inspect.trace()
try: try:
exception_classname = get_exception_classname(exc) exception_classname = get_exception_classname(exc)
log_parameters = get_messages(exc, framerecords[-1][1]) log_parameters = get_messages(exc, framerecords[-1][1])
error_context = get_error_context(framerecords, exception_classname, None, log_parameters, secondary) error_context = get_error_context(framerecords, exception_classname, None, log_parameters, secondary, log_level)
add_error_context(engine_name, error_context) add_error_context(engine_name, error_context)
finally: finally:
del framerecords del framerecords
def count_error( def count_error(
engine_name: str, log_message: str, log_parameters: typing.Optional[typing.Tuple] = None, secondary: bool = False engine_name: str,
log_message: str,
log_parameters: typing.Optional[typing.Tuple] = None,
secondary: bool = False,
log_level: int = logging.WARN,
) -> None: ) -> None:
if not settings['general']['enable_metrics']: if not settings['general']['enable_metrics']:
return return
framerecords = list(reversed(inspect.stack()[1:])) framerecords = list(reversed(inspect.stack()[1:]))
try: try:
error_context = get_error_context(framerecords, None, log_message, log_parameters or (), secondary) error_context = get_error_context(framerecords, None, log_message, log_parameters or (), secondary, log_level)
add_error_context(engine_name, error_context) add_error_context(engine_name, error_context)
finally: finally:
del framerecords del framerecords

View File

@ -233,8 +233,7 @@ class Network:
del kwargs['raise_for_httperror'] del kwargs['raise_for_httperror']
return do_raise_for_httperror return do_raise_for_httperror
@staticmethod def patch_response(self, response, do_raise_for_httperror):
def patch_response(response, do_raise_for_httperror):
if isinstance(response, httpx.Response): if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed) # requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
@ -242,8 +241,11 @@ class Network:
# raise an exception # raise an exception
if do_raise_for_httperror: if do_raise_for_httperror:
raise_for_httperror(response) try:
raise_for_httperror(response)
except:
self._logger.warning(f"HTTP Request failed: {response.request.method} {response.request.url}")
raise
return response return response
def is_valid_response(self, response): def is_valid_response(self, response):
@ -269,7 +271,7 @@ class Network:
else: else:
response = await client.request(method, url, **kwargs) response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0: if self.is_valid_response(response) or retries <= 0:
return Network.patch_response(response, do_raise_for_httperror) return self.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e: except httpx.RemoteProtocolError as e:
if not was_disconnected: if not was_disconnected:
# the server has closed the connection: # the server has closed the connection:

View File

@ -357,6 +357,7 @@ class Checker: # pylint: disable=missing-class-docstring
def __init__(self, processor: EngineProcessor): def __init__(self, processor: EngineProcessor):
self.processor = processor self.processor = processor
self.processor.log_engine_exc_info = False # Remove exception information from errors to reduce verbosity
self.tests = self.processor.get_tests() self.tests = self.processor.get_tests()
self.test_results = TestResults() self.test_results = TestResults()
@ -418,8 +419,10 @@ class Checker: # pylint: disable=missing-class-docstring
result_container_check.check_basic() result_container_check.check_basic()
return result_container_check return result_container_check
def run_test(self, test_name): def run_test(self, test_name: str):
test_parameters = self.tests[test_name] test_parameters = self.tests[test_name]
# Not really a warning, but an info log will not appear
logger.warning('---%s---', test_name)
search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
stop_test = False stop_test = False

View File

@ -3,6 +3,8 @@
""" """
import logging
from logging import Logger
import threading import threading
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
from timeit import default_timer from timeit import default_timer
@ -58,12 +60,13 @@ class SuspendedStatus:
class EngineProcessor(ABC): class EngineProcessor(ABC):
"""Base classes used for all types of request processors.""" """Base classes used for all types of request processors."""
__slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger' __slots__ = 'engine', 'engine_name', 'lock', 'suspended_status', 'logger', 'log_engine_exc_info'
def __init__(self, engine, engine_name: str): def __init__(self, engine, engine_name: str):
self.engine = engine self.engine = engine
self.engine_name = engine_name self.engine_name = engine_name
self.logger = engines[engine_name].logger self.logger: Logger = engines[engine_name].logger
self.log_engine_exc_info = True
key = get_network(self.engine_name) key = get_network(self.engine_name)
key = id(key) if key else self.engine_name key = id(key) if key else self.engine_name
self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus()) self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus())
@ -82,6 +85,10 @@ class EngineProcessor(ABC):
def has_initialize_function(self): def has_initialize_function(self):
return hasattr(self.engine, 'init') return hasattr(self.engine, 'init')
@property
def metrics_log_level(self) -> int:
return logging.WARN if self.log_engine_exc_info else logging.NOTSET
def handle_exception(self, result_container, exception_or_message, suspend=False): def handle_exception(self, result_container, exception_or_message, suspend=False):
# update result_container # update result_container
if isinstance(exception_or_message, BaseException): if isinstance(exception_or_message, BaseException):
@ -95,9 +102,9 @@ class EngineProcessor(ABC):
# metrics # metrics
counter_inc('engine', self.engine_name, 'search', 'count', 'error') counter_inc('engine', self.engine_name, 'search', 'count', 'error')
if isinstance(exception_or_message, BaseException): if isinstance(exception_or_message, BaseException):
count_exception(self.engine_name, exception_or_message) count_exception(self.engine_name, exception_or_message, log_level=self.metrics_log_level)
else: else:
count_error(self.engine_name, exception_or_message) count_error(self.engine_name, exception_or_message, log_level=self.metrics_log_level)
# suspend the engine ? # suspend the engine ?
if suspend: if suspend:
suspended_time = None suspended_time = None

View File

@ -127,6 +127,7 @@ class OnlineProcessor(EngineProcessor):
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
(status_code, reason, hostname), (status_code, reason, hostname),
secondary=True, secondary=True,
log_level=self.metrics_log_level,
) )
return response return response
@ -137,9 +138,6 @@ class OnlineProcessor(EngineProcessor):
self.engine.request(query, params) self.engine.request(query, params)
# ignoring empty urls # ignoring empty urls
if params['url'] is None:
return None
if not params['url']: if not params['url']:
return None return None
@ -180,20 +178,21 @@ class OnlineProcessor(EngineProcessor):
self.logger.exception( self.logger.exception(
"requests exception (search duration : {0} s, timeout: {1} s) : {2}".format( "requests exception (search duration : {0} s, timeout: {1} s) : {2}".format(
default_timer() - start_time, timeout_limit, e default_timer() - start_time, timeout_limit, e
) ),
exc_info=self.log_engine_exc_info,
) )
except SearxEngineCaptchaException as e: except SearxEngineCaptchaException as e:
self.handle_exception(result_container, e, suspend=True) self.handle_exception(result_container, e, suspend=True)
self.logger.exception('CAPTCHA') self.logger.exception('CAPTCHA', exc_info=self.log_engine_exc_info)
except SearxEngineTooManyRequestsException as e: except SearxEngineTooManyRequestsException as e:
self.handle_exception(result_container, e, suspend=True) self.handle_exception(result_container, e, suspend=True)
self.logger.exception('Too many requests') self.logger.exception('Too many requests', exc_info=self.log_engine_exc_info)
except SearxEngineAccessDeniedException as e: except SearxEngineAccessDeniedException as e:
self.handle_exception(result_container, e, suspend=True) self.handle_exception(result_container, e, suspend=True)
self.logger.exception('SearXNG is blocked') self.logger.exception('SearXNG is blocked', exc_info=self.log_engine_exc_info)
except Exception as e: # pylint: disable=broad-except except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, e) self.handle_exception(result_container, e)
self.logger.exception('exception : {0}'.format(e)) self.logger.exception('exception : {0}'.format(e), exc_info=self.log_engine_exc_info)
def get_default_tests(self): def get_default_tests(self):
tests = {} tests = {}

View File

@ -1622,11 +1622,6 @@ engines:
api_site: 'askubuntu' api_site: 'askubuntu'
categories: [it, q&a] categories: [it, q&a]
- name: internetarchivescholar
engine: internet_archive_scholar
shortcut: ias
timeout: 15.0
- name: superuser - name: superuser
engine: stackexchange engine: stackexchange
shortcut: su shortcut: su