Compare commits

...

6 Commits

Author SHA1 Message Date
Alexandre Flament 266fc91825
Merge 4398ce059f into 0f9694c90b 2024-11-24 02:01:35 +00:00
Markus Heiser 0f9694c90b [clean] Internet Archive Scholar search API no longer exists
Engine was added in #2733 but the API does no longer exists. Related:

- https://github.com/searxng/searxng/issues/4038

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 17:59:38 +01:00
Markus Heiser ccc4f30b20 [doc] update quantities on the intro page
The quantities on the intro page were partly out of date / example; we already
have 210 engines and not just 70. To avoid having to change the quantities
manually in the future, they are now calculated from the jinja context

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 16:33:08 +01:00
Markus Heiser c4b874e9b0 [fix] engine Library of Congress: fix API URL loc.gov -> www.loc.gov
Avoid HTTP 404 and redirects. Requests to the JSON/YAML API use the base url [1]

    https://www.loc.gov/{endpoint}/?fo=json

[1] https://www.loc.gov/apis/json-and-yaml/requests/

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 13:02:24 +01:00
Markus Heiser 7c4e4ebd40 [log] warning with URL in case of 'raise_for_httperror'
In order to be able to implement error handling, it is necessary to know which
URL triggered the exception / the URL has not yet been logged.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2024-11-23 11:33:19 +01:00
Alexandre Flament 4398ce059f Add baidu engine (experimental) 2022-07-26 10:52:35 +02:00
7 changed files with 203 additions and 100 deletions

View File

@ -4,22 +4,27 @@ Welcome to SearXNG
*Search without being tracked.*
SearXNG is a free internet metasearch engine which aggregates results from more
than 70 search services. Users are neither tracked nor profiled. Additionally,
SearXNG can be used over Tor for online anonymity.
.. jinja:: searx
SearXNG is a free internet metasearch engine which aggregates results from up
to {{engines | length}} :ref:`search services <configured engines>`. Users
are neither tracked nor profiled. Additionally, SearXNG can be used over Tor
for online anonymity.
Get started with SearXNG by using one of the instances listed at searx.space_.
If you don't trust anyone, you can set up your own, see :ref:`installation`.
.. jinja:: searx
.. sidebar:: features
- :ref:`self hosted <installation>`
- :ref:`no user tracking / no profiling <SearXNG protect privacy>`
- script & cookies are optional
- secure, encrypted connections
- :ref:`about 200 search engines <configured engines>`
- `about 60 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 100 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`{{engines | length}} search engines <configured engines>`
- `58 translations <https://translate.codeberg.org/projects/searxng/searxng/>`_
- about 70 `well maintained <https://uptime.searxng.org/>`__ instances on searx.space_
- :ref:`easy integration of search engines <demo online engine>`
- professional development: `CI <https://github.com/searxng/searxng/actions>`_,
`quality assurance <https://dev.searxng.org/>`_ &

169
searx/engines/baidu.py Normal file
View File

@ -0,0 +1,169 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Bing (Web)
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""
import re
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx.network import raise_for_httperror, multi_requests, get, Request
from searx.exceptions import SearxEngineCaptchaException
about = {
"website": 'https://www.baidu.com',
"wikidata_id": 'Q14772',
"official_api_documentation": 'https://apis.baidu.com/',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'zn',
}
# engine dependent config
categories = ['general', 'web']
paging = False
time_range_support = False
safesearch = False
base_url = 'https://www.baidu.com/'
search_string = 's?{query}'
skip_tpls = ('img_normal', 'short_video', 'yl_music_song', 'dict3', 'recommend_list')
desc_xpath_per_tpl = {
'se_com_default': './/span[contains(@class, "content-right_8Zs40")]',
'kaifa_pc_open_source_software': './/p[contains(@class, "c-color-text")]',
'bk_polysemy': './/div/@aria-label',
'se_st_single_video_zhanzhang': './/span[contains(@class, "c-span-last")]//p[2]',
}
def get_initial_parameters(params):
resp_index = get(base_url, headers=params['headers'], raise_for_httperror=True)
dom = html.fromstring(resp_index.text)
query_params = {}
for ielement in eval_xpath_list(dom, '//form[@id="form"]//input[@name]'):
name = ielement.attrib.get('name')
value = ielement.attrib.get('value')
query_params[name] = value
return query_params, resp_index.cookies
def request(query, params):
params['headers'].update(
{
'Accept-Language': 'en-US,en;q=0.5',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Sec-GPC': '1',
'Upgrade-Insecure-Requests': '1',
'TE': 'trailers',
}
)
query_params, cookies = get_initial_parameters(params)
query_params['wd'] = query
params['url'] = base_url + search_string.format(query=urlencode(query_params))
params['cookies'] = cookies
params['raise_for_httperror'] = False
return params
def response(resp):
results = []
if resp.url.host == 'wappass.baidu.com' or resp.url.path.startswith('/static/captcha'):
raise SearxEngineCaptchaException()
raise_for_httperror(resp)
dom = html.fromstring(resp.text)
# follow redirect but don't use the result page to reduce the CAPTCHA issue
redirect_element = eval_xpath_getindex(dom, '//noscript/meta[@http-equiv="refresh"]/@content', 0, default=None)
if redirect_element and redirect_element.startswith('0; url='):
get(
base_url + redirect_element[8:],
headers=resp.search_params['headers'],
cookies=resp.search_params['cookies'],
)
for result in eval_xpath_list(dom, '//div[contains(@id,"content_left")]/div[contains(@class, "c-container")]'):
tpl = result.attrib.get('tpl')
if tpl in skip_tpls:
continue
if tpl == 'kaifa_pc_blog_weak':
# skip the result to kaifa.baidu.com (search engine for IT)
# but includes results from kaifa
for r2 in eval_xpath_list(result, './/div[contains(@class, "c-gap-bottom-small")]'):
title = extract_text(eval_xpath(r2, './/div[@class="c-row"]//a'))
url = extract_text(eval_xpath(r2, './/div[@class="c-row"]//a/@href'))
content = extract_text(eval_xpath(r2, '//span[@class="c-line-clamp2"]'))
results.append(
{
'url': url,
'title': title,
'content': content,
}
)
continue
# normal results
title = extract_text(eval_xpath(result, './/h3/a'))
url = extract_text(eval_xpath(result, './/h3/a/@href'))
if not title or not url:
continue
content = None
if tpl in desc_xpath_per_tpl:
# try the XPath for the Baidu template
content = extract_text(eval_xpath(result, desc_xpath_per_tpl[tpl]))
if not content:
# no content was found: try all the XPath from the Baidu templates
for xp in desc_xpath_per_tpl.values():
content = extract_text(eval_xpath(result, xp))
if content:
break
results.append(
{
'url': url,
'title': title,
'content': content,
}
)
# resolve the Baidu redirections
# note: Baidu does not support HTTP/2
request_list = [
Request.get(
u['url'].replace('http://www.baidu.com/link?url=', 'https://www.baidu.com/link?url='),
allow_redirects=False,
headers=resp.search_params['headers'],
)
for u in results
]
response_list = multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[i]['url'] = redirect_response.headers['location']
return results
def debug_write_content_to_file(text):
RE_STYLE_ELEMENT = re.compile(r'<style[^>]*>[^<]+</style>')
RE_SCRIPT_ELEMENT = re.compile(r'<script[^>]*>[^<]+</script>')
RE_COMMENT_ELEMENT = re.compile(r'\<\!\-\-[^-]+\-\-\>')
with open('baidu.html', 'wt', encoding='utf-8') as f:
text = RE_STYLE_ELEMENT.sub("", text)
text = RE_SCRIPT_ELEMENT.sub("", text)
text = RE_COMMENT_ELEMENT.sub("", text)
text = "\n".join([ll.rstrip() for ll in text.splitlines() if ll.strip()])
f.write(text)

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Internet Archive scholar(science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": "https://scholar.archive.org/",
"wikidata_id": "Q115667709",
"official_api_documentation": "https://scholar.archive.org/api/redoc",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['science', 'scientific publications']
paging = True
base_url = "https://scholar.archive.org"
results_per_page = 15
def request(query, params):
args = {
"q": query,
"limit": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
}
params["url"] = f"{base_url}/search?{urlencode(args)}"
params["headers"]["Accept"] = "application/json"
return params
def response(resp):
results = []
json = resp.json()
for result in json["results"]:
publishedDate, content, doi = None, '', None
if result['biblio'].get('release_date'):
publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
if len(result['abstracts']) > 0:
content = result['abstracts'][0].get('body')
elif len(result['_highlights']) > 0:
content = result['_highlights'][0]
if len(result['releases']) > 0:
doi = result['releases'][0].get('doi')
results.append(
{
'template': 'paper.html',
'url': result['fulltext']['access_url'],
'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
'content': html_to_text(content),
'publisher': result['biblio'].get('publisher'),
'doi': doi,
'journal': result['biblio'].get('container_name'),
'authors': result['biblio'].get('contrib_names'),
'tags': result['tags'],
'publishedDate': publishedDate,
'issns': result['biblio'].get('issns'),
'pdf_url': result['fulltext'].get('access_url'),
}
)
return results

View File

@ -27,7 +27,7 @@ categories = ['images']
paging = True
endpoint = 'photos'
base_url = 'https://loc.gov'
base_url = 'https://www.loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json"

View File

@ -233,8 +233,7 @@ class Network:
del kwargs['raise_for_httperror']
return do_raise_for_httperror
@staticmethod
def patch_response(response, do_raise_for_httperror):
def patch_response(self, response, do_raise_for_httperror):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
@ -242,8 +241,11 @@ class Network:
# raise an exception
if do_raise_for_httperror:
try:
raise_for_httperror(response)
except:
self._logger.warning(f"HTTP Request failed: {response.request.method} {response.request.url}")
raise
return response
def is_valid_response(self, response):
@ -269,7 +271,7 @@ class Network:
else:
response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0:
return Network.patch_response(response, do_raise_for_httperror)
return self.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e:
if not was_disconnected:
# the server has closed the connection:

View File

@ -137,9 +137,6 @@ class OnlineProcessor(EngineProcessor):
self.engine.request(query, params)
# ignoring empty urls
if params['url'] is None:
return None
if not params['url']:
return None

View File

@ -422,6 +422,12 @@ engines:
shortcut: bi
disabled: true
- name: baidu
engine: baidu
shortcut: ba
timeout: 15
disabled: true
- name: bing images
engine: bing_images
shortcut: bii
@ -1622,11 +1628,6 @@ engines:
api_site: 'askubuntu'
categories: [it, q&a]
- name: internetarchivescholar
engine: internet_archive_scholar
shortcut: ias
timeout: 15.0
- name: superuser
engine: stackexchange
shortcut: su