[mod] arXiv engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other
typifications.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2025-09-10 16:25:46 +02:00
committed by Markus Heiser
parent f9b4869426
commit 6c3fb9e42b
3 changed files with 87 additions and 61 deletions

View File

@@ -0,0 +1,8 @@
.. _arxiv engine:
=====
arXiv
=====
.. automodule:: searx.engines.arxiv
:members:

View File

@@ -1,110 +1,129 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""ArXiV (Scientific preprints)
"""arXiv is a free distribution service and an open-access archive for nearly
2.4 million scholarly articles in the fields of physics, mathematics, computer
science, quantitative biology, quantitative finance, statistics, electrical
engineering and systems science, and economics.
The engine uses the `arXiv API`_.
.. _arXiv API: https://info.arxiv.org/help/api/user-manual.html
"""
import typing as t
from datetime import datetime
from urllib.parse import urlencode
from lxml import etree
from lxml.etree import XPath
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
# about
about = {
"website": 'https://arxiv.org',
"wikidata_id": 'Q118398',
"official_api_documentation": 'https://arxiv.org/help/api',
"website": "https://arxiv.org",
"wikidata_id": "Q118398",
"official_api_documentation": "https://info.arxiv.org/help/api/user-manual.html",
"use_official_api": True,
"require_api_key": False,
"results": 'XML-RSS',
"results": "XML-RSS",
}
categories = ['science', 'scientific publications']
categories = ["science", "scientific publications"]
paging = True
arxiv_max_results = 10
arxiv_search_prefix = "all"
"""Search fields, for more details see, `Details of Query Construction`_.
base_url = (
'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
)
.. _Details of Query Construction:
https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
"""
# engine dependent config
number_of_results = 10
base_url = "https://export.arxiv.org/api/query"
"""`arXiv API`_ URL, for more details see Query-Interface_
.. _Query-Interface: https://info.arxiv.org/help/api/user-manual.html#_query_interface
"""
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
xpath_entry = XPath("//atom:entry", namespaces=arxiv_namespaces)
xpath_title = XPath(".//atom:title", namespaces=arxiv_namespaces)
xpath_id = XPath(".//atom:id", namespaces=arxiv_namespaces)
xpath_summary = XPath(".//atom:summary", namespaces=arxiv_namespaces)
xpath_author_name = XPath(".//atom:author/atom:name", namespaces=arxiv_namespaces)
xpath_doi = XPath(".//arxiv:doi", namespaces=arxiv_namespaces)
xpath_pdf = XPath(".//atom:link[@title='pdf']", namespaces=arxiv_namespaces)
xpath_published = XPath(".//atom:published", namespaces=arxiv_namespaces)
xpath_journal = XPath(".//arxiv:journal_ref", namespaces=arxiv_namespaces)
xpath_category = XPath(".//atom:category/@term", namespaces=arxiv_namespaces)
xpath_comment = XPath("./arxiv:comment", namespaces=arxiv_namespaces)
def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
def request(query: str, params: "OnlineParams") -> None:
string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results}
params['url'] = base_url.format(**string_args)
return params
args = {
"search_query": f"{arxiv_search_prefix}:{query}",
"start": (params["pageno"] - 1) * arxiv_max_results,
"max_results": arxiv_max_results,
}
params["url"] = f"{base_url}?{urlencode(args)}"
def response(resp):
results = []
def response(resp: "SXNG_Response") -> EngineResults:
res = EngineResults()
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
title: str = eval_xpath_getindex(entry, xpath_title, 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
url: str = eval_xpath_getindex(entry, xpath_id, 0).text
abstract: str = eval_xpath_getindex(entry, xpath_summary, 0).text
authors: list[str] = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
# doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
doi: str = "" if doi_element is None else doi_element.text
# pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
pdf_url: str = "" if pdf_element is None else pdf_element.attrib.get("href")
# journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
journal: str = "" if journal_element is None else journal_element.text
# tags
tag_elements = eval_xpath(entry, xpath_category)
tags = [str(tag) for tag in tag_elements]
tags: list[str] = [str(tag) for tag in tag_elements]
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
comments: str = "" if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, "%Y-%m-%dT%H:%M:%SZ")
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
res.add(
res.types.Paper(
url=url,
title=title,
publishedDate=publishedDate,
content=abstract,
doi=doi,
authors=authors,
journal=journal,
tags=tags,
comments=comments,
pdf_url=pdf_url,
)
)
results.append(res_dict)
return results
return res

View File

@@ -490,7 +490,6 @@ engines:
- name: arxiv
engine: arxiv
shortcut: arx
timeout: 4.0
- name: ask
engine: ask