[mod] arXiv engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-12-23 04:00:02 +00:00 · 2025-09-10 16:25:46 +02:00
parent f9b4869426
commit 6c3fb9e42b
3 changed files with 87 additions and 61 deletions
--- a/docs/dev/engines/online/arxiv.rst
+++ b/docs/dev/engines/online/arxiv.rst
@@ -0,0 +1,8 @@
 .. _arxiv engine:
 =====
 arXiv
 =====
 .. automodule:: searx.engines.arxiv
   :members:
--- a/searx/engines/arxiv.py
+++ b/searx/engines/arxiv.py
@@ -1,110 +1,129 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""ArXiV (Scientific preprints)
+"""arXiv is a free distribution service and an open-access archive for nearly
 2.4 million scholarly articles in the fields of physics, mathematics, computer
 science, quantitative biology, quantitative finance, statistics, electrical
 engineering and systems science, and economics.
 The engine uses the `arXiv API`_.
 .. _arXiv API: https://info.arxiv.org/help/api/user-manual.html
 """
 import typing as t
 from datetime import datetime
 from urllib.parse import urlencode
 from lxml import etree
 from lxml.etree import XPath
 from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
 from searx.result_types import EngineResults
 if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams
 # about
 about = {
-    "website": 'https://arxiv.org',
+    "website": "https://arxiv.org",
-    "wikidata_id": 'Q118398',
+    "wikidata_id": "Q118398",
-    "official_api_documentation": 'https://arxiv.org/help/api',
+    "official_api_documentation": "https://info.arxiv.org/help/api/user-manual.html",
    "use_official_api": True,
    "require_api_key": False,
-    "results": 'XML-RSS',
+    "results": "XML-RSS",
 }
-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]
 paging = True
 arxiv_max_results = 10
 arxiv_search_prefix = "all"
 """Search fields, for more details see, `Details of Query Construction`_.
-base_url = (
+.. _Details of Query Construction:
-    'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
+   https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
-)
+"""
-# engine dependent config
+base_url = "https://export.arxiv.org/api/query"
-number_of_results = 10
+"""`arXiv API`_ URL, for more details see Query-Interface_
 .. _Query-Interface: https://info.arxiv.org/help/api/user-manual.html#_query_interface
 """
 # xpaths
 arxiv_namespaces = {
    "atom": "http://www.w3.org/2005/Atom",
    "arxiv": "http://arxiv.org/schemas/atom",
 }
-xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
+xpath_entry = XPath("//atom:entry", namespaces=arxiv_namespaces)
-xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
+xpath_title = XPath(".//atom:title", namespaces=arxiv_namespaces)
-xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
+xpath_id = XPath(".//atom:id", namespaces=arxiv_namespaces)
-xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
+xpath_summary = XPath(".//atom:summary", namespaces=arxiv_namespaces)
-xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
+xpath_author_name = XPath(".//atom:author/atom:name", namespaces=arxiv_namespaces)
-xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
+xpath_doi = XPath(".//arxiv:doi", namespaces=arxiv_namespaces)
-xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
+xpath_pdf = XPath(".//atom:link[@title='pdf']", namespaces=arxiv_namespaces)
-xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
+xpath_published = XPath(".//atom:published", namespaces=arxiv_namespaces)
-xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
+xpath_journal = XPath(".//arxiv:journal_ref", namespaces=arxiv_namespaces)
-xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
+xpath_category = XPath(".//atom:category/@term", namespaces=arxiv_namespaces)
-xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
+xpath_comment = XPath("./arxiv:comment", namespaces=arxiv_namespaces)
-def request(query, params):
+def request(query: str, params: "OnlineParams") -> None:
    # basic search
    offset = (params['pageno'] - 1) * number_of_results
-    string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results}
+    args = {
-
+        "search_query": f"{arxiv_search_prefix}:{query}",
-    params['url'] = base_url.format(**string_args)
+        "start": (params["pageno"] - 1) * arxiv_max_results,
-
+        "max_results": arxiv_max_results,
-    return params
+    }
    params["url"] = f"{base_url}?{urlencode(args)}"
-def response(resp):
+def response(resp: "SXNG_Response") -> EngineResults:
-    results = []
+
    res = EngineResults()
    dom = etree.fromstring(resp.content)
    for entry in eval_xpath_list(dom, xpath_entry):
        title = eval_xpath_getindex(entry, xpath_title, 0).text
-        url = eval_xpath_getindex(entry, xpath_id, 0).text
+        title: str = eval_xpath_getindex(entry, xpath_title, 0).text
        abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
-        authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
+        url: str = eval_xpath_getindex(entry, xpath_id, 0).text
        abstract: str = eval_xpath_getindex(entry, xpath_summary, 0).text
        authors: list[str] = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
        #  doi
        doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
-        doi = None if doi_element is None else doi_element.text
+        doi: str = "" if doi_element is None else doi_element.text
        # pdf
        pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
-        pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
+        pdf_url: str = "" if pdf_element is None else pdf_element.attrib.get("href")
        # journal
        journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
-        journal = None if journal_element is None else journal_element.text
+        journal: str = "" if journal_element is None else journal_element.text
        # tags
        tag_elements = eval_xpath(entry, xpath_category)
-        tags = [str(tag) for tag in tag_elements]
+        tags: list[str] = [str(tag) for tag in tag_elements]
        # comments
        comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
-        comments = None if comments_elements is None else comments_elements.text
+        comments: str = "" if comments_elements is None else comments_elements.text
-        publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
+        publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, "%Y-%m-%dT%H:%M:%SZ")
-        res_dict = {
+        res.add(
-            'template': 'paper.html',
+            res.types.Paper(
-            'url': url,
+                url=url,
-            'title': title,
+                title=title,
-            'publishedDate': publishedDate,
+                publishedDate=publishedDate,
-            'content': abstract,
+                content=abstract,
-            'doi': doi,
+                doi=doi,
-            'authors': authors,
+                authors=authors,
-            'journal': journal,
+                journal=journal,
-            'tags': tags,
+                tags=tags,
-            'comments': comments,
+                comments=comments,
-            'pdf_url': pdf_url,
+                pdf_url=pdf_url,
-        }
+            )
        )
-        results.append(res_dict)
+    return res
    return results
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -490,7 +490,6 @@ engines:
  - name: arxiv
    engine: arxiv
    shortcut: arx
    timeout: 4.0
  - name: ask
    engine: ask