[mod] arXiv engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-12-22 19:50:00 +00:00 · 2025-09-10 16:25:46 +02:00
parent f9b4869426
commit 6c3fb9e42b
3 changed files with 87 additions and 61 deletions
--- a/docs/dev/engines/online/arxiv.rst
+++ b/docs/dev/engines/online/arxiv.rst
@@ -0,0 +1,8 @@
+.. _arxiv engine:
+
+=====
+arXiv
+=====
+
+.. automodule:: searx.engines.arxiv
+   :members:
--- a/searx/engines/arxiv.py
+++ b/searx/engines/arxiv.py
@@ -1,110 +1,129 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""ArXiV (Scientific preprints)
+"""arXiv is a free distribution service and an open-access archive for nearly
+2.4 million scholarly articles in the fields of physics, mathematics, computer
+science, quantitative biology, quantitative finance, statistics, electrical
+engineering and systems science, and economics.

+The engine uses the `arXiv API`_.
+
+.. _arXiv API: https://info.arxiv.org/help/api/user-manual.html
 """

+import typing as t
+
 from datetime import datetime
+from urllib.parse import urlencode

 from lxml import etree
 from lxml.etree import XPath
 from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
+from searx.result_types import EngineResults
+
+if t.TYPE_CHECKING:
+    from searx.extended_types import SXNG_Response
+    from searx.search.processors import OnlineParams

-# about
 about = {
-    "website": 'https://arxiv.org',
-    "wikidata_id": 'Q118398',
-    "official_api_documentation": 'https://arxiv.org/help/api',
+    "website": "https://arxiv.org",
+    "wikidata_id": "Q118398",
+    "official_api_documentation": "https://info.arxiv.org/help/api/user-manual.html",
    "use_official_api": True,
    "require_api_key": False,
-    "results": 'XML-RSS',
+    "results": "XML-RSS",
 }

-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]
 paging = True
+arxiv_max_results = 10
+arxiv_search_prefix = "all"
+"""Search fields, for more details see, `Details of Query Construction`_.

-base_url = (
-    'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}'
-)
+.. _Details of Query Construction:
+   https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction
+"""

-# engine dependent config
-number_of_results = 10
+base_url = "https://export.arxiv.org/api/query"
+"""`arXiv API`_ URL, for more details see Query-Interface_
+
+.. _Query-Interface: https://info.arxiv.org/help/api/user-manual.html#_query_interface
+"""

-# xpaths
 arxiv_namespaces = {
    "atom": "http://www.w3.org/2005/Atom",
    "arxiv": "http://arxiv.org/schemas/atom",
 }
-xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
-xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
-xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
-xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
-xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
-xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
-xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
-xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
-xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
-xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
-xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
+xpath_entry = XPath("//atom:entry", namespaces=arxiv_namespaces)
+xpath_title = XPath(".//atom:title", namespaces=arxiv_namespaces)
+xpath_id = XPath(".//atom:id", namespaces=arxiv_namespaces)
+xpath_summary = XPath(".//atom:summary", namespaces=arxiv_namespaces)
+xpath_author_name = XPath(".//atom:author/atom:name", namespaces=arxiv_namespaces)
+xpath_doi = XPath(".//arxiv:doi", namespaces=arxiv_namespaces)
+xpath_pdf = XPath(".//atom:link[@title='pdf']", namespaces=arxiv_namespaces)
+xpath_published = XPath(".//atom:published", namespaces=arxiv_namespaces)
+xpath_journal = XPath(".//arxiv:journal_ref", namespaces=arxiv_namespaces)
+xpath_category = XPath(".//atom:category/@term", namespaces=arxiv_namespaces)
+xpath_comment = XPath("./arxiv:comment", namespaces=arxiv_namespaces)


-def request(query, params):
-    # basic search
-    offset = (params['pageno'] - 1) * number_of_results
+def request(query: str, params: "OnlineParams") -> None:

-    string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results}
-
-    params['url'] = base_url.format(**string_args)
-
-    return params
+    args = {
+        "search_query": f"{arxiv_search_prefix}:{query}",
+        "start": (params["pageno"] - 1) * arxiv_max_results,
+        "max_results": arxiv_max_results,
+    }
+    params["url"] = f"{base_url}?{urlencode(args)}"


-def response(resp):
-    results = []
+def response(resp: "SXNG_Response") -> EngineResults:
+
+    res = EngineResults()
+
    dom = etree.fromstring(resp.content)
    for entry in eval_xpath_list(dom, xpath_entry):
-        title = eval_xpath_getindex(entry, xpath_title, 0).text

-        url = eval_xpath_getindex(entry, xpath_id, 0).text
-        abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
+        title: str = eval_xpath_getindex(entry, xpath_title, 0).text

-        authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
+        url: str = eval_xpath_getindex(entry, xpath_id, 0).text
+        abstract: str = eval_xpath_getindex(entry, xpath_summary, 0).text
+
+        authors: list[str] = [author.text for author in eval_xpath_list(entry, xpath_author_name)]

        #  doi
        doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
-        doi = None if doi_element is None else doi_element.text
+        doi: str = "" if doi_element is None else doi_element.text

        # pdf
        pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
-        pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
+        pdf_url: str = "" if pdf_element is None else pdf_element.attrib.get("href")

        # journal
        journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
-        journal = None if journal_element is None else journal_element.text
+        journal: str = "" if journal_element is None else journal_element.text

        # tags
        tag_elements = eval_xpath(entry, xpath_category)
-        tags = [str(tag) for tag in tag_elements]
+        tags: list[str] = [str(tag) for tag in tag_elements]

        # comments
        comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
-        comments = None if comments_elements is None else comments_elements.text
+        comments: str = "" if comments_elements is None else comments_elements.text

-        publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
+        publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, "%Y-%m-%dT%H:%M:%SZ")

-        res_dict = {
-            'template': 'paper.html',
-            'url': url,
-            'title': title,
-            'publishedDate': publishedDate,
-            'content': abstract,
-            'doi': doi,
-            'authors': authors,
-            'journal': journal,
-            'tags': tags,
-            'comments': comments,
-            'pdf_url': pdf_url,
-        }
+        res.add(
+            res.types.Paper(
+                url=url,
+                title=title,
+                publishedDate=publishedDate,
+                content=abstract,
+                doi=doi,
+                authors=authors,
+                journal=journal,
+                tags=tags,
+                comments=comments,
+                pdf_url=pdf_url,
+            )
+        )

-        results.append(res_dict)
-
-    return results
+    return res
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -490,7 +490,6 @@ engines:
  - name: arxiv
    engine: arxiv
    shortcut: arx
-    timeout: 4.0

  - name: ask
    engine: ask