From 6c3fb9e42b1f383d648b576af810adb32d4839aa Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 10 Sep 2025 16:25:46 +0200 Subject: [PATCH] [mod] arXiv engine: revision of the engine (Paper result) Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser --- docs/dev/engines/online/arxiv.rst | 8 ++ searx/engines/arxiv.py | 139 +++++++++++++++++------------- searx/settings.yml | 1 - 3 files changed, 87 insertions(+), 61 deletions(-) create mode 100644 docs/dev/engines/online/arxiv.rst diff --git a/docs/dev/engines/online/arxiv.rst b/docs/dev/engines/online/arxiv.rst new file mode 100644 index 000000000..59676c6e6 --- /dev/null +++ b/docs/dev/engines/online/arxiv.rst @@ -0,0 +1,8 @@ +.. _arxiv engine: + +===== +arXiv +===== + +.. automodule:: searx.engines.arxiv + :members: diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index 39fcb1a34..c6fbb71a7 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -1,110 +1,129 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""ArXiV (Scientific preprints) +"""arXiv is a free distribution service and an open-access archive for nearly +2.4 million scholarly articles in the fields of physics, mathematics, computer +science, quantitative biology, quantitative finance, statistics, electrical +engineering and systems science, and economics. +The engine uses the `arXiv API`_. + +.. _arXiv API: https://info.arxiv.org/help/api/user-manual.html """ +import typing as t + from datetime import datetime +from urllib.parse import urlencode from lxml import etree from lxml.etree import XPath from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams -# about about = { - "website": 'https://arxiv.org', - "wikidata_id": 'Q118398', - "official_api_documentation": 'https://arxiv.org/help/api', + "website": "https://arxiv.org", + "wikidata_id": "Q118398", + "official_api_documentation": "https://info.arxiv.org/help/api/user-manual.html", "use_official_api": True, "require_api_key": False, - "results": 'XML-RSS', + "results": "XML-RSS", } -categories = ['science', 'scientific publications'] +categories = ["science", "scientific publications"] paging = True +arxiv_max_results = 10 +arxiv_search_prefix = "all" +"""Search fields, for more details see, `Details of Query Construction`_. -base_url = ( - 'https://export.arxiv.org/api/query?search_query=all:' + '{query}&start={offset}&max_results={number_of_results}' -) +.. _Details of Query Construction: + https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction +""" -# engine dependent config -number_of_results = 10 +base_url = "https://export.arxiv.org/api/query" +"""`arXiv API`_ URL, for more details see Query-Interface_ + +.. _Query-Interface: https://info.arxiv.org/help/api/user-manual.html#_query_interface +""" -# xpaths arxiv_namespaces = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom", } -xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) -xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) -xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) -xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) -xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) -xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) -xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) -xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) -xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) -xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) -xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) +xpath_entry = XPath("//atom:entry", namespaces=arxiv_namespaces) +xpath_title = XPath(".//atom:title", namespaces=arxiv_namespaces) +xpath_id = XPath(".//atom:id", namespaces=arxiv_namespaces) +xpath_summary = XPath(".//atom:summary", namespaces=arxiv_namespaces) +xpath_author_name = XPath(".//atom:author/atom:name", namespaces=arxiv_namespaces) +xpath_doi = XPath(".//arxiv:doi", namespaces=arxiv_namespaces) +xpath_pdf = XPath(".//atom:link[@title='pdf']", namespaces=arxiv_namespaces) +xpath_published = XPath(".//atom:published", namespaces=arxiv_namespaces) +xpath_journal = XPath(".//arxiv:journal_ref", namespaces=arxiv_namespaces) +xpath_category = XPath(".//atom:category/@term", namespaces=arxiv_namespaces) +xpath_comment = XPath("./arxiv:comment", namespaces=arxiv_namespaces) -def request(query, params): - # basic search - offset = (params['pageno'] - 1) * number_of_results +def request(query: str, params: "OnlineParams") -> None: - string_args = {'query': query, 'offset': offset, 'number_of_results': number_of_results} - - params['url'] = base_url.format(**string_args) - - return params + args = { + "search_query": f"{arxiv_search_prefix}:{query}", + "start": (params["pageno"] - 1) * arxiv_max_results, + "max_results": arxiv_max_results, + } + params["url"] = f"{base_url}?{urlencode(args)}" -def response(resp): - results = [] +def response(resp: "SXNG_Response") -> EngineResults: + + res = EngineResults() + dom = etree.fromstring(resp.content) for entry in eval_xpath_list(dom, xpath_entry): - title = eval_xpath_getindex(entry, xpath_title, 0).text - url = eval_xpath_getindex(entry, xpath_id, 0).text - abstract = eval_xpath_getindex(entry, xpath_summary, 0).text + title: str = eval_xpath_getindex(entry, xpath_title, 0).text - authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] + url: str = eval_xpath_getindex(entry, xpath_id, 0).text + abstract: str = eval_xpath_getindex(entry, xpath_summary, 0).text + + authors: list[str] = [author.text for author in eval_xpath_list(entry, xpath_author_name)] # doi doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) - doi = None if doi_element is None else doi_element.text + doi: str = "" if doi_element is None else doi_element.text # pdf pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) - pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') + pdf_url: str = "" if pdf_element is None else pdf_element.attrib.get("href") # journal journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) - journal = None if journal_element is None else journal_element.text + journal: str = "" if journal_element is None else journal_element.text # tags tag_elements = eval_xpath(entry, xpath_category) - tags = [str(tag) for tag in tag_elements] + tags: list[str] = [str(tag) for tag in tag_elements] # comments comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) - comments = None if comments_elements is None else comments_elements.text + comments: str = "" if comments_elements is None else comments_elements.text - publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') + publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, "%Y-%m-%dT%H:%M:%SZ") - res_dict = { - 'template': 'paper.html', - 'url': url, - 'title': title, - 'publishedDate': publishedDate, - 'content': abstract, - 'doi': doi, - 'authors': authors, - 'journal': journal, - 'tags': tags, - 'comments': comments, - 'pdf_url': pdf_url, - } + res.add( + res.types.Paper( + url=url, + title=title, + publishedDate=publishedDate, + content=abstract, + doi=doi, + authors=authors, + journal=journal, + tags=tags, + comments=comments, + pdf_url=pdf_url, + ) + ) - results.append(res_dict) - - return results + return res diff --git a/searx/settings.yml b/searx/settings.yml index c3dee3173..3b77bfe09 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -490,7 +490,6 @@ engines: - name: arxiv engine: arxiv shortcut: arx - timeout: 4.0 - name: ask engine: ask