From 599d9488c5e363fd01ec9170a5fea795c3f09f5d Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 10 Sep 2025 16:39:24 +0200 Subject: [PATCH] [mod] Google Scholar engine: revision of the engine (Paper result) Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser --- searx/engines/google.py | 15 +- searx/engines/google_scholar.py | 265 +++++++++++++++++--------------- 2 files changed, 152 insertions(+), 128 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 171aca2f4..1660032e0 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -11,6 +11,8 @@ engines: """ +import typing as t + import re import random import string @@ -28,8 +30,10 @@ from searx.exceptions import SearxEngineCaptchaException from searx.enginelib.traits import EngineTraits from searx.result_types import EngineResults +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams -# about about = { "website": 'https://www.google.com', "wikidata_id": 'Q9366', @@ -89,7 +93,7 @@ def ui_async(start: int) -> str: return ",".join([arc_id, use_ac, _fmt]) -def get_google_info(params, eng_traits): +def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[str, t.Any]: """Composing various (language) properties for the google engines (:ref:`google API`). @@ -144,7 +148,7 @@ def get_google_info(params, eng_traits): """ - ret_val = { + ret_val: dict[str, t.Any] = { 'language': None, 'country': None, 'subdomain': None, @@ -273,7 +277,7 @@ def detect_google_sorry(resp): raise SearxEngineCaptchaException() -def request(query, params): +def request(query: str, params: "OnlineParams") -> None: """Google search request""" # pylint: disable=line-too-long start = (params['pageno'] - 1) * 10 @@ -317,7 +321,6 @@ def request(query, params): params['cookies'] = google_info['cookies'] params['headers'].update(google_info['headers']) - return params # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87; @@ -341,7 +344,7 @@ def parse_data_images(text: str): return data_image_map -def response(resp) -> EngineResults: +def response(resp: "SXNG_Response"): """Get response from google's search request""" # pylint: disable=too-many-branches, too-many-statements detect_google_sorry(resp) diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 5420a5415..8a82b36ee 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -1,12 +1,29 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""This is the implementation of the Google Scholar engine. +"""Google Scholar is a freely accessible web search engine that indexes the full +text or metadata of scholarly literature across an array of publishing formats +and disciplines. Compared to other Google services the Scholar engine has a simple GET REST-API -and there does not exists `async` API. Even though the API slightly vintage we -can make use of the :ref:`google API` to assemble the arguments of the GET +and there does not exists ``async`` API. Even though the API slightly vintage +we can make use of the :ref:`google API` to assemble the arguments of the GET request. + +Configuration +============= + +.. code:: yaml + + - name: google scholar + engine: google_scholar + shortcut: gos + +Implementations +=============== + """ +import typing as t + from urllib.parse import urlencode from datetime import datetime from lxml import html @@ -16,6 +33,7 @@ from searx.utils import ( eval_xpath_getindex, eval_xpath_list, extract_text, + ElementType, ) from searx.exceptions import SearxEngineCaptchaException @@ -26,18 +44,23 @@ from searx.engines.google import ( time_range_dict, ) -# about +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + about = { - "website": 'https://scholar.google.com', - "wikidata_id": 'Q494817', - "official_api_documentation": 'https://developers.google.com/custom-search', + "website": "https://scholar.google.com", + "wikidata_id": "Q494817", + "official_api_documentation": "https://developers.google.com/custom-search", "use_official_api": False, "require_api_key": False, - "results": 'HTML', + "results": "HTML", } # engine dependent config -categories = ['science', 'scientific publications'] +categories = ["science", "scientific publications"] paging = True max_page = 50 """`Google max 50 pages`_ @@ -50,9 +73,97 @@ safesearch = False send_accept_language_header = True -def time_range_args(params): +def request(query: str, params: "OnlineParams") -> None: + """Google-Scholar search request""" + + google_info = get_google_info(params, traits) + # subdomain is: scholar.google.xy + google_info["subdomain"] = google_info["subdomain"].replace("www.", "scholar.") + + args = { + "q": query, + **google_info["params"], + "start": (params["pageno"] - 1) * 10, + "as_sdt": "2007", # include patents / to disable set "0,5" + "as_vis": "0", # include citations / to disable set "1" + } + args.update(time_range_args(params)) + + params["url"] = "https://" + google_info["subdomain"] + "/scholar?" + urlencode(args) + params["cookies"] = google_info["cookies"] + params["headers"].update(google_info["headers"]) + + +def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals + """Parse response from Google Scholar""" + + res = EngineResults() + dom = html.fromstring(resp.text) + detect_google_captcha(dom) + + # parse results + for result in eval_xpath_list(dom, "//div[@data-rp]"): + + title = extract_text(eval_xpath(result, ".//h3[1]//a")) + if not title: + # this is a [ZITATION] block + continue + + pub_type: str = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) or "" + if pub_type: + pub_type = pub_type[1:-1].lower() + + url: str = eval_xpath_getindex(result, ".//h3[1]//a/@href", 0) + content: str = extract_text(eval_xpath(result, ".//div[@class='gs_rs']")) or "" + authors, journal, publisher, publishedDate = parse_gs_a( + extract_text(eval_xpath(result, ".//div[@class='gs_a']")) + ) + if publisher in url: + publisher = "" + + # cited by + comments: str = ( + extract_text(eval_xpath(result, ".//div[@class='gs_fl']/a[starts-with(@href,'/scholar?cites=')]")) or "" + ) + + # link to the html or pdf document + html_url: str = "" + pdf_url: str = "" + doc_url = eval_xpath_getindex(result, ".//div[@class='gs_or_ggsm']/a/@href", 0, default=None) + doc_type = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) + if doc_type == "[PDF]": + pdf_url = doc_url + else: + html_url = doc_url + + res.add( + res.types.Paper( + type=pub_type, + url=url, + title=title, + authors=authors, + publisher=publisher, + journal=journal, + publishedDate=publishedDate, + content=content, + comments=comments, + html_url=html_url, + pdf_url=pdf_url, + ) + ) + + # parse suggestion + for suggestion in eval_xpath(dom, "//div[contains(@class, 'gs_qsuggest_wrap')]//li//a"): + res.add(res.types.LegacyResult(suggestion=extract_text(suggestion))) + + for correction in eval_xpath(dom, "//div[@class='gs_r gs_pda']/a"): + res.add(res.types.LegacyResult(correction=extract_text(correction))) + return res + + +def time_range_args(params: "OnlineParams") -> dict[str, int]: """Returns a dictionary with a time range arguments based on - ``params['time_range']``. + ``params["time_range"]``. Google Scholar supports a detailed search by year. Searching by *last month* or *last week* (as offered by SearXNG) is uncommon for scientific @@ -60,21 +171,23 @@ def time_range_args(params): To limit the result list when the users selects a range, all the SearXNG ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range - is set an empty dictionary of arguments is returned. Example; when - user selects a time range (current year minus one in 2022): + is set an empty dictionary of arguments is returned. + + Example; when user selects a time range and we find ourselves in the year + 2025 (current year minus one): .. code:: python - { 'as_ylo' : 2021 } + { "as_ylo" : 2024 } """ - ret_val = {} - if params['time_range'] in time_range_dict: - ret_val['as_ylo'] = datetime.now().year - 1 + ret_val: dict[str, int] = {} + if params["time_range"] in time_range_dict: + ret_val["as_ylo"] = datetime.now().year - 1 return ret_val -def detect_google_captcha(dom): +def detect_google_captcha(dom: ElementType): """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is not redirected to ``sorry.google.com``. """ @@ -82,29 +195,7 @@ def detect_google_captcha(dom): raise SearxEngineCaptchaException() -def request(query, params): - """Google-Scholar search request""" - - google_info = get_google_info(params, traits) - # subdomain is: scholar.google.xy - google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.") - - args = { - 'q': query, - **google_info['params'], - 'start': (params['pageno'] - 1) * 10, - 'as_sdt': '2007', # include patents / to disable set '0,5' - 'as_vis': '0', # include citations / to disable set '1' - } - args.update(time_range_args(params)) - - params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args) - params['cookies'] = google_info['cookies'] - params['headers'].update(google_info['headers']) - return params - - -def parse_gs_a(text: str | None): +def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]: """Parse the text written in green. Possible formats: @@ -113,98 +204,28 @@ def parse_gs_a(text: str | None): * "{authors} - {publisher}" """ if text is None or text == "": - return None, None, None, None + return [], "", "", None - s_text = text.split(' - ') - authors = s_text[0].split(', ') - publisher = s_text[-1] + s_text = text.split(" - ") + authors: list[str] = s_text[0].split(", ") + publisher: str = s_text[-1] if len(s_text) != 3: - return authors, None, publisher, None + return authors, "", publisher, None # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" # get journal and year - journal_year = s_text[1].split(', ') + journal_year = s_text[1].split(", ") # journal is optional and may contains some coma if len(journal_year) > 1: - journal = ', '.join(journal_year[0:-1]) - if journal == '…': - journal = None + journal: str = ", ".join(journal_year[0:-1]) + if journal == "…": + journal = "" else: - journal = None + journal = "" # year year = journal_year[-1] try: - publishedDate = datetime.strptime(year.strip(), '%Y') + publishedDate = datetime.strptime(year.strip(), "%Y") except ValueError: publishedDate = None return authors, journal, publisher, publishedDate - - -def response(resp): # pylint: disable=too-many-locals - """Parse response from Google Scholar""" - results = [] - - # convert the text to dom - dom = html.fromstring(resp.text) - detect_google_captcha(dom) - - # parse results - for result in eval_xpath_list(dom, '//div[@data-rp]'): - - title = extract_text(eval_xpath(result, './/h3[1]//a')) - - if not title: - # this is a [ZITATION] block - continue - - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) - if pub_type: - pub_type = pub_type[1:-1].lower() - - url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) - content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) - authors, journal, publisher, publishedDate = parse_gs_a( - extract_text(eval_xpath(result, './/div[@class="gs_a"]')) - ) - if publisher in url: - publisher = None - - # cited by - comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) - - # link to the html or pdf document - html_url = None - pdf_url = None - doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) - doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) - if doc_type == "[PDF]": - pdf_url = doc_url - else: - html_url = doc_url - - results.append( - { - 'template': 'paper.html', - 'type': pub_type, - 'url': url, - 'title': title, - 'authors': authors, - 'publisher': publisher, - 'journal': journal, - 'publishedDate': publishedDate, - 'content': content, - 'comments': comments, - 'html_url': html_url, - 'pdf_url': pdf_url, - } - ) - - # parse suggestion - for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): - # append suggestion - results.append({'suggestion': extract_text(suggestion)}) - - for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): - results.append({'correction': extract_text(correction)}) - - return results