From 1ac396133677d4b4171bad89885a5cdacce51d83 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 11 Jun 2021 16:06:36 +0200 Subject: [PATCH] [mod] google - get_lang_info add documentataion & comments BTW: remove obsolete log messages from google engine Signed-off-by: Markus Heiser --- searx/engines/google.py | 110 ++++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 27 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 1659a9a40..841212e09 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -134,42 +134,113 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): - ret_val = {} + """Composing various language properties for the google engines. + + This function is called by the various google engines (google itself, + google-images, -news, -scholar, -videos). + + :param dict param: request parameters of the engine + + :param list lang_list: list of supported languages of the engine + :py:obj:`ENGINES_LANGUAGES[engine-name] ` + + :param dict lang_list: custom aliases for non standard language codes + (used when calling :py:func:`searx.utils.match_language) + + :param bool supported_any_language: When a language is not specified, the + language interpretation is left up to Google to decide how the search + results should be delivered. This argument is ``True`` for the google + engine and ``False`` for the other engines (google-images, -news, + -scholar, -videos). + + :rtype: dict + :returns: + Py-Dictionary with the key/value pairs: + + language: + Return value from :py:func:`searx.utils.match_language + + country: + The country code (e.g. US, AT, CA, FR, DE ..) + + subdomain: + Google subdomain :py:obj:`google_domains` that fits to the country + code. + + params: + Py-Dictionary with additional request arguments (can be passed to + :py:func:`urllib.parse.urlencode`). + + headers: + Py-Dictionary with additional HTTP headers (can be passed to + request's headers) + """ + ret_val = { + 'language' : None, + 'country' : None, + 'subdomain' : None, + 'params' : {}, + 'headers' : {}, + } + + # language ... _lang = params['language'] _any_language = _lang.lower() == 'all' if _any_language: _lang = 'en-US' - language = match_language(_lang, lang_list, custom_aliases) ret_val['language'] = language - # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...) - _l = _lang.split('-') + # country ... - # the country code (US, AT, CA) + _l = _lang.split('-') if len(_l) == 2: country = _l[1] else: country = _l[0].upper() if country == 'EN': country = 'US' - ret_val['country'] = country - # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR) - lang_country = '%s-%s' % (language, country) + # subdomain ... - # subdomain ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') - ret_val['params'] = {} - ret_val['headers'] = {} + # params & headers + + lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..) + + # hl parameter: + # https://developers.google.com/custom-search/docs/xml_results#hlsp The + # Interface Language: + # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages + + ret_val['params']['hl'] = lang_list.get(lang_country, language) + + # lr parameter: + # The lr (language restrict) parameter restricts search results to + # documents written in a particular language. + # https://developers.google.com/custom-search/docs/xml_results#lrsp + # Language Collection Values: + # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections if _any_language and supported_any_language: - # based on whoogle + + # interpretation is left up to Google (based on whoogle) + # + # - add parameter ``source=lnt`` + # - don't use parameter ``lr`` + # - don't add a ``Accept-Language`` HTTP header. + ret_val['params']['source'] = 'lnt' + else: + + # restricts search results to documents written in a particular + # language. + ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) + # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 ret_val['headers']['Accept-Language'] = ','.join([ lang_country, @@ -178,18 +249,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): '*;q=0.5', ]) - # lr parameter: - # https://developers.google.com/custom-search/docs/xml_results#lrsp - # Language Collection Values: - # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections - ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) - - ret_val['params']['hl'] = lang_list.get(lang_country, language) - - # hl parameter: - # https://developers.google.com/custom-search/docs/xml_results#hlsp The - # Interface Language: - # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages return ret_val def detect_google_sorry(resp): @@ -220,11 +279,8 @@ def request(query, params): query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) - - logger.debug("query_url --> %s", query_url) params['url'] = query_url - logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'