From 9b6ffed061570e2540e219d66e5100a1572c07c8 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Thu, 25 Feb 2021 23:20:50 -0700 Subject: [PATCH 1/2] fix fetch_languages for bing Bing has a list of regions that it supports and some of these regions may have more than one possible language. In some cases, like Switzerland, these languages are always shown as options, so there is no issue. But in other cases, like Andorra, Bing will only show one language at the time, either the region's default or the request's language if the latter is supported by that region. For example, if the HTTP request is in French, Andorra will appear as fr-AD but if the same page is requested in any other language Andorra will appear as ca-AD. This is specially a problem when Bing assumes that the request is in English because it overrides enough language codes to make several major languages like Arabic dissappear from the languages.py file. To avoid that issue, I set the Accept-Language header to a language that's only supported in one region to hopefully avoid these overrides. --- searx/engines/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 80d5d18fc..9ece10964 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -27,7 +27,7 @@ from searx import settings from searx import logger from searx.data import ENGINES_LANGUAGES from searx.poolrequests import get, get_proxy_cycles -from searx.utils import load_module, match_language, get_engine_from_settings +from searx.utils import load_module, match_language, get_engine_from_settings, gen_useragent logger = logger.getChild('engines') @@ -131,8 +131,12 @@ def load_engine(engine_data): # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): + headers = { + 'User-Agent': gen_useragent(), + 'Accept-Language': 'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3', # bing needs a non-English language + } setattr(engine, 'fetch_supported_languages', - lambda: engine._fetch_supported_languages(get(engine.supported_languages_url))) + lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers))) engine.stats = { 'sent_search_count': 0, # sent search From d6681fd33b31cd058f2ff15e035828251773acae Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Thu, 25 Feb 2021 23:49:15 -0700 Subject: [PATCH 2/2] remove articles number from engines_languages.json --- searx/engines/wikipedia.py | 2 +- utils/fetch_languages.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 2adfefa69..da867c81e 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -106,6 +106,6 @@ def _fetch_supported_languages(resp): articles = int(td[4].xpath('./a/b')[0].text.replace(',', '')) # exclude languages with too few articles if articles >= 100: - supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles} + supported_languages[code] = {"name": name, "english_name": english_name} return supported_languages diff --git a/utils/fetch_languages.py b/utils/fetch_languages.py index 453693136..582e0ae00 100644 --- a/utils/fetch_languages.py +++ b/utils/fetch_languages.py @@ -2,8 +2,7 @@ # This script generates languages.py from intersecting each engine's supported languages. # -# Output files (engines_languages.json and languages.py) -# are written in current directory to avoid overwriting in case something goes wrong. +# Output files: searx/data/engines_languages.json and searx/languages.py import json from pathlib import Path