Merge pull request #1446 from MarcAbonce/language_aliases_fix

[fix] Fix queries in Hebrew and Norwegian so they give results in the right language
This commit is contained in:
Noémi Ványi 2019-01-07 20:38:05 +01:00 committed by GitHub
commit 491792c1a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 27248 additions and 26983 deletions

File diff suppressed because it is too large Load Diff

View File

@ -113,7 +113,6 @@ def load_engine(engine_data):
iso_lang not in getattr(engine, 'supported_languages'): iso_lang not in getattr(engine, 'supported_languages'):
language_aliases[iso_lang] = engine_lang language_aliases[iso_lang] = engine_lang
if language_aliases:
setattr(engine, 'language_aliases', language_aliases) setattr(engine, 'language_aliases', language_aliases)
# assign language fetching method if auxiliary method exists # assign language fetching method if auxiliary method exists

View File

@ -55,7 +55,7 @@ def request(query, params):
query=urlencode({'q': query}), query=urlencode({'q': query}),
offset=offset) offset=offset)
language = match_language(params['language'], supported_languages).lower() language = match_language(params['language'], supported_languages, language_aliases).lower()
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

View File

@ -48,7 +48,7 @@ def request(query, params):
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
# language cookie # language cookie
language = match_language(params['language'], supported_languages).lower() language = match_language(params['language'], supported_languages, language_aliases).lower()
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
# query and paging # query and paging

View File

@ -166,7 +166,7 @@ def extract_text_from_dom(result, xpath):
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
language = match_language(params['language'], supported_languages) language = match_language(params['language'], supported_languages, language_aliases)
language_array = language.split('-') language_array = language.split('-')
if params['language'].find('-') > 0: if params['language'].find('-') > 0:
country = params['language'].split('-')[1] country = params['language'].split('-')[1]
@ -381,10 +381,10 @@ def attributes_to_html(attributes):
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = {} supported_languages = {}
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
options = dom.xpath('//table//td/font/label/span') options = dom.xpath('//*[@id="langSec"]//input[@name="lr"]')
for option in options: for option in options:
code = option.xpath('./@id')[0][1:] code = option.xpath('./@value')[0].split('_')[-1]
name = option.text.title() name = option.xpath('./@data-name')[0].title()
supported_languages[code] = {"name": name} supported_languages[code] = {"name": name}
return supported_languages return supported_languages

View File

@ -51,7 +51,7 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),
search_options=urlencode(search_options)) search_options=urlencode(search_options))
language = match_language(params['language'], supported_languages).split('-')[0] language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
if language: if language:
params['url'] += '&lr=lang_' + language params['url'] += '&lr=lang_' + language

View File

@ -46,7 +46,7 @@ def request(query, params):
offset=offset) offset=offset)
# add language tag # add language tag
language = match_language(params['language'], supported_languages) language = match_language(params['language'], supported_languages, language_aliases)
params['url'] += '&locale=' + language.replace('-', '_').lower() params['url'] += '&locale=' + language.replace('-', '_').lower()
return params return params

View File

@ -36,7 +36,7 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
# do search-request # do search-request
def request(query, params): def request(query, params):
region = match_language(params['language'], supported_languages) region = match_language(params['language'], supported_languages, language_aliases)
ui_language = region.split('-')[0] ui_language = region.split('-')[0]
search_path = search_string.format( search_path = search_string.format(

View File

@ -68,7 +68,7 @@ def response(resp):
html = fromstring(resp.text) html = fromstring(resp.text)
search_results = html.xpath(wikidata_ids_xpath) search_results = html.xpath(wikidata_ids_xpath)
language = match_language(resp.search_params['language'], supported_languages).split('-')[0] language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]
# TODO: make requests asynchronous to avoid timeout when result_count > 1 # TODO: make requests asynchronous to avoid timeout when result_count > 1
for search_result in search_results[:result_count]: for search_result in search_results[:result_count]:

View File

@ -31,7 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url # set language in base_url
def url_lang(lang): def url_lang(lang):
return match_language(lang, supported_languages).split('-')[0] return match_language(lang, supported_languages, language_aliases).split('-')[0]
# do search-request # do search-request

View File

@ -9,6 +9,7 @@ class TestBingImagesEngine(SearxTestCase):
def test_request(self): def test_request(self):
bing_images.supported_languages = ['fr-FR', 'en-US'] bing_images.supported_languages = ['fr-FR', 'en-US']
bing_images.language_aliases = {}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1

View File

@ -9,6 +9,7 @@ class TestBingVideosEngine(SearxTestCase):
def test_request(self): def test_request(self):
bing_videos.supported_languages = ['fr-FR', 'en-US'] bing_videos.supported_languages = ['fr-FR', 'en-US']
bing_videos.language_aliases = {}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1

View File

@ -15,7 +15,8 @@ class TestGoogleEngine(SearxTestCase):
return response return response
def test_request(self): def test_request(self):
google.supported_languages = ['en', 'fr', 'zh-CN'] google.supported_languages = ['en', 'fr', 'zh-CN', 'iw']
google.language_aliases = {'he': 'iw'}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
@ -41,6 +42,12 @@ class TestGoogleEngine(SearxTestCase):
self.assertIn('zh-CN', params['url']) self.assertIn('zh-CN', params['url'])
self.assertIn('zh-CN', params['headers']['Accept-Language']) self.assertIn('zh-CN', params['headers']['Accept-Language'])
dicto['language'] = 'he'
params = google.request(query, dicto)
self.assertIn('google.com', params['url'])
self.assertIn('iw', params['url'])
self.assertIn('iw', params['headers']['Accept-Language'])
def test_response(self): def test_response(self):
self.assertRaises(AttributeError, google.response, None) self.assertRaises(AttributeError, google.response, None)
self.assertRaises(AttributeError, google.response, []) self.assertRaises(AttributeError, google.response, [])
@ -198,29 +205,13 @@ class TestGoogleEngine(SearxTestCase):
html = u""" html = u"""
<html> <html>
<body> <body>
<table> <div id="langSec">
<tbody> <div>
<tr> <input name="lr" data-name="english" value="lang_en" />
<td> <input name="lr" data-name="中文 (简体)" value="lang_zh-CN" />
<font> <input name="lr" data-name="中文 (繁體)" value="lang_zh-TW" />
<label> </div>
<span id="ten">English</span> </div>
</label>
</font>
</td>
<td>
<font>
<label>
<span id="tzh-CN">中文 (简体)</span>
</label>
<label>
<span id="tzh-TW">中文 (繁體)</span>
</label>
</font>
</td>
</tr>
</tbody>
</table>
</body> </body>
</html> </html>
""" """

View File

@ -10,6 +10,7 @@ class TestGoogleNewsEngine(SearxTestCase):
def test_request(self): def test_request(self):
google_news.supported_languages = ['en-US', 'fr-FR'] google_news.supported_languages = ['en-US', 'fr-FR']
google_news.language_aliases = {}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1

View File

@ -8,6 +8,7 @@ class TestQwantEngine(SearxTestCase):
def test_request(self): def test_request(self):
qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR'] qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
qwant.language_aliases = {}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 0 dicto['pageno'] = 0

View File

@ -8,6 +8,7 @@ class TestSwisscowsEngine(SearxTestCase):
def test_request(self): def test_request(self):
swisscows.supported_languages = ['de-AT', 'de-DE'] swisscows.supported_languages = ['de-AT', 'de-DE']
swisscows.language_aliases = {}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1

View File

@ -27,6 +27,7 @@ class TestWikidataEngine(SearxTestCase):
self.assertRaises(AttributeError, wikidata.response, '[]') self.assertRaises(AttributeError, wikidata.response, '[]')
wikidata.supported_languages = ['en', 'es'] wikidata.supported_languages = ['en', 'es']
wikidata.language_aliases = {}
response = mock.Mock(text='<html></html>', search_params={"language": "en"}) response = mock.Mock(text='<html></html>', search_params={"language": "en"})
self.assertEqual(wikidata.response(response), []) self.assertEqual(wikidata.response(response), [])

View File

@ -8,7 +8,8 @@ from searx.testing import SearxTestCase
class TestWikipediaEngine(SearxTestCase): class TestWikipediaEngine(SearxTestCase):
def test_request(self): def test_request(self):
wikipedia.supported_languages = ['fr', 'en'] wikipedia.supported_languages = ['fr', 'en', 'no']
wikipedia.language_aliases = {'nb': 'no'}
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
@ -25,9 +26,13 @@ class TestWikipediaEngine(SearxTestCase):
self.assertIn('Test_Query', params['url']) self.assertIn('Test_Query', params['url'])
self.assertNotIn('test_query', params['url']) self.assertNotIn('test_query', params['url'])
dicto['language'] = 'nb'
params = wikipedia.request(query, dicto)
self.assertIn('no.wikipedia.org', params['url'])
dicto['language'] = 'xx' dicto['language'] = 'xx'
params = wikipedia.request(query, dicto) params = wikipedia.request(query, dicto)
self.assertIn('en', params['url']) self.assertIn('en.wikipedia.org', params['url'])
def test_response(self): def test_response(self):
dicto = defaultdict(dict) dicto = defaultdict(dict)