[fix] engine yahoo: HTML tags are included in result titles

- https://github.com/searxng/searxng/issues/3790

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus 2024-09-01 18:53:56 +02:00 committed by Bnyro
parent 94a1f39bde
commit 21bfb4996e
1 changed files with 12 additions and 3 deletions

View File

@ -16,6 +16,7 @@ from searx.utils import (
eval_xpath_getindex, eval_xpath_getindex,
eval_xpath_list, eval_xpath_list,
extract_text, extract_text,
html_to_text,
) )
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
@ -133,12 +134,20 @@ def response(resp):
url = parse_url(url) url = parse_url(url)
title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='') title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='')
title = extract_text(title) title: str = extract_text(title)
content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='') content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
content = extract_text(content, allow_none=True) content: str = extract_text(content, allow_none=True)
# append result # append result
results.append({'url': url, 'title': title, 'content': content}) results.append(
{
'url': url,
# title sometimes contains HTML tags / see
# https://github.com/searxng/searxng/issues/3790
'title': " ".join(html_to_text(title).strip().split()),
'content': " ".join(html_to_text(content).strip().split()),
}
)
for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'): for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
# append suggestion # append suggestion