[fix] startpage engine: XPath expressions adapted for new HTML layout

Startpage has changed its HTML layout, classes like ``w-gl__result__main`` do no
longer exists and the result items have been slightly changed in their
structure.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2024-05-04 08:45:42 +02:00 committed by Markus Heiser
parent d577817646
commit dbed8da284
1 changed files with 5 additions and 11 deletions

View File

@ -142,9 +142,6 @@ search_url = base_url + '/sp/search'
# specific xpath variables # specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"] # not ads: div[@class="result"] are the direct childs of div[@id="results"]
results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]' search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form """XPath of Startpage's origin search form
@ -334,8 +331,8 @@ def _response_cat_web(dom):
results = [] results = []
# parse results # parse results
for result in eval_xpath(dom, results_xpath): for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
links = eval_xpath(result, link_xpath) links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
if not links: if not links:
continue continue
link = links[0] link = links[0]
@ -349,12 +346,9 @@ def _response_cat_web(dom):
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
continue continue
title = extract_text(link) title = extract_text(eval_xpath(link, 'h2'))
content = eval_xpath(result, './/p[contains(@class, "description")]')
if eval_xpath(result, content_xpath): content = extract_text(content, allow_none=True) or ''
content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore
else:
content = ''
published_date = None published_date = None