mirror of https://github.com/searxng/searxng.git
[fix] engine: google has changed the layout of its response
Since 28. March google has changed its response, this patch fixes the google engine to scrap out the results & images from the new designed response. closes: https://github.com/searxng/searxng/issues/2287 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
1498202b0b
commit
6f9e678346
|
@ -59,13 +59,10 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
|
||||||
# specific xpath variables
|
# specific xpath variables
|
||||||
# ------------------------
|
# ------------------------
|
||||||
|
|
||||||
results_xpath = './/div[@data-sokoban-container]'
|
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
|
||||||
title_xpath = './/a/h3[1]'
|
title_xpath = './/a/h3[1]'
|
||||||
href_xpath = './/a[h3]/@href'
|
href_xpath = './/a[h3]/@href'
|
||||||
content_xpath = './/div[@data-content-feature]'
|
content_xpath = './/div[@data-sncf]'
|
||||||
|
|
||||||
# google *sections* are no usual *results*, we ignore them
|
|
||||||
g_section_with_header = './g-section-with-header'
|
|
||||||
|
|
||||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||||
# from the links not the links itself.
|
# from the links not the links itself.
|
||||||
|
@ -303,21 +300,18 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
# (function(){var s=' ...
|
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
|
||||||
# ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})();
|
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
|
||||||
RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc")
|
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
|
||||||
|
|
||||||
|
|
||||||
def _parse_data_images(dom):
|
def _parse_data_images(dom):
|
||||||
data_image_map = {}
|
data_image_map = {}
|
||||||
for _script in eval_xpath_list(dom, "//script[@nonce]"):
|
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
|
||||||
script = _script.text
|
end_pos = data_image.rfind('=')
|
||||||
if not script:
|
if end_pos > 0:
|
||||||
continue
|
data_image = data_image[: end_pos + 1]
|
||||||
script = RE_DATA_IMAGE.search(script)
|
data_image_map[img_id] = data_image
|
||||||
if not script:
|
|
||||||
continue
|
|
||||||
data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=')
|
|
||||||
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
|
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
|
||||||
return data_image_map
|
return data_image_map
|
||||||
|
|
||||||
|
@ -331,11 +325,7 @@ def response(resp):
|
||||||
|
|
||||||
# convert the text to dom
|
# convert the text to dom
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
|
data_image_map = _parse_data_images(dom)
|
||||||
data_image_map = {}
|
|
||||||
if '_fmt:html' in UI_ASYNC:
|
|
||||||
# in this format images are embedded by a bse64 encoded 'data:image'
|
|
||||||
data_image_map = _parse_data_images(dom)
|
|
||||||
|
|
||||||
# results --> answer
|
# results --> answer
|
||||||
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
||||||
|
@ -349,11 +339,6 @@ def response(resp):
|
||||||
|
|
||||||
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
|
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
|
||||||
|
|
||||||
# google *sections*
|
|
||||||
if extract_text(eval_xpath(result, g_section_with_header)):
|
|
||||||
logger.debug("ignoring <g-section-with-header>")
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
|
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
|
||||||
if title_tag is None:
|
if title_tag is None:
|
||||||
|
@ -361,33 +346,29 @@ def response(resp):
|
||||||
logger.debug('ignoring item from the result_xpath list: missing title')
|
logger.debug('ignoring item from the result_xpath list: missing title')
|
||||||
continue
|
continue
|
||||||
title = extract_text(title_tag)
|
title = extract_text(title_tag)
|
||||||
|
|
||||||
url = eval_xpath_getindex(result, href_xpath, 0, None)
|
url = eval_xpath_getindex(result, href_xpath, 0, None)
|
||||||
if url is None:
|
if url is None:
|
||||||
|
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content = []
|
content_nodes = eval_xpath(result, content_xpath)
|
||||||
img_list = []
|
content = extract_text(content_nodes)
|
||||||
for content_feature in eval_xpath(result, content_xpath):
|
|
||||||
val = content_feature.attrib['data-content-feature']
|
|
||||||
if val in ['1', '2']:
|
|
||||||
txt = extract_text(content_feature, allow_none=True)
|
|
||||||
if txt:
|
|
||||||
content.append(txt)
|
|
||||||
elif '0' in val:
|
|
||||||
img = content_feature.xpath('.//img/@src')
|
|
||||||
if img:
|
|
||||||
img = img[0]
|
|
||||||
if img.startswith('data:image'):
|
|
||||||
img_id = content_feature.xpath('.//img/@id')
|
|
||||||
if img_id:
|
|
||||||
img = data_image_map.get(img_id[0])
|
|
||||||
img_list.append(img)
|
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
|
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
|
||||||
continue
|
continue
|
||||||
content = ' / '.join(content)
|
|
||||||
img_src = img_list[0] if img_list else None
|
img_src = content_nodes[0].xpath('.//img/@src')
|
||||||
|
if img_src:
|
||||||
|
img_src = img_src[0]
|
||||||
|
if img_src.startswith('data:image'):
|
||||||
|
img_id = content_nodes[0].xpath('.//img/@id')
|
||||||
|
if img_id:
|
||||||
|
img_src = data_image_map.get(img_id[0])
|
||||||
|
else:
|
||||||
|
img_src = None
|
||||||
|
|
||||||
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
|
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
|
||||||
|
|
||||||
except Exception as e: # pylint: disable=broad-except
|
except Exception as e: # pylint: disable=broad-except
|
||||||
|
|
Loading…
Reference in New Issue