From 8ae66a7eaaa0cbfdc4a7c095e36a308dbc5eedc8 Mon Sep 17 00:00:00 2001 From: Fmstrat Date: Tue, 13 Aug 2024 19:37:28 -0400 Subject: [PATCH 1/2] update for enhanced google answers --- searx/engines/google.py | 71 ++++++++++++++++++++++++----- searx/templates/simple/results.html | 6 ++- 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index f9215783c..71ac741b4 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -320,6 +320,51 @@ def _parse_data_images(dom): return data_image_map +def _eval_answers(results, dom, xpath): + answer_list = eval_xpath(dom, xpath) + drop_elements = [ + './/div[@class="nnFGuf"]', + './/script', # Scripts like the calculator + './/table[@class="HOoTuc"]', # The actual calculator controls + './/table[@class="ElumCf"]', # The actual calculator buttons + './/div[@class="mDRaHd"]', # Instructions with links + './/span[@class="W7GCoc CNbPnc"]', # Feedback + './/*[@style="display:none"]', # Hidden elements + ] + for item in answer_list: + for element in eval_xpath(item, ' | '.join(drop_elements)): + element.drop_tree() + extracted_table_html = None + is_safe = False + table_elements = eval_xpath(item, './/table') + if table_elements: + extracted_table = table_elements[0] + extracted_table.attrib.clear() + for element in extracted_table.xpath(f'.//*'): + element.attrib.clear() + extracted_table.set('cellpadding', '2') + extracted_table.set('cellspacing', '2') + extracted_table.set('border', '0') + extracted_table_html = html.tostring(extracted_table, pretty_print=True, encoding='unicode') + is_safe = True + for element in eval_xpath(item, './/table'): # Drop all remaining tables + element.drop_tree() + answer_content = extract_text(item) + if extracted_table_html: + answer_content += '

' + extracted_table_html + url = (eval_xpath(item, '../..//a/@href') + [None])[0] + if url and url.startswith('/search?'): # If the answer is a Google search link, don't use it + url = None + results.append( + { + 'answer': answer_content, + 'url': url, + 'safe': is_safe, + } + ) + return results + + def response(resp): """Get response from google's search request""" # pylint: disable=too-many-branches, too-many-statements @@ -331,17 +376,21 @@ def response(resp): dom = html.fromstring(resp.text) data_image_map = _parse_data_images(dom) - # results --> answer - answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') - for item in answer_list: - for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'): - bubble.drop_tree() - results.append( - { - 'answer': extract_text(item), - 'url': (eval_xpath(item, '../..//a/@href') + [None])[0], - } - ) + results = _eval_answers(results, dom, '//div[contains(@class, "card-section")]') # Look for cards first + if not results: + results = _eval_answers(results, dom, '//div[contains(@class, "LGOjhe")]') # Look for rendered answers next + # Look for JS DOM encoded string answers last + if not results: + pattern = r"'\\x3c.*?\\x3e'" # These are DOM encoded strings that Google will push into the DOM + matches = re.findall(pattern, resp.text) + for match in matches: + decoded_html = match.encode().decode('unicode_escape') + encoded_dom = html.fromstring(decoded_html) + sub_doms = eval_xpath(encoded_dom, '//div[contains(@class, "LGOjhe")]') + if sub_doms: + if '' in decoded_html: # Main answers start with a bold + results = _eval_answers(results, encoded_dom, '//div[contains(@class, "LGOjhe")]') + break # If it's a JS encoded answer, we only want the first one if it has bold above # parse results diff --git a/searx/templates/simple/results.html b/searx/templates/simple/results.html index 52b474c77..68c1fe6a9 100644 --- a/searx/templates/simple/results.html +++ b/searx/templates/simple/results.html @@ -23,7 +23,11 @@