Compare commits

...

3 Commits

Author SHA1 Message Date
Alexandre Flament 1a7b6872b5
Merge pull request #1792 from unixfox/google-images-internal-api
use the internal API for google images
2022-09-21 19:50:38 +02:00
Markus Heiser cf7ee67f71 [mod] google-images: slightly improvements of the engine
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-09-21 18:59:55 +02:00
Emilien Devos df5f8d0e8e use the internal API for google images 2022-09-20 22:52:38 +02:00
1 changed files with 49 additions and 157 deletions

View File

@ -1,28 +1,20 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""This is the implementation of the google images engine. """This is the implementation of the google images engine using the google
internal API used the Google Go Android app.
.. admonition:: Content-Security-Policy (CSP) This internal API offer results in
This engine needs to allow images from the `data URLs`_ (prefixed with the - JSON (_fmt:json)
``data:`` scheme):: - Protobuf (_fmt:pb)
- Protobuf compressed? (_fmt:pc)
- HTML (_fmt:html)
- Protobuf encoded in JSON (_fmt:jspb).
Header set Content-Security-Policy "img-src 'self' data: ;"
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
""" """
import re from urllib.parse import urlencode
from urllib.parse import urlencode, unquote from json import loads
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import ( from searx.engines.google import (
get_lang_info, get_lang_info,
@ -42,12 +34,12 @@ about = {
"official_api_documentation": 'https://developers.google.com/custom-search', "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'JSON',
} }
# engine dependent config # engine dependent config
categories = ['images', 'web'] categories = ['images', 'web']
paging = False paging = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
@ -56,74 +48,8 @@ send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'} filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def scrap_out_thumbs(dom):
"""Scrap out thumbnail data from <script> tags."""
ret_val = {}
for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'):
_script = script.text
# _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....');
_thumb_no, _img_data = _script[len("_setImgSrc(") : -2].split(",", 1)
_thumb_no = _thumb_no.replace("'", "")
_img_data = _img_data.replace("'", "")
_img_data = _img_data.replace(r"\/", r"/")
ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=")
return ret_val
# [0, "-H96xjSoW5DsgM", ["https://encrypted-tbn0.gstatic.com/images?q...", 155, 324]
# , ["https://assets.cdn.moviepilot.de/files/d3bf..", 576, 1200],
_RE_JS_IMAGE_URL = re.compile(
r'"'
r'([^"]*)' # -H96xjSoW5DsgM
r'",\s*\["'
r'https://[^\.]*\.gstatic.com/images[^"]*' # https://encrypted-tbn0.gstatic.com/images?q...
r'[^\[]*\["'
r'(https?://[^"]*)' # https://assets.cdn.moviepilot.de/files/d3bf...
)
def parse_urls_img_from_js(dom):
# There are two HTML script tags starting with a JS function
# 'AF_initDataCallback(...)'
#
# <script nonce="zscm+Ab/JzBk1Qd4GY6wGQ">
# AF_initDataCallback({key: 'ds:0', hash: '1', data:[], sideChannel: {}});
# </script>
# <script nonce="zscm+Ab/JzBk1Qd4GY6wGQ">
# AF_initDataCallback({key: 'ds:1', hash: '2', data:[null,[[["online_chips",[["the big",
# ["https://encrypted-tbn0.gstatic.com/images?q...",null,null,true,[null,0],f
# ...
# </script>
#
# The second script contains the URLs of the images.
# The AF_initDataCallback(..) is called with very large dictionary, that
# looks like JSON but it is not JSON since it contains JS variables and
# constants like 'null' (we can't use a JSON parser for).
#
# The alternative is to parse the entire <script> and find all image URLs by
# a regular expression.
img_src_script = eval_xpath_getindex(dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
data_id_to_img_url = {}
for data_id, url in _RE_JS_IMAGE_URL.findall(img_src_script):
data_id_to_img_url[data_id] = url
return data_id_to_img_url
def get_img_url_by_data_id(data_id_to_img_url, img_node):
"""Get full image URL by @data-id from parent element."""
data_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
img_url = data_id_to_img_url.get(data_id, '')
img_url = unquote(img_url.replace(r'\u00', r'%'))
return img_url
def request(query, params): def request(query, params):
"""Google-Video search request""" """Google-Image search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
@ -132,7 +58,17 @@ def request(query, params):
+ lang_info['subdomain'] + lang_info['subdomain']
+ '/search' + '/search'
+ "?" + "?"
+ urlencode({'q': query, 'tbm': "isch", **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'num': 30}) + urlencode(
{
'q': query,
'tbm': "isch",
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
'asearch': 'isch',
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
}
)
) )
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
@ -141,9 +77,9 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers']) params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
params['headers']['Accept'] = '*/*'
return params return params
@ -153,78 +89,34 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
# convert the text to dom json_start = resp.text.find('{"ischj":')
dom = html.fromstring(resp.text) json_data = loads(resp.text[json_start:])
img_bas64_map = scrap_out_thumbs(dom)
data_id_to_img_url = parse_urls_img_from_js(dom)
# parse results for item in json_data["ischj"]["metadata"]:
#
# root element::
# <div id="islmp" ..>
# result div per image::
# <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
# The data-id matches to a item in a json-data structure in::
# <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
# In this structure the link to the origin PNG, JPG or whatever is given
# first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
# <img class="rg_i Q4LuWd" data-iid="0"
# second link per image-div is the target link::
# <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
# the second link also contains two div tags with the *description* and *publisher*::
# <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
# <div class="fxgdke">en.wikipedia.org</div>
root = eval_xpath(dom, '//div[@id="islmp"]') result_item = {
if not root: 'url': item["result"]["referrer_url"],
logger.error("did not find root element id='islmp'") 'title': item["result"]["page_title"],
return results 'content': item["text_in_grid"]["snippet"],
'source': item["result"]["site_title"],
root = root[0] 'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): 'img_src': item["original_image"]["url"],
'thumbnail_src': item["thumbnail"]["url"],
img_alt = eval_xpath_getindex(img_node, '@alt', 0)
img_base64_id = eval_xpath(img_node, '@data-iid')
if img_base64_id:
img_base64_id = img_base64_id[0]
thumbnail_src = img_bas64_map[img_base64_id]
else:
thumbnail_src = eval_xpath(img_node, '@src')
if not thumbnail_src:
thumbnail_src = eval_xpath(img_node, '@data-src')
if thumbnail_src:
thumbnail_src = thumbnail_src[0]
else:
thumbnail_src = ''
link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
url = eval_xpath_getindex(link_node, '@href', 0, None)
if url is None:
logger.error("missing @href in node: %s", html.tostring(link_node))
continue
pub_nodes = eval_xpath(link_node, './div/div')
pub_descr = img_alt
pub_source = ''
if pub_nodes:
pub_descr = extract_text(pub_nodes[0])
pub_source = extract_text(pub_nodes[1])
src_url = get_img_url_by_data_id(data_id_to_img_url, img_node)
if not src_url:
src_url = thumbnail_src
results.append(
{
'url': url,
'title': img_alt,
'content': pub_descr,
'source': pub_source,
'img_src': src_url,
'thumbnail_src': thumbnail_src,
'template': 'images.html', 'template': 'images.html',
} }
)
author = item["result"].get('iptc', {}).get('creator')
if author:
result_item['author'] = ', '.join(author)
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice:
result_item['source'] += ' / ' + copyright_notice
file_size = item.get('gsa', {}).get('file_size')
if file_size:
result_item['source'] += ' (%s)' % file_size
results.append(result_item)
return results return results