mirror of https://github.com/searxng/searxng.git
[fix] engine google-News: fix decoding of URLs
Google-News returns internal links where the origin URL is encoded in a base64 (RFC 2045 aka URL-safe) string. Closes: https://github.com/searxng/searxng/issues/1959 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
7592d85982
commit
8de8070ed9
|
@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`:
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import binascii
|
|
||||||
import re
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from base64 import b64decode
|
import base64
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import babel
|
import babel
|
||||||
|
|
||||||
|
@ -144,34 +142,17 @@ def response(resp):
|
||||||
|
|
||||||
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
|
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
|
||||||
|
|
||||||
# The first <a> tag in the <article> contains the link to the
|
# The first <a> tag in the <article> contains the link to the article
|
||||||
# article The href attribute of the <a> is a google internal link,
|
# The href attribute of the <a> tag is a google internal link, we have
|
||||||
# we can't use. The real link is hidden in the jslog attribute:
|
# to decode
|
||||||
#
|
|
||||||
# <a ...
|
|
||||||
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
|
|
||||||
# href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen"
|
|
||||||
# ... />
|
|
||||||
|
|
||||||
jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
|
href = eval_xpath_getindex(result, './article/a/@href', 0)
|
||||||
url = re.findall('http[^;]*', jslog)
|
href = href.split('?')[0]
|
||||||
if url:
|
href = href.split('/')[-1]
|
||||||
url = url[0]
|
href = base64.urlsafe_b64decode(href + '====')
|
||||||
else:
|
href = href[4:].split(b'\xd2')[0]
|
||||||
# The real URL is base64 encoded in the json attribute:
|
href = href.decode()
|
||||||
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
|
|
||||||
jslog = jslog.split(";")[1].split(':')[1].strip()
|
|
||||||
try:
|
|
||||||
padding = (4 - (len(jslog) % 4)) * "="
|
|
||||||
jslog = b64decode(jslog + padding)
|
|
||||||
except binascii.Error:
|
|
||||||
# URL can't be read, skip this result
|
|
||||||
continue
|
|
||||||
|
|
||||||
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
|
|
||||||
url = re.findall('http[^;"]*', str(jslog))[0]
|
|
||||||
|
|
||||||
# the first <h3> tag in the <article> contains the title of the link
|
|
||||||
title = extract_text(eval_xpath(result, './article/h3[1]'))
|
title = extract_text(eval_xpath(result, './article/h3[1]'))
|
||||||
|
|
||||||
# The pub_date is mostly a string like 'yesertday', not a real
|
# The pub_date is mostly a string like 'yesertday', not a real
|
||||||
|
@ -189,7 +170,7 @@ def response(resp):
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
{
|
{
|
||||||
'url': url,
|
'url': href,
|
||||||
'title': title,
|
'title': title,
|
||||||
'content': content,
|
'content': content,
|
||||||
'img_src': img_src,
|
'img_src': img_src,
|
||||||
|
|
Loading…
Reference in New Issue