[mod] replace js_variable_to_python by js_obj_str_to_python (#2792) (#5477)

This patch is based on PR #2792 (old PR from 2023)

- js_obj_str_to_python handle more cases
- bring tests from chompjs ..
- comment out tests do not pass

The tests from chompjs give some overview of what is not implemented.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2025-11-25 12:51:08 +01:00
committed by GitHub
parent 0ee78c19dd
commit 54a97e1043
6 changed files with 410 additions and 61 deletions

View File

@@ -50,7 +50,7 @@ def response(resp):
pos = script.index(end_tag) + len(end_tag) - 1
script = script[:pos]
json_resp = utils.js_variable_to_python(script)
json_resp = utils.js_obj_str_to_python(script)
results = []

View File

@@ -134,7 +134,7 @@ from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
js_variable_to_python,
js_obj_str_to_python,
get_embeded_stream_url,
)
from searx.enginelib.traits import EngineTraits
@@ -262,7 +262,7 @@ def response(resp: SXNG_Response) -> EngineResults:
# data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
json_data = js_variable_to_python(js_object)
json_data = js_obj_str_to_python(js_object)
# json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
json_resp = json_data[1]['data']['body']['response']
@@ -439,9 +439,9 @@ def fetch_traits(engine_traits: EngineTraits):
resp = get('https://search.brave.com/settings')
if not resp.ok: # type: ignore
if not resp.ok:
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text) # type: ignore
dom = html.fromstring(resp.text)
for option in dom.xpath('//section//option[@value="en-us"]/../option'):
@@ -468,12 +468,12 @@ def fetch_traits(engine_traits: EngineTraits):
resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js')
if not resp.ok: # type: ignore
if not resp.ok:
print("ERROR: response from Brave is not OK.")
country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
country_js = resp.text[resp.text.index("options:{all") + len('options:') :]
country_js = country_js[: country_js.index("},k={default")]
country_tags = js_variable_to_python(country_js)
country_tags = js_obj_str_to_python(country_js)
for k, v in country_tags.items():
if k == 'all':

View File

@@ -407,7 +407,7 @@ def fetch_traits(engine_traits: EngineTraits):
"""
# pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel
from searx.utils import js_variable_to_python
from searx.utils import js_obj_str_to_python
# fetch regions
@@ -455,7 +455,7 @@ def fetch_traits(engine_traits: EngineTraits):
js_code = extr(resp.text, 'languages:', ',regions') # type: ignore
languages = js_variable_to_python(js_code)
languages: dict[str, str] = js_obj_str_to_python(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':

View File

@@ -15,7 +15,7 @@ from searx.utils import (
extr,
html_to_text,
parse_duration_string,
js_variable_to_python,
js_obj_str_to_python,
get_embeded_stream_url,
)
@@ -125,7 +125,7 @@ def parse_images(data):
match = extr(data, '<script>var imageSearchTabData=', '</script>')
if match:
json = js_variable_to_python(match.strip())
json = js_obj_str_to_python(match.strip())
items = json.get('content', {}).get('items', [])
for item in items:

View File

@@ -49,9 +49,14 @@ _BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
_JS_DECIMAL_RE = re.compile(r":\s*\.")
_JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
_XPATH_CACHE: dict[str, XPath] = {}
_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
@@ -741,12 +746,53 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
return None
def js_variable_to_python(js_variable: str) -> t.Any:
def _j2p_process_escape(match: re.Match[str]) -> str:
# deal with ECMA escape characters
_escape = match.group(1) or match.group(2)
return (
Rf'\{_escape}'
if _escape in _JSON_PASSTHROUGH_ESCAPES
else R'\u00' if _escape == 'x' else '' if _escape == '\n' else _escape
)
def _j2p_decimal(match: re.Match[str]) -> str:
return (
match.group(1)
+ match.group(2)
+ (match.group(3).replace("_", "") or "0")
+ "."
+ (match.group(4).replace("_", "") or "0")
)
def _j2p_decimal2(match: re.Match[str]) -> str:
return match.group(1) + match.group(2) + match.group(3).replace("_", "")
def js_obj_str_to_python(js_obj_str: str) -> t.Any:
"""Convert a javascript variable into JSON and then load the value
It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation.
"""
s = js_obj_str_to_json_str(js_obj_str)
# load the JSON and return the result
if s == "":
raise ValueError("js_obj_str can't be an empty string")
try:
return json.loads(s)
except json.JSONDecodeError as e:
logger.debug("Internal error: js_obj_str_to_python creates invalid JSON:\n%s", s)
raise ValueError("js_obj_str_to_python creates invalid JSON") from e
def js_obj_str_to_json_str(js_obj_str: str) -> str:
if not isinstance(js_obj_str, str):
raise ValueError("js_obj_str must be of type str")
if js_obj_str == "":
raise ValueError("js_obj_str can't be an empty string")
# when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote
in_string = None
@@ -754,61 +800,78 @@ def js_variable_to_python(js_variable: str) -> t.Any:
# r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
parts = re.split(r'(["\'])', js_variable)
# previous part (to check the escape character antislash)
previous_p = ""
parts = _JS_STRING_DELIMITERS.split(js_obj_str)
# does the previous part ends with a backslash?
blackslash_just_before = False
for i, p in enumerate(parts):
# parse characters inside a ECMA string
if in_string:
# we are in a JS string: replace the colon by a temporary character
# so quote_keys_regex doesn't have to deal with colon inside the JS strings
parts[i] = parts[i].replace(':', chr(1))
if in_string == "'":
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
parts[i] = parts[i].replace('"', r'\"')
# deal with delimiters and escape character
if not in_string and p in ('"', "'"):
# we are not in string
# but p is double or simple quote
# that's the start of a new string
# replace simple quote by double quote
# (JSON doesn't support simple quote)
parts[i] = '"'
in_string = p
continue
if p == in_string:
# we are in a string and the current part MAY close the string
if len(previous_p) > 0 and previous_p[-1] == '\\':
# there is an antislash just before: the ECMA string continue
continue
# the current p close the string
# replace simple quote by double quote
parts[i] = '"'
if p == in_string and not blackslash_just_before:
# * the current part matches the character which has opened the string
# * there is no antislash just before
# --> the current part close the current string
in_string = None
# replace simple quote and ` by double quote
# since JSON supports only double quote for string
parts[i] = '"'
if not in_string:
# replace void 0 by null
elif in_string:
# --> we are in a JS string
# replace the colon by a temporary character
# so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings
p = p.replace(':', chr(1))
# replace JS escape sequences by JSON escape sequences
p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p)
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
if in_string == "'":
p = p.replace('"', r'\"')
parts[i] = p
# deal with the sequence blackslash then quote
# since js_obj_str splits on quote, we detect this case:
# * the previous part ends with a black slash
# * the current part is a single quote
# when detected the blackslash is removed on the previous part
if blackslash_just_before and p[:1] == "'":
parts[i - 1] = parts[i - 1][:-1]
elif in_string is None and p in ('"', "'", "`"):
# we are not in string but p is string delimiter
# --> that's the start of a new string
in_string = p
# replace simple quote by double quote
# since JSON supports only double quote for string
parts[i] = '"'
elif in_string is None:
# we are not in a string
# replace by null these values:
# * void 0
# * void(0)
# * undefined
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
# we are sure there is no string in p
parts[i] = _JS_VOID_RE.sub("null", p)
# update previous_p
previous_p = p
p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p)
# make sure there is a leading zero in front of float
p = _JS_DECIMAL_RE.sub(_j2p_decimal, p)
p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p)
# remove extra coma in a list or an object
# for example [1,2,3,] becomes [1,2,3]
p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p)
parts[i] = p
# update for the next iteration
blackslash_just_before = len(p) > 0 and p[-1] == '\\'
# join the string
s = ''.join(parts)
# add quote around the key
# add quote arround the key
# { a: 12 }
# becomes
# { "a": 12 }
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
s = _JS_DECIMAL_RE.sub(":0.", s)
# replace the surogate character by colon
s = s.replace(chr(1), ':')
# load the JSON and return the result
return json.loads(s)
# replace the surogate character by colon and strip whitespaces
s = s.replace(chr(1), ':').strip()
return s
def parse_duration_string(duration_str: str) -> timedelta | None: