diff options
| author | Markus Heiser <markus.heiser@darmarIT.de> | 2025-11-25 12:51:08 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-11-25 12:51:08 +0100 |
| commit | 54a97e10431c1cdae910d3b37074a63eda7100fc (patch) | |
| tree | 3769dc3a7eae53ad96660cf47dfc476b8c26963d /searx | |
| parent | 0ee78c19dd5403560f14a13bbedeb75318ace45d (diff) | |
[mod] replace js_variable_to_python by js_obj_str_to_python (#2792) (#5477)
This patch is based on PR #2792 (old PR from 2023)
- js_obj_str_to_python handle more cases
- bring tests from chompjs ..
- comment out tests do not pass
The tests from chompjs give some overview of what is not implemented.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
| -rw-r--r-- | searx/engines/ask.py | 2 | ||||
| -rw-r--r-- | searx/engines/brave.py | 14 | ||||
| -rw-r--r-- | searx/engines/duckduckgo.py | 4 | ||||
| -rw-r--r-- | searx/engines/naver.py | 4 | ||||
| -rw-r--r-- | searx/utils.py | 155 |
5 files changed, 121 insertions, 58 deletions
diff --git a/searx/engines/ask.py b/searx/engines/ask.py index aeaf6136a..ca1a2110d 100644 --- a/searx/engines/ask.py +++ b/searx/engines/ask.py @@ -50,7 +50,7 @@ def response(resp): pos = script.index(end_tag) + len(end_tag) - 1 script = script[:pos] - json_resp = utils.js_variable_to_python(script) + json_resp = utils.js_obj_str_to_python(script) results = [] diff --git a/searx/engines/brave.py b/searx/engines/brave.py index fe8fb616b..75775d1ec 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -134,7 +134,7 @@ from searx.utils import ( eval_xpath, eval_xpath_list, eval_xpath_getindex, - js_variable_to_python, + js_obj_str_to_python, get_embeded_stream_url, ) from searx.enginelib.traits import EngineTraits @@ -262,7 +262,7 @@ def response(resp: SXNG_Response) -> EngineResults: # data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}] # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]" - json_data = js_variable_to_python(js_object) + json_data = js_obj_str_to_python(js_object) # json_data is a list and at the second position (0,1) in this list we find the "response" data we need .. json_resp = json_data[1]['data']['body']['response'] @@ -439,9 +439,9 @@ def fetch_traits(engine_traits: EngineTraits): resp = get('https://search.brave.com/settings') - if not resp.ok: # type: ignore + if not resp.ok: print("ERROR: response from Brave is not OK.") - dom = html.fromstring(resp.text) # type: ignore + dom = html.fromstring(resp.text) for option in dom.xpath('//section//option[@value="en-us"]/../option'): @@ -468,12 +468,12 @@ def fetch_traits(engine_traits: EngineTraits): resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js') - if not resp.ok: # type: ignore + if not resp.ok: print("ERROR: response from Brave is not OK.") - country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore + country_js = resp.text[resp.text.index("options:{all") + len('options:') :] country_js = country_js[: country_js.index("},k={default")] - country_tags = js_variable_to_python(country_js) + country_tags = js_obj_str_to_python(country_js) for k, v in country_tags.items(): if k == 'all': diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 208eaf46e..dcf6543d6 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -407,7 +407,7 @@ def fetch_traits(engine_traits: EngineTraits): """ # pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel - from searx.utils import js_variable_to_python + from searx.utils import js_obj_str_to_python # fetch regions @@ -455,7 +455,7 @@ def fetch_traits(engine_traits: EngineTraits): js_code = extr(resp.text, 'languages:', ',regions') # type: ignore - languages = js_variable_to_python(js_code) + languages: dict[str, str] = js_obj_str_to_python(js_code) for eng_lang, name in languages.items(): if eng_lang == 'wt_WT': diff --git a/searx/engines/naver.py b/searx/engines/naver.py index 5c331cd20..c715bc319 100644 --- a/searx/engines/naver.py +++ b/searx/engines/naver.py @@ -15,7 +15,7 @@ from searx.utils import ( extr, html_to_text, parse_duration_string, - js_variable_to_python, + js_obj_str_to_python, get_embeded_stream_url, ) @@ -125,7 +125,7 @@ def parse_images(data): match = extr(data, '<script>var imageSearchTabData=', '</script>') if match: - json = js_variable_to_python(match.strip()) + json = js_obj_str_to_python(match.strip()) items = json.get('content', {}).get('items', []) for item in items: diff --git a/searx/utils.py b/searx/utils.py index 3d7d6e715..4bbf8fb94 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -49,9 +49,14 @@ _BLOCKED_TAGS = ('script', 'style') _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) -_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)') -_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)') -_JS_DECIMAL_RE = re.compile(r":\s*\.") +_JS_STRING_DELIMITERS = re.compile(r'(["\'`])') +_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)') +_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined') +_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)") +_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)") +_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])") +_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)') +_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' _XPATH_CACHE: dict[str, XPath] = {} _LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {} @@ -741,12 +746,53 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo return None -def js_variable_to_python(js_variable: str) -> t.Any: +def _j2p_process_escape(match: re.Match[str]) -> str: + # deal with ECMA escape characters + _escape = match.group(1) or match.group(2) + return ( + Rf'\{_escape}' + if _escape in _JSON_PASSTHROUGH_ESCAPES + else R'\u00' if _escape == 'x' else '' if _escape == '\n' else _escape + ) + + +def _j2p_decimal(match: re.Match[str]) -> str: + return ( + match.group(1) + + match.group(2) + + (match.group(3).replace("_", "") or "0") + + "." + + (match.group(4).replace("_", "") or "0") + ) + + +def _j2p_decimal2(match: re.Match[str]) -> str: + return match.group(1) + match.group(2) + match.group(3).replace("_", "") + + +def js_obj_str_to_python(js_obj_str: str) -> t.Any: """Convert a javascript variable into JSON and then load the value It does not deal with all cases, but it is good enough for now. chompjs has a better implementation. """ + s = js_obj_str_to_json_str(js_obj_str) + # load the JSON and return the result + if s == "": + raise ValueError("js_obj_str can't be an empty string") + try: + return json.loads(s) + except json.JSONDecodeError as e: + logger.debug("Internal error: js_obj_str_to_python creates invalid JSON:\n%s", s) + raise ValueError("js_obj_str_to_python creates invalid JSON") from e + + +def js_obj_str_to_json_str(js_obj_str: str) -> str: + if not isinstance(js_obj_str, str): + raise ValueError("js_obj_str must be of type str") + if js_obj_str == "": + raise ValueError("js_obj_str can't be an empty string") + # when in_string is not None, it contains the character that has opened the string # either simple quote or double quote in_string = None @@ -754,61 +800,78 @@ def js_variable_to_python(js_variable: str) -> t.Any: # r"""{ a:"f\"irst", c:'sec"ond'}""" # becomes # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}'] - parts = re.split(r'(["\'])', js_variable) - # previous part (to check the escape character antislash) - previous_p = "" + parts = _JS_STRING_DELIMITERS.split(js_obj_str) + # does the previous part ends with a backslash? + blackslash_just_before = False for i, p in enumerate(parts): - # parse characters inside a ECMA string - if in_string: - # we are in a JS string: replace the colon by a temporary character - # so quote_keys_regex doesn't have to deal with colon inside the JS strings - parts[i] = parts[i].replace(':', chr(1)) - if in_string == "'": - # the JS string is delimited by simple quote. - # This is not supported by JSON. - # simple quote delimited string are converted to double quote delimited string - # here, inside a JS string, we escape the double quote - parts[i] = parts[i].replace('"', r'\"') - - # deal with delimiters and escape character - if not in_string and p in ('"', "'"): - # we are not in string - # but p is double or simple quote - # that's the start of a new string - # replace simple quote by double quote - # (JSON doesn't support simple quote) + if p == in_string and not blackslash_just_before: + # * the current part matches the character which has opened the string + # * there is no antislash just before + # --> the current part close the current string + in_string = None + # replace simple quote and ` by double quote + # since JSON supports only double quote for string parts[i] = '"' + + elif in_string: + # --> we are in a JS string + # replace the colon by a temporary character + # so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings + p = p.replace(':', chr(1)) + # replace JS escape sequences by JSON escape sequences + p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p) + # the JS string is delimited by simple quote. + # This is not supported by JSON. + # simple quote delimited string are converted to double quote delimited string + # here, inside a JS string, we escape the double quote + if in_string == "'": + p = p.replace('"', r'\"') + parts[i] = p + # deal with the sequence blackslash then quote + # since js_obj_str splits on quote, we detect this case: + # * the previous part ends with a black slash + # * the current part is a single quote + # when detected the blackslash is removed on the previous part + if blackslash_just_before and p[:1] == "'": + parts[i - 1] = parts[i - 1][:-1] + + elif in_string is None and p in ('"', "'", "`"): + # we are not in string but p is string delimiter + # --> that's the start of a new string in_string = p - continue - if p == in_string: - # we are in a string and the current part MAY close the string - if len(previous_p) > 0 and previous_p[-1] == '\\': - # there is an antislash just before: the ECMA string continue - continue - # the current p close the string # replace simple quote by double quote + # since JSON supports only double quote for string parts[i] = '"' - in_string = None - if not in_string: - # replace void 0 by null + elif in_string is None: + # we are not in a string + # replace by null these values: + # * void 0 + # * void(0) + # * undefined # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void - # we are sure there is no string in p - parts[i] = _JS_VOID_RE.sub("null", p) - # update previous_p - previous_p = p + p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p) + # make sure there is a leading zero in front of float + p = _JS_DECIMAL_RE.sub(_j2p_decimal, p) + p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p) + # remove extra coma in a list or an object + # for example [1,2,3,] becomes [1,2,3] + p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p) + parts[i] = p + + # update for the next iteration + blackslash_just_before = len(p) > 0 and p[-1] == '\\' + # join the string s = ''.join(parts) - # add quote around the key + # add quote arround the key # { a: 12 } # becomes # { "a": 12 } s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s) - s = _JS_DECIMAL_RE.sub(":0.", s) - # replace the surogate character by colon - s = s.replace(chr(1), ':') - # load the JSON and return the result - return json.loads(s) + # replace the surogate character by colon and strip whitespaces + s = s.replace(chr(1), ':').strip() + return s def parse_duration_string(duration_str: str) -> timedelta | None: |