Source code for util.formatting.json_parsing

import json
import re



[docs]
def parse_json_from_markdown(markdown_text: str) -> dict:
    """
    Parse JSON from markdown text. Any newline characters inside JSON string
    values are replaced with "<br>" and trailing commas are removed to produce valid JSON.

    Args:
        markdown_text (str): Markdown text.

    Raises:
        json.JSONDecodeError: If JSON parsing fails.
    """
    try:
        # Extract JSON block if present, otherwise use the entire text
        pattern = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL)
        match = pattern.search(markdown_text)
        json_text = match.group(1) if match else markdown_text
        json_text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", json_text).strip()

        # Remove trailing commas before closing braces/brackets
        json_text = re.sub(r",\s*([}\]])", r"\1", json_text)

        # Replace newline characters inside all JSON strings with "<br\>"
        def replace_newlines(match):
            # match.group(0) includes the surrounding quotes
            original = match.group(0)
            inner = original[1:-1]
            inner = inner.replace("\n", r"<br>")
            return json.dumps(inner)

        # This regex matches JSON string literals (keys and values)
        json_text = re.sub(r'"([^"\\]*(?:\\.[^"\\]*)*)"', replace_newlines, json_text)

        return json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"Error while parsing JSON: {e}")
        print(f"Markdown text: {markdown_text}")
        raise e



DEFAULT_ALLOWED_CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \t\n\r!" + \
                        "\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + \
                        "äöüßñáéíóúàâæçèêëîïôœùûãõ" + \
                        "ÄÖÜÑÁÉÍÓÚÀÂÆÇÈÊËÎÏÔŒÙÛÃÕ" + \
                        "±×÷∞∑√πΔ∫≈≠≤≥" + \
                        "αβγδεζηθικλμνξοπρστυφχψω" + \
                        "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"



[docs]
def has_invalid_characters(text: str, allowed_chars: str | None = None) -> bool:
    allowed = allowed_chars or DEFAULT_ALLOWED_CHARS
    return any(c not in allowed for c in text)