import json
import re
[docs]
def parse_json_from_markdown(markdown_text: str) -> dict:
"""
Parse JSON from markdown text. Any newline characters inside JSON string
values are replaced with "<br>" and trailing commas are removed to produce valid JSON.
Args:
markdown_text (str): Markdown text.
Raises:
json.JSONDecodeError: If JSON parsing fails.
"""
try:
# Extract JSON block if present, otherwise use the entire text
pattern = re.compile(r"```(?:json)?\s*(.*?)\s*```", re.DOTALL)
match = pattern.search(markdown_text)
json_text = match.group(1) if match else markdown_text
json_text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", json_text).strip()
# Remove trailing commas before closing braces/brackets
json_text = re.sub(r",\s*([}\]])", r"\1", json_text)
# Replace newline characters inside all JSON strings with "<br\>"
def replace_newlines(match):
# match.group(0) includes the surrounding quotes
original = match.group(0)
inner = original[1:-1]
inner = inner.replace("\n", r"<br>")
return json.dumps(inner)
# This regex matches JSON string literals (keys and values)
json_text = re.sub(r'"([^"\\]*(?:\\.[^"\\]*)*)"', replace_newlines, json_text)
return json.loads(json_text)
except json.JSONDecodeError as e:
print(f"Error while parsing JSON: {e}")
print(f"Markdown text: {markdown_text}")
raise e
DEFAULT_ALLOWED_CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \t\n\r!" + \
"\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + \
"äöüßñáéíóúàâæçèêëîïôœùûãõ" + \
"ÄÖÜÑÁÉÍÓÚÀÂÆÇÈÊËÎÏÔŒÙÛÃÕ" + \
"±×÷∞∑√πΔ∫≈≠≤≥" + \
"αβγδεζηθικλμνξοπρστυφχψω" + \
"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
[docs]
def has_invalid_characters(text: str, allowed_chars: str | None = None) -> bool:
allowed = allowed_chars or DEFAULT_ALLOWED_CHARS
return any(c not in allowed for c in text)