Source code for util.merge_datasets

# Open JSON File and count the number of items

import json
import random
import uuid
from dataclasses import dataclass
from pathlib import Path


[docs] @dataclass class FileInformation: file_path: Path version: int
[docs] def merge_json_and_assign_uuid(files: list[FileInformation], output: Path): merged = [] for file_information in files: with open(file_information.file_path) as file: data = json.load(file) for item in data: item["id"] = str(uuid.uuid4()) item["version"] = file_information.version merged.append(item) random.shuffle(merged) print(f"Total number of items: {len(merged)}") if output.exists(): raise FileExistsError(f"Output file {output} already exists") with open(output, "w") as out: json_string = json.dumps(merged, indent=4) out.write(json_string)
if __name__ == "__main__": files = [ FileInformation(Path("../../data/dev/ticket-dataset-4_3_0-prod.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_31_0-prod.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_31_0-prod.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_35_0-prod.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_37_5.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_37_7.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_37_9.json"), 4), FileInformation(Path("../../data/dev/ticket-dataset-4_37_10.json"), 4), ] output = Path("../../data/pre-release/dataset-tickets-multi-lang3.json") merge_json_and_assign_uuid(files, output)