import json import re class DatasetCleaner: def __init__(self, dataset): """ dataset: HuggingFace dataset object or similar """ self.dataset = dataset def clean_text(self, text): """Remove extra whitespace and line breaks""" text = re.sub(r'\s+', ' ', text) # collapse multiple spaces/newlines return text.strip() def export_to_json(self, split="train", output_file="cleaned_dataset.json"): """ Cleans and exports the dataset to JSON. split: which split to use ('train', 'test', 'validation', etc.) output_file: name of the file to save """ cleaned_data = [] for example in self.dataset[split]: cleaned_entry = { "Context": self.clean_text(example.get("Context", "")), "Response": self.clean_text(example.get("Response", "")) } cleaned_data.append(cleaned_entry) with open(output_file, "w", encoding="utf-8") as f: json.dump(cleaned_data, f, indent=2, ensure_ascii=False) print(f"✅ Saved cleaned data to {output_file} ({len(cleaned_data)} entries)")