Spaces:
Sleeping
Sleeping
import json | |
import re | |
class DatasetCleaner: | |
def __init__(self, dataset): | |
""" | |
dataset: HuggingFace dataset object or similar | |
""" | |
self.dataset = dataset | |
def clean_text(self, text): | |
"""Remove extra whitespace and line breaks""" | |
text = re.sub(r'\s+', ' ', text) # collapse multiple spaces/newlines | |
return text.strip() | |
def export_to_json(self, split="train", output_file="cleaned_dataset.json"): | |
""" | |
Cleans and exports the dataset to JSON. | |
split: which split to use ('train', 'test', 'validation', etc.) | |
output_file: name of the file to save | |
""" | |
cleaned_data = [] | |
for example in self.dataset[split]: | |
cleaned_entry = { | |
"Context": self.clean_text(example.get("Context", "")), | |
"Response": self.clean_text(example.get("Response", "")) | |
} | |
cleaned_data.append(cleaned_entry) | |
with open(output_file, "w", encoding="utf-8") as f: | |
json.dump(cleaned_data, f, indent=2, ensure_ascii=False) | |
print(f"β Saved cleaned data to {output_file} ({len(cleaned_data)} entries)") | |