Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| import random | |
| import requests | |
| from datasets import load_dataset, Dataset, DatasetDict | |
| path = "pminervini/HaluEval" | |
| API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}" | |
| response = requests.get(API_URL) | |
| res_json = response.json() | |
| gold_splits = {"dialogue", "qa", "summarization", "general"} | |
| available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set() | |
| name_to_ds = dict() | |
| for name in gold_splits: | |
| ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"}) | |
| name_to_ds[name] = ds | |
| # if name not in available_splits: | |
| ds.push_to_hub(path, config_name=name) | |
| def list_to_dict(lst: list) -> dict: | |
| res = dict() | |
| for entry in lst: | |
| for k, v in entry.items(): | |
| if k not in res: | |
| res[k] = [] | |
| res[k] += [v] | |
| return res | |
| for name in gold_splits - {"general"}: | |
| random.seed(42) | |
| ds = name_to_ds[name] | |
| new_entry_lst = [] | |
| for entry in ds["data"]: | |
| is_hallucinated = random.random() > 0.5 | |
| new_entry = None | |
| if name in {"qa"}: | |
| new_entry = { | |
| "knowledge": entry["knowledge"], | |
| "question": entry["question"], | |
| "answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'], | |
| "hallucination": "yes" if is_hallucinated else "no", | |
| } | |
| if name in {"dialogue"}: | |
| new_entry = { | |
| "knowledge": entry["knowledge"], | |
| "dialogue_history": entry["dialogue_history"], | |
| "response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'], | |
| "hallucination": "yes" if is_hallucinated else "no", | |
| } | |
| if name in {"summarization"}: | |
| new_entry = { | |
| "document": entry["document"], | |
| "summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'], | |
| "hallucination": "yes" if is_hallucinated else "no", | |
| } | |
| assert new_entry is not None | |
| new_entry_lst += [new_entry] | |
| new_ds_map = list_to_dict(new_entry_lst) | |
| new_ds = Dataset.from_dict(new_ds_map) | |
| new_dsd = DatasetDict({"data": new_ds}) | |
| new_dsd.push_to_hub(path, config_name=f"{name}_samples") | |