|
""" |
|
Description: This script is used to post-process the data after getting it from the LLM. |
|
|
|
input: data from llm |
|
output: data for llama finetuning |
|
|
|
""" |
|
import json |
|
import os |
|
import copy |
|
|
|
def save_data(data, dataset_name, model_name): |
|
file_dir = f"../data/finetune/{dataset_name}" |
|
os.makedirs(file_dir, exist_ok=True) |
|
file_path = os.path.join(file_dir, f"llama_ft_{model_name}.jsonl") |
|
|
|
with open(file_path, "w") as f: |
|
for d in data: |
|
f.write(json.dumps(d) + "\n") |
|
|
|
print(f"Data saved to {file_path}") |
|
|
|
def process(sample, dataset_name, model_name): |
|
""" |
|
input: sample from llm |
|
output: sample for llama finetuning |
|
""" |
|
|
|
output_format = {"conversations": [{"role": "user", "content": ""}, {"role": "assistant", "content": ""}]} |
|
ft_list = [] |
|
for i in range(len(sample)): |
|
output = copy.deepcopy(output_format) |
|
output["conversations"][0]["content"] = sample[i]["query"] |
|
output["conversations"][1]["content"] = str(sample[i]["answer"]) |
|
ft_list.append(output) |
|
|
|
|
|
save_data(ft_list, dataset_name, model_name) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
dataset_name_list = ["mag"] |
|
model_names = ["gpt-4o-mini-20240718", "o3-mini-2025-01-31", "gpt-o1-2024-12-17", "gpt-4o-2024-05-13", "gpt-4o-mini-20240718", "gpt35-1106"] |
|
|
|
for dataset_name in dataset_name_list: |
|
for model_name in model_names: |
|
relative_path = f"finetune/{dataset_name}/1000.json" |
|
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
file_path = os.path.join(current_dir, relative_path) |
|
with open(f"{file_path}", "r") as f: |
|
sample = json.load(f) |
|
|
|
|
|
process(sample, dataset_name, model_name) |
|
print(f"Processing {model_name} for {dataset_name} is done.") |
|
break |