File size: 6,865 Bytes
7bf4b88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
"""
Desc: This file is used to get the training data from the LLM
"""
import sys
from pathlib import Path
# Get the absolute path of the current script
current_file = Path(__file__).resolve()
project_root = current_file.parents[3]
# Add the project root to the system path
sys.path.append(str(project_root))
from stark_qa import load_qa
import argparse
import os
from openai import AzureOpenAI
import json
import openai
from prompts import prompts
"""
MAG:
sys_content: 478/query
output: 45/query
input: 25/query
1000 queries
total price:
1. o1: $13.29
2. o3mini: $0.97
3. deepseek-chat: $0.24
4. deepseek-reasoner: $0.49
Amazon:
sys_content: 478/query
"""
# get the prompt for different datasets
def get_sys_content(dataset_name):
"""
input:
dataset_name: the name of the dataset
output:
sys_content: the sys_content for the dataset
"""
sys_content = prompts(dataset_name)
return sys_content
# get the response from the llm
def get_response(sys_content, user_content):
messages = [{"role": "system", "content": sys_content},
{"role": "user", "content": user_content}
]
chat_completion = client.chat.completions.create(
messages=messages,
model=parameters['azure']['model'], # parameters['azure']['model'], parameters['openai']['model']
# temperature=0,
seed=576879897,
)
response = chat_completion.choices[0].message.content
# print(messages)
# print(response)
return response
# save the outputs to json file
def save_json(data, dataset_name):
"""
input:
data: the data to be saved
dataset_name: the name of the dataset
"""
file_dir = f"/home/yongjia/dgl/Yongjia/MOE/Reasoner/data/finetune/{dataset_name}"
os.makedirs(file_dir, exist_ok=True)
file_path = f"{file_dir}/1000_{parameters['azure']['model']}.json"
with open(file_path, 'w') as f:
json.dump(data, f, indent=4)
print(f"Saved to {file_path}")
# get the reasoning graphs for a dataset
def get_rg(dataset_name):
"""
input:
dataset_name: the name of the dataset
output:
rg: the reasoning graph for the dataset
"""
# get the prompt for the dataset
sys_content = get_sys_content(dataset_name)
# get qa dataset
qa = load_qa(dataset_name)
train_qa = qa.get_subset('train')
# we sample 1000 queries from the training set
pair_list = []
failure_count = 0
for i in range(1500):
query, q_id, ans_ids, _ = train_qa[i]
# call the llm to get the reasoning graph
response = get_response(sys_content, query)
print(response)
# process the response
if dataset_name == 'prime':
output = {
"Triplets":[],
"Restriction": [],
"Target": ""
}
try:
response = response.split('\n')
triplets_raw = response[0].replace('Triplets:', '').strip()
triplets = json.loads(triplets_raw)
output['Triplets'] = triplets
restriction_raw = response[1].replace('Restriction:', '').strip()
restriction = json.loads(restriction_raw)
output['Restriction'] = restriction
target = response[2].replace('Target:', '').strip()
output['Target'] = target
except:
failure_count += 1
continue
elif dataset_name == 'mag' or dataset_name == 'amazon':
output = {
"Metapath": "",
"Restriction": [],
}
try:
response = response.split('\n')
metapath = response[0].replace('Metapath:', '').strip()
output['Metapath'] = metapath
restriction_raw = response[1].replace('Restriction:', '').strip()
restriction = json.loads(restriction_raw)
output['Restriction'] = restriction
except:
failure_count += 1
continue
else:
raise ValueError('The dataset is not supported')
pair = {'query': query, 'answer': output}
pair_list.append(pair)
if len(pair_list) == 1000:
break
# save the output to json file
save_json(pair_list, dataset_name)
print(f"Failure count: {failure_count}")
if __name__ == '__main__':
# Argument parser setup
parser = argparse.ArgumentParser(description="Load LLM parameters and initialize API clients.")
# Dataset name
parser.add_argument("--dataset_name", type=str, required=True,
choices=["mag", "amazon", "prime"],
help="Specify the dataset to use.")
# Model selection
parser.add_argument("--model", type=str, required=True,
choices=["gpt-4o-mini-20240718", "gpt-4o-2024-05-13",
"deepseek-reasoner", "gpt-o1-2024-12-17",
"o3-mini-2025-01-31"],
help="Specify the model to use.")
# Azure API parameters
parser.add_argument("--azure_api_key", type=str, default=None, help="Azure API Key")
parser.add_argument("--azure_endpoint", type=str, default=None, help="Azure API Endpoint")
parser.add_argument("--azure_api_version", type=str, default=None, help="Azure API Version")
# OpenAI API parameters
parser.add_argument("--openai_api_key", type=str, default=None, help="OpenAI API Key")
parser.add_argument("--openai_endpoint", type=str, default=None, help="OpenAI API Endpoint")
args = parser.parse_args()
# Initialize parameters dictionary
parameters = {
"azure": {
"api_key": args.azure_api_key,
"azure_endpoint": args.azure_endpoint,
"api_version": args.azure_api_version,
},
"openai": {
"api_key": args.openai_api_key,
"endpoint": args.openai_endpoint,
}
}
# Determine which API client to use
if parameters["openai"]["api_key"]:
client = openai.OpenAI(
base_url=parameters["openai"]["endpoint"],
api_key=parameters["openai"]["api_key"],
)
else:
client = AzureOpenAI(
azure_endpoint=parameters["azure"]["azure_endpoint"],
api_key=parameters["azure"]["api_key"],
api_version=parameters["azure"]["api_version"],
)
get_rg(args.dataset_name)
|