Question Answering
Transformers
Safetensors
File size: 6,865 Bytes
7bf4b88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""
Desc: This file is used to get the training data from the LLM

"""
import sys
from pathlib import Path

# Get the absolute path of the current script
current_file = Path(__file__).resolve()
project_root = current_file.parents[3]

# Add the project root to the system path
sys.path.append(str(project_root))

from stark_qa import load_qa

import argparse
import os
from openai import AzureOpenAI
import json
import openai
from prompts import prompts






"""

MAG:
sys_content: 478/query
output: 45/query
input: 25/query
1000 queries

total price:
    1. o1: $13.29
    2. o3mini: $0.97
    3. deepseek-chat: $0.24
    4. deepseek-reasoner: $0.49
    
Amazon:
sys_content: 478/query

"""

# get the prompt for different datasets
def get_sys_content(dataset_name):
    """
        input: 
            dataset_name: the name of the dataset
        output:
            sys_content: the sys_content for the dataset
    """
    sys_content = prompts(dataset_name)
    
    
    return sys_content

# get the response from the llm
def get_response(sys_content, user_content):
    
    messages = [{"role": "system", "content": sys_content},
               {"role": "user", "content": user_content}
               ]
    
    chat_completion = client.chat.completions.create(
        messages=messages,
        model=parameters['azure']['model'], # parameters['azure']['model'], parameters['openai']['model']
        # temperature=0,
        seed=576879897,
    )
    response = chat_completion.choices[0].message.content
    
    # print(messages)
    # print(response)
    
    return response

# save the outputs to json file
def save_json(data, dataset_name):
    """
        input: 
            data: the data to be saved
            dataset_name: the name of the dataset
    """
    
    file_dir = f"/home/yongjia/dgl/Yongjia/MOE/Reasoner/data/finetune/{dataset_name}"
    os.makedirs(file_dir, exist_ok=True)
    file_path = f"{file_dir}/1000_{parameters['azure']['model']}.json"
    
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)
    print(f"Saved to {file_path}")

# get the reasoning graphs for a dataset
def get_rg(dataset_name):
    """
        input: 
            dataset_name: the name of the dataset
        output:
            rg: the reasoning graph for the dataset
    """
    
    # get the prompt for the dataset
    sys_content = get_sys_content(dataset_name)
    
    # get qa dataset 
    qa = load_qa(dataset_name)
    train_qa = qa.get_subset('train')
    
    # we sample 1000 queries from the training set
    pair_list = []
    failure_count = 0
    for i in range(1500):
        query, q_id, ans_ids, _ = train_qa[i]
    
        # call the llm to get the reasoning graph
        response = get_response(sys_content, query)
        print(response)
        
        # process the response
    
        if dataset_name == 'prime':
            output = {
                "Triplets":[],
                "Restriction": [],
                "Target": ""
            }
            
            try:
                response = response.split('\n')
                triplets_raw = response[0].replace('Triplets:', '').strip()
                triplets = json.loads(triplets_raw)
                output['Triplets'] = triplets
                
                restriction_raw = response[1].replace('Restriction:', '').strip()
                restriction = json.loads(restriction_raw)
                output['Restriction'] = restriction
                
                target = response[2].replace('Target:', '').strip()
                output['Target'] = target
            except:
                failure_count += 1
                continue
            
        elif dataset_name == 'mag' or dataset_name == 'amazon':
            output = {
                "Metapath": "",
                "Restriction": [],
            }
            
            try:
                response = response.split('\n')
                metapath = response[0].replace('Metapath:', '').strip()
                output['Metapath'] = metapath
                
                restriction_raw = response[1].replace('Restriction:', '').strip()
                restriction = json.loads(restriction_raw)
                output['Restriction'] = restriction
            except:
                failure_count += 1
                continue
            
        else:
            raise ValueError('The dataset is not supported')
        
        pair = {'query': query, 'answer': output}

        pair_list.append(pair)
        
        if len(pair_list) == 1000:
            break
            
    # save the output to json file
    save_json(pair_list, dataset_name)
    print(f"Failure count: {failure_count}")
    

if __name__ == '__main__':    
    # Argument parser setup
    parser = argparse.ArgumentParser(description="Load LLM parameters and initialize API clients.")
    
    # Dataset name
    parser.add_argument("--dataset_name", type=str, required=True, 
                        choices=["mag", "amazon", "prime"], 
                        help="Specify the dataset to use.")

    # Model selection
    parser.add_argument("--model", type=str, required=True, 
                        choices=["gpt-4o-mini-20240718", "gpt-4o-2024-05-13", 
                                "deepseek-reasoner", "gpt-o1-2024-12-17", 
                                "o3-mini-2025-01-31"],
                        help="Specify the model to use.")

    # Azure API parameters
    parser.add_argument("--azure_api_key", type=str, default=None, help="Azure API Key")
    parser.add_argument("--azure_endpoint", type=str, default=None, help="Azure API Endpoint")
    parser.add_argument("--azure_api_version", type=str, default=None, help="Azure API Version")

    # OpenAI API parameters
    parser.add_argument("--openai_api_key", type=str, default=None, help="OpenAI API Key")
    parser.add_argument("--openai_endpoint", type=str, default=None, help="OpenAI API Endpoint")

    args = parser.parse_args()

    # Initialize parameters dictionary
    parameters = {
        "azure": {
            "api_key": args.azure_api_key,
            "azure_endpoint": args.azure_endpoint,
            "api_version": args.azure_api_version,
        },
        "openai": {
            "api_key": args.openai_api_key,
            "endpoint": args.openai_endpoint,
        }
    }


    # Determine which API client to use
    if parameters["openai"]["api_key"]:
        client = openai.OpenAI(
            base_url=parameters["openai"]["endpoint"],
            api_key=parameters["openai"]["api_key"],
        )
    else:
        client = AzureOpenAI(
            azure_endpoint=parameters["azure"]["azure_endpoint"],
            api_key=parameters["azure"]["api_key"],
            api_version=parameters["azure"]["api_version"],
        )
    
    get_rg(args.dataset_name)