|
import re |
|
import json |
|
import jsonlines |
|
|
|
from openai import OpenAI |
|
|
|
|
|
def batch_eval(query_file, result1_file, result2_file, output_file_path): |
|
client = OpenAI() |
|
|
|
with open(query_file, "r") as f: |
|
data = f.read() |
|
|
|
queries = re.findall(r"- Question \d+: (.+)", data) |
|
|
|
with open(result1_file, "r") as f: |
|
answers1 = json.load(f) |
|
answers1 = [i["result"] for i in answers1] |
|
|
|
with open(result2_file, "r") as f: |
|
answers2 = json.load(f) |
|
answers2 = [i["result"] for i in answers2] |
|
|
|
requests = [] |
|
for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)): |
|
sys_prompt = """ |
|
---Role--- |
|
You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. |
|
""" |
|
|
|
prompt = f""" |
|
You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. |
|
|
|
- **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? |
|
- **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? |
|
- **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? |
|
|
|
For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories. |
|
|
|
Here is the question: |
|
{query} |
|
|
|
Here are the two answers: |
|
|
|
**Answer 1:** |
|
{answer1} |
|
|
|
**Answer 2:** |
|
{answer2} |
|
|
|
Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. |
|
|
|
Output your evaluation in the following JSON format: |
|
|
|
{{ |
|
"Comprehensiveness": {{ |
|
"Winner": "[Answer 1 or Answer 2]", |
|
"Explanation": "[Provide explanation here]" |
|
}}, |
|
"Empowerment": {{ |
|
"Winner": "[Answer 1 or Answer 2]", |
|
"Explanation": "[Provide explanation here]" |
|
}}, |
|
"Overall Winner": {{ |
|
"Winner": "[Answer 1 or Answer 2]", |
|
"Explanation": "[Summarize why this answer is the overall winner based on the three criteria]" |
|
}} |
|
}} |
|
""" |
|
|
|
request_data = { |
|
"custom_id": f"request-{i+1}", |
|
"method": "POST", |
|
"url": "/v1/chat/completions", |
|
"body": { |
|
"model": "gpt-4o-mini", |
|
"messages": [ |
|
{"role": "system", "content": sys_prompt}, |
|
{"role": "user", "content": prompt}, |
|
], |
|
}, |
|
} |
|
|
|
requests.append(request_data) |
|
|
|
with jsonlines.open(output_file_path, mode="w") as writer: |
|
for request in requests: |
|
writer.write(request) |
|
|
|
print(f"Batch API requests written to {output_file_path}") |
|
|
|
batch_input_file = client.files.create( |
|
file=open(output_file_path, "rb"), purpose="batch" |
|
) |
|
batch_input_file_id = batch_input_file.id |
|
|
|
batch = client.batches.create( |
|
input_file_id=batch_input_file_id, |
|
endpoint="/v1/chat/completions", |
|
completion_window="24h", |
|
metadata={"description": "nightly eval job"}, |
|
) |
|
|
|
print(f"Batch {batch.id} has been created.") |
|
|
|
|
|
if __name__ == "__main__": |
|
batch_eval() |
|
|