Spaces:
Running
Running
File size: 4,944 Bytes
5d57406 6de388e 5d57406 6de388e 5d57406 6de388e 5d57406 6de388e 5d57406 6de388e a77e097 6de388e a3012a1 6de388e a3012a1 6de388e a3012a1 6de388e a3012a1 6de388e 5d57406 a3012a1 6de388e a3012a1 6de388e 5aa60a6 6de388e 5d57406 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import json
import os
import glob
import argparse
import csv
def chatgpt_json(merge_file):
# chat results
merge_data = merge_file.decode("utf-8")
merge_data = merge_data.replace(": true,", ": \"true\",")
merge_data = merge_data.replace(": false,", ": \"false\",")
merge_data = eval(merge_data)
dataset_scores_dict = {}
for dataset_name, dataset_results in merge_data.items():
correct, total_nums = 0, 0
for id in dataset_results:
for dim in dataset_results[id]:
for result in dataset_results[id][dim]:
correct += result['rating']
total_nums += 1
dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
# dataset_scores_dict[dataset_name] = round(correct / total_nums , 4)
return dataset_scores_dict
def compute_scores(merge_file):
merge_data = merge_file.decode("utf-8")
merge_data = merge_data.replace(": true,", ": \"true\",")
merge_data = merge_data.replace(": false,", ": \"false\",")
merge_data = merge_data.replace(": null,", ": \"null\",")
merge_data = eval(merge_data)
dataset_scores_dict = {}
total_correct, total_num = 0, 0
eval_dims = ['action', 'speed', 'direction', 'order', 'attribute_change', 'avg']
dim_correct, dim_total = {dim: 0 for dim in eval_dims if dim!='avg'}, {dim: 0 for dim in eval_dims if dim!='avg'}
for dataset_name, dataset_results in merge_data.items():
dataset_correct, dataset_num = {dim: 0 for dim in eval_dims}, {dim: 0 for dim in eval_dims}
for id in dataset_results:
for dim in dataset_results[id]:
for result in dataset_results[id][dim]:
dataset_correct['avg'] += result['rating']
dataset_correct[dim] += result['rating']
dim_correct[dim] += result['rating']
dataset_num['avg'] += 1
dataset_num[dim] += 1
dim_total[dim] += 1
total_correct += dataset_correct['avg']
total_num += dataset_num['avg']
for dim in eval_dims:
dataset_scores_dict[f"{dim}_{dataset_name}"] = round(dataset_correct[dim] / dataset_num[dim] * 100, 2)
for dim in dim_correct:
dataset_scores_dict[f"avg_{dim}"] = round(dim_correct[dim] / dim_total[dim] * 100, 2)
dataset_scores_dict["avg_all"] = round(total_correct / total_num * 100, 2)
# print(dataset_score_dict)
# with open(args.score_output_file, 'w', encoding='utf-8') as f:
# json.dump(dataset_score_dict, f, indent=2)
# print(f'{args.score_output_file} is saved!')
# ========================
data = [
["Avg. All", "Avg. Action", "Avg. Direction", "Avg. Speed", "Avg. Event Order", "Avg. Attribute Change", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation",
"Action. Multi-Choice", "Action. Yes/No", "Action. Caption Matching", "Action. Caption Generation",
"Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
"Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
"Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
"Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"],
[dataset_scores_dict["avg_all"], dataset_scores_dict["avg_action"], dataset_scores_dict["avg_direction"], dataset_scores_dict["avg_speed"], dataset_scores_dict["avg_order"], dataset_scores_dict["avg_attribute_change"],
dataset_scores_dict["avg_multi-choice"], dataset_scores_dict["avg_yes_no"], dataset_scores_dict["avg_caption_matching"], dataset_scores_dict["avg_captioning"],
dataset_scores_dict['action_multi-choice'], dataset_scores_dict['action_yes_no'], dataset_scores_dict['action_caption_matching'], dataset_scores_dict['action_captioning'],
dataset_scores_dict['direction_multi-choice'], dataset_scores_dict['direction_yes_no'], dataset_scores_dict['direction_caption_matching'], dataset_scores_dict['direction_captioning'],
dataset_scores_dict['speed_multi-choice'], dataset_scores_dict['speed_yes_no'], dataset_scores_dict['speed_caption_matching'], dataset_scores_dict['speed_captioning'],
dataset_scores_dict['order_multi-choice'], dataset_scores_dict['order_yes_no'], dataset_scores_dict['order_caption_matching'], dataset_scores_dict['order_captioning'],
dataset_scores_dict['attribute_change_multi-choice'], dataset_scores_dict['attribute_change_yes_no'], dataset_scores_dict['attribute_change_caption_matching'], dataset_scores_dict['attribute_change_captioning'],
],
]
return data
|