File size: 4,092 Bytes
3fb43f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
import os
import glob
import argparse
import csv


def chatgpt_json(merge_file):
    # chat results
    merge_data = merge_file.decode("utf-8")
    merge_data = eval(merge_data)
    correct_answer_file = 'file/ANSWER.json'
    with open(correct_answer_file, 'r', encoding='utf-8') as f:
        correct_answer_data = json.load(f)

    dataset_scores_dict = {}
    for dataset_name, item in merge_data.items():

        total_nums = len(item)
        correct = 0
        # assert len(item) >= len(correct_answer_data[dataset_name]), f'Video-Bench-Input.json---{dataset_name}---is incomplete!'
        for id, sub_item in item.items():
            if sub_item['output_chatgpt_choice'] == correct_answer_data[dataset_name][id]['answer']:
                correct += 1

        dataset_scores_dict[dataset_name] = round(correct / total_nums * 100, 2)
    return dataset_scores_dict


def compute_scores(merge_file):
    dataset_score_dict = chatgpt_json(merge_file)
    dataset_weight = {
        1:
            {
                "ActivityNet": 1,
                "MSVD": 1,
                "MSRVTT": 1,
                "TGIF": 1,
                "Youcook2": 1,
                "Ucfcrime": 1,
                "MOT": 0.5,
            },

        2:
            {
                "TVQA": 1,
                "MV": 1,
                "NBA": 1,
            },

        3:
            {
                "Driving-exam": 0.5,
                "Driving-decision-making": 1,
                "SQA3D": 1,
            }

    }

    # Video-exclusive Understanding score
    exclusive_understanding_weight = dataset_weight[1]
    weights_sum = sum(exclusive_understanding_weight.values())
    exclusive_understanding_score = 0
    # import ipdb; ipdb.set_trace()
    for dataset_name, weight in exclusive_understanding_weight.items():
        exclusive_understanding_score += weight * dataset_score_dict[dataset_name] / weights_sum

    # Prior Knowledge-based Question-answer
    prior_QA_weight = dataset_weight[2]
    weights_sum = sum(prior_QA_weight.values())
    prior_QA_score = 0
    for dataset_name, weight in prior_QA_weight.items():
        prior_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum

    # Comprehension and Decision-making
    com_and_dec_QA_weight = dataset_weight[3]
    weights_sum = sum(com_and_dec_QA_weight.values())
    com_and_dec_QA_score = 0
    for dataset_name, weight in com_and_dec_QA_weight.items():
        com_and_dec_QA_score += weight * dataset_score_dict[dataset_name] / weights_sum

    dataset_score_dict['Exclusive_understanding'] = exclusive_understanding_score
    dataset_score_dict['Prior_Knowledge'] = prior_QA_score
    dataset_score_dict['Comprehension_and_Decision-making'] = com_and_dec_QA_score

    # final score
    final_score = sum([exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score]) / 3
    dataset_score_dict['final_score'] = final_score

    # print(dataset_score_dict)
    # with open(args.score_output_file, 'w', encoding='utf-8') as f:
    #   json.dump(dataset_score_dict, f, indent=2)
    # print(f'{args.score_output_file} is saved!')
    # ========================
    data = [

        ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
         "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime",
         "MOT", "TVQA", "MV", "NBA", "Driving-exam", "Driving-decision-making", "SQA3D"],

        [final_score, exclusive_understanding_score, prior_QA_score, com_and_dec_QA_score,
         dataset_score_dict['ActivityNet'],
         dataset_score_dict["MSVD"],
         dataset_score_dict['MSRVTT'],
         dataset_score_dict['TGIF'],
         dataset_score_dict['Youcook2'],
         dataset_score_dict['Ucfcrime'],
         dataset_score_dict['MOT'],
         dataset_score_dict['TVQA'],
         dataset_score_dict['MV'],
         dataset_score_dict['NBA'],
         dataset_score_dict['Driving-exam'],
         dataset_score_dict['Driving-decision-making'],
         dataset_score_dict['SQA3D'],
         ],
    ]

    return data