File size: 3,390 Bytes
157f5b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
import itertools
def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def extract_answers(llm_data, gt_data):
    llm_answers = {item['id']: item['answer'] for item in llm_data}
    gt_answers = {item['id']: [conv['value'] for conv in item['conversations'] if conv['from'] == 'gpt'][0] for item in gt_data}
    return llm_answers, gt_answers

def compute_bleu_scores(reference, hypothesis):
    smooth_fn = SmoothingFunction().method1
    weights = [
        (1, 0, 0, 0),        # BLEU-1
        (0.5, 0.5, 0, 0),    # BLEU-2
        (0.33, 0.33, 0.33, 0), # BLEU-3
        (0.25, 0.25, 0.25, 0.25)  # BLEU-4
    ]
    scores = [sentence_bleu([reference], hypothesis, weights=w, smoothing_function=smooth_fn) for w in weights]
    return scores

def compute_rouge_scores(references, hypotheses):
    rouge = Rouge()
    scores, _ = rouge.compute_score(references, hypotheses)
    return scores

def compute_cider_scores(references, hypotheses):
    cider = Cider()
    scores, _ = cider.compute_score(references, hypotheses)
    return scores

def main(llm_file, gt_file):
    llm_data = load_json(llm_file)
    gt_data = load_json(gt_file)
    
    llm_answers, gt_answers = extract_answers(llm_data, gt_data)
    
    bleu_scores = {i: [] for i in range(4)}
    references = {}
    hypotheses = {}
    
    for id in llm_answers:
        if id in gt_answers:
            hypothesis = llm_answers[id].split()
            reference = gt_answers[id].split()
            bleu = compute_bleu_scores(reference, hypothesis)
            for i in range(4):
                bleu_scores[i].append(bleu[i])
            references[id] = [gt_answers[id]]
            hypotheses[id] = [llm_answers[id]]
        else:
            print(f"ID {id} not found in ground truth data.")
    
    rouge_scores = compute_rouge_scores(references, hypotheses)
    cider_scores = compute_cider_scores(references, hypotheses)
    
    avg_bleu_scores = [sum(scores) / len(scores) for scores in bleu_scores.values()]
    # print(rouge_scores)
    # print(cider_scores)
    # avg_rouge_score = sum(rouge_scores) / len(rouge_scores)
    # avg_cider_score = sum(cider_scores) / len(cider_scores)

    print(f"Average BLEU-1: {avg_bleu_scores[0]:.4f}")
    print(f"Average BLEU-2: {avg_bleu_scores[1]:.4f}")
    print(f"Average BLEU-3: {avg_bleu_scores[2]:.4f}")
    print(f"Average BLEU-4: {avg_bleu_scores[3]:.4f}")
    print(f"Average ROUGE-L: {rouge_scores:.4f}")
    print(f"Average CIDEr: {cider_scores:.4f}")


def test():
    llm_type = ['finetune', 'ori']
    data_gt = {'pwiseg': '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_caption_instruct_0712_test.json',
                '4dor': '/mnt1/lyc/llava_finetune/data_json/4dor_caption_instruct_0711_test.json'
                 }
    data_type = ['pwiseg', '4dor']
    for dt, lt in itertools.product(data_gt, llm_type):
     # Replace with your ground truth file path
        print(f'[INFO] data {dt} llm {lt}')
        llm_file = f'/mnt1/lyc/llava_finetune/eval_output/results_{dt}_{lt}/preds_description.json'
        gt_file = data_gt[dt]
        main(llm_file, gt_file)
        print()
        print()

if __name__ == '__main__':
    test()