|
import json |
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|
from pycocoevalcap.rouge.rouge import Rouge |
|
from pycocoevalcap.cider.cider import Cider |
|
import itertools |
|
def load_json(filename): |
|
with open(filename, 'r') as file: |
|
return json.load(file) |
|
|
|
def extract_answers(llm_data, gt_data): |
|
llm_answers = {item['id']: item['answer'] for item in llm_data} |
|
gt_answers = {item['id']: [conv['value'] for conv in item['conversations'] if conv['from'] == 'gpt'][0] for item in gt_data} |
|
return llm_answers, gt_answers |
|
|
|
def compute_bleu_scores(reference, hypothesis): |
|
smooth_fn = SmoothingFunction().method1 |
|
weights = [ |
|
(1, 0, 0, 0), |
|
(0.5, 0.5, 0, 0), |
|
(0.33, 0.33, 0.33, 0), |
|
(0.25, 0.25, 0.25, 0.25) |
|
] |
|
scores = [sentence_bleu([reference], hypothesis, weights=w, smoothing_function=smooth_fn) for w in weights] |
|
return scores |
|
|
|
def compute_rouge_scores(references, hypotheses): |
|
rouge = Rouge() |
|
scores, _ = rouge.compute_score(references, hypotheses) |
|
return scores |
|
|
|
def compute_cider_scores(references, hypotheses): |
|
cider = Cider() |
|
scores, _ = cider.compute_score(references, hypotheses) |
|
return scores |
|
|
|
def main(llm_file, gt_file): |
|
llm_data = load_json(llm_file) |
|
gt_data = load_json(gt_file) |
|
|
|
llm_answers, gt_answers = extract_answers(llm_data, gt_data) |
|
|
|
bleu_scores = {i: [] for i in range(4)} |
|
references = {} |
|
hypotheses = {} |
|
|
|
for id in llm_answers: |
|
if id in gt_answers: |
|
hypothesis = llm_answers[id].split() |
|
reference = gt_answers[id].split() |
|
bleu = compute_bleu_scores(reference, hypothesis) |
|
for i in range(4): |
|
bleu_scores[i].append(bleu[i]) |
|
references[id] = [gt_answers[id]] |
|
hypotheses[id] = [llm_answers[id]] |
|
else: |
|
print(f"ID {id} not found in ground truth data.") |
|
|
|
rouge_scores = compute_rouge_scores(references, hypotheses) |
|
cider_scores = compute_cider_scores(references, hypotheses) |
|
|
|
avg_bleu_scores = [sum(scores) / len(scores) for scores in bleu_scores.values()] |
|
|
|
|
|
|
|
|
|
|
|
print(f"Average BLEU-1: {avg_bleu_scores[0]:.4f}") |
|
print(f"Average BLEU-2: {avg_bleu_scores[1]:.4f}") |
|
print(f"Average BLEU-3: {avg_bleu_scores[2]:.4f}") |
|
print(f"Average BLEU-4: {avg_bleu_scores[3]:.4f}") |
|
print(f"Average ROUGE-L: {rouge_scores:.4f}") |
|
print(f"Average CIDEr: {cider_scores:.4f}") |
|
|
|
|
|
def test(): |
|
llm_type = ['finetune', 'ori'] |
|
data_gt = {'pwiseg': '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_caption_instruct_0712_test.json', |
|
'4dor': '/mnt1/lyc/llava_finetune/data_json/4dor_caption_instruct_0711_test.json' |
|
} |
|
data_type = ['pwiseg', '4dor'] |
|
for dt, lt in itertools.product(data_gt, llm_type): |
|
|
|
print(f'[INFO] data {dt} llm {lt}') |
|
llm_file = f'/mnt1/lyc/llava_finetune/eval_output/results_{dt}_{lt}/preds_description.json' |
|
gt_file = data_gt[dt] |
|
main(llm_file, gt_file) |
|
print() |
|
print() |
|
|
|
if __name__ == '__main__': |
|
test() |
|
|