Update: integrate llama3 into finetuning code
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- accelerator_config/gpu_1_config.yml +15 -0
- accelerator_config/gpu_4_config.yml +16 -0
- dataset/SurgDataset.py +2 -0
- eval_output/4dor_count_eval.json +0 -0
- eval_output/4dor_count_eval_llama3_llava_finetune.json +0 -0
- eval_output/4dor_count_eval_llama3_llava_ori.json +0 -0
- eval_output/4dor_phase_eval.json +0 -0
- eval_output/pwiseg_count_eval_llama3_llava.json +0 -0
- eval_output/pwiseg_count_eval_llama3_llava_finetune.json +0 -0
- eval_output/results_4dor_finetune/preds.json +0 -0
- eval_output/results_4dor_finetune/preds_count.json +0 -0
- eval_output/results_4dor_finetune/preds_description.json +0 -0
- eval_output/results_4dor_ori/preds.json +0 -0
- eval_output/results_4dor_ori/preds_count.json +0 -0
- eval_output/results_4dor_ori/preds_description.json +0 -0
- eval_output/results_4dor_self_finetune_llava/preds.json +0 -0
- eval_output/results_4dor_self_finetune_llava/preds_classification.json +0 -0
- eval_output/results_4dor_self_finetune_llava/preds_count.json +0 -0
- eval_output/results_4dor_self_finetune_llava/preds_description.json +0 -0
- eval_output/results_4dor_self_finetune_llava/preds_phase.json +0 -0
- eval_output/results_pwiseg_finetune/preds.json +0 -0
- eval_output/results_pwiseg_finetune/preds_count.json +0 -0
- eval_output/results_pwiseg_finetune/preds_description.json +0 -0
- eval_output/results_pwiseg_ori/preds.json +0 -0
- eval_output/results_pwiseg_ori/preds_count.json +0 -0
- eval_output/results_pwiseg_ori/preds_description.json +0 -0
- eval_scripts/caption_eval.py +92 -0
- eval_scripts/count_eval.py +237 -0
- eval_scripts/phase_eval.py +179 -0
- run_finetune_llava.py +64 -49
- scripts/convert_gqa_for_eval.py +18 -0
- scripts/convert_mmbench_for_submission.py +27 -0
- scripts/convert_mmvet_for_eval.py +18 -0
- scripts/convert_seed_for_submission.py +74 -0
- scripts/convert_sqa_to_llava.py +88 -0
- scripts/convert_sqa_to_llava_base_prompt.py +334 -0
- scripts/convert_vizwiz_for_submission.py +47 -0
- scripts/convert_vqav2_for_submission.py +56 -0
- scripts/extract_mm_projector.py +47 -0
- scripts/finetune.sh +48 -0
- scripts/finetune/test_llava.sh +7 -2
- scripts/finetune_full_schedule.sh +48 -0
- scripts/finetune_lora.sh +49 -0
- scripts/finetune_lora_my.sh +49 -0
- scripts/finetune_qlora.sh +50 -0
- scripts/finetune_sqa.sh +36 -0
- scripts/merge_lora_weights.py +22 -0
- scripts/pretrain.sh +46 -0
- scripts/pretrain_xformers.sh +44 -0
- scripts/sqa_eval_batch.sh +13 -0
accelerator_config/gpu_1_config.yml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
distributed_type: 'NO'
|
3 |
+
downcast_bf16: 'no'
|
4 |
+
gpu_ids: '0'
|
5 |
+
machine_rank: 0
|
6 |
+
main_training_function: main
|
7 |
+
mixed_precision: 'no'
|
8 |
+
num_machines: 1
|
9 |
+
num_processes: 1
|
10 |
+
rdzv_backend: static
|
11 |
+
same_network: true
|
12 |
+
tpu_env: []
|
13 |
+
tpu_use_cluster: false
|
14 |
+
tpu_use_sudo: false
|
15 |
+
use_cpu: false
|
accelerator_config/gpu_4_config.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
distributed_type: MULTI_GPU
|
3 |
+
downcast_bf16: 'no'
|
4 |
+
gpu_ids: 0,1,2,3
|
5 |
+
machine_rank: 0
|
6 |
+
main_training_function: main
|
7 |
+
mixed_precision: 'no'
|
8 |
+
num_machines: 1
|
9 |
+
num_processes: 4
|
10 |
+
rdzv_backend: static
|
11 |
+
same_network: true
|
12 |
+
tpu_env: []
|
13 |
+
tpu_use_cluster: false
|
14 |
+
tpu_use_sudo: false
|
15 |
+
use_cpu: false
|
16 |
+
main_process_port: 29600
|
dataset/SurgDataset.py
CHANGED
@@ -28,6 +28,8 @@ class SurgDataset(Dataset):
|
|
28 |
if os.path.isfile(args.data_path):
|
29 |
with open(args.data_path) as f:
|
30 |
self.data_json = json.load(f)
|
|
|
|
|
31 |
else:
|
32 |
self.data_json_path = os.path.join(args.data_path, 'test.json')
|
33 |
if os.path.isfile(self.data_json_path):
|
|
|
28 |
if os.path.isfile(args.data_path):
|
29 |
with open(args.data_path) as f:
|
30 |
self.data_json = json.load(f)
|
31 |
+
if len(self.data_json) > 200:
|
32 |
+
self.data_json = self.data_json[:200]
|
33 |
else:
|
34 |
self.data_json_path = os.path.join(args.data_path, 'test.json')
|
35 |
if os.path.isfile(self.data_json_path):
|
eval_output/4dor_count_eval.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/4dor_count_eval_llama3_llava_finetune.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/4dor_count_eval_llama3_llava_ori.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/4dor_phase_eval.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/pwiseg_count_eval_llama3_llava.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/pwiseg_count_eval_llama3_llava_finetune.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_finetune/preds.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_finetune/preds_count.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_finetune/preds_description.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_ori/preds.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_ori/preds_count.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_ori/preds_description.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_self_finetune_llava/preds.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_self_finetune_llava/preds_classification.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_self_finetune_llava/preds_count.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_self_finetune_llava/preds_description.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_4dor_self_finetune_llava/preds_phase.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_pwiseg_finetune/preds.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_pwiseg_finetune/preds_count.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_pwiseg_finetune/preds_description.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_pwiseg_ori/preds.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_pwiseg_ori/preds_count.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_output/results_pwiseg_ori/preds_description.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval_scripts/caption_eval.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
3 |
+
from pycocoevalcap.rouge.rouge import Rouge
|
4 |
+
from pycocoevalcap.cider.cider import Cider
|
5 |
+
import itertools
|
6 |
+
def load_json(filename):
|
7 |
+
with open(filename, 'r') as file:
|
8 |
+
return json.load(file)
|
9 |
+
|
10 |
+
def extract_answers(llm_data, gt_data):
|
11 |
+
llm_answers = {item['id']: item['answer'] for item in llm_data}
|
12 |
+
gt_answers = {item['id']: [conv['value'] for conv in item['conversations'] if conv['from'] == 'gpt'][0] for item in gt_data}
|
13 |
+
return llm_answers, gt_answers
|
14 |
+
|
15 |
+
def compute_bleu_scores(reference, hypothesis):
|
16 |
+
smooth_fn = SmoothingFunction().method1
|
17 |
+
weights = [
|
18 |
+
(1, 0, 0, 0), # BLEU-1
|
19 |
+
(0.5, 0.5, 0, 0), # BLEU-2
|
20 |
+
(0.33, 0.33, 0.33, 0), # BLEU-3
|
21 |
+
(0.25, 0.25, 0.25, 0.25) # BLEU-4
|
22 |
+
]
|
23 |
+
scores = [sentence_bleu([reference], hypothesis, weights=w, smoothing_function=smooth_fn) for w in weights]
|
24 |
+
return scores
|
25 |
+
|
26 |
+
def compute_rouge_scores(references, hypotheses):
|
27 |
+
rouge = Rouge()
|
28 |
+
scores, _ = rouge.compute_score(references, hypotheses)
|
29 |
+
return scores
|
30 |
+
|
31 |
+
def compute_cider_scores(references, hypotheses):
|
32 |
+
cider = Cider()
|
33 |
+
scores, _ = cider.compute_score(references, hypotheses)
|
34 |
+
return scores
|
35 |
+
|
36 |
+
def main(llm_file, gt_file):
|
37 |
+
llm_data = load_json(llm_file)
|
38 |
+
gt_data = load_json(gt_file)
|
39 |
+
|
40 |
+
llm_answers, gt_answers = extract_answers(llm_data, gt_data)
|
41 |
+
|
42 |
+
bleu_scores = {i: [] for i in range(4)}
|
43 |
+
references = {}
|
44 |
+
hypotheses = {}
|
45 |
+
|
46 |
+
for id in llm_answers:
|
47 |
+
if id in gt_answers:
|
48 |
+
hypothesis = llm_answers[id].split()
|
49 |
+
reference = gt_answers[id].split()
|
50 |
+
bleu = compute_bleu_scores(reference, hypothesis)
|
51 |
+
for i in range(4):
|
52 |
+
bleu_scores[i].append(bleu[i])
|
53 |
+
references[id] = [gt_answers[id]]
|
54 |
+
hypotheses[id] = [llm_answers[id]]
|
55 |
+
else:
|
56 |
+
print(f"ID {id} not found in ground truth data.")
|
57 |
+
|
58 |
+
rouge_scores = compute_rouge_scores(references, hypotheses)
|
59 |
+
cider_scores = compute_cider_scores(references, hypotheses)
|
60 |
+
|
61 |
+
avg_bleu_scores = [sum(scores) / len(scores) for scores in bleu_scores.values()]
|
62 |
+
# print(rouge_scores)
|
63 |
+
# print(cider_scores)
|
64 |
+
# avg_rouge_score = sum(rouge_scores) / len(rouge_scores)
|
65 |
+
# avg_cider_score = sum(cider_scores) / len(cider_scores)
|
66 |
+
|
67 |
+
print(f"Average BLEU-1: {avg_bleu_scores[0]:.4f}")
|
68 |
+
print(f"Average BLEU-2: {avg_bleu_scores[1]:.4f}")
|
69 |
+
print(f"Average BLEU-3: {avg_bleu_scores[2]:.4f}")
|
70 |
+
print(f"Average BLEU-4: {avg_bleu_scores[3]:.4f}")
|
71 |
+
print(f"Average ROUGE-L: {rouge_scores:.4f}")
|
72 |
+
print(f"Average CIDEr: {cider_scores:.4f}")
|
73 |
+
|
74 |
+
|
75 |
+
def test():
|
76 |
+
llm_type = ['finetune', 'ori']
|
77 |
+
data_gt = {'pwiseg': '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_caption_instruct_0712_test.json',
|
78 |
+
'4dor': '/mnt1/lyc/llava_finetune/data_json/4dor_caption_instruct_0711_test.json'
|
79 |
+
}
|
80 |
+
data_type = ['pwiseg', '4dor']
|
81 |
+
for dt, lt in itertools.product(data_gt, llm_type):
|
82 |
+
# Replace with your ground truth file path
|
83 |
+
print(f'[INFO] data {dt} llm {lt}')
|
84 |
+
llm_file = f'/mnt1/lyc/llava_finetune/eval_output/results_{dt}_{lt}/preds_description.json'
|
85 |
+
gt_file = data_gt[dt]
|
86 |
+
main(llm_file, gt_file)
|
87 |
+
print()
|
88 |
+
print()
|
89 |
+
|
90 |
+
if __name__ == '__main__':
|
91 |
+
test()
|
92 |
+
|
eval_scripts/count_eval.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from openai import OpenAI
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
def load_json(filename):
|
7 |
+
with open(filename, 'r') as file:
|
8 |
+
return json.load(file)
|
9 |
+
|
10 |
+
|
11 |
+
client = OpenAI(api_key="sk-HZLqWTFgQKHUM0YN9d800981DbC34aEa90632493B9310360",
|
12 |
+
base_url="https://vip.yi-zhan.top/v1")
|
13 |
+
|
14 |
+
def get_result(prompt):
|
15 |
+
response = client.chat.completions.create(
|
16 |
+
model="gpt-4o-2024-05-13",
|
17 |
+
messages=[
|
18 |
+
{"role": "system", "content": "You are a helpful assistant"},
|
19 |
+
{"role": "user", "content":
|
20 |
+
[
|
21 |
+
{"type": "text", "text": prompt},
|
22 |
+
]
|
23 |
+
},
|
24 |
+
],
|
25 |
+
stream=False,
|
26 |
+
temperature=0.8
|
27 |
+
)
|
28 |
+
return response.choices[0].message.content
|
29 |
+
|
30 |
+
|
31 |
+
def create_prompt(question, llm_answer, gt_answer):
|
32 |
+
|
33 |
+
template = """
|
34 |
+
## Role
|
35 |
+
You are a judge, tasked with determining whether the answers provided by other large language models are consistent with the annotated data, especially in terms of numerical accuracy.
|
36 |
+
|
37 |
+
## Question
|
38 |
+
```json
|
39 |
+
{question}
|
40 |
+
```
|
41 |
+
|
42 |
+
## LLM Answer
|
43 |
+
```json
|
44 |
+
{llm_answer}
|
45 |
+
```
|
46 |
+
|
47 |
+
## Annotated Answer
|
48 |
+
```json
|
49 |
+
{gt_answer}
|
50 |
+
```
|
51 |
+
|
52 |
+
## Task
|
53 |
+
For a given Question, evaluate whether the LLM Answer is consistent with the Annotated Answer. If it is, please answer yes and give a reason.If it is not, please answer no and give a reason.
|
54 |
+
|
55 |
+
## Constraints
|
56 |
+
- Your response should be divided into two parts: 'answer' and 'reason'. The 'answer' should be either 'yes' or 'no', indicating whether the large language model's prediction aligns with the annotated information, particularly in terms of quantities. The 'reason' should provide the rationale for your answer.
|
57 |
+
- When evaluating the accuracy of the large language model's prediction, please pay close attention to the counting of quantities in the model's response and whether it matches the quantities provided in the standard information.
|
58 |
+
- output format is a json dict as follows:
|
59 |
+
"reason": reason,
|
60 |
+
"answer": answer
|
61 |
+
|
62 |
+
Take a deep breath and start your answer step by step.
|
63 |
+
"""
|
64 |
+
|
65 |
+
prompt = template.format(question=question,
|
66 |
+
llm_answer=llm_answer,
|
67 |
+
gt_answer=gt_answer)
|
68 |
+
return prompt
|
69 |
+
|
70 |
+
# def extract_answer(response_text):
|
71 |
+
# pattern = r'"answer":\s*"([^"]+)"'
|
72 |
+
# match = re.search(pattern, response_text)
|
73 |
+
# print(match)
|
74 |
+
# if match:
|
75 |
+
# return match.group(1).lower() == 'yes'
|
76 |
+
# return False
|
77 |
+
|
78 |
+
def extract_answer(json_string):
|
79 |
+
# 使用正则表达式匹配answer和reason
|
80 |
+
answer_match = re.search(r'"answer":\s*"([^"]+)"', json_string)
|
81 |
+
reason_match = re.search(r'"reason":\s*"([^"]+)"', json_string, re.DOTALL)
|
82 |
+
|
83 |
+
# 提取匹配的内容
|
84 |
+
answer = answer_match.group(1) if answer_match else None
|
85 |
+
reason = reason_match.group(1) if reason_match else None
|
86 |
+
|
87 |
+
return answer, reason
|
88 |
+
|
89 |
+
|
90 |
+
def main(llm_file, gt_file, out_file):
|
91 |
+
llm_data = load_json(llm_file)
|
92 |
+
gt_data = load_json(gt_file)
|
93 |
+
|
94 |
+
QA_dict = {item["id"]:{} for item in llm_data}
|
95 |
+
|
96 |
+
for item in llm_data:
|
97 |
+
qid = item["id"]
|
98 |
+
QA_dict[qid]["question"] = item["question"]
|
99 |
+
QA_dict[qid]["llm_answer"] = item["answer"]
|
100 |
+
|
101 |
+
for item in gt_data:
|
102 |
+
qid = item["id"]
|
103 |
+
# import ipdb
|
104 |
+
# ipdb.set_trace()
|
105 |
+
if qid in QA_dict.keys():
|
106 |
+
QA_dict[qid]["gt_answer"] = item["conversations"][1]["value"]
|
107 |
+
|
108 |
+
compares = []
|
109 |
+
correct_ans = 0
|
110 |
+
for ix, (qid, item) in enumerate(QA_dict.items()):
|
111 |
+
question = item["question"]
|
112 |
+
llm_answer = item["llm_answer"]
|
113 |
+
gt_answer = item["gt_answer"]
|
114 |
+
prompt = create_prompt(question, llm_answer, gt_answer)
|
115 |
+
|
116 |
+
try:
|
117 |
+
compare = get_result(prompt=prompt)
|
118 |
+
|
119 |
+
answer, reason = extract_answer(compare)
|
120 |
+
compare_data = {"id": qid, "answer": answer, "reason":reason}
|
121 |
+
compares.append(compare_data)
|
122 |
+
with open(out_file, 'w') as f:
|
123 |
+
json.dump(compares, f, indent=4)
|
124 |
+
|
125 |
+
if answer.lower()== 'yes':
|
126 |
+
correct_ans = correct_ans + 1
|
127 |
+
print(f"#correct \n answer:{answer},\n reason:{reason}")
|
128 |
+
else:
|
129 |
+
print(f"#wrong \n answer:{answer},\n reason:{reason}")
|
130 |
+
|
131 |
+
except:
|
132 |
+
print("break", item)
|
133 |
+
continue
|
134 |
+
print(f"[step {ix}, correct {correct_ans}, total {len(QA_dict)}, rate {correct_ans/len(QA_dict)}")
|
135 |
+
compares.append(compare)
|
136 |
+
|
137 |
+
return compares
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
if __name__ == "__main__":
|
142 |
+
|
143 |
+
#################################################
|
144 |
+
# 4dor count #
|
145 |
+
#################################################
|
146 |
+
|
147 |
+
# # intern fintuned: [step 200, correct 75, total 200, rate 0.375]
|
148 |
+
# llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
149 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
150 |
+
# out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results_eval/4dor_count_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
|
151 |
+
# compares = main(llm_file, gt_file, out_file)
|
152 |
+
|
153 |
+
# # intern origin: step 199, correct 18, total 200, rate 0.09
|
154 |
+
# llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
155 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
156 |
+
# out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results_eval/4dor_count_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
|
157 |
+
# compares = main(llm_file, gt_file, out_file)
|
158 |
+
|
159 |
+
# llava 7b fintuned: [step 199, correct 111, total 200, rate 0.555]
|
160 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
161 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
162 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
163 |
+
# compares = main(llm_file, gt_file, out_file)
|
164 |
+
|
165 |
+
# ## llava 7b origin: [step 199, correct 44, total 200, rate 0.22]
|
166 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
167 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
168 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
169 |
+
# compares = main(llm_file, gt_file, out_file)
|
170 |
+
|
171 |
+
## llava 13b fintuned: [step 199, correct 125, total 200, rate 0.625]
|
172 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
173 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
174 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
175 |
+
# compares = main(llm_file, gt_file, out_file)
|
176 |
+
|
177 |
+
# ## llava 13b origin: [step 199, correct 16, total 200, rate 0.08]
|
178 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
179 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
180 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
181 |
+
# compares = main(llm_file, gt_file, out_file)
|
182 |
+
|
183 |
+
#################################################
|
184 |
+
# pwi count #
|
185 |
+
#################################################
|
186 |
+
|
187 |
+
# intern fintuned: step 199, correct 60, total 200, rate 0.3
|
188 |
+
# llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
189 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
190 |
+
# out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results_eval/pwiseg_count_instruct_0712_test_compare.json' # 替换为你的 LLM 预测文件路径
|
191 |
+
# compares = main(llm_file, gt_file, out_file)
|
192 |
+
|
193 |
+
# # intern origin: step 199, correct 22, total 200, rate 0.11
|
194 |
+
# llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
195 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
196 |
+
# out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results_eval/pwiseg_count_instruct_0712_test_results.json' # 替���为你的 LLM 预测文件路径
|
197 |
+
# compares = main(llm_file, gt_file, out_file)
|
198 |
+
|
199 |
+
# llava 7b fintuned: step 198, correct 140, total 200, rate 0.7
|
200 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
201 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
202 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results_eval/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
203 |
+
# compares = main(llm_file, gt_file, out_file)
|
204 |
+
|
205 |
+
# ## llava 7b origin: step 199, correct 12, total 200, rate 0.06
|
206 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
207 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
208 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results-eval/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
209 |
+
# compares = main(llm_file, gt_file, out_file)
|
210 |
+
|
211 |
+
# # # llava 13b fintuned: step 199, correct 142, total 200, rate 0.71
|
212 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
213 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
214 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
215 |
+
# compares = main(llm_file, gt_file, out_file)
|
216 |
+
|
217 |
+
## llava 13b origin: [step 199, correct 142, total 200, rate 0.71]
|
218 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
219 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
220 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results-eval/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
|
221 |
+
# compares = main(llm_file, gt_file, out_file)
|
222 |
+
|
223 |
+
## LLaVA-NeXT
|
224 |
+
llm_file = '/mnt1/lyc/llava_finetune/eval_output/results_pwiseg_ori/preds_count.json' # 替换为你的 LLM 预测文件路径
|
225 |
+
gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
|
226 |
+
out_file = '/mnt1/lyc/llava_finetune/eval_output/pwiseg_count_eval_llama3_llava.json' # 替换为你的 LLM 预测文件路径
|
227 |
+
|
228 |
+
|
229 |
+
# llm_file = '/mnt1/lyc/llava_finetune/eval_output/results_4dor_ori/preds_count.json' # 替换为你的 LLM 预测文件路径
|
230 |
+
# gt_file = '/mnt1/lyc/llava_finetune/data_json/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
231 |
+
# out_file = '/mnt1/lyc/llava_finetune/eval_output/4dor_count_eval_llama3_llava_ori.json' # 替换为你的 LLM 预测文件路径
|
232 |
+
|
233 |
+
compares = main(llm_file, gt_file, out_file)
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
|
eval_scripts/phase_eval.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from openai import OpenAI
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
def load_json(filename):
|
7 |
+
with open(filename, 'r') as file:
|
8 |
+
return json.load(file)
|
9 |
+
|
10 |
+
|
11 |
+
client = OpenAI(api_key="sk-HZLqWTFgQKHUM0YN9d800981DbC34aEa90632493B9310360",
|
12 |
+
base_url="https://vip.yi-zhan.top/v1")
|
13 |
+
|
14 |
+
def get_result(prompt):
|
15 |
+
response = client.chat.completions.create(
|
16 |
+
model="gpt-4o-2024-05-13",
|
17 |
+
messages=[
|
18 |
+
{"role": "system", "content": "You are a helpful assistant"},
|
19 |
+
{"role": "user", "content":
|
20 |
+
[
|
21 |
+
{"type": "text", "text": prompt},
|
22 |
+
]
|
23 |
+
},
|
24 |
+
],
|
25 |
+
stream=False,
|
26 |
+
temperature=0.8
|
27 |
+
)
|
28 |
+
return response.choices[0].message.content
|
29 |
+
|
30 |
+
|
31 |
+
def create_prompt(question, llm_answer, gt_answer):
|
32 |
+
|
33 |
+
template = """
|
34 |
+
## Role
|
35 |
+
You are a fair judge, comparing the LLM answer with the annotated answer, and evaluating whether the answer is accurate about the understanding of the surgical stage.
|
36 |
+
|
37 |
+
## Question
|
38 |
+
```json
|
39 |
+
{question}
|
40 |
+
```
|
41 |
+
|
42 |
+
## LLM answer
|
43 |
+
```json
|
44 |
+
{llm_answer}
|
45 |
+
```
|
46 |
+
|
47 |
+
## Annotated answer
|
48 |
+
```json
|
49 |
+
{gt_answer}
|
50 |
+
```
|
51 |
+
|
52 |
+
## Task
|
53 |
+
For the given question, evaluate whether the LLM answer is consistent with the annotated answer. If yes, answer yes and give a reason. If no, answer no and give a reason.
|
54 |
+
|
55 |
+
## Constraints
|
56 |
+
- Your answer should be divided into two parts: "Answer" and "Reason". "Answer" should be "Yes" or "No", indicating whether the large language model's prediction is consistent with the annotation information. "Reason" should provide the reason for your answer.
|
57 |
+
- When evaluating the accuracy of the LLM's prediction, pay close attention to whether the model's answer is accurate about the understanding of the surgical phase, including whether the surgical stage is correctly identified and whether the operation suggestion given is appropriate.
|
58 |
+
- Output format is json Dictionary, as shown below:
|
59 |
+
"reason":reason,
|
60 |
+
"answer":answer
|
61 |
+
|
62 |
+
Take a deep breath and start answering step by step.
|
63 |
+
"""
|
64 |
+
|
65 |
+
|
66 |
+
prompt = template.format(question=question,
|
67 |
+
llm_answer=llm_answer,
|
68 |
+
gt_answer=gt_answer)
|
69 |
+
return prompt
|
70 |
+
|
71 |
+
|
72 |
+
def extract_answer(json_string):
|
73 |
+
# 使用正则表达式匹配answer和reason
|
74 |
+
answer_match = re.search(r'"answer":\s*"([^"]+)"', json_string)
|
75 |
+
reason_match = re.search(r'"reason":\s*"([^"]+)"', json_string, re.DOTALL)
|
76 |
+
|
77 |
+
# 提取匹配的内容
|
78 |
+
answer = answer_match.group(1) if answer_match else None
|
79 |
+
reason = reason_match.group(1) if reason_match else None
|
80 |
+
|
81 |
+
return answer, reason
|
82 |
+
|
83 |
+
|
84 |
+
def main(llm_file, gt_file, out_file):
|
85 |
+
llm_data = load_json(llm_file)
|
86 |
+
gt_data = load_json(gt_file)
|
87 |
+
|
88 |
+
QA_dict = {item["id"]:{} for item in llm_data}
|
89 |
+
|
90 |
+
for item in llm_data:
|
91 |
+
qid = item["id"]
|
92 |
+
QA_dict[qid]["question"] = item["question"]
|
93 |
+
QA_dict[qid]["llm_answer"] = item["answer"]
|
94 |
+
|
95 |
+
for item in gt_data:
|
96 |
+
qid = item["id"]
|
97 |
+
# import ipdb
|
98 |
+
# ipdb.set_trace()
|
99 |
+
if qid in QA_dict.keys():
|
100 |
+
QA_dict[qid]["gt_answer"] = item["conversations"][1]["value"]
|
101 |
+
|
102 |
+
compares = []
|
103 |
+
correct_ans = 0
|
104 |
+
for ix, (qid, item) in enumerate(QA_dict.items()):
|
105 |
+
question = item["question"]
|
106 |
+
llm_answer = item["llm_answer"]
|
107 |
+
gt_answer = item["gt_answer"]
|
108 |
+
prompt = create_prompt(question, llm_answer, gt_answer)
|
109 |
+
|
110 |
+
try:
|
111 |
+
compare = get_result(prompt=prompt)
|
112 |
+
|
113 |
+
answer, reason = extract_answer(compare)
|
114 |
+
compare_data = {"id": qid, "answer": answer, "reason":reason}
|
115 |
+
compares.append(compare_data)
|
116 |
+
with open(out_file, 'w') as f:
|
117 |
+
json.dump(compares, f, indent=4)
|
118 |
+
|
119 |
+
if answer.lower()== 'yes':
|
120 |
+
correct_ans = correct_ans + 1
|
121 |
+
print(f"#correct \n answer:{answer},\n reason:{reason}")
|
122 |
+
else:
|
123 |
+
print(f"#wrong \n answer:{answer},\n reason:{reason}")
|
124 |
+
|
125 |
+
except:
|
126 |
+
print("break", item)
|
127 |
+
continue
|
128 |
+
print(f"[step {ix}, correct {correct_ans}, total {len(QA_dict)}, rate {correct_ans/len(QA_dict)}")
|
129 |
+
compares.append(compare)
|
130 |
+
|
131 |
+
return compares
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
if __name__ == "__main__":
|
136 |
+
|
137 |
+
# # intern fintuned: step 199, correct 42, total 200, rate 0.21
|
138 |
+
# llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results/4dor_phase_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
139 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_phase_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
140 |
+
# out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results_eval/4dor_phase_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
|
141 |
+
# compares = main(llm_file, gt_file, out_file)
|
142 |
+
|
143 |
+
# intern origin:
|
144 |
+
# llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results/4dor_phase_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
145 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_phase_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
146 |
+
# out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results_eval/4dor_phase_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
|
147 |
+
# compares = main(llm_file, gt_file, out_file)
|
148 |
+
|
149 |
+
# llava 7b fintuned: [step 199, correct 111, total 200, rate 0.555]
|
150 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
151 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
152 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
153 |
+
# compares = main(llm_file, gt_file, out_file)
|
154 |
+
|
155 |
+
# ## llava 7b origin: [step 199, correct 44, total 200, rate 0.22]
|
156 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
157 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
158 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
159 |
+
# compares = main(llm_file, gt_file, out_file)
|
160 |
+
|
161 |
+
## llava 13b fintuned: [step 199, correct 125, total 200, rate 0.625]
|
162 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
163 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
164 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
165 |
+
# compares = main(llm_file, gt_file, out_file)
|
166 |
+
|
167 |
+
# ## llava 13b origin
|
168 |
+
# llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
169 |
+
# gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
170 |
+
# out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
|
171 |
+
# compares = main(llm_file, gt_file, out_file)
|
172 |
+
|
173 |
+
|
174 |
+
llm_file = '/mnt1/lyc/llava_finetune/results_4dor/preds_phase.json' # 替换为你的 LLM 预测文件路径
|
175 |
+
gt_file = '/mnt1/lyc/llava_finetune/data_json/4dor_phase_instruct_0711_test.json' # 替换为你的 GT 文件路径
|
176 |
+
out_file = '/mnt1/lyc/llava_finetune/eval_output/4dor_phase_eval.json' # 替换为你的 LLM 预测文件路径
|
177 |
+
compares = main(llm_file, gt_file, out_file)
|
178 |
+
|
179 |
+
|
run_finetune_llava.py
CHANGED
@@ -2,7 +2,7 @@ from llava.model.builder import load_pretrained_model
|
|
2 |
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
3 |
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
4 |
from llava.conversation import conv_templates, SeparatorStyle
|
5 |
-
from peft import LoraConfig, get_peft_model
|
6 |
from PIL import Image
|
7 |
import requests
|
8 |
import copy
|
@@ -10,7 +10,7 @@ import torch
|
|
10 |
import argparse
|
11 |
from dataset.SurgDataset import SurgDataset
|
12 |
from accelerate import Accelerator
|
13 |
-
from
|
14 |
import os
|
15 |
from tqdm import tqdm
|
16 |
import json
|
@@ -38,9 +38,10 @@ def parse_args():
|
|
38 |
parser.add_argument('--step_size', type=int, default=300)
|
39 |
parser.add_argument('--gamma', type=float, default=0.95, help='gemma value of scheduler')
|
40 |
parser.add_argument('--num_epochs', type=int, default=1000)
|
41 |
-
|
42 |
parser.add_argument('--test', action='store_true')
|
43 |
-
parser.add_argument('--
|
|
|
44 |
parser.add_argument('--output_dir', type=str, default='4dor_output', help='output file path, which will store output text.')
|
45 |
return parser.parse_args()
|
46 |
def main():
|
@@ -48,7 +49,9 @@ def main():
|
|
48 |
accelerator = Accelerator(project_dir=os.path.join(args.ckpt_dir, args.model_name),
|
49 |
log_with="wandb" if args.wandb else None,
|
50 |
gradient_accumulation_steps=args.gradient_accumulation_steps)
|
|
|
51 |
if args.wandb:
|
|
|
52 |
accelerator.init_trackers(
|
53 |
project_name=args.wandb_project,
|
54 |
config=args,
|
@@ -57,55 +60,67 @@ def main():
|
|
57 |
accelerator.print("[Info] Using wandb for logging...")
|
58 |
pretrained = "lmms-lab/llama3-llava-next-8b"
|
59 |
model_name = "llava_llama3"
|
60 |
-
tokenizer,
|
61 |
-
tokenizer.pad_token_id = tokenizer.eos_token_id
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
64 |
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
|
65 |
test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
|
66 |
-
|
67 |
-
|
|
|
68 |
param.requires_grad = False
|
69 |
-
|
70 |
-
print(f'[INFO] Using LoRA ...')
|
71 |
-
peft_config = LoraConfig(
|
72 |
-
lora_alpha=args.lora_rank,
|
73 |
-
lora_dropout=0.05,
|
74 |
-
r=args.lora_rank,
|
75 |
-
bias="none",
|
76 |
-
task_type="CAUSAL_LM",
|
77 |
-
target_modules=[
|
78 |
-
"q_proj",
|
79 |
-
"k_proj",
|
80 |
-
"v_proj",
|
81 |
-
"o_proj",
|
82 |
-
"gate_proj",
|
83 |
-
"up_proj",
|
84 |
-
"down_proj",
|
85 |
-
"lm_head",
|
86 |
-
],
|
87 |
-
)
|
88 |
-
lora_llm = get_peft_model(model, peft_config)
|
89 |
-
model = lora_llm.model
|
90 |
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
print(f'[INFO] Creating Model ...')
|
93 |
-
|
94 |
-
|
95 |
optimizer = torch.optim.AdamW(train_params, lr=args.lr, eps=1e-7)
|
96 |
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader) * args.step_size // args.gradient_accumulation_steps, gamma=args.gamma)
|
97 |
-
if args.checkpoint_path is not None:
|
98 |
-
# Stupid way to load lora...
|
99 |
-
# TODO: update this!
|
100 |
-
print(f'[INFO] Load checkpoint...')
|
101 |
-
whole_model = torch.load(os.path.join(args.checkpoint_path, 'pytorch_model.bin'), map_location='cpu')
|
102 |
-
train_model.load_state_dict(whole_model)
|
103 |
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
if args.test:
|
106 |
# testing code
|
107 |
accelerator.print(f'[INFO] Start testing...')
|
108 |
-
|
109 |
with torch.no_grad():
|
110 |
os.makedirs(args.output_dir, exist_ok=True)
|
111 |
output_list = []
|
@@ -116,7 +131,7 @@ def main():
|
|
116 |
image_sizes = image_sizes[0]
|
117 |
if len(image_sizes) != args.batch_size:
|
118 |
image_sizes = [torch.cat(image_sizes)]
|
119 |
-
output =
|
120 |
text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
|
121 |
output_data = raw_data
|
122 |
output_data.update({'answer': text_output, 'question': question})
|
@@ -146,7 +161,7 @@ def main():
|
|
146 |
# initialize epoch-level metrics
|
147 |
accelerator.print(f'[INFO] Start training...')
|
148 |
for epoch in tqdm(range(args.num_epochs)):
|
149 |
-
|
150 |
total_train_loss = 0
|
151 |
for i, batch in enumerate(train_dataloader):
|
152 |
optimizer.zero_grad()
|
@@ -155,10 +170,10 @@ def main():
|
|
155 |
image_sizes = image_sizes[0]
|
156 |
if len(image_sizes) != args.batch_size:
|
157 |
image_sizes = [torch.cat(image_sizes)]
|
158 |
-
output =
|
159 |
loss = output.loss
|
160 |
# Accelerator requires all params to involve gradient descend. This 'dummy loss' can avoid this issue.
|
161 |
-
for param in
|
162 |
loss += param.sum() * 0.0
|
163 |
accelerator.backward(loss)
|
164 |
optimizer.step()
|
@@ -177,7 +192,7 @@ def main():
|
|
177 |
total_test_loss = None
|
178 |
if epoch % args.eval_interval == 0:
|
179 |
total_test_loss = 0
|
180 |
-
|
181 |
with torch.no_grad():
|
182 |
for i, batch in enumerate(test_dataloader):
|
183 |
raw_data, question, answer, image, image_sizes = batch
|
@@ -185,7 +200,7 @@ def main():
|
|
185 |
image_sizes = image_sizes[0]
|
186 |
if len(image_sizes) != args.batch_size:
|
187 |
image_sizes = [torch.cat(image_sizes)]
|
188 |
-
output =
|
189 |
text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
|
190 |
if i % 100 == 0:
|
191 |
img_id = raw_data[0]['id']
|
@@ -198,7 +213,7 @@ def main():
|
|
198 |
save_model_dir = os.path.join(args.ckpt_dir, args.model_name, 'checkpoints', f'checkpoint_{epoch:05d}')
|
199 |
lora_save_dir = os.path.join(args.ckpt_dir, args.model_name, 'lora')
|
200 |
accelerator.save_state(save_model_dir, safe_serialization=False, total_limit=5)
|
201 |
-
unwrapped_model = accelerator.unwrap_model(
|
202 |
unwrapped_model.model.save_pretrained(
|
203 |
lora_save_dir,
|
204 |
save_function=accelerator.save,
|
|
|
2 |
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
3 |
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
4 |
from llava.conversation import conv_templates, SeparatorStyle
|
5 |
+
from peft import LoraConfig, get_peft_model, PeftModel
|
6 |
from PIL import Image
|
7 |
import requests
|
8 |
import copy
|
|
|
10 |
import argparse
|
11 |
from dataset.SurgDataset import SurgDataset
|
12 |
from accelerate import Accelerator
|
13 |
+
from llava.model.SurgLLaVA import SurgLLaVA
|
14 |
import os
|
15 |
from tqdm import tqdm
|
16 |
import json
|
|
|
38 |
parser.add_argument('--step_size', type=int, default=300)
|
39 |
parser.add_argument('--gamma', type=float, default=0.95, help='gemma value of scheduler')
|
40 |
parser.add_argument('--num_epochs', type=int, default=1000)
|
41 |
+
parser.add_argument('--lora', action='store_true', help='Use LoRA if True')
|
42 |
parser.add_argument('--test', action='store_true')
|
43 |
+
parser.add_argument('--lora_ckpt_path', type=str, default=None)
|
44 |
+
parser.add_argument('--ckpt_path', type=str, default=None)
|
45 |
parser.add_argument('--output_dir', type=str, default='4dor_output', help='output file path, which will store output text.')
|
46 |
return parser.parse_args()
|
47 |
def main():
|
|
|
49 |
accelerator = Accelerator(project_dir=os.path.join(args.ckpt_dir, args.model_name),
|
50 |
log_with="wandb" if args.wandb else None,
|
51 |
gradient_accumulation_steps=args.gradient_accumulation_steps)
|
52 |
+
|
53 |
if args.wandb:
|
54 |
+
print(f'[INFO] Using wandb for logging...')
|
55 |
accelerator.init_trackers(
|
56 |
project_name=args.wandb_project,
|
57 |
config=args,
|
|
|
60 |
accelerator.print("[Info] Using wandb for logging...")
|
61 |
pretrained = "lmms-lab/llama3-llava-next-8b"
|
62 |
model_name = "llava_llama3"
|
63 |
+
tokenizer, llm_model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map='cuda') # Add any other thing you want to pass in llava_model_args
|
64 |
+
# tokenizer.pad_token_id = tokenizer.eos_token_id
|
65 |
+
if tokenizer.pad_token is None:
|
66 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
67 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
68 |
+
train_dataset = SurgDataset(args, image_processor, llm_model.config, mode='train')
|
69 |
+
test_dataset = SurgDataset(args, image_processor, llm_model.config, mode='test')
|
70 |
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
|
71 |
test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
|
72 |
+
|
73 |
+
print(f'[INFO] Freezing llm model')
|
74 |
+
for param in llm_model.parameters():
|
75 |
param.requires_grad = False
|
76 |
+
llm_model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
if args.lora:
|
79 |
+
if args.lora_ckpt_path is not None:
|
80 |
+
print(f'[INFO] Loading LoRA model checkpoint...')
|
81 |
+
llm_model = PeftModel.from_pretrained(llm_model, './model_ckpt/llama3-llava-next-8b-task-lora')
|
82 |
+
llm_model = llm_model.merge_and_unload()
|
83 |
+
else:
|
84 |
+
print(f'[INFO] Creating LoRA ...')
|
85 |
+
peft_config = LoraConfig(
|
86 |
+
lora_alpha=args.lora_rank,
|
87 |
+
lora_dropout=0.05,
|
88 |
+
r=args.lora_rank,
|
89 |
+
bias="none",
|
90 |
+
task_type="CAUSAL_LM",
|
91 |
+
target_modules=[
|
92 |
+
"q_proj",
|
93 |
+
"k_proj",
|
94 |
+
"v_proj",
|
95 |
+
"o_proj",
|
96 |
+
"gate_proj",
|
97 |
+
"up_proj",
|
98 |
+
"down_proj",
|
99 |
+
"lm_head",
|
100 |
+
],
|
101 |
+
)
|
102 |
+
lora_llm = get_peft_model(llm_model, peft_config)
|
103 |
+
llm_model = lora_llm.model
|
104 |
+
|
105 |
+
|
106 |
+
train_params = llm_model.parameters()
|
107 |
print(f'[INFO] Creating Model ...')
|
108 |
+
model = SurgLLaVA(args, llm_model, tokenizer)
|
109 |
+
model = model.to(torch.bfloat16)
|
110 |
optimizer = torch.optim.AdamW(train_params, lr=args.lr, eps=1e-7)
|
111 |
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader) * args.step_size // args.gradient_accumulation_steps, gamma=args.gamma)
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
+
if args.ckpt_path is not None:
|
114 |
+
print(f'[INFO] Load whole pretrained checkpoint...')
|
115 |
+
whole_model = torch.load(os.path.join(args.ckpt_path, 'pytorch_model.bin'), map_location='cpu')
|
116 |
+
model.load_state_dict(whole_model)
|
117 |
+
|
118 |
+
print(f'[INFO] Preparing accelerator...')
|
119 |
+
model, tokenizer, optimizer, scheduler, train_dataloader, test_dataloader = accelerator.prepare(model, tokenizer, optimizer, scheduler, train_dataloader, test_dataloader)
|
120 |
if args.test:
|
121 |
# testing code
|
122 |
accelerator.print(f'[INFO] Start testing...')
|
123 |
+
model.eval()
|
124 |
with torch.no_grad():
|
125 |
os.makedirs(args.output_dir, exist_ok=True)
|
126 |
output_list = []
|
|
|
131 |
image_sizes = image_sizes[0]
|
132 |
if len(image_sizes) != args.batch_size:
|
133 |
image_sizes = [torch.cat(image_sizes)]
|
134 |
+
output = model(image, image_sizes, question)
|
135 |
text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
|
136 |
output_data = raw_data
|
137 |
output_data.update({'answer': text_output, 'question': question})
|
|
|
161 |
# initialize epoch-level metrics
|
162 |
accelerator.print(f'[INFO] Start training...')
|
163 |
for epoch in tqdm(range(args.num_epochs)):
|
164 |
+
model.train()
|
165 |
total_train_loss = 0
|
166 |
for i, batch in enumerate(train_dataloader):
|
167 |
optimizer.zero_grad()
|
|
|
170 |
image_sizes = image_sizes[0]
|
171 |
if len(image_sizes) != args.batch_size:
|
172 |
image_sizes = [torch.cat(image_sizes)]
|
173 |
+
output = model(image, image_sizes, question, answer)
|
174 |
loss = output.loss
|
175 |
# Accelerator requires all params to involve gradient descend. This 'dummy loss' can avoid this issue.
|
176 |
+
for param in model.parameters():
|
177 |
loss += param.sum() * 0.0
|
178 |
accelerator.backward(loss)
|
179 |
optimizer.step()
|
|
|
192 |
total_test_loss = None
|
193 |
if epoch % args.eval_interval == 0:
|
194 |
total_test_loss = 0
|
195 |
+
model.eval()
|
196 |
with torch.no_grad():
|
197 |
for i, batch in enumerate(test_dataloader):
|
198 |
raw_data, question, answer, image, image_sizes = batch
|
|
|
200 |
image_sizes = image_sizes[0]
|
201 |
if len(image_sizes) != args.batch_size:
|
202 |
image_sizes = [torch.cat(image_sizes)]
|
203 |
+
output = model(image, image_sizes, question, )
|
204 |
text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
|
205 |
if i % 100 == 0:
|
206 |
img_id = raw_data[0]['id']
|
|
|
213 |
save_model_dir = os.path.join(args.ckpt_dir, args.model_name, 'checkpoints', f'checkpoint_{epoch:05d}')
|
214 |
lora_save_dir = os.path.join(args.ckpt_dir, args.model_name, 'lora')
|
215 |
accelerator.save_state(save_model_dir, safe_serialization=False, total_limit=5)
|
216 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
217 |
unwrapped_model.model.save_pretrained(
|
218 |
lora_save_dir,
|
219 |
save_function=accelerator.save,
|
scripts/convert_gqa_for_eval.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument("--src", type=str)
|
7 |
+
parser.add_argument("--dst", type=str)
|
8 |
+
args = parser.parse_args()
|
9 |
+
|
10 |
+
all_answers = []
|
11 |
+
for line_idx, line in enumerate(open(args.src)):
|
12 |
+
res = json.loads(line)
|
13 |
+
question_id = res['question_id']
|
14 |
+
text = res['text'].rstrip('.').lower()
|
15 |
+
all_answers.append({"questionId": question_id, "prediction": text})
|
16 |
+
|
17 |
+
with open(args.dst, 'w') as f:
|
18 |
+
json.dump(all_answers, f)
|
scripts/convert_mmbench_for_submission.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import argparse
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
def get_args():
|
7 |
+
parser = argparse.ArgumentParser()
|
8 |
+
parser.add_argument("--annotation-file", type=str, required=True)
|
9 |
+
parser.add_argument("--result-dir", type=str, required=True)
|
10 |
+
parser.add_argument("--upload-dir", type=str, required=True)
|
11 |
+
parser.add_argument("--experiment", type=str, required=True)
|
12 |
+
|
13 |
+
return parser.parse_args()
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
args = get_args()
|
17 |
+
|
18 |
+
df = pd.read_table(args.annotation_file)
|
19 |
+
|
20 |
+
cur_df = df.copy()
|
21 |
+
cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
|
22 |
+
cur_df.insert(6, 'prediction', None)
|
23 |
+
for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
|
24 |
+
pred = json.loads(pred)
|
25 |
+
cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
|
26 |
+
|
27 |
+
cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
|
scripts/convert_mmvet_for_eval.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
parser = argparse.ArgumentParser()
|
6 |
+
parser.add_argument("--src", type=str)
|
7 |
+
parser.add_argument("--dst", type=str)
|
8 |
+
args = parser.parse_args()
|
9 |
+
|
10 |
+
cur_result = {}
|
11 |
+
|
12 |
+
for line in open(args.src):
|
13 |
+
data = json.loads(line)
|
14 |
+
qid = data['question_id']
|
15 |
+
cur_result[f'v1_{qid}'] = data['text']
|
16 |
+
|
17 |
+
with open(args.dst, 'w') as f:
|
18 |
+
json.dump(cur_result, f, indent=2)
|
scripts/convert_seed_for_submission.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
|
6 |
+
def get_args():
|
7 |
+
parser = argparse.ArgumentParser()
|
8 |
+
parser.add_argument("--annotation-file", type=str)
|
9 |
+
parser.add_argument("--result-file", type=str)
|
10 |
+
parser.add_argument("--result-upload-file", type=str)
|
11 |
+
return parser.parse_args()
|
12 |
+
|
13 |
+
|
14 |
+
def eval_single(result_file, eval_only_type=None):
|
15 |
+
results = {}
|
16 |
+
for line in open(result_file):
|
17 |
+
row = json.loads(line)
|
18 |
+
results[row['question_id']] = row
|
19 |
+
|
20 |
+
type_counts = {}
|
21 |
+
correct_counts = {}
|
22 |
+
for question_data in data['questions']:
|
23 |
+
if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
|
24 |
+
data_type = question_data['question_type_id']
|
25 |
+
type_counts[data_type] = type_counts.get(data_type, 0) + 1
|
26 |
+
try:
|
27 |
+
question_id = int(question_data['question_id'])
|
28 |
+
except:
|
29 |
+
question_id = question_data['question_id']
|
30 |
+
if question_id not in results:
|
31 |
+
correct_counts[data_type] = correct_counts.get(data_type, 0)
|
32 |
+
continue
|
33 |
+
row = results[question_id]
|
34 |
+
if row['text'] == question_data['answer']:
|
35 |
+
correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
|
36 |
+
|
37 |
+
total_count = 0
|
38 |
+
total_correct = 0
|
39 |
+
for data_type in sorted(type_counts.keys()):
|
40 |
+
accuracy = correct_counts[data_type] / type_counts[data_type] * 100
|
41 |
+
if eval_only_type is None:
|
42 |
+
print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
|
43 |
+
|
44 |
+
total_count += type_counts[data_type]
|
45 |
+
total_correct += correct_counts[data_type]
|
46 |
+
|
47 |
+
total_accuracy = total_correct / total_count * 100
|
48 |
+
if eval_only_type is None:
|
49 |
+
print(f"Total accuracy: {total_accuracy:.2f}%")
|
50 |
+
else:
|
51 |
+
print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
|
52 |
+
|
53 |
+
return results
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
args = get_args()
|
57 |
+
data = json.load(open(args.annotation_file))
|
58 |
+
ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
|
59 |
+
|
60 |
+
results = eval_single(args.result_file)
|
61 |
+
eval_single(args.result_file, eval_only_type='image')
|
62 |
+
eval_single(args.result_file, eval_only_type='video')
|
63 |
+
|
64 |
+
with open(args.result_upload_file, 'w') as fp:
|
65 |
+
for question in data['questions']:
|
66 |
+
qid = question['question_id']
|
67 |
+
if qid in results:
|
68 |
+
result = results[qid]
|
69 |
+
else:
|
70 |
+
result = results[int(qid)]
|
71 |
+
fp.write(json.dumps({
|
72 |
+
'question_id': qid,
|
73 |
+
'prediction': result['text']
|
74 |
+
}) + '\n')
|
scripts/convert_sqa_to_llava.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import fire
|
4 |
+
import re
|
5 |
+
from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
|
6 |
+
|
7 |
+
|
8 |
+
def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
|
9 |
+
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
|
10 |
+
problems = json.load(open(os.path.join(base_dir, "problems.json")))
|
11 |
+
|
12 |
+
split_problems = build_prompt_chatbot(
|
13 |
+
problems, split_indices, prompt_format,
|
14 |
+
use_caption=False, is_test=False)
|
15 |
+
|
16 |
+
target_format = []
|
17 |
+
for prob_id, (input, output) in split_problems.items():
|
18 |
+
if input.startswith('Question: '):
|
19 |
+
input = input.replace('Question: ', '')
|
20 |
+
if output.startswith('Answer: '):
|
21 |
+
output = output.replace('Answer: ', '')
|
22 |
+
|
23 |
+
raw_prob_data = problems[prob_id]
|
24 |
+
if raw_prob_data['image'] is None:
|
25 |
+
target_format.append({
|
26 |
+
"id": prob_id,
|
27 |
+
"conversations": [
|
28 |
+
{'from': 'human', 'value': f"{input}"},
|
29 |
+
{'from': 'gpt', 'value': f"{output}"},
|
30 |
+
],
|
31 |
+
})
|
32 |
+
|
33 |
+
else:
|
34 |
+
target_format.append({
|
35 |
+
"id": prob_id,
|
36 |
+
"image": os.path.join(prob_id, raw_prob_data['image']),
|
37 |
+
"conversations": [
|
38 |
+
{'from': 'human', 'value': f"{input}\n<image>"},
|
39 |
+
{'from': 'gpt', 'value': f"{output}"},
|
40 |
+
],
|
41 |
+
})
|
42 |
+
|
43 |
+
print(f'Number of samples: {len(target_format)}')
|
44 |
+
|
45 |
+
with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
|
46 |
+
json.dump(target_format, f, indent=2)
|
47 |
+
|
48 |
+
|
49 |
+
def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
|
50 |
+
split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
|
51 |
+
problems = json.load(open(os.path.join(base_dir, "problems.json")))
|
52 |
+
|
53 |
+
split_problems = build_prompt_chatbot(
|
54 |
+
problems, split_indices, prompt_format,
|
55 |
+
use_caption=False, is_test=False)
|
56 |
+
|
57 |
+
writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
|
58 |
+
for prob_id, (input, output) in split_problems.items():
|
59 |
+
if input.startswith('Question: '):
|
60 |
+
input = input.replace('Question: ', '')
|
61 |
+
if output.startswith('Answer: '):
|
62 |
+
output = output.replace('Answer: ', '')
|
63 |
+
|
64 |
+
raw_prob_data = problems[prob_id]
|
65 |
+
if raw_prob_data['image'] is None:
|
66 |
+
data = {
|
67 |
+
"id": prob_id,
|
68 |
+
"instruction": f"{input}",
|
69 |
+
"output": f"{output}",
|
70 |
+
}
|
71 |
+
|
72 |
+
else:
|
73 |
+
data = {
|
74 |
+
"id": prob_id,
|
75 |
+
"image": os.path.join(prob_id, raw_prob_data['image']),
|
76 |
+
"instruction": f"{input}\n<image>",
|
77 |
+
"output": f"{output}",
|
78 |
+
}
|
79 |
+
writer.write(json.dumps(data) + '\n')
|
80 |
+
writer.close()
|
81 |
+
|
82 |
+
|
83 |
+
def main(task, **kwargs):
|
84 |
+
globals()[task](**kwargs)
|
85 |
+
|
86 |
+
|
87 |
+
if __name__ == "__main__":
|
88 |
+
fire.Fire(main)
|
scripts/convert_sqa_to_llava_base_prompt.py
ADDED
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_question_text(problem):
|
2 |
+
question = problem['question']
|
3 |
+
return question
|
4 |
+
|
5 |
+
|
6 |
+
def get_context_text(problem, use_caption):
|
7 |
+
txt_context = problem['hint']
|
8 |
+
img_context = problem['caption'] if use_caption else ""
|
9 |
+
context = " ".join([txt_context, img_context]).strip()
|
10 |
+
if context == "":
|
11 |
+
context = "N/A"
|
12 |
+
return context
|
13 |
+
|
14 |
+
|
15 |
+
def get_choice_text(probelm, options):
|
16 |
+
choices = probelm['choices']
|
17 |
+
choice_list = []
|
18 |
+
for i, c in enumerate(choices):
|
19 |
+
choice_list.append("({}) {}".format(options[i], c))
|
20 |
+
choice_txt = " ".join(choice_list)
|
21 |
+
#print(choice_txt)
|
22 |
+
return choice_txt
|
23 |
+
|
24 |
+
|
25 |
+
def get_answer(problem, options):
|
26 |
+
return options[problem['answer']]
|
27 |
+
|
28 |
+
|
29 |
+
def get_lecture_text(problem):
|
30 |
+
# \\n: GPT-3 can generate the lecture with more tokens.
|
31 |
+
lecture = problem['lecture'].replace("\n", "\\n")
|
32 |
+
return lecture
|
33 |
+
|
34 |
+
|
35 |
+
def get_solution_text(problem):
|
36 |
+
# \\n: GPT-3 can generate the solution with more tokens
|
37 |
+
solution = problem['solution'].replace("\n", "\\n")
|
38 |
+
return solution
|
39 |
+
|
40 |
+
|
41 |
+
def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True):
|
42 |
+
|
43 |
+
input_format, output_format = format.split("-")
|
44 |
+
|
45 |
+
## Inputs
|
46 |
+
if input_format == "CQM":
|
47 |
+
input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
|
48 |
+
elif input_format == "QCM":
|
49 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
|
50 |
+
# upper bound experiment
|
51 |
+
elif input_format == "QCML":
|
52 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
|
53 |
+
elif input_format == "QCME":
|
54 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
|
55 |
+
elif input_format == "QCMLE":
|
56 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
|
57 |
+
|
58 |
+
elif input_format == "QCLM":
|
59 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
|
60 |
+
elif input_format == "QCEM":
|
61 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
|
62 |
+
elif input_format == "QCLEM":
|
63 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
|
64 |
+
|
65 |
+
# Outputs
|
66 |
+
if test_example:
|
67 |
+
output = "Answer:"
|
68 |
+
elif output_format == 'A':
|
69 |
+
output = f"Answer: The answer is {answer}."
|
70 |
+
|
71 |
+
elif output_format == 'AL':
|
72 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
|
73 |
+
elif output_format == 'AE':
|
74 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
|
75 |
+
elif output_format == 'ALE':
|
76 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
|
77 |
+
elif output_format == 'AEL':
|
78 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
|
79 |
+
|
80 |
+
elif output_format == 'LA':
|
81 |
+
output = f"Answer: {lecture} The answer is {answer}."
|
82 |
+
elif output_format == 'EA':
|
83 |
+
output = f"Answer: {solution} The answer is {answer}."
|
84 |
+
elif output_format == 'LEA':
|
85 |
+
output = f"Answer: {lecture} {solution} The answer is {answer}."
|
86 |
+
elif output_format == 'ELA':
|
87 |
+
output = f"Answer: {solution} {lecture} The answer is {answer}."
|
88 |
+
elif output_format == 'LEPA':
|
89 |
+
output = ''
|
90 |
+
if len(lecture.strip()) > 0:
|
91 |
+
output += f"LECTURE: {lecture}\n"
|
92 |
+
if len(solution.strip()) > 0:
|
93 |
+
output += f"SOLUTION: {solution}\n"
|
94 |
+
output += '###\n'
|
95 |
+
output += f"ANSWER: {answer}."
|
96 |
+
|
97 |
+
input = input.replace(" ", " ").strip()
|
98 |
+
output = output.replace(" ", " ").strip()
|
99 |
+
if input.endswith("BECAUSE:"):
|
100 |
+
input = input.replace("BECAUSE:", "").strip()
|
101 |
+
if output.endswith("BECAUSE:"):
|
102 |
+
output = output.replace("BECAUSE:", "").strip()
|
103 |
+
return input, output
|
104 |
+
|
105 |
+
|
106 |
+
def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
|
107 |
+
|
108 |
+
input_format, output_format = format.split("-")
|
109 |
+
|
110 |
+
## Inputs
|
111 |
+
if input_format == "CQM":
|
112 |
+
input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
|
113 |
+
elif input_format == "QCM":
|
114 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
|
115 |
+
# upper bound experiment
|
116 |
+
elif input_format == "QCML":
|
117 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
|
118 |
+
elif input_format == "QCME":
|
119 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
|
120 |
+
elif input_format == "QCMLE":
|
121 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
|
122 |
+
|
123 |
+
elif input_format == "QCLM":
|
124 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
|
125 |
+
elif input_format == "QCEM":
|
126 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
|
127 |
+
elif input_format == "QCLEM":
|
128 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
|
129 |
+
|
130 |
+
# Outputs
|
131 |
+
if test_example:
|
132 |
+
output = "Answer:"
|
133 |
+
elif output_format == 'A':
|
134 |
+
output = f"Answer: The answer is {answer}."
|
135 |
+
|
136 |
+
elif output_format == 'AL':
|
137 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
|
138 |
+
elif output_format == 'AE':
|
139 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
|
140 |
+
elif output_format == 'ALE':
|
141 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
|
142 |
+
elif output_format == 'AEL':
|
143 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
|
144 |
+
|
145 |
+
elif output_format == 'LA':
|
146 |
+
output = f"Answer: {lecture} The answer is {answer}."
|
147 |
+
elif output_format == 'EA':
|
148 |
+
output = f"Answer: {solution} The answer is {answer}."
|
149 |
+
elif output_format == 'LEA':
|
150 |
+
output = f"Answer: {lecture} {solution} The answer is {answer}."
|
151 |
+
elif output_format == 'ELA':
|
152 |
+
output = f"Answer: {solution} {lecture} The answer is {answer}."
|
153 |
+
|
154 |
+
text = input + output
|
155 |
+
text = text.replace(" ", " ").strip()
|
156 |
+
if text.endswith("BECAUSE:"):
|
157 |
+
text = text.replace("BECAUSE:", "").strip()
|
158 |
+
return text
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True):
|
163 |
+
|
164 |
+
input_format, output_format = format.split("-")
|
165 |
+
|
166 |
+
## Inputs
|
167 |
+
if input_format == "CQM":
|
168 |
+
input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
|
169 |
+
elif input_format == "QCM":
|
170 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
|
171 |
+
# upper bound experiment
|
172 |
+
elif input_format == "QCML":
|
173 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
|
174 |
+
elif input_format == "QCME":
|
175 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
|
176 |
+
elif input_format == "QCMLE":
|
177 |
+
input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
|
178 |
+
|
179 |
+
elif input_format == "QCLM":
|
180 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
|
181 |
+
elif input_format == "QCEM":
|
182 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
|
183 |
+
elif input_format == "QCLEM":
|
184 |
+
input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
|
185 |
+
|
186 |
+
# Outputs
|
187 |
+
if test_example:
|
188 |
+
output = "Answer:"
|
189 |
+
elif output_format == 'A':
|
190 |
+
output = f"Answer: The answer is {answer}."
|
191 |
+
|
192 |
+
elif output_format == 'AL':
|
193 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
|
194 |
+
elif output_format == 'AE':
|
195 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
|
196 |
+
elif output_format == 'ALE':
|
197 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
|
198 |
+
elif output_format == 'AEL':
|
199 |
+
output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
|
200 |
+
|
201 |
+
elif output_format == 'LA':
|
202 |
+
output = f"Answer: {lecture} The answer is {answer}."
|
203 |
+
elif output_format == 'EA':
|
204 |
+
output = f"Answer: {solution} The answer is {answer}."
|
205 |
+
elif output_format == 'LEA':
|
206 |
+
output = f"Answer: {lecture} {solution} The answer is {answer}."
|
207 |
+
elif output_format == 'ELA':
|
208 |
+
output = f"Answer: {solution} {lecture} The answer is {answer}."
|
209 |
+
|
210 |
+
input = input.replace(" ", " ").strip()
|
211 |
+
output = output.replace(" ", " ").strip()
|
212 |
+
if output.endswith("BECAUSE:"):
|
213 |
+
output = output.replace("BECAUSE:", "").strip()
|
214 |
+
|
215 |
+
user_prompt = {"role": "user", "content": f"Can you explain {input}?"}
|
216 |
+
assistant_prompt = {"role": "assistant", "content": f"{output}"}
|
217 |
+
|
218 |
+
return user_prompt, assistant_prompt
|
219 |
+
|
220 |
+
|
221 |
+
def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False):
|
222 |
+
examples = {}
|
223 |
+
|
224 |
+
for qid in shot_qids:
|
225 |
+
question = get_question_text(problems[qid])
|
226 |
+
context = get_context_text(problems[qid], use_caption)
|
227 |
+
choice = get_choice_text(problems[qid], options)
|
228 |
+
answer = get_answer(problems[qid], options)
|
229 |
+
lecture = get_lecture_text(problems[qid]).replace('\\n', '\n')
|
230 |
+
solution = get_solution_text(problems[qid]).replace('\\n', '\n')
|
231 |
+
|
232 |
+
train_example = create_one_example_chatbot(prompt_format,
|
233 |
+
question,
|
234 |
+
context,
|
235 |
+
choice,
|
236 |
+
answer,
|
237 |
+
lecture,
|
238 |
+
solution,
|
239 |
+
test_example=is_test)
|
240 |
+
examples[qid] = train_example
|
241 |
+
return examples
|
242 |
+
|
243 |
+
|
244 |
+
def build_prompt(problems, shot_qids, test_qid, args):
|
245 |
+
|
246 |
+
examples = []
|
247 |
+
|
248 |
+
# n-shot training examples
|
249 |
+
for qid in shot_qids:
|
250 |
+
question = get_question_text(problems[qid])
|
251 |
+
context = get_context_text(problems[qid], args.use_caption)
|
252 |
+
choice = get_choice_text(problems[qid], args.options)
|
253 |
+
answer = get_answer(problems[qid], args.options)
|
254 |
+
lecture = get_lecture_text(problems[qid])
|
255 |
+
solution = get_solution_text(problems[qid])
|
256 |
+
|
257 |
+
train_example = create_one_example(args.prompt_format,
|
258 |
+
question,
|
259 |
+
context,
|
260 |
+
choice,
|
261 |
+
answer,
|
262 |
+
lecture,
|
263 |
+
solution,
|
264 |
+
test_example=False)
|
265 |
+
examples.append(train_example)
|
266 |
+
|
267 |
+
# test example
|
268 |
+
question = get_question_text(problems[test_qid])
|
269 |
+
context = get_context_text(problems[test_qid], args.use_caption)
|
270 |
+
choice = get_choice_text(problems[test_qid], args.options)
|
271 |
+
answer = get_answer(problems[test_qid], args.options)
|
272 |
+
lecture = get_lecture_text(problems[test_qid])
|
273 |
+
solution = get_solution_text(problems[test_qid])
|
274 |
+
|
275 |
+
test_example = create_one_example(args.prompt_format,
|
276 |
+
question,
|
277 |
+
context,
|
278 |
+
choice,
|
279 |
+
answer,
|
280 |
+
lecture,
|
281 |
+
solution,
|
282 |
+
test_example=True)
|
283 |
+
examples.append(test_example)
|
284 |
+
|
285 |
+
# create the prompt input
|
286 |
+
prompt_input = '\n\n'.join(examples)
|
287 |
+
|
288 |
+
return prompt_input
|
289 |
+
|
290 |
+
|
291 |
+
def build_prompt_gpt4(problems, shot_qids, test_qid, args):
|
292 |
+
|
293 |
+
prompt_array = [{"role": "system", "content": "You are a helpful assistant."}]
|
294 |
+
|
295 |
+
# n-shot training examples
|
296 |
+
for qid in shot_qids:
|
297 |
+
question = get_question_text(problems[qid])
|
298 |
+
context = get_context_text(problems[qid], args.use_caption)
|
299 |
+
choice = get_choice_text(problems[qid], args.options)
|
300 |
+
answer = get_answer(problems[qid], args.options)
|
301 |
+
lecture = get_lecture_text(problems[qid])
|
302 |
+
solution = get_solution_text(problems[qid])
|
303 |
+
|
304 |
+
user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
|
305 |
+
question,
|
306 |
+
context,
|
307 |
+
choice,
|
308 |
+
answer,
|
309 |
+
lecture,
|
310 |
+
solution,
|
311 |
+
test_example=False)
|
312 |
+
prompt_array.append(user_prompt)
|
313 |
+
prompt_array.append(assistant_prompt)
|
314 |
+
|
315 |
+
# test example
|
316 |
+
question = get_question_text(problems[test_qid])
|
317 |
+
context = get_context_text(problems[test_qid], args.use_caption)
|
318 |
+
choice = get_choice_text(problems[test_qid], args.options)
|
319 |
+
answer = get_answer(problems[test_qid], args.options)
|
320 |
+
lecture = get_lecture_text(problems[test_qid])
|
321 |
+
solution = get_solution_text(problems[test_qid])
|
322 |
+
|
323 |
+
user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
|
324 |
+
question,
|
325 |
+
context,
|
326 |
+
choice,
|
327 |
+
answer,
|
328 |
+
lecture,
|
329 |
+
solution,
|
330 |
+
test_example=True)
|
331 |
+
prompt_array.append(user_prompt)
|
332 |
+
prompt_array.append(assistant_prompt)
|
333 |
+
|
334 |
+
return prompt_array
|
scripts/convert_vizwiz_for_submission.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
|
5 |
+
from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
|
6 |
+
|
7 |
+
|
8 |
+
def parse_args():
|
9 |
+
parser = argparse.ArgumentParser()
|
10 |
+
parser.add_argument('--annotation-file', type=str, required=True)
|
11 |
+
parser.add_argument('--result-file', type=str, required=True)
|
12 |
+
parser.add_argument('--result-upload-file', type=str, required=True)
|
13 |
+
return parser.parse_args()
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == '__main__':
|
17 |
+
|
18 |
+
args = parse_args()
|
19 |
+
|
20 |
+
os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
|
21 |
+
|
22 |
+
results = []
|
23 |
+
error_line = 0
|
24 |
+
for line_idx, line in enumerate(open(args.result_file)):
|
25 |
+
try:
|
26 |
+
results.append(json.loads(line))
|
27 |
+
except:
|
28 |
+
error_line += 1
|
29 |
+
results = {x['question_id']: x['text'] for x in results}
|
30 |
+
test_split = [json.loads(line) for line in open(args.annotation_file)]
|
31 |
+
split_ids = set([x['question_id'] for x in test_split])
|
32 |
+
|
33 |
+
print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
|
34 |
+
|
35 |
+
all_answers = []
|
36 |
+
|
37 |
+
answer_processor = EvalAIAnswerProcessor()
|
38 |
+
|
39 |
+
for x in test_split:
|
40 |
+
assert x['question_id'] in results
|
41 |
+
all_answers.append({
|
42 |
+
'image': x['image'],
|
43 |
+
'answer': answer_processor(results[x['question_id']])
|
44 |
+
})
|
45 |
+
|
46 |
+
with open(args.result_upload_file, 'w') as f:
|
47 |
+
json.dump(all_answers, f)
|
scripts/convert_vqav2_for_submission.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
|
5 |
+
from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
|
6 |
+
|
7 |
+
|
8 |
+
def parse_args():
|
9 |
+
parser = argparse.ArgumentParser()
|
10 |
+
parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
|
11 |
+
parser.add_argument('--ckpt', type=str, required=True)
|
12 |
+
parser.add_argument('--split', type=str, required=True)
|
13 |
+
return parser.parse_args()
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == '__main__':
|
17 |
+
|
18 |
+
args = parse_args()
|
19 |
+
|
20 |
+
src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
|
21 |
+
test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
|
22 |
+
dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
|
23 |
+
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
24 |
+
|
25 |
+
results = []
|
26 |
+
error_line = 0
|
27 |
+
for line_idx, line in enumerate(open(src)):
|
28 |
+
try:
|
29 |
+
results.append(json.loads(line))
|
30 |
+
except:
|
31 |
+
error_line += 1
|
32 |
+
|
33 |
+
results = {x['question_id']: x['text'] for x in results}
|
34 |
+
test_split = [json.loads(line) for line in open(test_split)]
|
35 |
+
split_ids = set([x['question_id'] for x in test_split])
|
36 |
+
|
37 |
+
print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
|
38 |
+
|
39 |
+
all_answers = []
|
40 |
+
|
41 |
+
answer_processor = EvalAIAnswerProcessor()
|
42 |
+
|
43 |
+
for x in test_split:
|
44 |
+
if x['question_id'] not in results:
|
45 |
+
all_answers.append({
|
46 |
+
'question_id': x['question_id'],
|
47 |
+
'answer': ''
|
48 |
+
})
|
49 |
+
else:
|
50 |
+
all_answers.append({
|
51 |
+
'question_id': x['question_id'],
|
52 |
+
'answer': answer_processor(results[x['question_id']])
|
53 |
+
})
|
54 |
+
|
55 |
+
with open(dst, 'w') as f:
|
56 |
+
json.dump(all_answers, open(dst, 'w'))
|
scripts/extract_mm_projector.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This is just a utility that I use to extract the projector for quantized models.
|
3 |
+
It is NOT necessary at all to train, or run inference/serve demos.
|
4 |
+
Use this script ONLY if you fully understand its implications.
|
5 |
+
"""
|
6 |
+
|
7 |
+
|
8 |
+
import os
|
9 |
+
import argparse
|
10 |
+
import torch
|
11 |
+
import json
|
12 |
+
from collections import defaultdict
|
13 |
+
|
14 |
+
|
15 |
+
def parse_args():
|
16 |
+
parser = argparse.ArgumentParser(description='Extract MMProjector weights')
|
17 |
+
parser.add_argument('--model-path', type=str, help='model folder')
|
18 |
+
parser.add_argument('--output', type=str, help='output file')
|
19 |
+
args = parser.parse_args()
|
20 |
+
return args
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
args = parse_args()
|
25 |
+
|
26 |
+
keys_to_match = ['mm_projector']
|
27 |
+
ckpt_to_key = defaultdict(list)
|
28 |
+
try:
|
29 |
+
model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
|
30 |
+
for k, v in model_indices['weight_map'].items():
|
31 |
+
if any(key_match in k for key_match in keys_to_match):
|
32 |
+
ckpt_to_key[v].append(k)
|
33 |
+
except FileNotFoundError:
|
34 |
+
# Smaller models or model checkpoints saved by DeepSpeed.
|
35 |
+
v = 'pytorch_model.bin'
|
36 |
+
for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
|
37 |
+
if any(key_match in k for key_match in keys_to_match):
|
38 |
+
ckpt_to_key[v].append(k)
|
39 |
+
|
40 |
+
loaded_weights = {}
|
41 |
+
|
42 |
+
for ckpt_name, weight_keys in ckpt_to_key.items():
|
43 |
+
ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
|
44 |
+
for k in weight_keys:
|
45 |
+
loaded_weights[k] = ckpt[k]
|
46 |
+
|
47 |
+
torch.save(loaded_weights, args.output)
|
scripts/finetune.sh
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
6 |
+
|
7 |
+
################## VICUNA ##################
|
8 |
+
# PROMPT_VERSION=v1
|
9 |
+
# MODEL_VERSION="vicuna-v1-3-7b"
|
10 |
+
################## VICUNA ##################
|
11 |
+
|
12 |
+
################## LLaMA-2 ##################
|
13 |
+
# PROMPT_VERSION="llava_llama_2"
|
14 |
+
# MODEL_VERSION="llama-2-7b-chat"
|
15 |
+
################## LLaMA-2 ##################
|
16 |
+
|
17 |
+
deepspeed llava/train/train_mem.py \
|
18 |
+
--deepspeed ./scripts/zero2.json \
|
19 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
20 |
+
--version $PROMPT_VERSION \
|
21 |
+
--data_path ./playground/data/llava_instruct_80k.json \
|
22 |
+
--image_folder /path/to/coco/train2017 \
|
23 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
24 |
+
--pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
|
25 |
+
--mm_vision_select_layer -2 \
|
26 |
+
--mm_use_im_start_end False \
|
27 |
+
--mm_use_im_patch_token False \
|
28 |
+
--bf16 True \
|
29 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
|
30 |
+
--num_train_epochs 1 \
|
31 |
+
--per_device_train_batch_size 16 \
|
32 |
+
--per_device_eval_batch_size 4 \
|
33 |
+
--gradient_accumulation_steps 1 \
|
34 |
+
--evaluation_strategy "no" \
|
35 |
+
--save_strategy "steps" \
|
36 |
+
--save_steps 50000 \
|
37 |
+
--save_total_limit 1 \
|
38 |
+
--learning_rate 2e-5 \
|
39 |
+
--weight_decay 0. \
|
40 |
+
--warmup_ratio 0.03 \
|
41 |
+
--lr_scheduler_type "cosine" \
|
42 |
+
--logging_steps 1 \
|
43 |
+
--tf32 True \
|
44 |
+
--model_max_length 2048 \
|
45 |
+
--gradient_checkpointing True \
|
46 |
+
--dataloader_num_workers 4 \
|
47 |
+
--lazy_preprocess True \
|
48 |
+
--report_to wandb
|
scripts/finetune/test_llava.sh
CHANGED
@@ -1,4 +1,9 @@
|
|
1 |
-
accelerate launch --config_file ./
|
2 |
run_finetune_llava.py \
|
3 |
--test \
|
4 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate launch --config_file ./accelerator_config/gpu_4_config.yml \
|
2 |
run_finetune_llava.py \
|
3 |
--test \
|
4 |
+
--data_path /mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_caption_instruct_0712_test.json \
|
5 |
+
--output_dir ./eval_output/results_pwiseg_ori \
|
6 |
+
# --lora_ckpt_path /mnt1/lyc/llava_finetune/model_ckpt/llama3-llava-next-8b-task-lora \
|
7 |
+
# --ckpt_path ./model_ckpt/llava3_mix_instr/checkpoints/checkpoint_00003 \
|
8 |
+
# --data_path /mnt1/lyc/llava_finetune/data_json/4dor_caption_instruct_0711_test.json \
|
9 |
+
# --data_path /mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json
|
scripts/finetune_full_schedule.sh
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
6 |
+
|
7 |
+
################## VICUNA ##################
|
8 |
+
# PROMPT_VERSION=v1
|
9 |
+
# MODEL_VERSION="vicuna-v1-3-7b"
|
10 |
+
################## VICUNA ##################
|
11 |
+
|
12 |
+
################## LLaMA-2 ##################
|
13 |
+
# PROMPT_VERSION="llava_llama_2"
|
14 |
+
# MODEL_VERSION="llama-2-7b-chat"
|
15 |
+
################## LLaMA-2 ##################
|
16 |
+
|
17 |
+
deepspeed llava/train/train_mem.py \
|
18 |
+
--deepspeed ./scripts/zero2.json \
|
19 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
20 |
+
--version $PROMPT_VERSION \
|
21 |
+
--data_path ./playground/data/llava_instruct_158k.json \
|
22 |
+
--image_folder /path/to/coco/train2017 \
|
23 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
24 |
+
--pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
|
25 |
+
--mm_vision_select_layer -2 \
|
26 |
+
--mm_use_im_start_end False \
|
27 |
+
--mm_use_im_patch_token False \
|
28 |
+
--bf16 True \
|
29 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
|
30 |
+
--num_train_epochs 3 \
|
31 |
+
--per_device_train_batch_size 16 \
|
32 |
+
--per_device_eval_batch_size 4 \
|
33 |
+
--gradient_accumulation_steps 1 \
|
34 |
+
--evaluation_strategy "no" \
|
35 |
+
--save_strategy "steps" \
|
36 |
+
--save_steps 50000 \
|
37 |
+
--save_total_limit 1 \
|
38 |
+
--learning_rate 2e-5 \
|
39 |
+
--weight_decay 0. \
|
40 |
+
--warmup_ratio 0.03 \
|
41 |
+
--lr_scheduler_type "cosine" \
|
42 |
+
--logging_steps 1 \
|
43 |
+
--tf32 True \
|
44 |
+
--model_max_length 2048 \
|
45 |
+
--gradient_checkpointing True \
|
46 |
+
--dataloader_num_workers 4 \
|
47 |
+
--lazy_preprocess True \
|
48 |
+
--report_to wandb
|
scripts/finetune_lora.sh
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
6 |
+
|
7 |
+
################## VICUNA ##################
|
8 |
+
# PROMPT_VERSION=v1
|
9 |
+
# MODEL_VERSION="vicuna-v1-3-7b"
|
10 |
+
################## VICUNA ##################
|
11 |
+
|
12 |
+
################## LLaMA-2 ##################
|
13 |
+
# PROMPT_VERSION="llava_llama_2"
|
14 |
+
# MODEL_VERSION="llama-2-7b-chat"
|
15 |
+
################## LLaMA-2 ##################
|
16 |
+
|
17 |
+
deepspeed llava/train/train_mem.py \
|
18 |
+
--deepspeed ./scripts/zero2.json \
|
19 |
+
--lora_enable True \
|
20 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
21 |
+
--version $PROMPT_VERSION \
|
22 |
+
--data_path ./playground/data/llava_instruct_80k.json \
|
23 |
+
--image_folder /path/to/coco/train2017 \
|
24 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
25 |
+
--pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
|
26 |
+
--mm_vision_select_layer -2 \
|
27 |
+
--mm_use_im_start_end False \
|
28 |
+
--mm_use_im_patch_token False \
|
29 |
+
--bf16 True \
|
30 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
|
31 |
+
--num_train_epochs 1 \
|
32 |
+
--per_device_train_batch_size 16 \
|
33 |
+
--per_device_eval_batch_size 4 \
|
34 |
+
--gradient_accumulation_steps 1 \
|
35 |
+
--evaluation_strategy "no" \
|
36 |
+
--save_strategy "steps" \
|
37 |
+
--save_steps 50000 \
|
38 |
+
--save_total_limit 1 \
|
39 |
+
--learning_rate 2e-5 \
|
40 |
+
--weight_decay 0. \
|
41 |
+
--warmup_ratio 0.03 \
|
42 |
+
--lr_scheduler_type "cosine" \
|
43 |
+
--logging_steps 1 \
|
44 |
+
--tf32 True \
|
45 |
+
--model_max_length 2048 \
|
46 |
+
--gradient_checkpointing True \
|
47 |
+
--lazy_preprocess True \
|
48 |
+
--dataloader_num_workers 4 \
|
49 |
+
--report_to wandb
|
scripts/finetune_lora_my.sh
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
6 |
+
|
7 |
+
################## VICUNA ##################
|
8 |
+
# PROMPT_VERSION=v1
|
9 |
+
# MODEL_VERSION="vicuna-v1-3-7b"
|
10 |
+
################## VICUNA ##################
|
11 |
+
|
12 |
+
################## LLaMA-2 ##################
|
13 |
+
PROMPT_VERSION="llava_llama_2"
|
14 |
+
MODEL_VERSION="llama-2-7b-chat"
|
15 |
+
################## LLaMA-2 ##################
|
16 |
+
|
17 |
+
deepspeed llava/train/train_mem.py \
|
18 |
+
--deepspeed ./scripts/zero2.json \
|
19 |
+
--lora_enable True \
|
20 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
21 |
+
--version $PROMPT_VERSION \
|
22 |
+
--data_path /mnt1/wjl/InternLM-XComposer/data/4D-OR-instruct/llava_3d_0503_train.json \
|
23 |
+
--image_folder /mnt1/wjl/InternLM-XComposer/data/4D-OR-MV \
|
24 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
25 |
+
--pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
|
26 |
+
--mm_vision_select_layer -2 \
|
27 |
+
--mm_use_im_start_end False \
|
28 |
+
--mm_use_im_patch_token False \
|
29 |
+
--bf16 True \
|
30 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
|
31 |
+
--num_train_epochs 1 \
|
32 |
+
--per_device_train_batch_size 16 \
|
33 |
+
--per_device_eval_batch_size 4 \
|
34 |
+
--gradient_accumulation_steps 1 \
|
35 |
+
--evaluation_strategy "no" \
|
36 |
+
--save_strategy "steps" \
|
37 |
+
--save_steps 50000 \
|
38 |
+
--save_total_limit 1 \
|
39 |
+
--learning_rate 2e-5 \
|
40 |
+
--weight_decay 0. \
|
41 |
+
--warmup_ratio 0.03 \
|
42 |
+
--lr_scheduler_type "cosine" \
|
43 |
+
--logging_steps 1 \
|
44 |
+
--tf32 True \
|
45 |
+
--model_max_length 2048 \
|
46 |
+
--gradient_checkpointing True \
|
47 |
+
--lazy_preprocess True \
|
48 |
+
--dataloader_num_workers 4 \
|
49 |
+
--report_to wandb
|
scripts/finetune_qlora.sh
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
6 |
+
|
7 |
+
################## VICUNA ##################
|
8 |
+
# PROMPT_VERSION=v1
|
9 |
+
# MODEL_VERSION="vicuna-v1-3-7b"
|
10 |
+
################## VICUNA ##################
|
11 |
+
|
12 |
+
################## LLaMA-2 ##################
|
13 |
+
# PROMPT_VERSION="llava_llama_2"
|
14 |
+
# MODEL_VERSION="llama-2-7b-chat"
|
15 |
+
################## LLaMA-2 ##################
|
16 |
+
|
17 |
+
deepspeed llava/train/train_mem.py \
|
18 |
+
--deepspeed ./scripts/zero2.json \
|
19 |
+
--lora_enable True \
|
20 |
+
--bits 4 \
|
21 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
22 |
+
--version $PROMPT_VERSION \
|
23 |
+
--data_path ./playground/data/llava_instruct_80k.json \
|
24 |
+
--image_folder /path/to/coco/train2017 \
|
25 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
26 |
+
--pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
|
27 |
+
--mm_vision_select_layer -2 \
|
28 |
+
--mm_use_im_start_end False \
|
29 |
+
--mm_use_im_patch_token False \
|
30 |
+
--bf16 True \
|
31 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
|
32 |
+
--num_train_epochs 1 \
|
33 |
+
--per_device_train_batch_size 16 \
|
34 |
+
--per_device_eval_batch_size 4 \
|
35 |
+
--gradient_accumulation_steps 1 \
|
36 |
+
--evaluation_strategy "no" \
|
37 |
+
--save_strategy "steps" \
|
38 |
+
--save_steps 50000 \
|
39 |
+
--save_total_limit 1 \
|
40 |
+
--learning_rate 2e-5 \
|
41 |
+
--weight_decay 0. \
|
42 |
+
--warmup_ratio 0.03 \
|
43 |
+
--lr_scheduler_type "cosine" \
|
44 |
+
--logging_steps 1 \
|
45 |
+
--tf32 True \
|
46 |
+
--model_max_length 2048 \
|
47 |
+
--gradient_checkpointing True \
|
48 |
+
--lazy_preprocess True \
|
49 |
+
--dataloader_num_workers 4 \
|
50 |
+
--report_to wandb
|
scripts/finetune_sqa.sh
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
deepspeed llava/train/train_mem.py \
|
6 |
+
--deepspeed ./scripts/zero2.json \
|
7 |
+
--model_name_or_path lmsys/vicuna-13b-v1.3 \
|
8 |
+
--version $PROMPT_VERSION \
|
9 |
+
--data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
|
10 |
+
--image_folder /Data/ScienceQA/data/scienceqa/images/train \
|
11 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
12 |
+
--pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
|
13 |
+
--mm_vision_select_layer -2 \
|
14 |
+
--mm_use_im_start_end False \
|
15 |
+
--mm_use_im_patch_token False \
|
16 |
+
--bf16 True \
|
17 |
+
--output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
|
18 |
+
--num_train_epochs 12 \
|
19 |
+
--per_device_train_batch_size 16 \
|
20 |
+
--per_device_eval_batch_size 4 \
|
21 |
+
--gradient_accumulation_steps 1 \
|
22 |
+
--evaluation_strategy "no" \
|
23 |
+
--save_strategy "steps" \
|
24 |
+
--save_steps 50000 \
|
25 |
+
--save_total_limit 1 \
|
26 |
+
--learning_rate 2e-5 \
|
27 |
+
--weight_decay 0. \
|
28 |
+
--warmup_ratio 0.03 \
|
29 |
+
--lr_scheduler_type "cosine" \
|
30 |
+
--logging_steps 1 \
|
31 |
+
--tf32 True \
|
32 |
+
--model_max_length 2048 \
|
33 |
+
--gradient_checkpointing True \
|
34 |
+
--dataloader_num_workers 4 \
|
35 |
+
--lazy_preprocess True \
|
36 |
+
--report_to wandb
|
scripts/merge_lora_weights.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from llava.model.builder import load_pretrained_model
|
3 |
+
from llava.mm_utils import get_model_name_from_path
|
4 |
+
|
5 |
+
|
6 |
+
def merge_lora(args):
|
7 |
+
model_name = get_model_name_from_path(args.model_path)
|
8 |
+
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
|
9 |
+
|
10 |
+
model.save_pretrained(args.save_model_path)
|
11 |
+
tokenizer.save_pretrained(args.save_model_path)
|
12 |
+
|
13 |
+
|
14 |
+
if __name__ == "__main__":
|
15 |
+
parser = argparse.ArgumentParser()
|
16 |
+
parser.add_argument("--model-path", type=str, required=True)
|
17 |
+
parser.add_argument("--model-base", type=str, required=True)
|
18 |
+
parser.add_argument("--save-model-path", type=str, required=True)
|
19 |
+
|
20 |
+
args = parser.parse_args()
|
21 |
+
|
22 |
+
merge_lora(args)
|
scripts/pretrain.sh
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
|
4 |
+
|
5 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
6 |
+
|
7 |
+
# MODEL_VERSION=vicuna-v1-3-7b
|
8 |
+
# MODEL_VERSION=llama-2-7b-chat
|
9 |
+
|
10 |
+
########### DO NOT CHANGE ###########
|
11 |
+
########### USE THIS FOR BOTH ###########
|
12 |
+
PROMPT_VERSION=plain
|
13 |
+
########### DO NOT CHANGE ###########
|
14 |
+
|
15 |
+
deepspeed llava/train/train_mem.py \
|
16 |
+
--deepspeed ./scripts/zero2.json \
|
17 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
18 |
+
--version $PROMPT_VERSION \
|
19 |
+
--data_path /path/to/pretrain_data.json \
|
20 |
+
--image_folder /path/to/images \
|
21 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
22 |
+
--tune_mm_mlp_adapter True \
|
23 |
+
--mm_vision_select_layer -2 \
|
24 |
+
--mm_use_im_start_end False \
|
25 |
+
--mm_use_im_patch_token False \
|
26 |
+
--bf16 True \
|
27 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
|
28 |
+
--num_train_epochs 1 \
|
29 |
+
--per_device_train_batch_size 16 \
|
30 |
+
--per_device_eval_batch_size 4 \
|
31 |
+
--gradient_accumulation_steps 1 \
|
32 |
+
--evaluation_strategy "no" \
|
33 |
+
--save_strategy "steps" \
|
34 |
+
--save_steps 24000 \
|
35 |
+
--save_total_limit 1 \
|
36 |
+
--learning_rate 2e-3 \
|
37 |
+
--weight_decay 0. \
|
38 |
+
--warmup_ratio 0.03 \
|
39 |
+
--lr_scheduler_type "cosine" \
|
40 |
+
--logging_steps 1 \
|
41 |
+
--tf32 True \
|
42 |
+
--model_max_length 2048 \
|
43 |
+
--gradient_checkpointing True \
|
44 |
+
--dataloader_num_workers 4 \
|
45 |
+
--lazy_preprocess True \
|
46 |
+
--report_to wandb
|
scripts/pretrain_xformers.sh
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Uncomment and set the following variables correspondingly to run this script:
|
4 |
+
|
5 |
+
# MODEL_VERSION=vicuna-v1-3-7b
|
6 |
+
# MODEL_VERSION=llama-2-7b-chat
|
7 |
+
|
8 |
+
########### DO NOT CHANGE ###########
|
9 |
+
########### USE THIS FOR BOTH ###########
|
10 |
+
PROMPT_VERSION=plain
|
11 |
+
########### DO NOT CHANGE ###########
|
12 |
+
|
13 |
+
deepspeed llava/train/train_xformers.py \
|
14 |
+
--deepspeed ./scripts/zero2.json \
|
15 |
+
--model_name_or_path ./checkpoints/$MODEL_VERSION \
|
16 |
+
--version $PROMPT_VERSION \
|
17 |
+
--data_path /path/to/pretrain_data.json \
|
18 |
+
--image_folder /path/to/images \
|
19 |
+
--vision_tower openai/clip-vit-large-patch14 \
|
20 |
+
--tune_mm_mlp_adapter True \
|
21 |
+
--mm_vision_select_layer -2 \
|
22 |
+
--mm_use_im_start_end False \
|
23 |
+
--mm_use_im_patch_token False \
|
24 |
+
--bf16 False \
|
25 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
|
26 |
+
--num_train_epochs 1 \
|
27 |
+
--per_device_train_batch_size 4 \
|
28 |
+
--per_device_eval_batch_size 4 \
|
29 |
+
--gradient_accumulation_steps 4 \
|
30 |
+
--evaluation_strategy "no" \
|
31 |
+
--save_strategy "steps" \
|
32 |
+
--save_steps 24000 \
|
33 |
+
--save_total_limit 1 \
|
34 |
+
--learning_rate 2e-3 \
|
35 |
+
--weight_decay 0. \
|
36 |
+
--warmup_ratio 0.03 \
|
37 |
+
--lr_scheduler_type "cosine" \
|
38 |
+
--logging_steps 1 \
|
39 |
+
--tf32 False \
|
40 |
+
--model_max_length 2048 \
|
41 |
+
--gradient_checkpointing True \
|
42 |
+
--dataloader_num_workers 4 \
|
43 |
+
--lazy_preprocess True \
|
44 |
+
--report_to wandb
|
scripts/sqa_eval_batch.sh
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
CHUNKS=8
|
4 |
+
for IDX in {0..7}; do
|
5 |
+
CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
|
6 |
+
--model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
|
7 |
+
--question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
|
8 |
+
--image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
|
9 |
+
--answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
|
10 |
+
--num-chunks $CHUNKS \
|
11 |
+
--chunk-idx $IDX \
|
12 |
+
--conv-mode llava_v1 &
|
13 |
+
done
|