lyclyc52 commited on
Commit
157f5b2
·
1 Parent(s): 085da01

Update: integrate llama3 into finetuning code

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. accelerator_config/gpu_1_config.yml +15 -0
  2. accelerator_config/gpu_4_config.yml +16 -0
  3. dataset/SurgDataset.py +2 -0
  4. eval_output/4dor_count_eval.json +0 -0
  5. eval_output/4dor_count_eval_llama3_llava_finetune.json +0 -0
  6. eval_output/4dor_count_eval_llama3_llava_ori.json +0 -0
  7. eval_output/4dor_phase_eval.json +0 -0
  8. eval_output/pwiseg_count_eval_llama3_llava.json +0 -0
  9. eval_output/pwiseg_count_eval_llama3_llava_finetune.json +0 -0
  10. eval_output/results_4dor_finetune/preds.json +0 -0
  11. eval_output/results_4dor_finetune/preds_count.json +0 -0
  12. eval_output/results_4dor_finetune/preds_description.json +0 -0
  13. eval_output/results_4dor_ori/preds.json +0 -0
  14. eval_output/results_4dor_ori/preds_count.json +0 -0
  15. eval_output/results_4dor_ori/preds_description.json +0 -0
  16. eval_output/results_4dor_self_finetune_llava/preds.json +0 -0
  17. eval_output/results_4dor_self_finetune_llava/preds_classification.json +0 -0
  18. eval_output/results_4dor_self_finetune_llava/preds_count.json +0 -0
  19. eval_output/results_4dor_self_finetune_llava/preds_description.json +0 -0
  20. eval_output/results_4dor_self_finetune_llava/preds_phase.json +0 -0
  21. eval_output/results_pwiseg_finetune/preds.json +0 -0
  22. eval_output/results_pwiseg_finetune/preds_count.json +0 -0
  23. eval_output/results_pwiseg_finetune/preds_description.json +0 -0
  24. eval_output/results_pwiseg_ori/preds.json +0 -0
  25. eval_output/results_pwiseg_ori/preds_count.json +0 -0
  26. eval_output/results_pwiseg_ori/preds_description.json +0 -0
  27. eval_scripts/caption_eval.py +92 -0
  28. eval_scripts/count_eval.py +237 -0
  29. eval_scripts/phase_eval.py +179 -0
  30. run_finetune_llava.py +64 -49
  31. scripts/convert_gqa_for_eval.py +18 -0
  32. scripts/convert_mmbench_for_submission.py +27 -0
  33. scripts/convert_mmvet_for_eval.py +18 -0
  34. scripts/convert_seed_for_submission.py +74 -0
  35. scripts/convert_sqa_to_llava.py +88 -0
  36. scripts/convert_sqa_to_llava_base_prompt.py +334 -0
  37. scripts/convert_vizwiz_for_submission.py +47 -0
  38. scripts/convert_vqav2_for_submission.py +56 -0
  39. scripts/extract_mm_projector.py +47 -0
  40. scripts/finetune.sh +48 -0
  41. scripts/finetune/test_llava.sh +7 -2
  42. scripts/finetune_full_schedule.sh +48 -0
  43. scripts/finetune_lora.sh +49 -0
  44. scripts/finetune_lora_my.sh +49 -0
  45. scripts/finetune_qlora.sh +50 -0
  46. scripts/finetune_sqa.sh +36 -0
  47. scripts/merge_lora_weights.py +22 -0
  48. scripts/pretrain.sh +46 -0
  49. scripts/pretrain_xformers.sh +44 -0
  50. scripts/sqa_eval_batch.sh +13 -0
accelerator_config/gpu_1_config.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ distributed_type: 'NO'
3
+ downcast_bf16: 'no'
4
+ gpu_ids: '0'
5
+ machine_rank: 0
6
+ main_training_function: main
7
+ mixed_precision: 'no'
8
+ num_machines: 1
9
+ num_processes: 1
10
+ rdzv_backend: static
11
+ same_network: true
12
+ tpu_env: []
13
+ tpu_use_cluster: false
14
+ tpu_use_sudo: false
15
+ use_cpu: false
accelerator_config/gpu_4_config.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ distributed_type: MULTI_GPU
3
+ downcast_bf16: 'no'
4
+ gpu_ids: 0,1,2,3
5
+ machine_rank: 0
6
+ main_training_function: main
7
+ mixed_precision: 'no'
8
+ num_machines: 1
9
+ num_processes: 4
10
+ rdzv_backend: static
11
+ same_network: true
12
+ tpu_env: []
13
+ tpu_use_cluster: false
14
+ tpu_use_sudo: false
15
+ use_cpu: false
16
+ main_process_port: 29600
dataset/SurgDataset.py CHANGED
@@ -28,6 +28,8 @@ class SurgDataset(Dataset):
28
  if os.path.isfile(args.data_path):
29
  with open(args.data_path) as f:
30
  self.data_json = json.load(f)
 
 
31
  else:
32
  self.data_json_path = os.path.join(args.data_path, 'test.json')
33
  if os.path.isfile(self.data_json_path):
 
28
  if os.path.isfile(args.data_path):
29
  with open(args.data_path) as f:
30
  self.data_json = json.load(f)
31
+ if len(self.data_json) > 200:
32
+ self.data_json = self.data_json[:200]
33
  else:
34
  self.data_json_path = os.path.join(args.data_path, 'test.json')
35
  if os.path.isfile(self.data_json_path):
eval_output/4dor_count_eval.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/4dor_count_eval_llama3_llava_finetune.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/4dor_count_eval_llama3_llava_ori.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/4dor_phase_eval.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/pwiseg_count_eval_llama3_llava.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/pwiseg_count_eval_llama3_llava_finetune.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_finetune/preds.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_finetune/preds_count.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_finetune/preds_description.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_ori/preds.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_ori/preds_count.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_ori/preds_description.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_self_finetune_llava/preds.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_self_finetune_llava/preds_classification.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_self_finetune_llava/preds_count.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_self_finetune_llava/preds_description.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_4dor_self_finetune_llava/preds_phase.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_pwiseg_finetune/preds.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_pwiseg_finetune/preds_count.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_pwiseg_finetune/preds_description.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_pwiseg_ori/preds.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_pwiseg_ori/preds_count.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_output/results_pwiseg_ori/preds_description.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_scripts/caption_eval.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
3
+ from pycocoevalcap.rouge.rouge import Rouge
4
+ from pycocoevalcap.cider.cider import Cider
5
+ import itertools
6
+ def load_json(filename):
7
+ with open(filename, 'r') as file:
8
+ return json.load(file)
9
+
10
+ def extract_answers(llm_data, gt_data):
11
+ llm_answers = {item['id']: item['answer'] for item in llm_data}
12
+ gt_answers = {item['id']: [conv['value'] for conv in item['conversations'] if conv['from'] == 'gpt'][0] for item in gt_data}
13
+ return llm_answers, gt_answers
14
+
15
+ def compute_bleu_scores(reference, hypothesis):
16
+ smooth_fn = SmoothingFunction().method1
17
+ weights = [
18
+ (1, 0, 0, 0), # BLEU-1
19
+ (0.5, 0.5, 0, 0), # BLEU-2
20
+ (0.33, 0.33, 0.33, 0), # BLEU-3
21
+ (0.25, 0.25, 0.25, 0.25) # BLEU-4
22
+ ]
23
+ scores = [sentence_bleu([reference], hypothesis, weights=w, smoothing_function=smooth_fn) for w in weights]
24
+ return scores
25
+
26
+ def compute_rouge_scores(references, hypotheses):
27
+ rouge = Rouge()
28
+ scores, _ = rouge.compute_score(references, hypotheses)
29
+ return scores
30
+
31
+ def compute_cider_scores(references, hypotheses):
32
+ cider = Cider()
33
+ scores, _ = cider.compute_score(references, hypotheses)
34
+ return scores
35
+
36
+ def main(llm_file, gt_file):
37
+ llm_data = load_json(llm_file)
38
+ gt_data = load_json(gt_file)
39
+
40
+ llm_answers, gt_answers = extract_answers(llm_data, gt_data)
41
+
42
+ bleu_scores = {i: [] for i in range(4)}
43
+ references = {}
44
+ hypotheses = {}
45
+
46
+ for id in llm_answers:
47
+ if id in gt_answers:
48
+ hypothesis = llm_answers[id].split()
49
+ reference = gt_answers[id].split()
50
+ bleu = compute_bleu_scores(reference, hypothesis)
51
+ for i in range(4):
52
+ bleu_scores[i].append(bleu[i])
53
+ references[id] = [gt_answers[id]]
54
+ hypotheses[id] = [llm_answers[id]]
55
+ else:
56
+ print(f"ID {id} not found in ground truth data.")
57
+
58
+ rouge_scores = compute_rouge_scores(references, hypotheses)
59
+ cider_scores = compute_cider_scores(references, hypotheses)
60
+
61
+ avg_bleu_scores = [sum(scores) / len(scores) for scores in bleu_scores.values()]
62
+ # print(rouge_scores)
63
+ # print(cider_scores)
64
+ # avg_rouge_score = sum(rouge_scores) / len(rouge_scores)
65
+ # avg_cider_score = sum(cider_scores) / len(cider_scores)
66
+
67
+ print(f"Average BLEU-1: {avg_bleu_scores[0]:.4f}")
68
+ print(f"Average BLEU-2: {avg_bleu_scores[1]:.4f}")
69
+ print(f"Average BLEU-3: {avg_bleu_scores[2]:.4f}")
70
+ print(f"Average BLEU-4: {avg_bleu_scores[3]:.4f}")
71
+ print(f"Average ROUGE-L: {rouge_scores:.4f}")
72
+ print(f"Average CIDEr: {cider_scores:.4f}")
73
+
74
+
75
+ def test():
76
+ llm_type = ['finetune', 'ori']
77
+ data_gt = {'pwiseg': '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_caption_instruct_0712_test.json',
78
+ '4dor': '/mnt1/lyc/llava_finetune/data_json/4dor_caption_instruct_0711_test.json'
79
+ }
80
+ data_type = ['pwiseg', '4dor']
81
+ for dt, lt in itertools.product(data_gt, llm_type):
82
+ # Replace with your ground truth file path
83
+ print(f'[INFO] data {dt} llm {lt}')
84
+ llm_file = f'/mnt1/lyc/llava_finetune/eval_output/results_{dt}_{lt}/preds_description.json'
85
+ gt_file = data_gt[dt]
86
+ main(llm_file, gt_file)
87
+ print()
88
+ print()
89
+
90
+ if __name__ == '__main__':
91
+ test()
92
+
eval_scripts/count_eval.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from openai import OpenAI
3
+ import re
4
+
5
+
6
+ def load_json(filename):
7
+ with open(filename, 'r') as file:
8
+ return json.load(file)
9
+
10
+
11
+ client = OpenAI(api_key="sk-HZLqWTFgQKHUM0YN9d800981DbC34aEa90632493B9310360",
12
+ base_url="https://vip.yi-zhan.top/v1")
13
+
14
+ def get_result(prompt):
15
+ response = client.chat.completions.create(
16
+ model="gpt-4o-2024-05-13",
17
+ messages=[
18
+ {"role": "system", "content": "You are a helpful assistant"},
19
+ {"role": "user", "content":
20
+ [
21
+ {"type": "text", "text": prompt},
22
+ ]
23
+ },
24
+ ],
25
+ stream=False,
26
+ temperature=0.8
27
+ )
28
+ return response.choices[0].message.content
29
+
30
+
31
+ def create_prompt(question, llm_answer, gt_answer):
32
+
33
+ template = """
34
+ ## Role
35
+ You are a judge, tasked with determining whether the answers provided by other large language models are consistent with the annotated data, especially in terms of numerical accuracy.
36
+
37
+ ## Question
38
+ ```json
39
+ {question}
40
+ ```
41
+
42
+ ## LLM Answer
43
+ ```json
44
+ {llm_answer}
45
+ ```
46
+
47
+ ## Annotated Answer
48
+ ```json
49
+ {gt_answer}
50
+ ```
51
+
52
+ ## Task
53
+ For a given Question, evaluate whether the LLM Answer is consistent with the Annotated Answer. If it is, please answer yes and give a reason.If it is not, please answer no and give a reason.
54
+
55
+ ## Constraints
56
+ - Your response should be divided into two parts: 'answer' and 'reason'. The 'answer' should be either 'yes' or 'no', indicating whether the large language model's prediction aligns with the annotated information, particularly in terms of quantities. The 'reason' should provide the rationale for your answer.
57
+ - When evaluating the accuracy of the large language model's prediction, please pay close attention to the counting of quantities in the model's response and whether it matches the quantities provided in the standard information.
58
+ - output format is a json dict as follows:
59
+ "reason": reason,
60
+ "answer": answer
61
+
62
+ Take a deep breath and start your answer step by step.
63
+ """
64
+
65
+ prompt = template.format(question=question,
66
+ llm_answer=llm_answer,
67
+ gt_answer=gt_answer)
68
+ return prompt
69
+
70
+ # def extract_answer(response_text):
71
+ # pattern = r'"answer":\s*"([^"]+)"'
72
+ # match = re.search(pattern, response_text)
73
+ # print(match)
74
+ # if match:
75
+ # return match.group(1).lower() == 'yes'
76
+ # return False
77
+
78
+ def extract_answer(json_string):
79
+ # 使用正则表达式匹配answer和reason
80
+ answer_match = re.search(r'"answer":\s*"([^"]+)"', json_string)
81
+ reason_match = re.search(r'"reason":\s*"([^"]+)"', json_string, re.DOTALL)
82
+
83
+ # 提取匹配的内容
84
+ answer = answer_match.group(1) if answer_match else None
85
+ reason = reason_match.group(1) if reason_match else None
86
+
87
+ return answer, reason
88
+
89
+
90
+ def main(llm_file, gt_file, out_file):
91
+ llm_data = load_json(llm_file)
92
+ gt_data = load_json(gt_file)
93
+
94
+ QA_dict = {item["id"]:{} for item in llm_data}
95
+
96
+ for item in llm_data:
97
+ qid = item["id"]
98
+ QA_dict[qid]["question"] = item["question"]
99
+ QA_dict[qid]["llm_answer"] = item["answer"]
100
+
101
+ for item in gt_data:
102
+ qid = item["id"]
103
+ # import ipdb
104
+ # ipdb.set_trace()
105
+ if qid in QA_dict.keys():
106
+ QA_dict[qid]["gt_answer"] = item["conversations"][1]["value"]
107
+
108
+ compares = []
109
+ correct_ans = 0
110
+ for ix, (qid, item) in enumerate(QA_dict.items()):
111
+ question = item["question"]
112
+ llm_answer = item["llm_answer"]
113
+ gt_answer = item["gt_answer"]
114
+ prompt = create_prompt(question, llm_answer, gt_answer)
115
+
116
+ try:
117
+ compare = get_result(prompt=prompt)
118
+
119
+ answer, reason = extract_answer(compare)
120
+ compare_data = {"id": qid, "answer": answer, "reason":reason}
121
+ compares.append(compare_data)
122
+ with open(out_file, 'w') as f:
123
+ json.dump(compares, f, indent=4)
124
+
125
+ if answer.lower()== 'yes':
126
+ correct_ans = correct_ans + 1
127
+ print(f"#correct \n answer:{answer},\n reason:{reason}")
128
+ else:
129
+ print(f"#wrong \n answer:{answer},\n reason:{reason}")
130
+
131
+ except:
132
+ print("break", item)
133
+ continue
134
+ print(f"[step {ix}, correct {correct_ans}, total {len(QA_dict)}, rate {correct_ans/len(QA_dict)}")
135
+ compares.append(compare)
136
+
137
+ return compares
138
+
139
+
140
+
141
+ if __name__ == "__main__":
142
+
143
+ #################################################
144
+ # 4dor count #
145
+ #################################################
146
+
147
+ # # intern fintuned: [step 200, correct 75, total 200, rate 0.375]
148
+ # llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
149
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
150
+ # out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results_eval/4dor_count_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
151
+ # compares = main(llm_file, gt_file, out_file)
152
+
153
+ # # intern origin: step 199, correct 18, total 200, rate 0.09
154
+ # llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
155
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
156
+ # out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results_eval/4dor_count_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
157
+ # compares = main(llm_file, gt_file, out_file)
158
+
159
+ # llava 7b fintuned: [step 199, correct 111, total 200, rate 0.555]
160
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
161
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
162
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
163
+ # compares = main(llm_file, gt_file, out_file)
164
+
165
+ # ## llava 7b origin: [step 199, correct 44, total 200, rate 0.22]
166
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
167
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
168
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
169
+ # compares = main(llm_file, gt_file, out_file)
170
+
171
+ ## llava 13b fintuned: [step 199, correct 125, total 200, rate 0.625]
172
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
173
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
174
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
175
+ # compares = main(llm_file, gt_file, out_file)
176
+
177
+ # ## llava 13b origin: [step 199, correct 16, total 200, rate 0.08]
178
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
179
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
180
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
181
+ # compares = main(llm_file, gt_file, out_file)
182
+
183
+ #################################################
184
+ # pwi count #
185
+ #################################################
186
+
187
+ # intern fintuned: step 199, correct 60, total 200, rate 0.3
188
+ # llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
189
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
190
+ # out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results_eval/pwiseg_count_instruct_0712_test_compare.json' # 替换为你的 LLM 预测文件路径
191
+ # compares = main(llm_file, gt_file, out_file)
192
+
193
+ # # intern origin: step 199, correct 22, total 200, rate 0.11
194
+ # llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
195
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
196
+ # out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results_eval/pwiseg_count_instruct_0712_test_results.json' # 替���为你的 LLM 预测文件路径
197
+ # compares = main(llm_file, gt_file, out_file)
198
+
199
+ # llava 7b fintuned: step 198, correct 140, total 200, rate 0.7
200
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
201
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
202
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results_eval/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
203
+ # compares = main(llm_file, gt_file, out_file)
204
+
205
+ # ## llava 7b origin: step 199, correct 12, total 200, rate 0.06
206
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
207
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
208
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results-eval/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
209
+ # compares = main(llm_file, gt_file, out_file)
210
+
211
+ # # # llava 13b fintuned: step 199, correct 142, total 200, rate 0.71
212
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
213
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
214
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
215
+ # compares = main(llm_file, gt_file, out_file)
216
+
217
+ ## llava 13b origin: [step 199, correct 142, total 200, rate 0.71]
218
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
219
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
220
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results-eval/pwiseg_count_instruct_0712_test_results.json' # 替换为你的 LLM 预测文件路径
221
+ # compares = main(llm_file, gt_file, out_file)
222
+
223
+ ## LLaVA-NeXT
224
+ llm_file = '/mnt1/lyc/llava_finetune/eval_output/results_pwiseg_ori/preds_count.json' # 替换为你的 LLM 预测文件路径
225
+ gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json' # 替换为你的 GT 文件路径
226
+ out_file = '/mnt1/lyc/llava_finetune/eval_output/pwiseg_count_eval_llama3_llava.json' # 替换为你的 LLM 预测文件路径
227
+
228
+
229
+ # llm_file = '/mnt1/lyc/llava_finetune/eval_output/results_4dor_ori/preds_count.json' # 替换为你的 LLM 预测文件路径
230
+ # gt_file = '/mnt1/lyc/llava_finetune/data_json/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
231
+ # out_file = '/mnt1/lyc/llava_finetune/eval_output/4dor_count_eval_llama3_llava_ori.json' # 替换为你的 LLM 预测文件路径
232
+
233
+ compares = main(llm_file, gt_file, out_file)
234
+
235
+
236
+
237
+
eval_scripts/phase_eval.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from openai import OpenAI
3
+ import re
4
+
5
+
6
+ def load_json(filename):
7
+ with open(filename, 'r') as file:
8
+ return json.load(file)
9
+
10
+
11
+ client = OpenAI(api_key="sk-HZLqWTFgQKHUM0YN9d800981DbC34aEa90632493B9310360",
12
+ base_url="https://vip.yi-zhan.top/v1")
13
+
14
+ def get_result(prompt):
15
+ response = client.chat.completions.create(
16
+ model="gpt-4o-2024-05-13",
17
+ messages=[
18
+ {"role": "system", "content": "You are a helpful assistant"},
19
+ {"role": "user", "content":
20
+ [
21
+ {"type": "text", "text": prompt},
22
+ ]
23
+ },
24
+ ],
25
+ stream=False,
26
+ temperature=0.8
27
+ )
28
+ return response.choices[0].message.content
29
+
30
+
31
+ def create_prompt(question, llm_answer, gt_answer):
32
+
33
+ template = """
34
+ ## Role
35
+ You are a fair judge, comparing the LLM answer with the annotated answer, and evaluating whether the answer is accurate about the understanding of the surgical stage.
36
+
37
+ ## Question
38
+ ```json
39
+ {question}
40
+ ```
41
+
42
+ ## LLM answer
43
+ ```json
44
+ {llm_answer}
45
+ ```
46
+
47
+ ## Annotated answer
48
+ ```json
49
+ {gt_answer}
50
+ ```
51
+
52
+ ## Task
53
+ For the given question, evaluate whether the LLM answer is consistent with the annotated answer. If yes, answer yes and give a reason. If no, answer no and give a reason.
54
+
55
+ ## Constraints
56
+ - Your answer should be divided into two parts: "Answer" and "Reason". "Answer" should be "Yes" or "No", indicating whether the large language model's prediction is consistent with the annotation information. "Reason" should provide the reason for your answer.
57
+ - When evaluating the accuracy of the LLM's prediction, pay close attention to whether the model's answer is accurate about the understanding of the surgical phase, including whether the surgical stage is correctly identified and whether the operation suggestion given is appropriate.
58
+ - Output format is json Dictionary, as shown below:
59
+ "reason":reason,
60
+ "answer":answer
61
+
62
+ Take a deep breath and start answering step by step.
63
+ """
64
+
65
+
66
+ prompt = template.format(question=question,
67
+ llm_answer=llm_answer,
68
+ gt_answer=gt_answer)
69
+ return prompt
70
+
71
+
72
+ def extract_answer(json_string):
73
+ # 使用正则表达式匹配answer和reason
74
+ answer_match = re.search(r'"answer":\s*"([^"]+)"', json_string)
75
+ reason_match = re.search(r'"reason":\s*"([^"]+)"', json_string, re.DOTALL)
76
+
77
+ # 提取匹配的内容
78
+ answer = answer_match.group(1) if answer_match else None
79
+ reason = reason_match.group(1) if reason_match else None
80
+
81
+ return answer, reason
82
+
83
+
84
+ def main(llm_file, gt_file, out_file):
85
+ llm_data = load_json(llm_file)
86
+ gt_data = load_json(gt_file)
87
+
88
+ QA_dict = {item["id"]:{} for item in llm_data}
89
+
90
+ for item in llm_data:
91
+ qid = item["id"]
92
+ QA_dict[qid]["question"] = item["question"]
93
+ QA_dict[qid]["llm_answer"] = item["answer"]
94
+
95
+ for item in gt_data:
96
+ qid = item["id"]
97
+ # import ipdb
98
+ # ipdb.set_trace()
99
+ if qid in QA_dict.keys():
100
+ QA_dict[qid]["gt_answer"] = item["conversations"][1]["value"]
101
+
102
+ compares = []
103
+ correct_ans = 0
104
+ for ix, (qid, item) in enumerate(QA_dict.items()):
105
+ question = item["question"]
106
+ llm_answer = item["llm_answer"]
107
+ gt_answer = item["gt_answer"]
108
+ prompt = create_prompt(question, llm_answer, gt_answer)
109
+
110
+ try:
111
+ compare = get_result(prompt=prompt)
112
+
113
+ answer, reason = extract_answer(compare)
114
+ compare_data = {"id": qid, "answer": answer, "reason":reason}
115
+ compares.append(compare_data)
116
+ with open(out_file, 'w') as f:
117
+ json.dump(compares, f, indent=4)
118
+
119
+ if answer.lower()== 'yes':
120
+ correct_ans = correct_ans + 1
121
+ print(f"#correct \n answer:{answer},\n reason:{reason}")
122
+ else:
123
+ print(f"#wrong \n answer:{answer},\n reason:{reason}")
124
+
125
+ except:
126
+ print("break", item)
127
+ continue
128
+ print(f"[step {ix}, correct {correct_ans}, total {len(QA_dict)}, rate {correct_ans/len(QA_dict)}")
129
+ compares.append(compare)
130
+
131
+ return compares
132
+
133
+
134
+
135
+ if __name__ == "__main__":
136
+
137
+ # # intern fintuned: step 199, correct 42, total 200, rate 0.21
138
+ # llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results/4dor_phase_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
139
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_phase_instruct_0711_test.json' # 替换为你的 GT 文件路径
140
+ # out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/results_eval/4dor_phase_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
141
+ # compares = main(llm_file, gt_file, out_file)
142
+
143
+ # intern origin:
144
+ # llm_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results/4dor_phase_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
145
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_phase_instruct_0711_test.json' # 替换为你的 GT 文件路径
146
+ # out_file = '/mnt1/wjl/InternLM-XComposer/output/finetune_0712_pwi+4dor_epoch2/internlm-xcomposer2-vl-7b/origin_results_eval/4dor_phase_instruct_0711_test_compare.json' # 替换为你的 LLM 预测文件路径
147
+ # compares = main(llm_file, gt_file, out_file)
148
+
149
+ # llava 7b fintuned: [step 199, correct 111, total 200, rate 0.555]
150
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
151
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
152
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-task-lora-2024-07-14-08/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
153
+ # compares = main(llm_file, gt_file, out_file)
154
+
155
+ # ## llava 7b origin: [step 199, correct 44, total 200, rate 0.22]
156
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
157
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
158
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-7b-orign-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
159
+ # compares = main(llm_file, gt_file, out_file)
160
+
161
+ ## llava 13b fintuned: [step 199, correct 125, total 200, rate 0.625]
162
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
163
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
164
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-task-lora-2024-07-14-07/results_eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
165
+ # compares = main(llm_file, gt_file, out_file)
166
+
167
+ # ## llava 13b origin
168
+ # llm_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
169
+ # gt_file = '/mnt1/wjl/InternLM-XComposer/output/GT/4dor_count_instruct_0711_test.json' # 替换为你的 GT 文件路径
170
+ # out_file = '/mnt1/wjl/LLaVA/checkpoints/llava-v1.5-13b-origin-results-eval/4dor_count_instruct_0711_test_results.json' # 替换为你的 LLM 预测文件路径
171
+ # compares = main(llm_file, gt_file, out_file)
172
+
173
+
174
+ llm_file = '/mnt1/lyc/llava_finetune/results_4dor/preds_phase.json' # 替换为你的 LLM 预测文件路径
175
+ gt_file = '/mnt1/lyc/llava_finetune/data_json/4dor_phase_instruct_0711_test.json' # 替换为你的 GT 文件路径
176
+ out_file = '/mnt1/lyc/llava_finetune/eval_output/4dor_phase_eval.json' # 替换为你的 LLM 预测文件路径
177
+ compares = main(llm_file, gt_file, out_file)
178
+
179
+
run_finetune_llava.py CHANGED
@@ -2,7 +2,7 @@ from llava.model.builder import load_pretrained_model
2
  from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
3
  from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
4
  from llava.conversation import conv_templates, SeparatorStyle
5
- from peft import LoraConfig, get_peft_model
6
  from PIL import Image
7
  import requests
8
  import copy
@@ -10,7 +10,7 @@ import torch
10
  import argparse
11
  from dataset.SurgDataset import SurgDataset
12
  from accelerate import Accelerator
13
- from models.SurgLLaVA import SurgLLaVA
14
  import os
15
  from tqdm import tqdm
16
  import json
@@ -38,9 +38,10 @@ def parse_args():
38
  parser.add_argument('--step_size', type=int, default=300)
39
  parser.add_argument('--gamma', type=float, default=0.95, help='gemma value of scheduler')
40
  parser.add_argument('--num_epochs', type=int, default=1000)
41
-
42
  parser.add_argument('--test', action='store_true')
43
- parser.add_argument('--checkpoint_path', type=str, default=None)
 
44
  parser.add_argument('--output_dir', type=str, default='4dor_output', help='output file path, which will store output text.')
45
  return parser.parse_args()
46
  def main():
@@ -48,7 +49,9 @@ def main():
48
  accelerator = Accelerator(project_dir=os.path.join(args.ckpt_dir, args.model_name),
49
  log_with="wandb" if args.wandb else None,
50
  gradient_accumulation_steps=args.gradient_accumulation_steps)
 
51
  if args.wandb:
 
52
  accelerator.init_trackers(
53
  project_name=args.wandb_project,
54
  config=args,
@@ -57,55 +60,67 @@ def main():
57
  accelerator.print("[Info] Using wandb for logging...")
58
  pretrained = "lmms-lab/llama3-llava-next-8b"
59
  model_name = "llava_llama3"
60
- tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map='cuda') # Add any other thing you want to pass in llava_model_args
61
- tokenizer.pad_token_id = tokenizer.eos_token_id
62
- train_dataset = SurgDataset(args, image_processor, model.config, mode='train')
63
- test_dataset = SurgDataset(args, image_processor, model.config, mode='test')
 
 
 
64
  train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
65
  test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
66
-
67
- for param in model.parameters():
 
68
  param.requires_grad = False
69
- model.eval()
70
- print(f'[INFO] Using LoRA ...')
71
- peft_config = LoraConfig(
72
- lora_alpha=args.lora_rank,
73
- lora_dropout=0.05,
74
- r=args.lora_rank,
75
- bias="none",
76
- task_type="CAUSAL_LM",
77
- target_modules=[
78
- "q_proj",
79
- "k_proj",
80
- "v_proj",
81
- "o_proj",
82
- "gate_proj",
83
- "up_proj",
84
- "down_proj",
85
- "lm_head",
86
- ],
87
- )
88
- lora_llm = get_peft_model(model, peft_config)
89
- model = lora_llm.model
90
 
91
- train_params = model.parameters()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  print(f'[INFO] Creating Model ...')
93
- train_model = SurgLLaVA(args, model, tokenizer)
94
- train_model = train_model.to(torch.bfloat16)
95
  optimizer = torch.optim.AdamW(train_params, lr=args.lr, eps=1e-7)
96
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader) * args.step_size // args.gradient_accumulation_steps, gamma=args.gamma)
97
- if args.checkpoint_path is not None:
98
- # Stupid way to load lora...
99
- # TODO: update this!
100
- print(f'[INFO] Load checkpoint...')
101
- whole_model = torch.load(os.path.join(args.checkpoint_path, 'pytorch_model.bin'), map_location='cpu')
102
- train_model.load_state_dict(whole_model)
103
 
104
- train_model, tokenizer, optimizer, scheduler, train_dataloader, test_dataloader = accelerator.prepare(train_model, tokenizer, optimizer, scheduler, train_dataloader, test_dataloader)
 
 
 
 
 
 
105
  if args.test:
106
  # testing code
107
  accelerator.print(f'[INFO] Start testing...')
108
- train_model.eval()
109
  with torch.no_grad():
110
  os.makedirs(args.output_dir, exist_ok=True)
111
  output_list = []
@@ -116,7 +131,7 @@ def main():
116
  image_sizes = image_sizes[0]
117
  if len(image_sizes) != args.batch_size:
118
  image_sizes = [torch.cat(image_sizes)]
119
- output = train_model(image, image_sizes, question)
120
  text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
121
  output_data = raw_data
122
  output_data.update({'answer': text_output, 'question': question})
@@ -146,7 +161,7 @@ def main():
146
  # initialize epoch-level metrics
147
  accelerator.print(f'[INFO] Start training...')
148
  for epoch in tqdm(range(args.num_epochs)):
149
- train_model.train()
150
  total_train_loss = 0
151
  for i, batch in enumerate(train_dataloader):
152
  optimizer.zero_grad()
@@ -155,10 +170,10 @@ def main():
155
  image_sizes = image_sizes[0]
156
  if len(image_sizes) != args.batch_size:
157
  image_sizes = [torch.cat(image_sizes)]
158
- output = train_model(image, image_sizes, question, answer)
159
  loss = output.loss
160
  # Accelerator requires all params to involve gradient descend. This 'dummy loss' can avoid this issue.
161
- for param in train_model.parameters():
162
  loss += param.sum() * 0.0
163
  accelerator.backward(loss)
164
  optimizer.step()
@@ -177,7 +192,7 @@ def main():
177
  total_test_loss = None
178
  if epoch % args.eval_interval == 0:
179
  total_test_loss = 0
180
- train_model.eval()
181
  with torch.no_grad():
182
  for i, batch in enumerate(test_dataloader):
183
  raw_data, question, answer, image, image_sizes = batch
@@ -185,7 +200,7 @@ def main():
185
  image_sizes = image_sizes[0]
186
  if len(image_sizes) != args.batch_size:
187
  image_sizes = [torch.cat(image_sizes)]
188
- output = train_model(image, image_sizes, question, )
189
  text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
190
  if i % 100 == 0:
191
  img_id = raw_data[0]['id']
@@ -198,7 +213,7 @@ def main():
198
  save_model_dir = os.path.join(args.ckpt_dir, args.model_name, 'checkpoints', f'checkpoint_{epoch:05d}')
199
  lora_save_dir = os.path.join(args.ckpt_dir, args.model_name, 'lora')
200
  accelerator.save_state(save_model_dir, safe_serialization=False, total_limit=5)
201
- unwrapped_model = accelerator.unwrap_model(train_model)
202
  unwrapped_model.model.save_pretrained(
203
  lora_save_dir,
204
  save_function=accelerator.save,
 
2
  from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
3
  from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
4
  from llava.conversation import conv_templates, SeparatorStyle
5
+ from peft import LoraConfig, get_peft_model, PeftModel
6
  from PIL import Image
7
  import requests
8
  import copy
 
10
  import argparse
11
  from dataset.SurgDataset import SurgDataset
12
  from accelerate import Accelerator
13
+ from llava.model.SurgLLaVA import SurgLLaVA
14
  import os
15
  from tqdm import tqdm
16
  import json
 
38
  parser.add_argument('--step_size', type=int, default=300)
39
  parser.add_argument('--gamma', type=float, default=0.95, help='gemma value of scheduler')
40
  parser.add_argument('--num_epochs', type=int, default=1000)
41
+ parser.add_argument('--lora', action='store_true', help='Use LoRA if True')
42
  parser.add_argument('--test', action='store_true')
43
+ parser.add_argument('--lora_ckpt_path', type=str, default=None)
44
+ parser.add_argument('--ckpt_path', type=str, default=None)
45
  parser.add_argument('--output_dir', type=str, default='4dor_output', help='output file path, which will store output text.')
46
  return parser.parse_args()
47
  def main():
 
49
  accelerator = Accelerator(project_dir=os.path.join(args.ckpt_dir, args.model_name),
50
  log_with="wandb" if args.wandb else None,
51
  gradient_accumulation_steps=args.gradient_accumulation_steps)
52
+
53
  if args.wandb:
54
+ print(f'[INFO] Using wandb for logging...')
55
  accelerator.init_trackers(
56
  project_name=args.wandb_project,
57
  config=args,
 
60
  accelerator.print("[Info] Using wandb for logging...")
61
  pretrained = "lmms-lab/llama3-llava-next-8b"
62
  model_name = "llava_llama3"
63
+ tokenizer, llm_model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map='cuda') # Add any other thing you want to pass in llava_model_args
64
+ # tokenizer.pad_token_id = tokenizer.eos_token_id
65
+ if tokenizer.pad_token is None:
66
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
67
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
68
+ train_dataset = SurgDataset(args, image_processor, llm_model.config, mode='train')
69
+ test_dataset = SurgDataset(args, image_processor, llm_model.config, mode='test')
70
  train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
71
  test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=True, batch_size=args.batch_size, num_workers=4)
72
+
73
+ print(f'[INFO] Freezing llm model')
74
+ for param in llm_model.parameters():
75
  param.requires_grad = False
76
+ llm_model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ if args.lora:
79
+ if args.lora_ckpt_path is not None:
80
+ print(f'[INFO] Loading LoRA model checkpoint...')
81
+ llm_model = PeftModel.from_pretrained(llm_model, './model_ckpt/llama3-llava-next-8b-task-lora')
82
+ llm_model = llm_model.merge_and_unload()
83
+ else:
84
+ print(f'[INFO] Creating LoRA ...')
85
+ peft_config = LoraConfig(
86
+ lora_alpha=args.lora_rank,
87
+ lora_dropout=0.05,
88
+ r=args.lora_rank,
89
+ bias="none",
90
+ task_type="CAUSAL_LM",
91
+ target_modules=[
92
+ "q_proj",
93
+ "k_proj",
94
+ "v_proj",
95
+ "o_proj",
96
+ "gate_proj",
97
+ "up_proj",
98
+ "down_proj",
99
+ "lm_head",
100
+ ],
101
+ )
102
+ lora_llm = get_peft_model(llm_model, peft_config)
103
+ llm_model = lora_llm.model
104
+
105
+
106
+ train_params = llm_model.parameters()
107
  print(f'[INFO] Creating Model ...')
108
+ model = SurgLLaVA(args, llm_model, tokenizer)
109
+ model = model.to(torch.bfloat16)
110
  optimizer = torch.optim.AdamW(train_params, lr=args.lr, eps=1e-7)
111
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=len(train_dataloader) * args.step_size // args.gradient_accumulation_steps, gamma=args.gamma)
 
 
 
 
 
 
112
 
113
+ if args.ckpt_path is not None:
114
+ print(f'[INFO] Load whole pretrained checkpoint...')
115
+ whole_model = torch.load(os.path.join(args.ckpt_path, 'pytorch_model.bin'), map_location='cpu')
116
+ model.load_state_dict(whole_model)
117
+
118
+ print(f'[INFO] Preparing accelerator...')
119
+ model, tokenizer, optimizer, scheduler, train_dataloader, test_dataloader = accelerator.prepare(model, tokenizer, optimizer, scheduler, train_dataloader, test_dataloader)
120
  if args.test:
121
  # testing code
122
  accelerator.print(f'[INFO] Start testing...')
123
+ model.eval()
124
  with torch.no_grad():
125
  os.makedirs(args.output_dir, exist_ok=True)
126
  output_list = []
 
131
  image_sizes = image_sizes[0]
132
  if len(image_sizes) != args.batch_size:
133
  image_sizes = [torch.cat(image_sizes)]
134
+ output = model(image, image_sizes, question)
135
  text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
136
  output_data = raw_data
137
  output_data.update({'answer': text_output, 'question': question})
 
161
  # initialize epoch-level metrics
162
  accelerator.print(f'[INFO] Start training...')
163
  for epoch in tqdm(range(args.num_epochs)):
164
+ model.train()
165
  total_train_loss = 0
166
  for i, batch in enumerate(train_dataloader):
167
  optimizer.zero_grad()
 
170
  image_sizes = image_sizes[0]
171
  if len(image_sizes) != args.batch_size:
172
  image_sizes = [torch.cat(image_sizes)]
173
+ output = model(image, image_sizes, question, answer)
174
  loss = output.loss
175
  # Accelerator requires all params to involve gradient descend. This 'dummy loss' can avoid this issue.
176
+ for param in model.parameters():
177
  loss += param.sum() * 0.0
178
  accelerator.backward(loss)
179
  optimizer.step()
 
192
  total_test_loss = None
193
  if epoch % args.eval_interval == 0:
194
  total_test_loss = 0
195
+ model.eval()
196
  with torch.no_grad():
197
  for i, batch in enumerate(test_dataloader):
198
  raw_data, question, answer, image, image_sizes = batch
 
200
  image_sizes = image_sizes[0]
201
  if len(image_sizes) != args.batch_size:
202
  image_sizes = [torch.cat(image_sizes)]
203
+ output = model(image, image_sizes, question, )
204
  text_output = tokenizer.batch_decode(output, skip_special_tokens=True)
205
  if i % 100 == 0:
206
  img_id = raw_data[0]['id']
 
213
  save_model_dir = os.path.join(args.ckpt_dir, args.model_name, 'checkpoints', f'checkpoint_{epoch:05d}')
214
  lora_save_dir = os.path.join(args.ckpt_dir, args.model_name, 'lora')
215
  accelerator.save_state(save_model_dir, safe_serialization=False, total_limit=5)
216
+ unwrapped_model = accelerator.unwrap_model(model)
217
  unwrapped_model.model.save_pretrained(
218
  lora_save_dir,
219
  save_function=accelerator.save,
scripts/convert_gqa_for_eval.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("--src", type=str)
7
+ parser.add_argument("--dst", type=str)
8
+ args = parser.parse_args()
9
+
10
+ all_answers = []
11
+ for line_idx, line in enumerate(open(args.src)):
12
+ res = json.loads(line)
13
+ question_id = res['question_id']
14
+ text = res['text'].rstrip('.').lower()
15
+ all_answers.append({"questionId": question_id, "prediction": text})
16
+
17
+ with open(args.dst, 'w') as f:
18
+ json.dump(all_answers, f)
scripts/convert_mmbench_for_submission.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import pandas as pd
5
+
6
+ def get_args():
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("--annotation-file", type=str, required=True)
9
+ parser.add_argument("--result-dir", type=str, required=True)
10
+ parser.add_argument("--upload-dir", type=str, required=True)
11
+ parser.add_argument("--experiment", type=str, required=True)
12
+
13
+ return parser.parse_args()
14
+
15
+ if __name__ == "__main__":
16
+ args = get_args()
17
+
18
+ df = pd.read_table(args.annotation_file)
19
+
20
+ cur_df = df.copy()
21
+ cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22
+ cur_df.insert(6, 'prediction', None)
23
+ for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24
+ pred = json.loads(pred)
25
+ cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26
+
27
+ cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
scripts/convert_mmvet_for_eval.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("--src", type=str)
7
+ parser.add_argument("--dst", type=str)
8
+ args = parser.parse_args()
9
+
10
+ cur_result = {}
11
+
12
+ for line in open(args.src):
13
+ data = json.loads(line)
14
+ qid = data['question_id']
15
+ cur_result[f'v1_{qid}'] = data['text']
16
+
17
+ with open(args.dst, 'w') as f:
18
+ json.dump(cur_result, f, indent=2)
scripts/convert_seed_for_submission.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+
5
+
6
+ def get_args():
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("--annotation-file", type=str)
9
+ parser.add_argument("--result-file", type=str)
10
+ parser.add_argument("--result-upload-file", type=str)
11
+ return parser.parse_args()
12
+
13
+
14
+ def eval_single(result_file, eval_only_type=None):
15
+ results = {}
16
+ for line in open(result_file):
17
+ row = json.loads(line)
18
+ results[row['question_id']] = row
19
+
20
+ type_counts = {}
21
+ correct_counts = {}
22
+ for question_data in data['questions']:
23
+ if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24
+ data_type = question_data['question_type_id']
25
+ type_counts[data_type] = type_counts.get(data_type, 0) + 1
26
+ try:
27
+ question_id = int(question_data['question_id'])
28
+ except:
29
+ question_id = question_data['question_id']
30
+ if question_id not in results:
31
+ correct_counts[data_type] = correct_counts.get(data_type, 0)
32
+ continue
33
+ row = results[question_id]
34
+ if row['text'] == question_data['answer']:
35
+ correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36
+
37
+ total_count = 0
38
+ total_correct = 0
39
+ for data_type in sorted(type_counts.keys()):
40
+ accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41
+ if eval_only_type is None:
42
+ print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43
+
44
+ total_count += type_counts[data_type]
45
+ total_correct += correct_counts[data_type]
46
+
47
+ total_accuracy = total_correct / total_count * 100
48
+ if eval_only_type is None:
49
+ print(f"Total accuracy: {total_accuracy:.2f}%")
50
+ else:
51
+ print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52
+
53
+ return results
54
+
55
+ if __name__ == "__main__":
56
+ args = get_args()
57
+ data = json.load(open(args.annotation_file))
58
+ ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
59
+
60
+ results = eval_single(args.result_file)
61
+ eval_single(args.result_file, eval_only_type='image')
62
+ eval_single(args.result_file, eval_only_type='video')
63
+
64
+ with open(args.result_upload_file, 'w') as fp:
65
+ for question in data['questions']:
66
+ qid = question['question_id']
67
+ if qid in results:
68
+ result = results[qid]
69
+ else:
70
+ result = results[int(qid)]
71
+ fp.write(json.dumps({
72
+ 'question_id': qid,
73
+ 'prediction': result['text']
74
+ }) + '\n')
scripts/convert_sqa_to_llava.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import fire
4
+ import re
5
+ from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
6
+
7
+
8
+ def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
9
+ split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
10
+ problems = json.load(open(os.path.join(base_dir, "problems.json")))
11
+
12
+ split_problems = build_prompt_chatbot(
13
+ problems, split_indices, prompt_format,
14
+ use_caption=False, is_test=False)
15
+
16
+ target_format = []
17
+ for prob_id, (input, output) in split_problems.items():
18
+ if input.startswith('Question: '):
19
+ input = input.replace('Question: ', '')
20
+ if output.startswith('Answer: '):
21
+ output = output.replace('Answer: ', '')
22
+
23
+ raw_prob_data = problems[prob_id]
24
+ if raw_prob_data['image'] is None:
25
+ target_format.append({
26
+ "id": prob_id,
27
+ "conversations": [
28
+ {'from': 'human', 'value': f"{input}"},
29
+ {'from': 'gpt', 'value': f"{output}"},
30
+ ],
31
+ })
32
+
33
+ else:
34
+ target_format.append({
35
+ "id": prob_id,
36
+ "image": os.path.join(prob_id, raw_prob_data['image']),
37
+ "conversations": [
38
+ {'from': 'human', 'value': f"{input}\n<image>"},
39
+ {'from': 'gpt', 'value': f"{output}"},
40
+ ],
41
+ })
42
+
43
+ print(f'Number of samples: {len(target_format)}')
44
+
45
+ with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
46
+ json.dump(target_format, f, indent=2)
47
+
48
+
49
+ def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
50
+ split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
51
+ problems = json.load(open(os.path.join(base_dir, "problems.json")))
52
+
53
+ split_problems = build_prompt_chatbot(
54
+ problems, split_indices, prompt_format,
55
+ use_caption=False, is_test=False)
56
+
57
+ writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
58
+ for prob_id, (input, output) in split_problems.items():
59
+ if input.startswith('Question: '):
60
+ input = input.replace('Question: ', '')
61
+ if output.startswith('Answer: '):
62
+ output = output.replace('Answer: ', '')
63
+
64
+ raw_prob_data = problems[prob_id]
65
+ if raw_prob_data['image'] is None:
66
+ data = {
67
+ "id": prob_id,
68
+ "instruction": f"{input}",
69
+ "output": f"{output}",
70
+ }
71
+
72
+ else:
73
+ data = {
74
+ "id": prob_id,
75
+ "image": os.path.join(prob_id, raw_prob_data['image']),
76
+ "instruction": f"{input}\n<image>",
77
+ "output": f"{output}",
78
+ }
79
+ writer.write(json.dumps(data) + '\n')
80
+ writer.close()
81
+
82
+
83
+ def main(task, **kwargs):
84
+ globals()[task](**kwargs)
85
+
86
+
87
+ if __name__ == "__main__":
88
+ fire.Fire(main)
scripts/convert_sqa_to_llava_base_prompt.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_question_text(problem):
2
+ question = problem['question']
3
+ return question
4
+
5
+
6
+ def get_context_text(problem, use_caption):
7
+ txt_context = problem['hint']
8
+ img_context = problem['caption'] if use_caption else ""
9
+ context = " ".join([txt_context, img_context]).strip()
10
+ if context == "":
11
+ context = "N/A"
12
+ return context
13
+
14
+
15
+ def get_choice_text(probelm, options):
16
+ choices = probelm['choices']
17
+ choice_list = []
18
+ for i, c in enumerate(choices):
19
+ choice_list.append("({}) {}".format(options[i], c))
20
+ choice_txt = " ".join(choice_list)
21
+ #print(choice_txt)
22
+ return choice_txt
23
+
24
+
25
+ def get_answer(problem, options):
26
+ return options[problem['answer']]
27
+
28
+
29
+ def get_lecture_text(problem):
30
+ # \\n: GPT-3 can generate the lecture with more tokens.
31
+ lecture = problem['lecture'].replace("\n", "\\n")
32
+ return lecture
33
+
34
+
35
+ def get_solution_text(problem):
36
+ # \\n: GPT-3 can generate the solution with more tokens
37
+ solution = problem['solution'].replace("\n", "\\n")
38
+ return solution
39
+
40
+
41
+ def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True):
42
+
43
+ input_format, output_format = format.split("-")
44
+
45
+ ## Inputs
46
+ if input_format == "CQM":
47
+ input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
48
+ elif input_format == "QCM":
49
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
50
+ # upper bound experiment
51
+ elif input_format == "QCML":
52
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
53
+ elif input_format == "QCME":
54
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
55
+ elif input_format == "QCMLE":
56
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
57
+
58
+ elif input_format == "QCLM":
59
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
60
+ elif input_format == "QCEM":
61
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
62
+ elif input_format == "QCLEM":
63
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
64
+
65
+ # Outputs
66
+ if test_example:
67
+ output = "Answer:"
68
+ elif output_format == 'A':
69
+ output = f"Answer: The answer is {answer}."
70
+
71
+ elif output_format == 'AL':
72
+ output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
73
+ elif output_format == 'AE':
74
+ output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
75
+ elif output_format == 'ALE':
76
+ output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
77
+ elif output_format == 'AEL':
78
+ output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
79
+
80
+ elif output_format == 'LA':
81
+ output = f"Answer: {lecture} The answer is {answer}."
82
+ elif output_format == 'EA':
83
+ output = f"Answer: {solution} The answer is {answer}."
84
+ elif output_format == 'LEA':
85
+ output = f"Answer: {lecture} {solution} The answer is {answer}."
86
+ elif output_format == 'ELA':
87
+ output = f"Answer: {solution} {lecture} The answer is {answer}."
88
+ elif output_format == 'LEPA':
89
+ output = ''
90
+ if len(lecture.strip()) > 0:
91
+ output += f"LECTURE: {lecture}\n"
92
+ if len(solution.strip()) > 0:
93
+ output += f"SOLUTION: {solution}\n"
94
+ output += '###\n'
95
+ output += f"ANSWER: {answer}."
96
+
97
+ input = input.replace(" ", " ").strip()
98
+ output = output.replace(" ", " ").strip()
99
+ if input.endswith("BECAUSE:"):
100
+ input = input.replace("BECAUSE:", "").strip()
101
+ if output.endswith("BECAUSE:"):
102
+ output = output.replace("BECAUSE:", "").strip()
103
+ return input, output
104
+
105
+
106
+ def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
107
+
108
+ input_format, output_format = format.split("-")
109
+
110
+ ## Inputs
111
+ if input_format == "CQM":
112
+ input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
113
+ elif input_format == "QCM":
114
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
115
+ # upper bound experiment
116
+ elif input_format == "QCML":
117
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
118
+ elif input_format == "QCME":
119
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
120
+ elif input_format == "QCMLE":
121
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
122
+
123
+ elif input_format == "QCLM":
124
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
125
+ elif input_format == "QCEM":
126
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
127
+ elif input_format == "QCLEM":
128
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
129
+
130
+ # Outputs
131
+ if test_example:
132
+ output = "Answer:"
133
+ elif output_format == 'A':
134
+ output = f"Answer: The answer is {answer}."
135
+
136
+ elif output_format == 'AL':
137
+ output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
138
+ elif output_format == 'AE':
139
+ output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
140
+ elif output_format == 'ALE':
141
+ output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
142
+ elif output_format == 'AEL':
143
+ output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
144
+
145
+ elif output_format == 'LA':
146
+ output = f"Answer: {lecture} The answer is {answer}."
147
+ elif output_format == 'EA':
148
+ output = f"Answer: {solution} The answer is {answer}."
149
+ elif output_format == 'LEA':
150
+ output = f"Answer: {lecture} {solution} The answer is {answer}."
151
+ elif output_format == 'ELA':
152
+ output = f"Answer: {solution} {lecture} The answer is {answer}."
153
+
154
+ text = input + output
155
+ text = text.replace(" ", " ").strip()
156
+ if text.endswith("BECAUSE:"):
157
+ text = text.replace("BECAUSE:", "").strip()
158
+ return text
159
+
160
+
161
+
162
+ def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True):
163
+
164
+ input_format, output_format = format.split("-")
165
+
166
+ ## Inputs
167
+ if input_format == "CQM":
168
+ input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
169
+ elif input_format == "QCM":
170
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
171
+ # upper bound experiment
172
+ elif input_format == "QCML":
173
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
174
+ elif input_format == "QCME":
175
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
176
+ elif input_format == "QCMLE":
177
+ input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
178
+
179
+ elif input_format == "QCLM":
180
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
181
+ elif input_format == "QCEM":
182
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
183
+ elif input_format == "QCLEM":
184
+ input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
185
+
186
+ # Outputs
187
+ if test_example:
188
+ output = "Answer:"
189
+ elif output_format == 'A':
190
+ output = f"Answer: The answer is {answer}."
191
+
192
+ elif output_format == 'AL':
193
+ output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
194
+ elif output_format == 'AE':
195
+ output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
196
+ elif output_format == 'ALE':
197
+ output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
198
+ elif output_format == 'AEL':
199
+ output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
200
+
201
+ elif output_format == 'LA':
202
+ output = f"Answer: {lecture} The answer is {answer}."
203
+ elif output_format == 'EA':
204
+ output = f"Answer: {solution} The answer is {answer}."
205
+ elif output_format == 'LEA':
206
+ output = f"Answer: {lecture} {solution} The answer is {answer}."
207
+ elif output_format == 'ELA':
208
+ output = f"Answer: {solution} {lecture} The answer is {answer}."
209
+
210
+ input = input.replace(" ", " ").strip()
211
+ output = output.replace(" ", " ").strip()
212
+ if output.endswith("BECAUSE:"):
213
+ output = output.replace("BECAUSE:", "").strip()
214
+
215
+ user_prompt = {"role": "user", "content": f"Can you explain {input}?"}
216
+ assistant_prompt = {"role": "assistant", "content": f"{output}"}
217
+
218
+ return user_prompt, assistant_prompt
219
+
220
+
221
+ def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False):
222
+ examples = {}
223
+
224
+ for qid in shot_qids:
225
+ question = get_question_text(problems[qid])
226
+ context = get_context_text(problems[qid], use_caption)
227
+ choice = get_choice_text(problems[qid], options)
228
+ answer = get_answer(problems[qid], options)
229
+ lecture = get_lecture_text(problems[qid]).replace('\\n', '\n')
230
+ solution = get_solution_text(problems[qid]).replace('\\n', '\n')
231
+
232
+ train_example = create_one_example_chatbot(prompt_format,
233
+ question,
234
+ context,
235
+ choice,
236
+ answer,
237
+ lecture,
238
+ solution,
239
+ test_example=is_test)
240
+ examples[qid] = train_example
241
+ return examples
242
+
243
+
244
+ def build_prompt(problems, shot_qids, test_qid, args):
245
+
246
+ examples = []
247
+
248
+ # n-shot training examples
249
+ for qid in shot_qids:
250
+ question = get_question_text(problems[qid])
251
+ context = get_context_text(problems[qid], args.use_caption)
252
+ choice = get_choice_text(problems[qid], args.options)
253
+ answer = get_answer(problems[qid], args.options)
254
+ lecture = get_lecture_text(problems[qid])
255
+ solution = get_solution_text(problems[qid])
256
+
257
+ train_example = create_one_example(args.prompt_format,
258
+ question,
259
+ context,
260
+ choice,
261
+ answer,
262
+ lecture,
263
+ solution,
264
+ test_example=False)
265
+ examples.append(train_example)
266
+
267
+ # test example
268
+ question = get_question_text(problems[test_qid])
269
+ context = get_context_text(problems[test_qid], args.use_caption)
270
+ choice = get_choice_text(problems[test_qid], args.options)
271
+ answer = get_answer(problems[test_qid], args.options)
272
+ lecture = get_lecture_text(problems[test_qid])
273
+ solution = get_solution_text(problems[test_qid])
274
+
275
+ test_example = create_one_example(args.prompt_format,
276
+ question,
277
+ context,
278
+ choice,
279
+ answer,
280
+ lecture,
281
+ solution,
282
+ test_example=True)
283
+ examples.append(test_example)
284
+
285
+ # create the prompt input
286
+ prompt_input = '\n\n'.join(examples)
287
+
288
+ return prompt_input
289
+
290
+
291
+ def build_prompt_gpt4(problems, shot_qids, test_qid, args):
292
+
293
+ prompt_array = [{"role": "system", "content": "You are a helpful assistant."}]
294
+
295
+ # n-shot training examples
296
+ for qid in shot_qids:
297
+ question = get_question_text(problems[qid])
298
+ context = get_context_text(problems[qid], args.use_caption)
299
+ choice = get_choice_text(problems[qid], args.options)
300
+ answer = get_answer(problems[qid], args.options)
301
+ lecture = get_lecture_text(problems[qid])
302
+ solution = get_solution_text(problems[qid])
303
+
304
+ user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
305
+ question,
306
+ context,
307
+ choice,
308
+ answer,
309
+ lecture,
310
+ solution,
311
+ test_example=False)
312
+ prompt_array.append(user_prompt)
313
+ prompt_array.append(assistant_prompt)
314
+
315
+ # test example
316
+ question = get_question_text(problems[test_qid])
317
+ context = get_context_text(problems[test_qid], args.use_caption)
318
+ choice = get_choice_text(problems[test_qid], args.options)
319
+ answer = get_answer(problems[test_qid], args.options)
320
+ lecture = get_lecture_text(problems[test_qid])
321
+ solution = get_solution_text(problems[test_qid])
322
+
323
+ user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
324
+ question,
325
+ context,
326
+ choice,
327
+ answer,
328
+ lecture,
329
+ solution,
330
+ test_example=True)
331
+ prompt_array.append(user_prompt)
332
+ prompt_array.append(assistant_prompt)
333
+
334
+ return prompt_array
scripts/convert_vizwiz_for_submission.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+
5
+ from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
6
+
7
+
8
+ def parse_args():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--annotation-file', type=str, required=True)
11
+ parser.add_argument('--result-file', type=str, required=True)
12
+ parser.add_argument('--result-upload-file', type=str, required=True)
13
+ return parser.parse_args()
14
+
15
+
16
+ if __name__ == '__main__':
17
+
18
+ args = parse_args()
19
+
20
+ os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21
+
22
+ results = []
23
+ error_line = 0
24
+ for line_idx, line in enumerate(open(args.result_file)):
25
+ try:
26
+ results.append(json.loads(line))
27
+ except:
28
+ error_line += 1
29
+ results = {x['question_id']: x['text'] for x in results}
30
+ test_split = [json.loads(line) for line in open(args.annotation_file)]
31
+ split_ids = set([x['question_id'] for x in test_split])
32
+
33
+ print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34
+
35
+ all_answers = []
36
+
37
+ answer_processor = EvalAIAnswerProcessor()
38
+
39
+ for x in test_split:
40
+ assert x['question_id'] in results
41
+ all_answers.append({
42
+ 'image': x['image'],
43
+ 'answer': answer_processor(results[x['question_id']])
44
+ })
45
+
46
+ with open(args.result_upload_file, 'w') as f:
47
+ json.dump(all_answers, f)
scripts/convert_vqav2_for_submission.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+
5
+ from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
6
+
7
+
8
+ def parse_args():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11
+ parser.add_argument('--ckpt', type=str, required=True)
12
+ parser.add_argument('--split', type=str, required=True)
13
+ return parser.parse_args()
14
+
15
+
16
+ if __name__ == '__main__':
17
+
18
+ args = parse_args()
19
+
20
+ src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21
+ test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22
+ dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23
+ os.makedirs(os.path.dirname(dst), exist_ok=True)
24
+
25
+ results = []
26
+ error_line = 0
27
+ for line_idx, line in enumerate(open(src)):
28
+ try:
29
+ results.append(json.loads(line))
30
+ except:
31
+ error_line += 1
32
+
33
+ results = {x['question_id']: x['text'] for x in results}
34
+ test_split = [json.loads(line) for line in open(test_split)]
35
+ split_ids = set([x['question_id'] for x in test_split])
36
+
37
+ print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38
+
39
+ all_answers = []
40
+
41
+ answer_processor = EvalAIAnswerProcessor()
42
+
43
+ for x in test_split:
44
+ if x['question_id'] not in results:
45
+ all_answers.append({
46
+ 'question_id': x['question_id'],
47
+ 'answer': ''
48
+ })
49
+ else:
50
+ all_answers.append({
51
+ 'question_id': x['question_id'],
52
+ 'answer': answer_processor(results[x['question_id']])
53
+ })
54
+
55
+ with open(dst, 'w') as f:
56
+ json.dump(all_answers, open(dst, 'w'))
scripts/extract_mm_projector.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is just a utility that I use to extract the projector for quantized models.
3
+ It is NOT necessary at all to train, or run inference/serve demos.
4
+ Use this script ONLY if you fully understand its implications.
5
+ """
6
+
7
+
8
+ import os
9
+ import argparse
10
+ import torch
11
+ import json
12
+ from collections import defaultdict
13
+
14
+
15
+ def parse_args():
16
+ parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17
+ parser.add_argument('--model-path', type=str, help='model folder')
18
+ parser.add_argument('--output', type=str, help='output file')
19
+ args = parser.parse_args()
20
+ return args
21
+
22
+
23
+ if __name__ == '__main__':
24
+ args = parse_args()
25
+
26
+ keys_to_match = ['mm_projector']
27
+ ckpt_to_key = defaultdict(list)
28
+ try:
29
+ model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30
+ for k, v in model_indices['weight_map'].items():
31
+ if any(key_match in k for key_match in keys_to_match):
32
+ ckpt_to_key[v].append(k)
33
+ except FileNotFoundError:
34
+ # Smaller models or model checkpoints saved by DeepSpeed.
35
+ v = 'pytorch_model.bin'
36
+ for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37
+ if any(key_match in k for key_match in keys_to_match):
38
+ ckpt_to_key[v].append(k)
39
+
40
+ loaded_weights = {}
41
+
42
+ for ckpt_name, weight_keys in ckpt_to_key.items():
43
+ ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44
+ for k in weight_keys:
45
+ loaded_weights[k] = ckpt[k]
46
+
47
+ torch.save(loaded_weights, args.output)
scripts/finetune.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ # Uncomment and set the following variables correspondingly to run this script:
6
+
7
+ ################## VICUNA ##################
8
+ # PROMPT_VERSION=v1
9
+ # MODEL_VERSION="vicuna-v1-3-7b"
10
+ ################## VICUNA ##################
11
+
12
+ ################## LLaMA-2 ##################
13
+ # PROMPT_VERSION="llava_llama_2"
14
+ # MODEL_VERSION="llama-2-7b-chat"
15
+ ################## LLaMA-2 ##################
16
+
17
+ deepspeed llava/train/train_mem.py \
18
+ --deepspeed ./scripts/zero2.json \
19
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
20
+ --version $PROMPT_VERSION \
21
+ --data_path ./playground/data/llava_instruct_80k.json \
22
+ --image_folder /path/to/coco/train2017 \
23
+ --vision_tower openai/clip-vit-large-patch14 \
24
+ --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25
+ --mm_vision_select_layer -2 \
26
+ --mm_use_im_start_end False \
27
+ --mm_use_im_patch_token False \
28
+ --bf16 True \
29
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30
+ --num_train_epochs 1 \
31
+ --per_device_train_batch_size 16 \
32
+ --per_device_eval_batch_size 4 \
33
+ --gradient_accumulation_steps 1 \
34
+ --evaluation_strategy "no" \
35
+ --save_strategy "steps" \
36
+ --save_steps 50000 \
37
+ --save_total_limit 1 \
38
+ --learning_rate 2e-5 \
39
+ --weight_decay 0. \
40
+ --warmup_ratio 0.03 \
41
+ --lr_scheduler_type "cosine" \
42
+ --logging_steps 1 \
43
+ --tf32 True \
44
+ --model_max_length 2048 \
45
+ --gradient_checkpointing True \
46
+ --dataloader_num_workers 4 \
47
+ --lazy_preprocess True \
48
+ --report_to wandb
scripts/finetune/test_llava.sh CHANGED
@@ -1,4 +1,9 @@
1
- accelerate launch --config_file ./hf_config/single_gpu_config.yml \
2
  run_finetune_llava.py \
3
  --test \
4
- --checkpoint_path ./model_ckpt/llava3_mix_instr/checkpoints/checkpoint_00003 \
 
 
 
 
 
 
1
+ accelerate launch --config_file ./accelerator_config/gpu_4_config.yml \
2
  run_finetune_llava.py \
3
  --test \
4
+ --data_path /mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_caption_instruct_0712_test.json \
5
+ --output_dir ./eval_output/results_pwiseg_ori \
6
+ # --lora_ckpt_path /mnt1/lyc/llava_finetune/model_ckpt/llama3-llava-next-8b-task-lora \
7
+ # --ckpt_path ./model_ckpt/llava3_mix_instr/checkpoints/checkpoint_00003 \
8
+ # --data_path /mnt1/lyc/llava_finetune/data_json/4dor_caption_instruct_0711_test.json \
9
+ # --data_path /mnt1/wjl/InternLM-XComposer/output/GT/pwiseg_count_instruct_0712_test.json
scripts/finetune_full_schedule.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ # Uncomment and set the following variables correspondingly to run this script:
6
+
7
+ ################## VICUNA ##################
8
+ # PROMPT_VERSION=v1
9
+ # MODEL_VERSION="vicuna-v1-3-7b"
10
+ ################## VICUNA ##################
11
+
12
+ ################## LLaMA-2 ##################
13
+ # PROMPT_VERSION="llava_llama_2"
14
+ # MODEL_VERSION="llama-2-7b-chat"
15
+ ################## LLaMA-2 ##################
16
+
17
+ deepspeed llava/train/train_mem.py \
18
+ --deepspeed ./scripts/zero2.json \
19
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
20
+ --version $PROMPT_VERSION \
21
+ --data_path ./playground/data/llava_instruct_158k.json \
22
+ --image_folder /path/to/coco/train2017 \
23
+ --vision_tower openai/clip-vit-large-patch14 \
24
+ --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25
+ --mm_vision_select_layer -2 \
26
+ --mm_use_im_start_end False \
27
+ --mm_use_im_patch_token False \
28
+ --bf16 True \
29
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30
+ --num_train_epochs 3 \
31
+ --per_device_train_batch_size 16 \
32
+ --per_device_eval_batch_size 4 \
33
+ --gradient_accumulation_steps 1 \
34
+ --evaluation_strategy "no" \
35
+ --save_strategy "steps" \
36
+ --save_steps 50000 \
37
+ --save_total_limit 1 \
38
+ --learning_rate 2e-5 \
39
+ --weight_decay 0. \
40
+ --warmup_ratio 0.03 \
41
+ --lr_scheduler_type "cosine" \
42
+ --logging_steps 1 \
43
+ --tf32 True \
44
+ --model_max_length 2048 \
45
+ --gradient_checkpointing True \
46
+ --dataloader_num_workers 4 \
47
+ --lazy_preprocess True \
48
+ --report_to wandb
scripts/finetune_lora.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ # Uncomment and set the following variables correspondingly to run this script:
6
+
7
+ ################## VICUNA ##################
8
+ # PROMPT_VERSION=v1
9
+ # MODEL_VERSION="vicuna-v1-3-7b"
10
+ ################## VICUNA ##################
11
+
12
+ ################## LLaMA-2 ##################
13
+ # PROMPT_VERSION="llava_llama_2"
14
+ # MODEL_VERSION="llama-2-7b-chat"
15
+ ################## LLaMA-2 ##################
16
+
17
+ deepspeed llava/train/train_mem.py \
18
+ --deepspeed ./scripts/zero2.json \
19
+ --lora_enable True \
20
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
21
+ --version $PROMPT_VERSION \
22
+ --data_path ./playground/data/llava_instruct_80k.json \
23
+ --image_folder /path/to/coco/train2017 \
24
+ --vision_tower openai/clip-vit-large-patch14 \
25
+ --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
26
+ --mm_vision_select_layer -2 \
27
+ --mm_use_im_start_end False \
28
+ --mm_use_im_patch_token False \
29
+ --bf16 True \
30
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
31
+ --num_train_epochs 1 \
32
+ --per_device_train_batch_size 16 \
33
+ --per_device_eval_batch_size 4 \
34
+ --gradient_accumulation_steps 1 \
35
+ --evaluation_strategy "no" \
36
+ --save_strategy "steps" \
37
+ --save_steps 50000 \
38
+ --save_total_limit 1 \
39
+ --learning_rate 2e-5 \
40
+ --weight_decay 0. \
41
+ --warmup_ratio 0.03 \
42
+ --lr_scheduler_type "cosine" \
43
+ --logging_steps 1 \
44
+ --tf32 True \
45
+ --model_max_length 2048 \
46
+ --gradient_checkpointing True \
47
+ --lazy_preprocess True \
48
+ --dataloader_num_workers 4 \
49
+ --report_to wandb
scripts/finetune_lora_my.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ # Uncomment and set the following variables correspondingly to run this script:
6
+
7
+ ################## VICUNA ##################
8
+ # PROMPT_VERSION=v1
9
+ # MODEL_VERSION="vicuna-v1-3-7b"
10
+ ################## VICUNA ##################
11
+
12
+ ################## LLaMA-2 ##################
13
+ PROMPT_VERSION="llava_llama_2"
14
+ MODEL_VERSION="llama-2-7b-chat"
15
+ ################## LLaMA-2 ##################
16
+
17
+ deepspeed llava/train/train_mem.py \
18
+ --deepspeed ./scripts/zero2.json \
19
+ --lora_enable True \
20
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
21
+ --version $PROMPT_VERSION \
22
+ --data_path /mnt1/wjl/InternLM-XComposer/data/4D-OR-instruct/llava_3d_0503_train.json \
23
+ --image_folder /mnt1/wjl/InternLM-XComposer/data/4D-OR-MV \
24
+ --vision_tower openai/clip-vit-large-patch14 \
25
+ --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
26
+ --mm_vision_select_layer -2 \
27
+ --mm_use_im_start_end False \
28
+ --mm_use_im_patch_token False \
29
+ --bf16 True \
30
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
31
+ --num_train_epochs 1 \
32
+ --per_device_train_batch_size 16 \
33
+ --per_device_eval_batch_size 4 \
34
+ --gradient_accumulation_steps 1 \
35
+ --evaluation_strategy "no" \
36
+ --save_strategy "steps" \
37
+ --save_steps 50000 \
38
+ --save_total_limit 1 \
39
+ --learning_rate 2e-5 \
40
+ --weight_decay 0. \
41
+ --warmup_ratio 0.03 \
42
+ --lr_scheduler_type "cosine" \
43
+ --logging_steps 1 \
44
+ --tf32 True \
45
+ --model_max_length 2048 \
46
+ --gradient_checkpointing True \
47
+ --lazy_preprocess True \
48
+ --dataloader_num_workers 4 \
49
+ --report_to wandb
scripts/finetune_qlora.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ # Uncomment and set the following variables correspondingly to run this script:
6
+
7
+ ################## VICUNA ##################
8
+ # PROMPT_VERSION=v1
9
+ # MODEL_VERSION="vicuna-v1-3-7b"
10
+ ################## VICUNA ##################
11
+
12
+ ################## LLaMA-2 ##################
13
+ # PROMPT_VERSION="llava_llama_2"
14
+ # MODEL_VERSION="llama-2-7b-chat"
15
+ ################## LLaMA-2 ##################
16
+
17
+ deepspeed llava/train/train_mem.py \
18
+ --deepspeed ./scripts/zero2.json \
19
+ --lora_enable True \
20
+ --bits 4 \
21
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
22
+ --version $PROMPT_VERSION \
23
+ --data_path ./playground/data/llava_instruct_80k.json \
24
+ --image_folder /path/to/coco/train2017 \
25
+ --vision_tower openai/clip-vit-large-patch14 \
26
+ --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
27
+ --mm_vision_select_layer -2 \
28
+ --mm_use_im_start_end False \
29
+ --mm_use_im_patch_token False \
30
+ --bf16 True \
31
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
32
+ --num_train_epochs 1 \
33
+ --per_device_train_batch_size 16 \
34
+ --per_device_eval_batch_size 4 \
35
+ --gradient_accumulation_steps 1 \
36
+ --evaluation_strategy "no" \
37
+ --save_strategy "steps" \
38
+ --save_steps 50000 \
39
+ --save_total_limit 1 \
40
+ --learning_rate 2e-5 \
41
+ --weight_decay 0. \
42
+ --warmup_ratio 0.03 \
43
+ --lr_scheduler_type "cosine" \
44
+ --logging_steps 1 \
45
+ --tf32 True \
46
+ --model_max_length 2048 \
47
+ --gradient_checkpointing True \
48
+ --lazy_preprocess True \
49
+ --dataloader_num_workers 4 \
50
+ --report_to wandb
scripts/finetune_sqa.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ deepspeed llava/train/train_mem.py \
6
+ --deepspeed ./scripts/zero2.json \
7
+ --model_name_or_path lmsys/vicuna-13b-v1.3 \
8
+ --version $PROMPT_VERSION \
9
+ --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
10
+ --image_folder /Data/ScienceQA/data/scienceqa/images/train \
11
+ --vision_tower openai/clip-vit-large-patch14 \
12
+ --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
13
+ --mm_vision_select_layer -2 \
14
+ --mm_use_im_start_end False \
15
+ --mm_use_im_patch_token False \
16
+ --bf16 True \
17
+ --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
18
+ --num_train_epochs 12 \
19
+ --per_device_train_batch_size 16 \
20
+ --per_device_eval_batch_size 4 \
21
+ --gradient_accumulation_steps 1 \
22
+ --evaluation_strategy "no" \
23
+ --save_strategy "steps" \
24
+ --save_steps 50000 \
25
+ --save_total_limit 1 \
26
+ --learning_rate 2e-5 \
27
+ --weight_decay 0. \
28
+ --warmup_ratio 0.03 \
29
+ --lr_scheduler_type "cosine" \
30
+ --logging_steps 1 \
31
+ --tf32 True \
32
+ --model_max_length 2048 \
33
+ --gradient_checkpointing True \
34
+ --dataloader_num_workers 4 \
35
+ --lazy_preprocess True \
36
+ --report_to wandb
scripts/merge_lora_weights.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from llava.model.builder import load_pretrained_model
3
+ from llava.mm_utils import get_model_name_from_path
4
+
5
+
6
+ def merge_lora(args):
7
+ model_name = get_model_name_from_path(args.model_path)
8
+ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
9
+
10
+ model.save_pretrained(args.save_model_path)
11
+ tokenizer.save_pretrained(args.save_model_path)
12
+
13
+
14
+ if __name__ == "__main__":
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--model-path", type=str, required=True)
17
+ parser.add_argument("--model-base", type=str, required=True)
18
+ parser.add_argument("--save-model-path", type=str, required=True)
19
+
20
+ args = parser.parse_args()
21
+
22
+ merge_lora(args)
scripts/pretrain.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4
+
5
+ # Uncomment and set the following variables correspondingly to run this script:
6
+
7
+ # MODEL_VERSION=vicuna-v1-3-7b
8
+ # MODEL_VERSION=llama-2-7b-chat
9
+
10
+ ########### DO NOT CHANGE ###########
11
+ ########### USE THIS FOR BOTH ###########
12
+ PROMPT_VERSION=plain
13
+ ########### DO NOT CHANGE ###########
14
+
15
+ deepspeed llava/train/train_mem.py \
16
+ --deepspeed ./scripts/zero2.json \
17
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
18
+ --version $PROMPT_VERSION \
19
+ --data_path /path/to/pretrain_data.json \
20
+ --image_folder /path/to/images \
21
+ --vision_tower openai/clip-vit-large-patch14 \
22
+ --tune_mm_mlp_adapter True \
23
+ --mm_vision_select_layer -2 \
24
+ --mm_use_im_start_end False \
25
+ --mm_use_im_patch_token False \
26
+ --bf16 True \
27
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
28
+ --num_train_epochs 1 \
29
+ --per_device_train_batch_size 16 \
30
+ --per_device_eval_batch_size 4 \
31
+ --gradient_accumulation_steps 1 \
32
+ --evaluation_strategy "no" \
33
+ --save_strategy "steps" \
34
+ --save_steps 24000 \
35
+ --save_total_limit 1 \
36
+ --learning_rate 2e-3 \
37
+ --weight_decay 0. \
38
+ --warmup_ratio 0.03 \
39
+ --lr_scheduler_type "cosine" \
40
+ --logging_steps 1 \
41
+ --tf32 True \
42
+ --model_max_length 2048 \
43
+ --gradient_checkpointing True \
44
+ --dataloader_num_workers 4 \
45
+ --lazy_preprocess True \
46
+ --report_to wandb
scripts/pretrain_xformers.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Uncomment and set the following variables correspondingly to run this script:
4
+
5
+ # MODEL_VERSION=vicuna-v1-3-7b
6
+ # MODEL_VERSION=llama-2-7b-chat
7
+
8
+ ########### DO NOT CHANGE ###########
9
+ ########### USE THIS FOR BOTH ###########
10
+ PROMPT_VERSION=plain
11
+ ########### DO NOT CHANGE ###########
12
+
13
+ deepspeed llava/train/train_xformers.py \
14
+ --deepspeed ./scripts/zero2.json \
15
+ --model_name_or_path ./checkpoints/$MODEL_VERSION \
16
+ --version $PROMPT_VERSION \
17
+ --data_path /path/to/pretrain_data.json \
18
+ --image_folder /path/to/images \
19
+ --vision_tower openai/clip-vit-large-patch14 \
20
+ --tune_mm_mlp_adapter True \
21
+ --mm_vision_select_layer -2 \
22
+ --mm_use_im_start_end False \
23
+ --mm_use_im_patch_token False \
24
+ --bf16 False \
25
+ --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
26
+ --num_train_epochs 1 \
27
+ --per_device_train_batch_size 4 \
28
+ --per_device_eval_batch_size 4 \
29
+ --gradient_accumulation_steps 4 \
30
+ --evaluation_strategy "no" \
31
+ --save_strategy "steps" \
32
+ --save_steps 24000 \
33
+ --save_total_limit 1 \
34
+ --learning_rate 2e-3 \
35
+ --weight_decay 0. \
36
+ --warmup_ratio 0.03 \
37
+ --lr_scheduler_type "cosine" \
38
+ --logging_steps 1 \
39
+ --tf32 False \
40
+ --model_max_length 2048 \
41
+ --gradient_checkpointing True \
42
+ --dataloader_num_workers 4 \
43
+ --lazy_preprocess True \
44
+ --report_to wandb
scripts/sqa_eval_batch.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ CHUNKS=8
4
+ for IDX in {0..7}; do
5
+ CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
6
+ --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
7
+ --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
8
+ --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
9
+ --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
10
+ --num-chunks $CHUNKS \
11
+ --chunk-idx $IDX \
12
+ --conv-mode llava_v1 &
13
+ done