|
import os |
|
|
|
import torch |
|
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor |
|
from qwen_vl_utils import process_vision_info |
|
import json |
|
from tqdm import tqdm |
|
import os |
|
import argparse |
|
|
|
def read_json(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
data = json.load(file) |
|
return data |
|
|
|
def write_json(file_path, data): |
|
with open(file_path, 'w', encoding='utf-8') as file: |
|
json.dump(data, file, ensure_ascii=False, indent=4) |
|
|
|
|
|
print(torch.cuda.device_count()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--model_path", type=str, default="/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/ckpt_7B") |
|
parser.add_argument("--begin", type=int, default=0) |
|
parser.add_argument("--end", type=int, default=4635) |
|
parser.add_argument("--batch_size", type=int, default=16) |
|
parser.add_argument("--data_path", type=str, default="/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/magicbrush_dataset/dataset.json") |
|
parser.add_argument("--prompt_path", type=str, default="/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/magicbrush_dataset/gen.json") |
|
|
|
args = parser.parse_args() |
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
args.model_path, |
|
torch_dtype=torch.bfloat16, |
|
attn_implementation="flash_attention_2", |
|
device_map="auto", |
|
) |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(args.model_path) |
|
print(model.device) |
|
|
|
|
|
|
|
|
|
data = read_json(args.data_path) |
|
save_data = [] |
|
correct_num = 0 |
|
begin = args.begin |
|
end = args.end |
|
batch_size = args.batch_size |
|
json_path = args.prompt_path |
|
|
|
|
|
for batch_idx in tqdm(range(begin, end, batch_size)): |
|
batch = data[batch_idx:min(batch_idx+batch_size, end)] |
|
print(len(batch)) |
|
image_list = [] |
|
input_text_list = [] |
|
data_list = [] |
|
save_list = [] |
|
sd_ans = [] |
|
|
|
for idx, i in enumerate(batch): |
|
save_ = { |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "image", |
|
"image": "", |
|
}, |
|
{"type": "text", |
|
"text": "Please help me write a prompt for image editing on this picture. The requirements are as follows: complex editing instructions should include two to five simple editing instructions involving spatial relationships (simple editing instructions such as ADD: add an object to the left of a certain object, DELETE: delete a certain object, MODIFY: change a certain object into another object). We hope that the editing instructions can have simple reasoning and can also include some abstract concept-based editing (such as making the atmosphere more romantic, or making the diet healthier, or making the boy more handsome and the girl more beautiful, etc.). Please give me clear editing instructions and also consider whether such editing instructions are reasonable."}, |
|
], |
|
"result":"" |
|
} |
|
|
|
messages = batch[idx] |
|
save_['content'][0]['image'] = messages['content'][0]['image'] |
|
save_['content'][1]['text'] = messages['content'][1]['text'] |
|
|
|
data_list.append(messages) |
|
save_list.append(save_) |
|
|
|
text = processor.apply_chat_template(data_list, tokenize=False, add_generation_prompt=True) |
|
|
|
image_inputs, video_inputs = process_vision_info(data_list) |
|
inputs = processor( |
|
text=[text], |
|
images=image_inputs, |
|
videos=video_inputs, |
|
padding=True, |
|
return_tensors="pt", |
|
) |
|
inputs = inputs.to(model.device) |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=128) |
|
|
|
generated_ids_trimmed = [ |
|
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
|
] |
|
output_text = processor.batch_decode( |
|
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
) |
|
|
|
save_['result'] = output_text |
|
save_data.append(save_) |
|
|
|
if batch_idx % 4 ==0: |
|
write_json(json_path,save_data) |
|
print(len(save_data)) |
|
|
|
write_json(json_path,save_data) |
|
|
|
|