File size: 4,704 Bytes
3946de7 0166ac7 3946de7 bdec26b 3946de7 0572ee4 b7eccab 0572ee4 a0e47ea 0572ee4 3946de7 ee8a3ee 3946de7 8b05886 bdec26b a7a2726 7021583 bb9184c 4cacacd 3946de7 ff717b9 3946de7 efb41ef eaf915b 9015d93 0d8b001 eaf915b c543d24 1c3052e 2a045f6 1c3052e 36e4e7e eaf915b 0d8b001 eaf915b 4369fe7 eaf915b 0d8b001 eaf915b 4369fe7 eaf915b 3946de7 eaf915b 0166ac7 0d8b001 1c3052e eaf915b 9015d93 eaf915b 8a2fe71 bdec26b 3946de7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
def read_json(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
def write_json(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
# default: Load the model on the available device(s)
model_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/models/QVQ-72B-Preview'
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype="auto", device_map="auto"
)
# default processer
processor = AutoProcessor.from_pretrained(model_path)
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
#processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview", min_pixels=min_pixels, max_pixels=max_pixels)
import glob
from PIL import Image
import argparse
import os
# parser = argparse.ArgumentParser(description="Process a dataset with specific index range.")
# parser.add_argument("--batch_size", type=int, default = 1,help="batch size")
# #parser.add_argument("--index", type=int, default = 0,help="index")
# args = parser.parse_args()
folder = "/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/dataset"
file_names = os.listdir(folder)
num_image = len(file_names)
begin, end, batch_size= 0, num_image, 6
print(f"beigin : {begin}, end : {end}, batch_size : {batch_size}")
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
],
},
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png",
},
{"type": "text", "text": "Please describe in detail the content of the picture."},
],
}
]
from tqdm import tqdm
# Preparation for inference
ans = []
counter = 0
for batch_idx in tqdm(range(begin, end, batch_size)):
up = min(batch_idx + batch_size, end)
batch = file_names[batch_idx: up]
print(f"data index range : {batch_idx} ~ {up}")
image_inputs_batch, video_inputs_batch,text_batch = [], [], []
for idx,i in enumerate(batch):
#img = batch[i]
#print('gain image successfully !')
img_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/dataset/' + i
#print(img_path)
messages[1]["content"][0]["image"] = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/dataset/' + i
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
text_batch.append(text)
image_inputs, video_inputs = process_vision_info(messages)
print(video_inputs)
image_inputs_batch.append(image_inputs)
video_inputs_batch.append(video_inputs)
inputs = processor(
text=text_batch, # [text]
images=image_inputs_batch,
videos=None,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
#print(inputs)
# for x in range(len(inputs)):
# print(f"Generating {x}th image")
# generated_ids = model.generate(**x, max_new_tokens=8192)
# generated_ids_trimmed = [
# out_ids[len(in_ids) :] for in_ids, out_ids in zip(x.input_ids, generated_ids)
# ]
# output_text = processor.batch_decode(
# generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
# )
# ans.append(output_text)
generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
ans.append(output_text)
save_path = "output_final.json"
counter = counter + 1
if counter % 10 == 0 or up + 10 >= end:
print(f"Saving data at iteration {idx + 1}")
write_json(save_path, ans)
|