File size: 4,704 Bytes
3946de7
 
0166ac7
3946de7
bdec26b
 
 
 
 
 
 
 
3946de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0572ee4
b7eccab
0572ee4
a0e47ea
 
 
 
0572ee4
3946de7
ee8a3ee
3946de7
8b05886
bdec26b
a7a2726
7021583
bb9184c
4cacacd
3946de7
 
 
 
 
 
 
 
 
 
 
 
 
 
ff717b9
3946de7
 
 
efb41ef
eaf915b
 
 
 
 
9015d93
 
 
0d8b001
eaf915b
c543d24
1c3052e
2a045f6
1c3052e
36e4e7e
eaf915b
 
 
0d8b001
eaf915b
4369fe7
eaf915b
 
 
0d8b001
eaf915b
4369fe7
eaf915b
 
 
 
3946de7
eaf915b
0166ac7
0d8b001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c3052e
eaf915b
9015d93
eaf915b
8a2fe71
bdec26b
3946de7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import json

def read_json(file_path): 
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# default: Load the model on the available device(s)
model_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/models/QVQ-72B-Preview'
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained(model_path)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
#processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview", min_pixels=min_pixels, max_pixels=max_pixels)

import glob
from PIL import Image
import argparse
import os

# parser = argparse.ArgumentParser(description="Process a dataset with specific index range.")
# parser.add_argument("--batch_size", type=int, default = 1,help="batch size")
# #parser.add_argument("--index", type=int, default = 0,help="index")
# args = parser.parse_args()


folder = "/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/dataset"

file_names = os.listdir(folder)

num_image = len(file_names)

begin, end, batch_size= 0, num_image, 6
print(f"beigin : {begin}, end : {end}, batch_size : {batch_size}")
messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
        ],
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png",
            },
            {"type": "text", "text": "Please describe in detail the content of the picture."},
        ],
    }
]

from tqdm import tqdm
# Preparation for inference
ans = []
counter = 0
for batch_idx in tqdm(range(begin, end, batch_size)):
    up = min(batch_idx + batch_size, end)
    batch = file_names[batch_idx: up]
    print(f"data index range : {batch_idx} ~ {up}")
    image_inputs_batch, video_inputs_batch,text_batch = [], [], []
    for idx,i in enumerate(batch):
        #img = batch[i]
        #print('gain image successfully !')
        img_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/dataset/' + i
        #print(img_path)
        messages[1]["content"][0]["image"] = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/dataset/' + i
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        text_batch.append(text)
        image_inputs, video_inputs = process_vision_info(messages)
        print(video_inputs)
        image_inputs_batch.append(image_inputs)
        video_inputs_batch.append(video_inputs)
    inputs = processor(
        text=text_batch, # [text]
        images=image_inputs_batch,
        videos=None,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference: Generation of the output
    
    #print(inputs)

    # for x in range(len(inputs)):
    #     print(f"Generating {x}th image")
    #     generated_ids = model.generate(**x, max_new_tokens=8192)
    #     generated_ids_trimmed = [
    #         out_ids[len(in_ids) :] for in_ids, out_ids in zip(x.input_ids, generated_ids)
    #     ]
    #     output_text = processor.batch_decode(
    #         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
    #     )
    #     ans.append(output_text)

    generated_ids = model.generate(**inputs, max_new_tokens=8192)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    ans.append(output_text)
    save_path = "output_final.json"
    counter = counter + 1
    if counter % 10 == 0 or up + 10 >= end:
        print(f"Saving data at iteration {idx + 1}")
        write_json(save_path, ans)