Wendy-Fly commited on
Commit
3946de7
·
verified ·
1 Parent(s): 118a0aa

Upload infer_qwen2_vl.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. infer_qwen2_vl.py +80 -0
infer_qwen2_vl.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
+ from qwen_vl_utils import process_vision_info
3
+
4
+
5
+ # default: Load the model on the available device(s)
6
+ model_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/models/QVQ-72B-Preview'
7
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
8
+ model_path, torch_dtype="auto", device_map="auto"
9
+ )
10
+
11
+ # default processer
12
+ processor = AutoProcessor.from_pretrained(model_path)
13
+
14
+ # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
15
+ # min_pixels = 256*28*28
16
+ # max_pixels = 1280*28*28
17
+ #processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview", min_pixels=min_pixels, max_pixels=max_pixels)
18
+
19
+ import glob
20
+ from PIL import Image
21
+
22
+ folder = "/Users/baixuehai/Downloads/images"
23
+
24
+ images = []
25
+ for img_path in glob.glob(f"{folder}/*.jpe"):
26
+ img = Image.open(img_path)
27
+ images.append(img)
28
+ print(len(images))
29
+ messages = [
30
+ {
31
+ "role": "system",
32
+ "content": [
33
+ {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
34
+ ],
35
+ },
36
+ {
37
+ "role": "user",
38
+ "content": [
39
+ {
40
+ "type": "image",
41
+ "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png",
42
+ },
43
+ {"type": "text", "text": "详细描述图片中的内容"},
44
+ ],
45
+ }
46
+ ]
47
+ from tqdm import tqdm
48
+ # Preparation for inference
49
+ ans = []
50
+ for img in tqdm(images):
51
+ messages[1]["content"][0]["image"] = img
52
+
53
+ text = processor.apply_chat_template(
54
+ messages, tokenize=False, add_generation_prompt=True
55
+ )
56
+ image_inputs, video_inputs = process_vision_info(messages)
57
+ inputs = processor(
58
+ text=[text],
59
+ images=image_inputs,
60
+ videos=video_inputs,
61
+ padding=True,
62
+ return_tensors="pt",
63
+ )
64
+ inputs = inputs.to("cuda")
65
+
66
+ # Inference: Generation of the output
67
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
68
+ generated_ids_trimmed = [
69
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
70
+ ]
71
+ output_text = processor.batch_decode(
72
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
73
+ )
74
+ ans.append(output_text)
75
+ #print(output_text)
76
+
77
+ import json
78
+ output_json = "output.json"
79
+ with open(output_json,"w", encoding="utf-8")as f:
80
+ json.dump(ans,f,ensure_ascii=False, indent=4)