Wendy-Fly
/

Truck2

Model card Files Files and versions Community

Wendy-Fly commited on Feb 6

Commit

3946de7

verified ·

1 Parent(s): 118a0aa

Upload infer_qwen2_vl.py with huggingface_hub

Browse files

Files changed (1) hide show

infer_qwen2_vl.py +80 -0

infer_qwen2_vl.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# default: Load the model on the available device(s)
+model_path = '/inspire/hdd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/albus/ICCV_2025/qvq/models/QVQ-72B-Preview'
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    model_path, torch_dtype="auto", device_map="auto"
+)
+# default processer
+processor = AutoProcessor.from_pretrained(model_path)
+# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
+# min_pixels = 256*28*28
+# max_pixels = 1280*28*28
+#processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview", min_pixels=min_pixels, max_pixels=max_pixels)
+import glob
+from PIL import Image
+folder = "/Users/baixuehai/Downloads/images"
+images = []
+for img_path in glob.glob(f"{folder}/*.jpe"):
+    img = Image.open(img_path)
+    images.append(img)
+print(len(images))
+messages = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
+        ],
+    },
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png",
+            },
+            {"type": "text", "text": "详细描述图片中的内容"},
+        ],
+    }
+]
+from tqdm import tqdm
+# Preparation for inference
+ans = []
+for img in tqdm(images):
+    messages[1]["content"][0]["image"] = img
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=8192)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    ans.append(output_text)
+    #print(output_text)
+import json
+output_json = "output.json"
+with open(output_json,"w", encoding="utf-8")as f:
+    json.dump(ans,f,ensure_ascii=False, indent=4)