Glm4vVisionModel / glm_9b_visual /test_vision_model.py
Alic-Li's picture
Upload 3 files
b3733b4 verified
import torch
from PIL import Image
from transformers import AutoImageProcessor
from transformers.models.glm4v.modeling_glm4v import Glm4vVisionModel, Glm4vVisionConfig
from safetensors.torch import load_file
device = "cuda:0" if torch.cuda.is_available() else "cpu"
image_path = "/home/alic-li/图片/黛玉训狗_红楼之家_饺子和布莱恩的木石前盟.jpg"
vision_model_path = "/mnt/69043a6d-b152-4bd1-be10-e1130af6487f/glm_9b_visual/"
image_processor = AutoImageProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", trust_remote_code=True)
config = Glm4vVisionConfig.from_pretrained(vision_model_path)
vision_model = Glm4vVisionModel(config).to(device)
state_dict = load_file(f"{vision_model_path}/model.safetensors")
missing_keys, unexpected_keys = vision_model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
image = Image.open(image_path).convert("RGB")
inputs = image_processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs["pixel_values"]
grid_thw = inputs["image_grid_thw"] # 这个才是正确的 patch 分布!
with torch.no_grad():
outputs = vision_model(
hidden_states=pixel_values,
grid_thw=grid_thw
)
print("Image embedding shape:", outputs.shape)
print("First 5 features:", outputs[:, :5].cpu().numpy())
print("All features:", outputs.cpu().numpy())