import torch from PIL import Image from transformers import AutoImageProcessor from transformers.models.glm4v.modeling_glm4v import Glm4vVisionModel, Glm4vVisionConfig from safetensors.torch import load_file device = "cuda:0" if torch.cuda.is_available() else "cpu" image_path = "/home/alic-li/图片/黛玉训狗_红楼之家_饺子和布莱恩的木石前盟.jpg" vision_model_path = "/mnt/69043a6d-b152-4bd1-be10-e1130af6487f/glm_9b_visual/" image_processor = AutoImageProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", trust_remote_code=True) config = Glm4vVisionConfig.from_pretrained(vision_model_path) vision_model = Glm4vVisionModel(config).to(device) state_dict = load_file(f"{vision_model_path}/model.safetensors") missing_keys, unexpected_keys = vision_model.load_state_dict(state_dict, strict=False) print("Missing keys:", missing_keys) print("Unexpected keys:", unexpected_keys) image = Image.open(image_path).convert("RGB") inputs = image_processor(images=image, return_tensors="pt").to(device) pixel_values = inputs["pixel_values"] grid_thw = inputs["image_grid_thw"] # 这个才是正确的 patch 分布! with torch.no_grad(): outputs = vision_model( hidden_states=pixel_values, grid_thw=grid_thw ) print("Image embedding shape:", outputs.shape) print("First 5 features:", outputs[:, :5].cpu().numpy()) print("All features:", outputs.cpu().numpy())