|
import torch |
|
from PIL import Image |
|
from transformers import AutoImageProcessor |
|
from transformers.models.glm4v.modeling_glm4v import Glm4vVisionModel, Glm4vVisionConfig |
|
from safetensors.torch import load_file |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
image_path = "/home/alic-li/图片/黛玉训狗_红楼之家_饺子和布莱恩的木石前盟.jpg" |
|
vision_model_path = "/mnt/69043a6d-b152-4bd1-be10-e1130af6487f/glm_9b_visual/" |
|
|
|
image_processor = AutoImageProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", trust_remote_code=True) |
|
|
|
config = Glm4vVisionConfig.from_pretrained(vision_model_path) |
|
vision_model = Glm4vVisionModel(config).to(device) |
|
|
|
state_dict = load_file(f"{vision_model_path}/model.safetensors") |
|
missing_keys, unexpected_keys = vision_model.load_state_dict(state_dict, strict=False) |
|
print("Missing keys:", missing_keys) |
|
print("Unexpected keys:", unexpected_keys) |
|
|
|
image = Image.open(image_path).convert("RGB") |
|
inputs = image_processor(images=image, return_tensors="pt").to(device) |
|
|
|
pixel_values = inputs["pixel_values"] |
|
grid_thw = inputs["image_grid_thw"] |
|
|
|
with torch.no_grad(): |
|
outputs = vision_model( |
|
hidden_states=pixel_values, |
|
grid_thw=grid_thw |
|
) |
|
|
|
print("Image embedding shape:", outputs.shape) |
|
print("First 5 features:", outputs[:, :5].cpu().numpy()) |
|
print("All features:", outputs.cpu().numpy()) |
|
|