Upload 3 files
Browse files
glm_9b_visual/config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"Glm4vVisionModel"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"depth": 24,
|
8 |
+
"hidden_act": "silu",
|
9 |
+
"hidden_dropout_prob": 0.0,
|
10 |
+
"hidden_size": 1536,
|
11 |
+
"image_size": 336,
|
12 |
+
"in_channels": 3,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 13696,
|
15 |
+
"model_type": "glm4v",
|
16 |
+
"num_heads": 12,
|
17 |
+
"out_hidden_size": 4096,
|
18 |
+
"patch_size": 14,
|
19 |
+
"rms_norm_eps": 1e-05,
|
20 |
+
"spatial_merge_size": 2,
|
21 |
+
"temporal_patch_size": 2,
|
22 |
+
"torch_dtype": "bfloat16",
|
23 |
+
"transformers_version": "4.54.0.dev0"
|
24 |
+
}
|
glm_9b_visual/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3e1189a52ab8c389a8ed82cb7d0fed8d026f6f9a7a8343746259b62146a201e
|
3 |
+
size 1785015304
|
glm_9b_visual/test_vision_model.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image
|
3 |
+
from transformers import AutoImageProcessor
|
4 |
+
from transformers.models.glm4v.modeling_glm4v import Glm4vVisionModel, Glm4vVisionConfig
|
5 |
+
from safetensors.torch import load_file
|
6 |
+
|
7 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
8 |
+
|
9 |
+
image_path = "/home/alic-li/图片/黛玉训狗_红楼之家_饺子和布莱恩的木石前盟.jpg"
|
10 |
+
vision_model_path = "/mnt/69043a6d-b152-4bd1-be10-e1130af6487f/glm_9b_visual/"
|
11 |
+
|
12 |
+
image_processor = AutoImageProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking", trust_remote_code=True)
|
13 |
+
|
14 |
+
config = Glm4vVisionConfig.from_pretrained(vision_model_path)
|
15 |
+
vision_model = Glm4vVisionModel(config).to(device)
|
16 |
+
|
17 |
+
state_dict = load_file(f"{vision_model_path}/model.safetensors")
|
18 |
+
missing_keys, unexpected_keys = vision_model.load_state_dict(state_dict, strict=False)
|
19 |
+
print("Missing keys:", missing_keys)
|
20 |
+
print("Unexpected keys:", unexpected_keys)
|
21 |
+
|
22 |
+
image = Image.open(image_path).convert("RGB")
|
23 |
+
inputs = image_processor(images=image, return_tensors="pt").to(device)
|
24 |
+
|
25 |
+
pixel_values = inputs["pixel_values"]
|
26 |
+
grid_thw = inputs["image_grid_thw"] # 这个才是正确的 patch 分布!
|
27 |
+
|
28 |
+
with torch.no_grad():
|
29 |
+
outputs = vision_model(
|
30 |
+
hidden_states=pixel_values,
|
31 |
+
grid_thw=grid_thw
|
32 |
+
)
|
33 |
+
|
34 |
+
print("Image embedding shape:", outputs.shape)
|
35 |
+
print("First 5 features:", outputs[:, :5].cpu().numpy())
|
36 |
+
print("All features:", outputs.cpu().numpy())
|