jamessyx commited on Jun 9

Commit

95b3a4d

verified ·

1 Parent(s): 36b91e1

Upload 27 files

Browse files

Files changed (27) hide show

src/Qwen-encoder-1.5B/added_tokens.json +5 -0
src/Qwen-encoder-1.5B/config.json +28 -0
src/Qwen-encoder-1.5B/generation_config.json +6 -0
src/Qwen-encoder-1.5B/merges.txt +0 -0
src/Qwen-encoder-1.5B/model.safetensors.index.json +345 -0
src/Qwen-encoder-1.5B/special_tokens_map.json +21 -0
src/Qwen-encoder-1.5B/tokenizer.json +0 -0
src/Qwen-encoder-1.5B/tokenizer_config.json +52 -0
src/Qwen-encoder-1.5B/vocab.json +0 -0
src/open_clip/__init__.py +13 -0
src/open_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
src/open_clip/constants.py +6 -0
src/open_clip/convert.py +190 -0
src/open_clip/factory.py +475 -0
src/open_clip/hf_configs.py +67 -0
src/open_clip/hf_model.py +193 -0
src/open_clip/model.py +782 -0
src/open_clip/model_configs/ViT-L-14-336.json +16 -0
src/open_clip/openai.py +90 -0
src/open_clip/pos_embed.py +96 -0
src/open_clip/pretrained.py +655 -0
src/open_clip/timm_model.py +152 -0
src/open_clip/tokenizer.py +517 -0
src/open_clip/transform.py +407 -0
src/open_clip/transformer.py +908 -0
src/open_clip/utils.py +89 -0
src/open_clip/version.py +1 -0

src/Qwen-encoder-1.5B/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

src/Qwen-encoder-1.5B/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/home/BiomikeeNew/werent4/llm2vec/output/mntp/qwen_0.5/checkpoint-120000",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

src/Qwen-encoder-1.5B/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.40.2"
+}

src/Qwen-encoder-1.5B/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

src/Qwen-encoder-1.5B/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,345 @@

+{
+  "metadata": {
+    "total_size": 6174857216
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

src/Qwen-encoder-1.5B/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": "_",
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

src/Qwen-encoder-1.5B/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/Qwen-encoder-1.5B/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "62": {
+      "content": "_",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "mask_token": "_",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

src/Qwen-encoder-1.5B/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .version import __version__
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
+from .factory import list_models, add_model_config, get_model_config, load_checkpoint
+from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
+    convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
+    get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
+    get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
+from .tokenizer import SimpleTokenizer, tokenize, decode
+from .transform import image_transform, AugmentationCfg

src/open_clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

src/open_clip/constants.py ADDED Viewed

	@@ -0,0 +1,6 @@

+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+INCEPTION_MEAN = (0.5, 0.5, 0.5)
+INCEPTION_STD = (0.5, 0.5, 0.5)

src/open_clip/convert.py ADDED Viewed

	@@ -0,0 +1,190 @@

+""" Conversion functions for 3rd part state-dicts and non-torch native checkpoint formats.
+"""
+from typing import Union
+import torch
+import numpy as np
+from .model import CLIP, CustomTextCLIP
+from .transformer import TextTransformer, Transformer
+@torch.no_grad()
+def load_big_vision_weights(model: CustomTextCLIP, checkpoint_path: str):
+    """ Load weights from .npz checkpoints for official Google big_vision image-text models
+    Currently the SigLIP source models are supported and a CustomTextCLIP destination model
+    w/ timm image encoder.
+    """
+    from timm.layers import resample_patch_embed, resample_abs_pos_embed
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+    w = np.load(checkpoint_path)
+    interpolation = 'bilinear'
+    antialias = False
+    def _convert_timm_img(module, prefix):
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+        if embed_conv_w.shape[-2:] != module.patch_embed.proj.weight.shape[-2:]:
+            embed_conv_w = resample_patch_embed(
+                embed_conv_w,
+                module.patch_embed.proj.weight.shape[-2:],
+                interpolation=interpolation,
+                antialias=antialias,
+                verbose=True,
+            )
+        module.patch_embed.proj.weight.copy_(embed_conv_w)
+        module.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+        if module.cls_token is not None:
+            module.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+        pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False)
+        if pos_embed_w.shape != module.pos_embed.shape:
+            assert False, f'{pos_embed_w.shape}, {module.pos_embed.shape}'
+            num_prefix_tokens = 0 if getattr(module, 'no_embed_class', False) else getattr(module, 'num_prefix_tokens', 1)
+            pos_embed_w = resample_abs_pos_embed(  # resize pos embedding when different size from pretrained weights
+                pos_embed_w,
+                new_size=module.patch_embed.grid_size,
+                num_prefix_tokens=num_prefix_tokens,
+                interpolation=interpolation,
+                antialias=antialias,
+                verbose=True,
+            )
+        module.pos_embed.copy_(pos_embed_w)
+        mha_sub, b_sub, ln1_sub = (0, 0, 1)
+        for i, block in enumerate(module.blocks.children()):
+            block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_{mha_sub}/'
+            block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+            block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+            block.attn.qkv.weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+            block.attn.qkv.bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+            block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+            for r in range(2):
+                getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/kernel']))
+                getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/bias']))
+            block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/scale']))
+            block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/bias']))
+        module.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+        module.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+        if module.attn_pool is not None:
+            block_prefix = f'{prefix}MAPHead_0/'
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
+            module.attn_pool.latent.copy_(_n2p(w[f'{block_prefix}probe'], t=False))
+            module.attn_pool.q.weight.copy_(_n2p(w[f'{mha_prefix}query/kernel'], t=False).flatten(1).T)
+            module.attn_pool.q.bias.copy_(_n2p(w[f'{mha_prefix}query/bias'], t=False).reshape(-1))
+            module.attn_pool.kv.weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('key', 'value')]))
+            module.attn_pool.kv.bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('key', 'value')]))
+            module.attn_pool.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+            module.attn_pool.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+            module.attn_pool.norm.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+            module.attn_pool.norm.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+            for r in range(2):
+                getattr(module.attn_pool.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/kernel']))
+                getattr(module.attn_pool.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/bias']))
+    def _convert_openclip_transformer(module: Transformer, prefix):
+        for i, block in enumerate(module.resblocks.children()):
+            block_prefix = f'{prefix}encoderblock_{i}/'
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
+            block.ln_1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+            block.ln_1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+            block.attn.in_proj_weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+            block.attn.in_proj_bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.out_proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+            block.attn.out_proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+            block.ln_2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/scale']))
+            block.ln_2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/bias']))
+            block.mlp.c_fc.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/kernel']))
+            block.mlp.c_fc.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/bias']))
+            block.mlp.c_proj.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/kernel']))
+            block.mlp.c_proj.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/bias']))
+    def _convert_openclip_txt(module: TextTransformer, prefix):
+        module.token_embedding.weight.copy_(_n2p(w[f'{prefix}Embed_0/embedding'], t=False))
+        pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False).squeeze(0)
+        module.positional_embedding.copy_(pos_embed_w)
+        _convert_openclip_transformer(module.transformer, prefix=prefix + 'Encoder_0/')
+        module.ln_final.weight.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/scale']))
+        module.ln_final.bias.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/bias']))
+        module.text_projection.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+        module.text_projection.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    _convert_timm_img(model.visual.trunk, 'params/img/')
+    _convert_openclip_txt(model.text, 'params/txt/')
+    model.logit_bias.copy_(_n2p(w['params/b'])[0])
+    model.logit_scale.copy_(_n2p(w['params/t'])[0])
+@torch.no_grad()
+def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict, fastvit = True):
+    def _convert_timm_img(state_dict):
+        if fastvit:
+            from timm.models.fastvit import checkpoint_filter_fn
+        else:
+            from timm.models.vision_transformer_hybrid import checkpoint_filter_fn
+        timm_state_dict = checkpoint_filter_fn(state_dict, model.visual.trunk)
+        timm_state_dict = {'visual.trunk.' + k: v for k, v in timm_state_dict.items()}
+        return timm_state_dict
+    def _convert_openclip_txt(state_dict, prefix='text_encoder.'):
+        text_dict = {}
+        for k, v in state_dict.items():
+            if not k.startswith(prefix):
+                continue
+            k = k.replace(prefix, '')
+            k = k.replace('projection_layer', 'text_projection')
+            k = k.replace('embedding_layer', 'token_embedding')
+            if k.startswith('positional_embedding.pos_embed.pos_embed'):
+                k = k.replace('positional_embedding.pos_embed.pos_embed', 'positional_embedding')
+                v = v.squeeze()
+            k = k.replace('final_layer_norm', 'ln_final')
+            k = k.replace('pre_norm_mha.0', 'ln_1')
+            k = k.replace('pre_norm_mha.1', 'attn')
+            k = k.replace('pre_norm_ffn.0', 'ln_2')
+            k = k.replace('pre_norm_ffn.1', 'mlp.c_fc')
+            k = k.replace('pre_norm_ffn.4', 'mlp.c_proj')
+            k = k.replace('qkv_proj.weight', 'in_proj_weight')
+            k = k.replace('qkv_proj.bias', 'in_proj_bias')
+            k = k.replace('transformer.', 'transformer.resblocks.')
+            text_dict['text.' + k] = v
+        return text_dict
+    image_dict = _convert_timm_img(state_dict)
+    text_dict = _convert_openclip_txt(state_dict)
+    out_dict = {**image_dict, **text_dict}
+    out_dict['logit_scale'] = state_dict['logit_scale']
+    return out_dict
+def convert_state_dict(model: Union[CustomTextCLIP, CLIP], state_dict):
+    if 'image_encoder.model.patch_embed.0.rbr_conv.0.conv.weight' in state_dict:
+        # Apple MobileCLIP s1 & s2 state_dicts (s0 and b not currently supported)
+        state_dict = convert_mobile_clip_state_dict(model, state_dict)
+    if 'image_encoder.model.patch_emb.0.block.conv.weight' in state_dict:
+        # convert b model
+        state_dict = convert_mobile_clip_state_dict(model, state_dict, fastvit=False)
+    return state_dict

src/open_clip/factory.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import json
+import logging
+import os
+import re
+from copy import deepcopy
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .convert import convert_state_dict
+from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
+    resize_pos_embed, get_cast_dtype, resize_text_pos_embed, set_model_preprocess_cfg
+from .openai import load_openai_model
+from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\
+    list_pretrained_tags_by_model, download_pretrained_from_hf
+from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs
+from .tokenizer import HFTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH
+HF_HUB_PREFIX = 'hf-hub:'
+_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
+_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+def _rescan_model_configs():
+    global _MODEL_CONFIGS
+    config_ext = ('.json',)
+    config_files = []
+    for config_path in _MODEL_CONFIG_PATHS:
+        if config_path.is_file() and config_path.suffix in config_ext:
+            config_files.append(config_path)
+        elif config_path.is_dir():
+            for ext in config_ext:
+                config_files.extend(config_path.glob(f'*{ext}'))
+    for cf in config_files:
+        with open(cf, 'r') as f:
+            model_cfg = json.load(f)
+            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
+                _MODEL_CONFIGS[cf.stem] = model_cfg
+    _MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))}
+_rescan_model_configs()  # initial populate of model config registry
+def list_models():
+    """ enumerate available model architectures based on config files """
+    return list(_MODEL_CONFIGS.keys())
+def add_model_config(path):
+    """ add model config path or file and update registry """
+    if not isinstance(path, Path):
+        path = Path(path)
+    _MODEL_CONFIG_PATHS.append(path)
+    _rescan_model_configs()
+def get_model_config(model_name):
+    if model_name in _MODEL_CONFIGS:
+        return deepcopy(_MODEL_CONFIGS[model_name])
+    else:
+        return None
+def _get_hf_config(model_id, cache_dir=None):
+    config_path = download_pretrained_from_hf(model_id, filename='open_clip_config.json', cache_dir=cache_dir)
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    return config
+def get_tokenizer(
+        model_name: str = '',
+        context_length: Optional[int] = None,
+        **kwargs,
+):
+    if model_name.startswith(HF_HUB_PREFIX):
+        model_name = model_name[len(HF_HUB_PREFIX):]
+        try:
+            config = _get_hf_config(model_name)['model_cfg']
+        except Exception:
+            tokenizer = HFTokenizer(
+                model_name,
+                context_length=context_length or DEFAULT_CONTEXT_LENGTH,
+                **kwargs,
+            )
+            return tokenizer
+    else:
+        config = get_model_config(model_name)
+        assert config is not None, f"No valid model config found for {model_name}."
+    text_config = config.get('text_cfg', {})
+    if 'tokenizer_kwargs' in text_config:
+        tokenizer_kwargs = dict(text_config['tokenizer_kwargs'], **kwargs)
+    else:
+        tokenizer_kwargs = kwargs
+    if context_length is None:
+        context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH)
+    if 'hf_tokenizer_name' in text_config:
+        tokenizer = HFTokenizer(
+            text_config['hf_tokenizer_name'],
+            context_length=context_length,
+            **tokenizer_kwargs,
+        )
+    else:
+        tokenizer = SimpleTokenizer(
+            context_length=context_length,
+            **tokenizer_kwargs,
+        )
+    return tokenizer
+def load_state_dict(checkpoint_path: str, map_location='cpu'):
+    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif isinstance(checkpoint, torch.jit.ScriptModule):
+        state_dict = checkpoint.state_dict()
+        for key in ["input_resolution", "context_length", "vocab_size"]:
+            state_dict.pop(key, None)
+    else:
+        state_dict = checkpoint
+    if next(iter(state_dict.items()))[0].startswith('module'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    return state_dict
+def load_checkpoint(
+        model: Union[CLIP, CustomTextCLIP],
+        checkpoint_path: str,
+        strict: bool = True,
+):
+    if Path(checkpoint_path).suffix in ('.npz', '.npy'):
+        # Separate path loading numpy big_vision (SigLIP) weights
+        from open_clip.convert import load_big_vision_weights
+        load_big_vision_weights(model, checkpoint_path)
+        return {}
+    state_dict = load_state_dict(checkpoint_path)
+    # Detect & convert 3rd party state_dicts -> open_clip
+    state_dict = convert_state_dict(model, state_dict)
+    # Detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712
+    if 'logit_bias' not in state_dict and model.logit_bias is not None:
+        state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"])
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
+    # resize_pos_embed(state_dict, model)
+    resize_text_pos_embed(state_dict, model)
+    # Finally, load the massaged state_dict into model
+    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    # incompatible_keys = []
+    return incompatible_keys
+def create_model(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        force_preprocess_cfg: Optional[Dict[str, Any]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        require_pretrained: bool = False,
+        **model_kwargs,
+):
+    force_preprocess_cfg = force_preprocess_cfg or {}
+    preprocess_cfg = asdict(PreprocessCfg())
+    has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX)
+    if has_hf_hub_prefix:
+        model_id = model_name[len(HF_HUB_PREFIX):]
+        checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
+        config = _get_hf_config(model_id, cache_dir)
+        preprocess_cfg = merge_preprocess_dict(preprocess_cfg, config['preprocess_cfg'])
+        model_cfg = config['model_cfg']
+        pretrained_hf = False  # override, no need to load original HF text weights
+    else:
+        model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
+        checkpoint_path = None
+        model_cfg = None
+    if isinstance(device, str):
+        device = torch.device(device)
+    if pretrained and pretrained.lower() == 'openai':
+        logging.info(f'Loading pretrained {model_name} from OpenAI.')
+        model = load_openai_model(
+            model_name,
+            precision=precision,
+            device=device,
+            cache_dir=cache_dir,
+        )
+    else:
+        model_cfg = model_cfg or get_model_config(model_name)
+        if model_cfg is not None:
+            logging.info(f'Loaded {model_name} model config.')
+        else:
+            logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
+            raise RuntimeError(f'Model config for {model_name} not found.')
+        if force_quick_gelu:
+            # override for use of QuickGELU on non-OpenAI transformer models
+            model_cfg["quick_gelu"] = True
+        if force_patch_dropout is not None:
+            # override the default patch dropout value
+            model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout
+        if force_image_size is not None:
+            # override model config's image size
+            model_cfg["vision_cfg"]["image_size"] = force_image_size
+        is_timm_model = 'timm_model_name' in model_cfg.get('vision_cfg', {})
+        if pretrained_image:
+            if is_timm_model:
+                # pretrained weight loading for timm models set via vision_cfg
+                model_cfg['vision_cfg']['timm_model_pretrained'] = True
+            else:
+                assert False, 'pretrained image towers currently only supported for timm models'
+        # cast_dtype set for fp16 and bf16 (manual mixed-precision), not set for 'amp' or 'pure' modes
+        cast_dtype = get_cast_dtype(precision)
+        is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {})
+        if is_hf_model:
+            # load pretrained weights for HF text model IFF no CLIP weights being loaded
+            model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf and not pretrained
+        custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model
+        model_cfg = dict(model_cfg, **model_kwargs)  # merge cfg dict w/ kwargs (kwargs overrides cfg)
+        if custom_text:
+            if "multimodal_cfg" in model_cfg:
+                model = CoCa(**model_cfg, cast_dtype=cast_dtype)
+            else:
+                model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+        else:
+            model = CLIP(**model_cfg, cast_dtype=cast_dtype)
+        if precision in ("fp16", "bf16"):
+            dtype = torch.float16 if 'fp16' in precision else torch.bfloat16
+            # manual mixed precision that matches original OpenAI behaviour
+            if is_timm_model:
+                # FIXME this is a bit janky, create timm based model in low-precision and
+                # then cast only LayerNormFp32 instances back to float32 so they don't break.
+                # Why? The convert_weights_to_lp fn only works with native models.
+                model.to(device=device, dtype=dtype)
+                from .transformer import LayerNormFp32
+                def _convert_ln(m):
+                    if isinstance(m, LayerNormFp32):
+                        m.weight.data = m.weight.data.to(torch.float32)
+                        m.bias.data = m.bias.data.to(torch.float32)
+                model.apply(_convert_ln)
+            else:
+                model.to(device=device)
+                convert_weights_to_lp(model, dtype=dtype)
+        elif precision in ("pure_fp16", "pure_bf16"):
+            dtype = torch.float16 if 'fp16' in precision else torch.bfloat16
+            model.to(device=device, dtype=dtype)
+        else:
+            model.to(device=device)
+        pretrained_loaded = False
+        if pretrained:
+            checkpoint_path = ''
+            pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
+            if pretrained_cfg:
+                checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
+                preprocess_cfg = merge_preprocess_dict(preprocess_cfg, pretrained_cfg)
+            elif os.path.exists(pretrained):
+                checkpoint_path = pretrained
+            if checkpoint_path:
+                logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
+                load_checkpoint(model, checkpoint_path)
+            else:
+                error_str = (
+                    f'Pretrained weights ({pretrained}) not found for model {model_name}.'
+                    f' Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
+                logging.warning(error_str)
+                raise RuntimeError(error_str)
+            pretrained_loaded = True
+        elif has_hf_hub_prefix:
+            logging.info(f'Loading pretrained {model_name} weights ({checkpoint_path}).')
+            load_checkpoint(model, checkpoint_path)
+            pretrained_loaded = True
+        if require_pretrained and not pretrained_loaded:
+            # callers of create_model_from_pretrained always expect pretrained weights
+            raise RuntimeError(
+                f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.')
+    if output_dict and hasattr(model, "output_dict"):
+        model.output_dict = True
+    if jit:
+        model = torch.jit.script(model)
+    # set image preprocessing configuration in model attributes for convenience
+    if getattr(model.visual, 'image_size', None) is not None:
+        # use image_size set on model creation (via config or force_image_size arg)
+        force_preprocess_cfg['size'] = model.visual.image_size
+    set_model_preprocess_cfg(model, merge_preprocess_dict(preprocess_cfg, force_preprocess_cfg))
+    return model
+def create_loss(args):
+    if args.distill:
+        return DistillClipLoss(
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif "coca" in args.model.lower():
+        return CoCaLoss(
+            caption_loss_weight=args.coca_caption_loss_weight,
+            clip_loss_weight=args.coca_contrastive_loss_weight,
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif args.siglip:
+        assert not args.horovod, "Horovod not currently supported for SigLip"
+        return SigLipLoss(
+            rank=args.rank,
+            world_size=args.world_size,
+        )
+    return ClipLoss(
+        local_loss=args.local_loss,
+        gather_with_grad=args.gather_with_grad,
+        cache_labels=True,
+        rank=args.rank,
+        world_size=args.world_size,
+        use_horovod=args.horovod,
+    )
+def create_model_and_transforms(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        image_interpolation: Optional[str] = None,
+        image_resize_mode: Optional[str] = None,  # only effective for inference
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        **model_kwargs,
+):
+    force_preprocess_cfg = merge_preprocess_kwargs(
+        {}, mean=image_mean, std=image_std, interpolation=image_interpolation, resize_mode=image_resize_mode)
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_patch_dropout=force_patch_dropout,
+        force_image_size=force_image_size,
+        force_preprocess_cfg=force_preprocess_cfg,
+        pretrained_image=pretrained_image,
+        pretrained_hf=pretrained_hf,
+        cache_dir=cache_dir,
+        output_dict=output_dict,
+        **model_kwargs,
+    )
+    pp_cfg = PreprocessCfg(**model.visual.preprocess_cfg)
+    preprocess_train = image_transform_v2(
+        pp_cfg,
+        is_train=True,
+        aug_cfg=aug_cfg,
+    )
+    preprocess_val = image_transform_v2(
+        pp_cfg,
+        is_train=False,
+    )
+    return model, preprocess_train, preprocess_val
+def create_model_from_pretrained(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        image_interpolation: Optional[str] = None,
+        image_resize_mode: Optional[str] = None,  # only effective for inference
+        return_transform: bool = True,
+        cache_dir: Optional[str] = None,
+        **model_kwargs,
+):
+    force_preprocess_cfg = merge_preprocess_kwargs(
+        {}, mean=image_mean, std=image_std, interpolation=image_interpolation, resize_mode=image_resize_mode)
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_image_size=force_image_size,
+        force_preprocess_cfg=force_preprocess_cfg,
+        cache_dir=cache_dir,
+        require_pretrained=True,
+        **model_kwargs,
+    )
+    if not return_transform:
+        return model
+    preprocess = image_transform_v2(
+        PreprocessCfg(**model.visual.preprocess_cfg),
+        is_train=False,
+    )
+    return model, preprocess

src/open_clip/hf_configs.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# HF architecture dict:
+arch_dict = {
+    # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
+    "roberta": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+            "layer_attr": "layer",
+            "token_embeddings_attr": "embeddings"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
+    "xlm-roberta": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+            "layer_attr": "layer",
+            "token_embeddings_attr": "embeddings"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
+    "mt5": {
+        "config_names": {
+            # unlimited seqlen
+            # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
+            "context_length": "",
+            "vocab_size": "vocab_size",
+            "width": "d_model",
+            "heads": "num_heads",
+            "layers": "num_layers",
+            "layer_attr": "block",
+            "token_embeddings_attr": "embed_tokens"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/bert
+    "bert": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+        },
+        "pooler": "cls_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/m2m_100
+    "m2m_100": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "d_model",
+            "heads": "encoder_attention_heads",
+            "layers": "encoder_layers",
+        },
+        "pooler": "cls_pooler",
+    },
+}

src/open_clip/hf_model.py ADDED Viewed

	@@ -0,0 +1,193 @@

+""" huggingface model adapter
+Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
+"""
+import re
+import torch
+import torch.nn as nn
+from torch import TensorType
+try:
+    import transformers
+    from transformers import AutoModel, AutoTokenizer, AutoConfig, PretrainedConfig
+    from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
+        BaseModelOutputWithPoolingAndCrossAttentions
+except ImportError as e:
+    transformers = None
+    class BaseModelOutput:
+        pass
+    class PretrainedConfig:
+        pass
+from .hf_configs import arch_dict
+# utils
+def _camel2snake(s):
+    return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
+# TODO: ?last - for gpt-like models
+_POOLERS = {}
+def register_pooler(cls):
+    """Decorator registering pooler class"""
+    _POOLERS[_camel2snake(cls.__name__)] = cls
+    return cls
+@register_pooler
+class MeanPooler(nn.Module):
+    """Mean pooling"""
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
+        return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
+@register_pooler
+class MaxPooler(nn.Module):
+    """Max pooling"""
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
+        return masked_output.max(1).values
+@register_pooler
+class ClsPooler(nn.Module):
+    """CLS token pooling"""
+    def __init__(self, use_pooler_output=True):
+        super().__init__()
+        self.cls_token_position = 0
+        self.use_pooler_output = use_pooler_output
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        if (self.use_pooler_output and
+            isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
+            (x.pooler_output is not None)
+        ):
+            return x.pooler_output
+        return x.last_hidden_state[:, self.cls_token_position, :]
+@register_pooler
+class ClsLastHiddenStatePooler(nn.Module):
+    """CLS token pooling
+    NOTE: this is equivalent to ClsPooler above with use_pooler_output=False
+    """
+    def __init__(self):
+        super().__init__()
+        self.cls_token_position = 0
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        return x.last_hidden_state[:, self.cls_token_position, :]
+class HFTextEncoder(nn.Module):
+    """HuggingFace model adapter"""
+    output_tokens: torch.jit.Final[bool]
+    def __init__(
+            self,
+            model_name_or_path: str,
+            output_dim: int,
+            config: PretrainedConfig = None,
+            pooler_type: str = None,
+            proj_type: str = None,
+            pretrained: bool = True,
+            output_tokens: bool = False,
+    ):
+        super().__init__()
+        self.output_tokens = output_tokens
+        self.output_dim = output_dim
+        # TODO: find better way to get this information
+        uses_transformer_pooler = (pooler_type == "cls_pooler")
+        if transformers is None:
+            raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
+        if config is None:
+            self.config = AutoConfig.from_pretrained(model_name_or_path)
+            create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
+                AutoModel.from_config, self.config)
+            # TODO: do all model configs have this attribute? PretrainedConfig does so yes??
+            if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
+                self.transformer = create_func(model_args)
+                self.transformer = self.transformer.encoder
+            else:
+                self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
+        else:
+            self.config = config
+            self.transformer = AutoModel.from_config(config)
+        if pooler_type is None:  # get default arch pooler
+            pooler_type = (arch_dict[self.config.model_type]["pooler"])
+        # FIXME downstream users of OpenCLIP models use these attr, need to verify valid across all models
+        self.vocab_size = getattr(self.config, 'vocab_size', 0)
+        self.context_length = getattr(self.config, 'max_position_embeddings', 0)
+        self.pooler = _POOLERS[pooler_type]()
+        d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
+        if (d_model == output_dim) and (proj_type is None):  # do we always need a proj?
+            self.proj = nn.Identity()
+        elif proj_type == 'linear':
+            self.proj = nn.Linear(d_model, output_dim, bias=False)
+        elif proj_type == 'mlp':
+            hidden_size = (d_model + output_dim) // 2
+            self.proj = nn.Sequential(
+                nn.Linear(d_model, hidden_size, bias=False),
+                nn.GELU(),
+                nn.Linear(hidden_size, output_dim, bias=False),
+            )
+    def forward(self, x: TensorType):
+        attn_mask = (x != self.config.pad_token_id).long()
+        out = self.transformer(input_ids=x, attention_mask=attn_mask)
+        pooled_out = self.pooler(out, attn_mask)
+        projected = self.proj(pooled_out)
+        seq_len = out.last_hidden_state.shape[1]
+        tokens = (
+            out.last_hidden_state[:, torch.arange(seq_len) != self.pooler.cls_token_position, :]
+            if type(self.pooler) == ClsPooler
+            else out.last_hidden_state
+        )
+        if self.output_tokens:
+            return projected, tokens
+        return projected
+    def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        if not unlocked_layers:  # full freezing
+            for n, p in self.transformer.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+            return
+        encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
+        layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
+        print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
+        embeddings = getattr(
+            self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
+        modules = [embeddings, *layer_list][:-unlocked_layers]
+        # freeze layers
+        for module in modules:
+            for n, p in module.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.gradient_checkpointing_enable()
+    def init_parameters(self):
+        pass

src/open_clip/model.py ADDED Viewed

	@@ -0,0 +1,782 @@

+""" CLIP Model
+Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+import copy
+import logging
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import timm
+from timm.data import resolve_data_config
+from timm.data.transforms_factory import create_transform
+from timm.layers import SwiGLUPacked
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from functools import partial
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+from llm2vec.models import Qwen2BiModel
+# from .hf_configs import arch_dict
+from .hf_model import HFTextEncoder
+from .timm_model import TimmModel
+from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer,\
+    text_global_pool
+from .utils import to_2tuple
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0.  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    attentional_pool: bool = False  # whether to use attentional pooler in the last embedding layer (overrides pool_type)
+    attn_pooler_queries: int = 256  # n_queries for attentional pooler
+    attn_pooler_heads: int = 8  # n heads for attentional_pooling
+    no_ln_pre: bool = False  # disable pre transformer LayerNorm
+    pos_embed_type: str = 'learnable'
+    final_ln_after_pool: bool = False  # apply final LayerNorm after pooling
+    pool_type: str = 'tok'
+    output_tokens: bool = False
+    act_kwargs: Optional[dict] = None
+    norm_kwargs: Optional[dict] = None
+    timm_model_name: Optional[str] = None  # a valid model name overrides layers, width, patch_size
+    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
+    timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
+    timm_proj_bias: bool = False  # enable bias final projection
+    timm_drop: float = 0.  # head dropout
+    timm_drop_path: Optional[float] = None  # backbone stochastic depth
+@dataclass
+class CLIPTextCfg:
+    context_length: int = 77
+    vocab_size: int = 49408
+    hf_tokenizer_name: Optional[str] = None
+    tokenizer_kwargs: Optional[dict] = None
+    width: int = 512
+    heads: int = 8
+    layers: int = 12
+    mlp_ratio: float = 4.0
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    embed_cls: bool = False
+    pad_id: int = 0
+    no_causal_mask: bool = False  # disable causal masking
+    final_ln_after_pool: bool = False  # apply final LayerNorm after pooling
+    pool_type: str = 'argmax'
+    proj_bias: bool = False
+    output_tokens: bool = False
+    act_kwargs: dict = None
+    norm_kwargs: dict = None
+    # HuggingFace specific text tower config
+    hf_model_name: Optional[str] = None
+    hf_model_pretrained: bool = True
+    hf_proj_type: str = 'mlp'
+    hf_pooler_type: str = 'mean_pooler'  # attentional pooling for HF models
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == 'bf16':
+        cast_dtype = torch.bfloat16
+    elif precision == 'fp16':
+        cast_dtype = torch.float16
+    return cast_dtype
+def get_input_dtype(precision: str):
+    input_dtype = None
+    if precision in ('bf16', 'pure_bf16'):
+        input_dtype = torch.bfloat16
+    elif precision in ('fp16', 'pure_fp16'):
+        input_dtype = torch.float16
+    return input_dtype
+def _build_vision_tower(
+        embed_dim: int,
+        vision_cfg: CLIPVisionCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None
+):
+    if isinstance(vision_cfg, dict):
+        vision_cfg = CLIPVisionCfg(**vision_cfg)
+    # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
+    # memory efficient in recent PyTorch releases (>= 1.10).
+    # NOTE: timm models always use native GELU regardless of quick_gelu flag.
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    if vision_cfg.timm_model_name:
+        visual = TimmModel(
+            vision_cfg.timm_model_name,
+            pretrained=vision_cfg.timm_model_pretrained,
+            pool=vision_cfg.timm_pool,
+            proj=vision_cfg.timm_proj,
+            proj_bias=vision_cfg.timm_proj_bias,
+            drop=vision_cfg.timm_drop,
+            drop_path=vision_cfg.timm_drop_path,
+            patch_drop=vision_cfg.patch_dropout if vision_cfg.patch_dropout > 0 else None,
+            embed_dim=embed_dim,
+            image_size=vision_cfg.image_size,
+        )
+    elif isinstance(vision_cfg.layers, (tuple, list)):
+        vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
+        visual = ModifiedResNet(
+            layers=vision_cfg.layers,
+            output_dim=embed_dim,
+            heads=vision_heads,
+            image_size=vision_cfg.image_size,
+            width=vision_cfg.width,
+        )
+    else:
+        vision_heads = vision_cfg.width // vision_cfg.head_width
+        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+        if vision_cfg.norm_kwargs:
+            norm_layer = partial(norm_layer, **vision_cfg.norm_kwargs)
+        if vision_cfg.act_kwargs is not None:
+            act_layer = partial(act_layer, **vision_cfg.act_kwargs)
+        visual = VisionTransformer(
+            image_size=vision_cfg.image_size,
+            patch_size=vision_cfg.patch_size,
+            width=vision_cfg.width,
+            layers=vision_cfg.layers,
+            heads=vision_heads,
+            mlp_ratio=vision_cfg.mlp_ratio,
+            ls_init_value=vision_cfg.ls_init_value,
+            patch_dropout=vision_cfg.patch_dropout,
+            attentional_pool=vision_cfg.attentional_pool,
+            attn_pooler_queries=vision_cfg.attn_pooler_queries,
+            attn_pooler_heads=vision_cfg.attn_pooler_heads,
+            pos_embed_type=vision_cfg.pos_embed_type,
+            no_ln_pre=vision_cfg.no_ln_pre,
+            final_ln_after_pool=vision_cfg.final_ln_after_pool,
+            pool_type=vision_cfg.pool_type,
+            output_tokens=vision_cfg.output_tokens,
+            output_dim=embed_dim,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+    return visual
+def _build_text_tower(
+        embed_dim: int,
+        text_cfg: CLIPTextCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    if isinstance(text_cfg, dict):
+        text_cfg = CLIPTextCfg(**text_cfg)
+    if text_cfg.hf_model_name:
+        text = HFTextEncoder(
+            text_cfg.hf_model_name,
+            output_dim=embed_dim,
+            proj_type=text_cfg.hf_proj_type,
+            pooler_type=text_cfg.hf_pooler_type,
+            pretrained=text_cfg.hf_model_pretrained,
+            output_tokens=text_cfg.output_tokens,
+        )
+    else:
+        act_layer = QuickGELU if quick_gelu else nn.GELU
+        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+        if text_cfg.norm_kwargs:
+            norm_layer = partial(norm_layer, **text_cfg.norm_kwargs)
+        if text_cfg.act_kwargs is not None:
+            act_layer = partial(act_layer, **text_cfg.act_kwargs)
+        text = TextTransformer(
+            context_length=text_cfg.context_length,
+            vocab_size=text_cfg.vocab_size,
+            width=text_cfg.width,
+            heads=text_cfg.heads,
+            layers=text_cfg.layers,
+            mlp_ratio=text_cfg.mlp_ratio,
+            ls_init_value=text_cfg.ls_init_value,
+            output_dim=embed_dim,
+            embed_cls=text_cfg.embed_cls,
+            no_causal_mask=text_cfg.no_causal_mask,
+            pad_id=text_cfg.pad_id,
+            pool_type=text_cfg.pool_type,
+            proj_bias=text_cfg.proj_bias,
+            output_tokens=text_cfg.output_tokens,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+    return text
+def resize_pos_embed(state_dict, interpolation: str = 'bicubic', antialias: bool = True):
+    # Rescale the grid of position embeddings when loading from state_dict
+    old_pos_embed = state_dict.get('pos_embed', None)[0]
+    if old_pos_embed is None:
+        print('No positional embedding found in state_dict')
+        return
+    grid_size = to_2tuple([336 // 14, 336 // 14])
+    extra_tokens = 5  # FIXME detect different token configs (ie no class token, or more)
+    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+    if new_seq_len == old_pos_embed.shape[0]:
+        print('Positional embedding grid-size matches model, no need to resize')
+        return
+    if extra_tokens:
+        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+    else:
+        pos_emb_tok, pos_emb_img = None, old_pos_embed
+    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
+    print('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+    pos_emb_img = F.interpolate(
+        pos_emb_img,
+        size=grid_size,
+        mode=interpolation,
+        antialias=antialias,
+        align_corners=False,
+    )
+    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)
+    if pos_emb_tok is not None:
+        # import pdb
+        # pdb.set_trace()
+        new_pos_embed = torch.cat([pos_emb_tok.unsqueeze(0), pos_emb_img], dim=1)
+    else:
+        new_pos_embed = pos_emb_img
+    state_dict['pos_embed'] = new_pos_embed
+    return state_dict
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        model = timm.create_model("hf-hub:paige-ai/Virchow2", pretrained=False, mlp_layer=SwiGLUPacked, patch_size=14, img_size=336, # 将 patch_size 修改为 21
+                                  act_layer=torch.nn.SiLU)
+        self.visual2 = model
+        config = AutoConfig.from_pretrained("../Qwen-encoder-1.5B")
+        # 初始化模型结构（不加载预训练参数）
+        self.text = Qwen2BiModel(config)
+        self.proj = nn.Linear(1536, 3328) # 2048+1280
+        self.logit_scale = nn.Parameter(torch.ones([]) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones([]) * init_logit_bias)
+        else:
+            self.logit_bias = None
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        if not unlocked_layers:  # full freezing
+            for n, p in self.transformer.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+            return
+        encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
+        layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
+        print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
+        embeddings = getattr(
+            self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
+        modules = [embeddings, *layer_list][:-unlocked_layers]
+        # freeze layers
+        for module in modules:
+            for n, p in module.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+        ## lock position embedding
+        self.positional_embedding.requires_grad = False
+        ## lock token embedding
+        self.token_embedding.requires_grad = False
+        ## lock text projection
+        if self.text_projection is not None:
+            self.text_projection.requires_grad = False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.visual2.set_grad_checkpointing(enable)
+        # self.transformer.grad_checkpointing = enable
+        self.text._set_gradient_checkpointing(enable)
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        features2 = self.visual2(image)
+        features2 = torch.cat([features2[:, 0, :], features2[:, 5:, :].mean(1)], dim=-1)
+        features = torch.cat([features, features2], dim=-1)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text2, normalize: bool = False):
+        features = self.text(**text2)
+        ### mask attention
+        last_hidden_states = features.last_hidden_state  # (batch_size, sequence_length, hidden_size)
+        attention_mask = text2['attention_mask']  # (batch_size, sequence_length)
+        # (batch_size, sequence_length, 1)
+        attention_mask = attention_mask.unsqueeze(-1).float()
+        masked_hidden_states = last_hidden_states * attention_mask
+        # (batch_size, 1, 1)
+        valid_token_count = attention_mask.sum(dim=1, keepdim=True)
+        # (batch_size, hidden_size)
+        features = masked_hidden_states.sum(dim=1) / valid_token_count.squeeze(1)
+        features = self.proj(features)
+        return F.normalize(features, dim=-1) if normalize else features
+    def get_logits(self, image, text):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True)
+        image_logits = self.logit_scale.exp() * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+    def forward(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+    ):
+        image_features = self.encode_image(image, normalize=True) if image is not None else None
+        text_features = self.encode_text(text, normalize=True) if text is not None else None
+        if self.output_dict:
+            out_dict = {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+            if self.logit_bias is not None:
+                out_dict['logit_bias'] = self.logit_bias
+            return out_dict
+        if self.logit_bias is not None:
+            return image_features, text_features, self.logit_scale.exp(), self.logit_bias
+        return image_features, text_features, self.logit_scale.exp()
+class CustomTextCLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
+        self.context_length = self.text.context_length
+        self.vocab_size = self.text.vocab_size
+        self.logit_scale = nn.Parameter(torch.ones([]) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones([]) * init_logit_bias)
+        else:
+            self.logit_bias = None
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        self.text.lock(unlocked_layers, freeze_layer_norm)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False):
+        features = self.text(text)
+        return F.normalize(features, dim=-1) if normalize else features
+    def get_logits(self, image, text):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True)
+        image_logits = self.logit_scale.exp() * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+    def forward(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+    ):
+        image_features = self.encode_image(image, normalize=True) if image is not None else None
+        text_features = self.encode_text(text, normalize=True) if text is not None else None
+        if self.output_dict:
+            out_dict = {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+            if self.logit_bias is not None:
+                out_dict['logit_bias'] = self.logit_bias
+            return out_dict
+        if self.logit_bias is not None:
+            return image_features, text_features, self.logit_scale.exp(), self.logit_bias
+        return image_features, text_features, self.logit_scale.exp()
+class CustomCLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        model = timm.create_model('hf_hub:paige-ai/Virchow2', pretrained=False)
+        # 加载本地保存的模型权重
+        checkpoint_path = "/sunyuxuan/project/2024/model/vision_encoder/pathology/virchow2/pytorch_model.bin"
+        state_dict = torch.load(checkpoint_path, map_location="cpu")
+        model.load_state_dict(state_dict)
+        # import pdb
+        # pdb.set_trace()
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
+        self.context_length = self.text.context_length
+        self.vocab_size = self.text.vocab_size
+        self.logit_scale = nn.Parameter(torch.ones([]) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones([]) * init_logit_bias)
+        else:
+            self.logit_bias = None
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        self.text.lock(unlocked_layers, freeze_layer_norm)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False):
+        features = self.text(text)
+        return F.normalize(features, dim=-1) if normalize else features
+    def get_logits(self, image, text):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True)
+        image_logits = self.logit_scale.exp() * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+    def forward(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+    ):
+        image_features = self.encode_image(image, normalize=True) if image is not None else None
+        text_features = self.encode_text(text, normalize=True) if text is not None else None
+        if self.output_dict:
+            out_dict = {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+            if self.logit_bias is not None:
+                out_dict['logit_bias'] = self.logit_bias
+            return out_dict
+        if self.logit_bias is not None:
+            return image_features, text_features, self.logit_scale.exp(), self.logit_bias
+        return image_features, text_features, self.logit_scale.exp()
+def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
+    """Convert applicable model parameters to low-precision (bf16 or fp16)"""
+    def _convert_weights(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.to(dtype)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(dtype)
+        if isinstance(l, (nn.MultiheadAttention, Attention)):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.to(dtype)
+        if isinstance(l, (CLIP, TextTransformer)):
+            # convert text nn.Parameter projections
+            attr = getattr(l, "text_projection", None)
+            if attr is not None:
+                attr.data = attr.data.to(dtype)
+        if isinstance(l, VisionTransformer):
+            # convert vision nn.Parameter projections
+            attr = getattr(l, "proj", None)
+            if attr is not None:
+                attr.data = attr.data.to(dtype)
+    model.apply(_convert_weights)
+convert_weights_to_fp16 = convert_weights_to_lp  # backwards compat
+# used to maintain checkpoint compatibility
+def convert_to_custom_text_state_dict(state_dict: dict):
+    if 'text_projection' in state_dict:
+        # old format state_dict, move text tower -> .text
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if any(k.startswith(p) for p in (
+                'text_projection',
+                'positional_embedding',
+                'token_embedding',
+                'transformer',
+                'ln_final',
+            )):
+                k = 'text.' + k
+            new_state_dict[k] = v
+        return new_state_dict
+    return state_dict
+def build_model_from_openai_state_dict(
+        state_dict: dict,
+        quick_gelu=True,
+        cast_dtype=torch.float16,
+):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len(
+            [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_size = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_size = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    vision_cfg = CLIPVisionCfg(
+        layers=vision_layers,
+        width=vision_width,
+        patch_size=vision_patch_size,
+        image_size=image_size,
+    )
+    text_cfg = CLIPTextCfg(
+        context_length=context_length,
+        vocab_size=vocab_size,
+        width=transformer_width,
+        heads=transformer_heads,
+        layers=transformer_layers,
+    )
+    model = CLIP(
+        embed_dim,
+        vision_cfg=vision_cfg,
+        text_cfg=text_cfg,
+        quick_gelu=quick_gelu,  # OpenAI models were trained with QuickGELU
+        cast_dtype=cast_dtype,
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        state_dict.pop(key, None)
+    convert_weights_to_fp16(model)  # OpenAI state dicts are partially converted to float16
+    model.load_state_dict(state_dict, strict=True)
+    return model.eval()
+def trace_model(model, batch_size=256, device=torch.device('cpu')):
+    model.eval()
+    image_size = model.visual.image_size
+    example_images = torch.ones((batch_size, 3, image_size, image_size), device=device)
+    example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
+    model = torch.jit.trace_module(
+        model,
+        inputs=dict(
+            forward=(example_images, example_text),
+            encode_text=(example_text,),
+            encode_image=(example_images,)
+        ))
+    model.visual.image_size = image_size
+    return model
+#
+#
+# def resize_pos_embed(state_dict, model, interpolation: str = 'bicubic', antialias: bool = True):
+#     # Rescale the grid of position embeddings when loading from state_dict
+#     old_pos_embed = state_dict.get('visual.positional_embedding', None)
+#     if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
+#         return
+#     grid_size = to_2tuple(model.visual.grid_size)
+#     extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+#     new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+#     if new_seq_len == old_pos_embed.shape[0]:
+#         return
+#
+#     if extra_tokens:
+#         pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+#     else:
+#         pos_emb_tok, pos_emb_img = None, old_pos_embed
+#     old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
+#
+#     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+#     pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+#     pos_emb_img = F.interpolate(
+#         pos_emb_img,
+#         size=grid_size,
+#         mode=interpolation,
+#         antialias=antialias,
+#         align_corners=False,
+#     )
+#     pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+#     if pos_emb_tok is not None:
+#         new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+#     else:
+#         new_pos_embed = pos_emb_img
+#     state_dict['visual.positional_embedding'] = new_pos_embed
+def resize_text_pos_embed(state_dict, model, interpolation: str = 'linear', antialias: bool = False):
+    old_pos_embed = state_dict.get('positional_embedding', None)
+    if old_pos_embed is None:
+        return
+    # FIXME add support for text cls_token
+    model_pos_embed = getattr(model, 'positional_embedding', None)
+    if model_pos_embed is None:
+        model_pos_embed = getattr(model.text, 'positional_embedding', None)
+    old_num_pos = old_pos_embed.shape[0]
+    old_width = old_pos_embed.shape[1]
+    num_pos = model_pos_embed.shape[0]
+    width = model_pos_embed.shape[1]
+    assert old_width == width, 'text pos_embed width changed!'
+    if old_num_pos == num_pos:
+        return
+    logging.info('Resizing text position embedding num_pos from %s to %s', old_num_pos, num_pos)
+    old_pos_embed = old_pos_embed.reshape(1, old_num_pos, old_width).permute(0, 2, 1)
+    old_pos_embed = F.interpolate(
+        old_pos_embed,
+        size=num_pos,
+        mode=interpolation,
+        antialias=antialias,
+        align_corners=False,
+    )
+    old_pos_embed = old_pos_embed.permute(0, 2, 1)[0]
+    new_pos_embed = old_pos_embed
+    state_dict['positional_embedding'] = new_pos_embed
+def get_model_preprocess_cfg(model):
+    module = getattr(model, 'visual', model)
+    preprocess_cfg = getattr(module, 'preprocess_cfg', {})
+    if not preprocess_cfg:
+        # use separate legacy attributes if preprocess_cfg dict not found
+        size = getattr(module, 'image_size')
+        if size is not None:
+            preprocess_cfg['size'] = size
+        mean = getattr(module, 'image_mean', None)
+        if mean is not None:
+            preprocess_cfg['mean'] = mean
+        std = getattr(module, 'image_std', None)
+        if std is not None:
+            preprocess_cfg['std'] = std
+    return preprocess_cfg
+def set_model_preprocess_cfg(model, preprocess_cfg: Dict[str, Any]):
+    module = getattr(model, 'visual', model)
+    module.image_mean = preprocess_cfg['mean']  # legacy attribute, keeping for bwd compat
+    module.image_std = preprocess_cfg['std']  # legacy attribute, keeping for bwd compat
+    module.preprocess_cfg = copy.deepcopy(preprocess_cfg)  # new attr, package all pp cfg as dict
+def get_model_tokenize_cfg(model):
+    module = getattr(model, 'text', model)
+    cfg = {}
+    context_length = getattr(module, 'context_length', None)
+    if context_length is not None:
+        cfg['context_length'] = context_length
+    vocab_size = getattr(module, 'vocab_size', None)
+    if vocab_size is not None:
+        cfg['vocab_size'] = vocab_size
+    return cfg

src/open_clip/model_configs/ViT-L-14-336.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 336,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

src/open_clip/openai.py ADDED Viewed

	@@ -0,0 +1,90 @@

+""" OpenAI pretrained model functions
+Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+import os
+import warnings
+from typing import List, Optional, Union
+import torch
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
+from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
+__all__ = ["list_openai_models", "load_openai_model"]
+def list_openai_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list_pretrained_models_by_tag('openai')
+def load_openai_model(
+        name: str,
+        precision: Optional[str] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        cache_dir: Optional[str] = None,
+):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    precision: str
+        Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    cache_dir : Optional[str]
+        The directory to cache the downloaded model weights
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    if precision is None:
+        precision = 'fp32' if device == 'cpu' else 'fp16'
+    if get_pretrained_url(name, 'openai'):
+        model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location="cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        state_dict = torch.load(model_path, map_location="cpu")
+    # Build a non-jit model from the OpenAI jitted model state dict
+    cast_dtype = get_cast_dtype(precision)
+    try:
+        model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
+    except KeyError:
+        sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
+        model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
+    # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
+    model = model.to(device)
+    # FIXME support pure fp16/bf16 precision modes
+    if precision != 'fp16':
+        model.float()
+        if precision == 'bf16':
+            # for bf16, convert back to low-precision
+            convert_weights_to_lp(model, dtype=torch.bfloat16)
+    # add mean / std attributes for consistency with OpenCLIP models
+    model.visual.image_mean = OPENAI_DATASET_MEAN
+    model.visual.image_std = OPENAI_DATASET_STD
+    return model

src/open_clip/pos_embed.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed

src/open_clip/pretrained.py ADDED Viewed

	@@ -0,0 +1,655 @@

+import hashlib
+import os
+import urllib
+import warnings
+from functools import partial
+from typing import Dict, Union
+from tqdm import tqdm
+from .constants import (
+    IMAGENET_MEAN,
+    IMAGENET_STD,
+    INCEPTION_MEAN,
+    INCEPTION_STD,
+    OPENAI_DATASET_MEAN,
+    OPENAI_DATASET_STD,
+)
+from .version import __version__
+try:
+    from huggingface_hub import hf_hub_download
+    hf_hub_download = partial(hf_hub_download, library_name="open_clip", library_version=__version__)
+    _has_hf_hub = True
+except ImportError:
+    hf_hub_download = None
+    _has_hf_hub = False
+def _pcfg(url='', hf_hub='', **kwargs):
+    # OpenAI / OpenCLIP defaults
+    return {
+        'url': url,
+        'hf_hub': hf_hub,
+        'mean': OPENAI_DATASET_MEAN,
+        'std': OPENAI_DATASET_STD,
+        'interpolation': 'bicubic',
+        'resize_mode': 'shortest',
+        **kwargs,
+    }
+def _slpcfg(url='', hf_hub='', **kwargs):
+    # SiGLIP defaults
+    return {
+        'url': url,
+        'hf_hub': hf_hub,
+        'mean': INCEPTION_MEAN,
+        'std': INCEPTION_STD,
+        'interpolation': 'bicubic',
+        'resize_mode': 'squash',
+        **kwargs,
+    }
+def _apcfg(url='', hf_hub='', **kwargs):
+    # CLIPA defaults
+    return {
+        'url': url,
+        'hf_hub': hf_hub,
+        'mean': IMAGENET_MEAN,
+        'std': IMAGENET_STD,
+        'interpolation': 'bilinear',
+        'resize_mode': 'squash',
+        **kwargs,
+    }
+def _mccfg(url='', hf_hub='', **kwargs):
+    # MobileCLIP
+    return {
+        'url': url,
+        'hf_hub': hf_hub,
+        'mean': (0., 0., 0.),
+        'std': (1., 1., 1.),
+        'interpolation': 'bilinear',
+        'resize_mode': 'shortest',
+        **kwargs,
+    }
+_RN50 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt"),
+    yfcc15m=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt"),
+    cc12m=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt"),
+)
+_RN50_quickgelu = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt"),
+    yfcc15m=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt"),
+    cc12m=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt"),
+)
+_RN101 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt"),
+    yfcc15m=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt"),
+)
+_RN101_quickgelu = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt"),
+    yfcc15m=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt"),
+)
+_RN50x4 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt"),
+)
+_RN50x16 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt"),
+)
+_RN50x64 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt"),
+)
+_VITB32 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
+    laion400m_e31=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
+    laion400m_e32=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
+    laion2b_e16=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"),
+    laion2b_s34b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-laion2B-s34B-b79K/'),
+    # DataComp-XL models
+    datacomp_xl_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K/'),
+    # DataComp-M models
+    datacomp_m_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-DataComp.M-s128M-b4K/'),
+    commonpool_m_clip_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.M.clip-s128M-b4K/'),
+    commonpool_m_laion_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.M.laion-s128M-b4K/'),
+    commonpool_m_image_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.M.image-s128M-b4K/'),
+    commonpool_m_text_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.M.text-s128M-b4K/'),
+    commonpool_m_basic_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.M.basic-s128M-b4K/'),
+    commonpool_m_s128m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.M-s128M-b4K/'),
+    # DataComp-S models
+    datacomp_s_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-DataComp.S-s13M-b4K/'),
+    commonpool_s_clip_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.S.clip-s13M-b4K/'),
+    commonpool_s_laion_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.S.laion-s13M-b4K/'),
+    commonpool_s_image_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.S.image-s13M-b4K/'),
+    commonpool_s_text_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.S.text-s13M-b4K/'),
+    commonpool_s_basic_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.S.basic-s13M-b4K/'),
+    commonpool_s_s13m_b4k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-CommonPool.S-s13M-b4K/'),
+)
+_VITB32_quickgelu = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
+    laion400m_e31=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
+    laion400m_e32=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
+    metaclip_400m=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/b32_400m.pt"),
+    metaclip_fullcc=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/b32_fullcc2.5b.pt"),
+)
+_VITB32_256 = dict(
+    datacomp_s34b_b86k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K/'),
+)
+_VITB16 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"),
+    laion400m_e31=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"),
+    laion400m_e32=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"),
+    laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-laion2B-s34B-b88K/'),
+    # DataComp-XL models
+    datacomp_xl_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K/'),
+    # DataComp-L models
+    datacomp_l_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-DataComp.L-s1B-b8K/'),
+    commonpool_l_clip_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-CommonPool.L.clip-s1B-b8K/'),
+    commonpool_l_laion_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-CommonPool.L.laion-s1B-b8K/'),
+    commonpool_l_image_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-CommonPool.L.image-s1B-b8K/'),
+    commonpool_l_text_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-CommonPool.L.text-s1B-b8K/'),
+    commonpool_l_basic_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-CommonPool.L.basic-s1B-b8K/'),
+    commonpool_l_s1b_b8k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-CommonPool.L-s1B-b8K/'),
+    # DFN
+    dfn2b=_pcfg(hf_hub='apple/DFN2B-CLIP-ViT-B-16/')
+)
+_VITB16_quickgelu = dict(
+    metaclip_400m=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/b16_400m.pt"),
+    metaclip_fullcc=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/b16_fullcc2.5b.pt"),
+)
+_VITB16_PLUS_240 = dict(
+    laion400m_e31=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e31-8fb26589.pt"),
+    laion400m_e32=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e32-699c4b84.pt"),
+)
+_VITL14 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"),
+    laion400m_e31=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"),
+    laion400m_e32=_pcfg(
+        "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"),
+    laion2b_s32b_b82k=_pcfg(
+        hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/',
+        mean=INCEPTION_MEAN, std=INCEPTION_STD),
+    # DataComp-XL models
+    datacomp_xl_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K/'),
+    commonpool_xl_clip_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K/'),
+    commonpool_xl_laion_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-L-14-CommonPool.XL.laion-s13B-b90K/'),
+    commonpool_xl_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-L-14-CommonPool.XL-s13B-b90K/'),
+)
+_VITL14_quickgelu = dict(
+    metaclip_400m=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/l14_400m.pt"),
+    metaclip_fullcc=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/l14_fullcc2.5b.pt"),
+    dfn2b=_pcfg(hf_hub='apple/DFN2B-CLIP-ViT-L-14/'),
+)
+_VITL14_336 = dict(
+    openai=_pcfg(
+        "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"),
+)
+_VITH14 = dict(
+    laion2b_s32b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-laion2B-s32B-b79K/'),
+)
+_VITH14_quickgelu = dict(
+    metaclip_fullcc=_pcfg(
+        "https://dl.fbaipublicfiles.com/MMPT/metaclip/h14_fullcc2.5b.pt"),
+    dfn5b=_pcfg(
+        hf_hub='apple/DFN5B-CLIP-ViT-H-14/',
+        interpolation="bicubic",
+        resize_mode="squash"
+    ),
+)
+_VITH14_378_quickgelu = dict(
+    dfn5b=_pcfg(
+        hf_hub='apple/DFN5B-CLIP-ViT-H-14-378/',
+        interpolation="bicubic",
+        resize_mode="squash"
+    ),
+)
+_VITg14 = dict(
+    laion2b_s12b_b42k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s12B-b42K/'),
+    laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s34B-b88K/'),
+)
+_VITbigG14 = dict(
+    laion2b_s39b_b160k=_pcfg(hf_hub='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/'),
+)
+_robertaViTB32 = dict(
+    laion2b_s12b_b32k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k/'),
+)
+_xlmRobertaBaseViTB32 = dict(
+    laion5b_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-xlm-roberta-base-laion5B-s13B-b90k/'),
+)
+_xlmRobertaLargeFrozenViTH14 = dict(
+    frozen_laion5b_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k/'),
+)
+_convnext_base = dict(
+    laion400m_s13b_b51k=_pcfg(hf_hub='laion/CLIP-convnext_base-laion400M-s13B-b51K/'),
+)
+_convnext_base_w = dict(
+    laion2b_s13b_b82k=_pcfg(hf_hub='laion/CLIP-convnext_base_w-laion2B-s13B-b82K/'),
+    laion2b_s13b_b82k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg/'),
+    laion_aesthetic_s13b_b82k=_pcfg(hf_hub='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K/'),
+)
+_convnext_base_w_320 = dict(
+    laion_aesthetic_s13b_b82k=_pcfg(hf_hub='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K/'),
+    laion_aesthetic_s13b_b82k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg/'),
+)
+_convnext_large_d = dict(
+    laion2b_s26b_b102k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg/'),
+)
+_convnext_large_d_320 = dict(
+    laion2b_s29b_b131k_ft=_pcfg(hf_hub='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft/'),
+    laion2b_s29b_b131k_ft_soup=_pcfg(hf_hub='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/'),
+)
+_convnext_xxlarge = dict(
+    laion2b_s34b_b82k_augreg=_pcfg(hf_hub='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg/'),
+    laion2b_s34b_b82k_augreg_rewind=_pcfg(hf_hub='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind/'),
+    laion2b_s34b_b82k_augreg_soup=_pcfg(hf_hub='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/'),
+)
+_coca_VITB32 = dict(
+    laion2b_s13b_b90k=_pcfg(hf_hub='laion/CoCa-ViT-B-32-laion2B-s13B-b90k/'),
+    mscoco_finetuned_laion2b_s13b_b90k=_pcfg(hf_hub='laion/mscoco_finetuned_CoCa-ViT-B-32-laion2B-s13B-b90k/')
+)
+_coca_VITL14 = dict(
+    laion2b_s13b_b90k=_pcfg(hf_hub='laion/CoCa-ViT-L-14-laion2B-s13B-b90k/'),
+    mscoco_finetuned_laion2b_s13b_b90k=_pcfg(hf_hub='laion/mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k/')
+)
+_PRETRAINED = {
+    "RN50": _RN50,
+    "RN50-quickgelu": _RN50_quickgelu,
+    "RN101": _RN101,
+    "RN101-quickgelu": _RN101_quickgelu,
+    "RN50x4": _RN50x4,
+    "RN50x16": _RN50x16,
+    "RN50x64": _RN50x64,
+    "ViT-B-32": _VITB32,
+    "ViT-B-32-256": _VITB32_256,
+    "ViT-B-32-quickgelu": _VITB32_quickgelu,
+    "ViT-B-16": _VITB16,
+    "ViT-B-16-quickgelu": _VITB16_quickgelu,
+    "ViT-B-16-plus-240": _VITB16_PLUS_240,
+    "ViT-L-14": _VITL14,
+    "ViT-L-14-quickgelu": _VITL14_quickgelu,
+    "ViT-L-14-336": _VITL14_336,
+    "ViT-H-14": _VITH14,
+    "ViT-H-14-quickgelu": _VITH14_quickgelu,
+    "ViT-H-14-378-quickgelu": _VITH14_378_quickgelu,
+    "ViT-g-14": _VITg14,
+    "ViT-bigG-14": _VITbigG14,
+    "roberta-ViT-B-32": _robertaViTB32,
+    "xlm-roberta-base-ViT-B-32": _xlmRobertaBaseViTB32,
+    "xlm-roberta-large-ViT-H-14": _xlmRobertaLargeFrozenViTH14,
+    "convnext_base": _convnext_base,
+    "convnext_base_w": _convnext_base_w,
+    "convnext_base_w_320": _convnext_base_w_320,
+    "convnext_large_d": _convnext_large_d,
+    "convnext_large_d_320": _convnext_large_d_320,
+    "convnext_xxlarge": _convnext_xxlarge,
+    "coca_ViT-B-32": _coca_VITB32,
+    "coca_ViT-L-14": _coca_VITL14,
+    "EVA01-g-14": dict(
+        # from QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt
+        laion400m_s11b_b41k=_pcfg(hf_hub='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k/'),
+    ),
+    "EVA01-g-14-plus": dict(
+        # from QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt
+        merged2b_s11b_b114k=_pcfg(hf_hub='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k/'),
+    ),
+    "EVA02-B-16": dict(
+        # from QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt
+        merged2b_s8b_b131k=_pcfg(hf_hub='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k/'),
+    ),
+    "EVA02-L-14": dict(
+        # from QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt
+        merged2b_s4b_b131k=_pcfg(hf_hub='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k/'),
+    ),
+    "EVA02-L-14-336": dict(
+        # from QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt
+        merged2b_s6b_b61k=_pcfg(hf_hub='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k/'),
+    ),
+    "EVA02-E-14": dict(
+        # from QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt
+        laion2b_s4b_b115k=_pcfg(hf_hub='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k/'),
+    ),
+    "EVA02-E-14-plus": dict(
+        # from QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt
+        laion2b_s9b_b144k=_pcfg(hf_hub='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k/'),
+    ),
+    "ViT-B-16-SigLIP": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP/'),
+    ),
+    "ViT-B-16-SigLIP-256": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-256/'),
+    ),
+    "ViT-B-16-SigLIP-i18n-256": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-i18n-256/'),
+    ),
+    "ViT-B-16-SigLIP-384": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-384/'),
+    ),
+    "ViT-B-16-SigLIP-512": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-512/'),
+    ),
+    "ViT-L-16-SigLIP-256": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-L-16-SigLIP-256/'),
+    ),
+    "ViT-L-16-SigLIP-384": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-L-16-SigLIP-384/'),
+    ),
+    "ViT-SO400M-14-SigLIP": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-SO400M-14-SigLIP/'),
+    ),
+    "ViT-SO400M-14-SigLIP-384": dict(
+        webli=_slpcfg(hf_hub='timm/ViT-SO400M-14-SigLIP-384/'),
+    ),
+    "ViT-L-14-CLIPA": dict(
+        datacomp1b=_apcfg(hf_hub='UCSC-VLAA/ViT-L-14-CLIPA-datacomp1B/'),
+    ),
+    "ViT-L-14-CLIPA-336": dict(
+        datacomp1b=_apcfg(hf_hub='UCSC-VLAA/ViT-L-14-CLIPA-336-datacomp1B/'),
+    ),
+    "ViT-H-14-CLIPA": dict(
+        datacomp1b=_apcfg(hf_hub='UCSC-VLAA/ViT-H-14-CLIPA-datacomp1B/'),
+    ),
+    "ViT-H-14-CLIPA-336": dict(
+        laion2b=_apcfg(hf_hub='UCSC-VLAA/ViT-H-14-CLIPA-336-laion2B/'),
+        datacomp1b=_apcfg(hf_hub='UCSC-VLAA/ViT-H-14-CLIPA-336-datacomp1B/'),
+    ),
+    "ViT-bigG-14-CLIPA": dict(
+        datacomp1b=_apcfg(hf_hub='UCSC-VLAA/ViT-bigG-14-CLIPA-datacomp1B/'),
+    ),
+    "ViT-bigG-14-CLIPA-336": dict(
+        datacomp1b=_apcfg(hf_hub='UCSC-VLAA/ViT-bigG-14-CLIPA-336-datacomp1B/'),
+    ),
+    "nllb-clip-base": dict(
+        v1=_pcfg(hf_hub='visheratin/nllb-clip-base-oc/'),
+    ),
+    "nllb-clip-large": dict(
+        v1=_pcfg(hf_hub='visheratin/nllb-clip-large-oc/'),
+    ),
+    "nllb-clip-base-siglip": dict(
+        v1=_slpcfg(hf_hub='visheratin/nllb-clip-base-siglip/'),
+        mrl=_slpcfg(hf_hub='visheratin/nllb-siglip-mrl-base/'),
+    ),
+    "nllb-clip-large-siglip": dict(
+        v1=_slpcfg(hf_hub='visheratin/nllb-clip-large-siglip/'),
+        mrl=_slpcfg(hf_hub='visheratin/nllb-siglip-mrl-large/'),
+    ),
+    "MobileCLIP-S1": dict(
+        datacompdr=_mccfg(hf_hub='apple/MobileCLIP-S1-OpenCLIP/')),
+    "MobileCLIP-S2": dict(
+        datacompdr=_mccfg(hf_hub='apple/MobileCLIP-S2-OpenCLIP/')),
+    "MobileCLIP-B": dict(
+        datacompdr=_mccfg(hf_hub='apple/MobileCLIP-B-OpenCLIP/'),
+        datacompdr_lt=_mccfg(hf_hub='apple/MobileCLIP-B-LT-OpenCLIP/'),
+    ),
+    "ViTamin-S": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-S/pytorch_model.bin'),
+    ),
+    "ViTamin-S-LTT": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-S-LTT/pytorch_model.bin'),
+    ),
+    "ViTamin-B": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-B/pytorch_model.bin'),
+    ),
+    "ViTamin-B-LTT": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-B-LTT/pytorch_model.bin'),
+    ),
+    "ViTamin-L": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L-224px/pytorch_model.bin'),
+    ),
+    "ViTamin-L-256": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L-256px/pytorch_model.bin'),
+    ),
+    "ViTamin-L-336": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L-336px/pytorch_model.bin'),
+    ),
+    "ViTamin-L-384": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L-384px/pytorch_model.bin'),
+    ),
+    "ViTamin-L2": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L2-224px/pytorch_model.bin'),
+    ),
+    "ViTamin-L2-256": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L2-256px/pytorch_model.bin'),
+    ),
+    "ViTamin-L2-336": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L2-336px/pytorch_model.bin'),
+    ),
+    "ViTamin-L2-384": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-L2-384px/pytorch_model.bin'),
+    ),
+    "ViTamin-XL-256": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-XL-256px/pytorch_model.bin'),
+    ),
+    "ViTamin-XL-336": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-XL-336px/pytorch_model.bin'),
+    ),
+    "ViTamin-XL-384": dict(
+        datacomp1b=_pcfg(hf_hub='jienengchen/ViTamin-XL-384px/pytorch_model.bin'),
+    ),
+}
+def _clean_tag(tag: str):
+    # normalize pretrained tags
+    return tag.lower().replace('-', '_')
+def list_pretrained(as_str: bool = False):
+    """ returns list of pretrained models
+    Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True
+    """
+    return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()]
+def list_pretrained_models_by_tag(tag: str):
+    """ return all models having the specified pretrain tag """
+    models = []
+    tag = _clean_tag(tag)
+    for k in _PRETRAINED.keys():
+        if tag in _PRETRAINED[k]:
+            models.append(k)
+    return models
+def list_pretrained_tags_by_model(model: str):
+    """ return all pretrain tags for the specified model architecture """
+    tags = []
+    if model in _PRETRAINED:
+        tags.extend(_PRETRAINED[model].keys())
+    return tags
+def is_pretrained_cfg(model: str, tag: str):
+    if model not in _PRETRAINED:
+        return False
+    return _clean_tag(tag) in _PRETRAINED[model]
+def get_pretrained_cfg(model: str, tag: str):
+    if model not in _PRETRAINED:
+        return {}
+    model_pretrained = _PRETRAINED[model]
+    return model_pretrained.get(_clean_tag(tag), {})
+def get_pretrained_url(model: str, tag: str):
+    cfg = get_pretrained_cfg(model, _clean_tag(tag))
+    return cfg.get('url', '')
+def download_pretrained_from_url(
+        url: str,
+        cache_dir: Union[str, None] = None,
+):
+    if not cache_dir:
+        cache_dir = os.path.expanduser("~/.cache/clip")
+    os.makedirs(cache_dir, exist_ok=True)
+    filename = os.path.basename(url)
+    if 'openaipublic' in url:
+        expected_sha256 = url.split("/")[-2]
+    elif 'mlfoundations' in url:
+        expected_sha256 = os.path.splitext(filename)[0].split("-")[-1]
+    else:
+        expected_sha256 = ''
+    download_target = os.path.join(cache_dir, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if expected_sha256:
+            if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
+                return download_target
+            else:
+                warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+        else:
+            return download_target
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def has_hf_hub(necessary=False):
+    if not _has_hf_hub and necessary:
+        # if no HF Hub module installed, and it is necessary to continue, raise error
+        raise RuntimeError(
+            'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.')
+    return _has_hf_hub
+def download_pretrained_from_hf(
+        model_id: str,
+        filename: str = 'open_clip_pytorch_model.bin',
+        revision=None,
+        cache_dir: Union[str, None] = None,
+):
+    has_hf_hub(True)
+    cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir)
+    return cached_file
+def download_pretrained(
+        cfg: Dict,
+        force_hf_hub: bool = False,
+        cache_dir: Union[str, None] = None,
+):
+    target = ''
+    if not cfg:
+        return target
+    download_url = cfg.get('url', '')
+    download_hf_hub = cfg.get('hf_hub', '')
+    if download_hf_hub and force_hf_hub:
+        # use HF hub even if url exists
+        download_url = ''
+    if download_url:
+        target = download_pretrained_from_url(download_url, cache_dir=cache_dir)
+    elif download_hf_hub:
+        has_hf_hub(True)
+        # we assume the hf_hub entries in pretrained config combine model_id + filename in
+        # 'org/model_name/filename.pt' form. To specify just the model id w/o filename and
+        # use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'.
+        model_id, filename = os.path.split(download_hf_hub)
+        if filename:
+            target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir)
+        else:
+            target = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
+    return target

src/open_clip/timm_model.py ADDED Viewed

	@@ -0,0 +1,152 @@

+""" timm model adapter
+Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
+"""
+import logging
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+try:
+    import timm
+    from timm.models.layers import Mlp, to_2tuple
+    try:
+        # old timm imports < 0.8.1
+        from timm.models.layers.attention_pool2d import RotAttentionPool2d
+        from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
+    except ImportError:
+        # new timm imports >= 0.8.1
+        from timm.layers import RotAttentionPool2d
+        from timm.layers import AttentionPool2d as AbsAttentionPool2d
+except ImportError:
+    timm = None
+from .utils import freeze_batch_norm_2d
+class TimmModel(nn.Module):
+    """ timm model adapter
+    """
+    def __init__(
+            self,
+            model_name,
+            embed_dim,
+            image_size=224,
+            pool='avg',
+            proj='linear',
+            proj_bias=False,
+            drop=0.,
+            drop_path=None,
+            patch_drop=None,
+            pretrained=False,
+    ):
+        super().__init__()
+        if timm is None:
+            raise RuntimeError("Please `pip install timm` to use timm models.")
+        self.image_size = to_2tuple(image_size)
+        # setup kwargs that may not be common across all models
+        timm_kwargs = {}
+        if drop_path is not None:
+            timm_kwargs['drop_path_rate'] = drop_path
+        if patch_drop is not None:
+            timm_kwargs['patch_drop_rate'] = patch_drop
+        custom_pool = pool in ('abs_attn', 'rot_attn')
+        if proj:
+            assert proj in ("linear", "mlp", "none")
+        extra_proj = proj in ("linear", "mlp")
+        if not extra_proj and not custom_pool:
+            # use network classifier head as projection if no proj specified and no custom pooling used
+            # if projection is explicitly set to "none" will be pass through from network trunk
+            proj_dim = 0 if proj == 'none' else embed_dim
+            self.trunk = timm.create_model(
+                model_name,
+                num_classes=proj_dim,
+                global_pool=pool,
+                pretrained=pretrained,
+                **timm_kwargs,
+            )
+            prev_chs = embed_dim
+        else:
+            self.trunk = timm.create_model(
+                model_name,
+                pretrained=pretrained,
+                **timm_kwargs,
+            )
+            feat_size = self.trunk.default_cfg.get('pool_size', None)
+            feature_ndim = 1 if not feat_size else 2
+            if custom_pool:
+                assert feature_ndim == 2
+                # if attn pooling used, remove both classifier and default pool
+                self.trunk.reset_classifier(0, global_pool='')
+            else:
+                # reset global pool if pool config set, otherwise leave as network default
+                reset_kwargs = dict(global_pool=pool) if pool else {}
+                self.trunk.reset_classifier(0, **reset_kwargs)
+            prev_chs = self.trunk.num_features
+        head_layers = OrderedDict()
+        # Add custom pooling to head
+        if pool == 'abs_attn':
+            head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
+            prev_chs = embed_dim
+        elif pool == 'rot_attn':
+            head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
+            prev_chs = embed_dim
+        # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
+        if proj == 'linear':
+            head_layers['drop'] = nn.Dropout(drop)
+            head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
+        elif proj == 'mlp':
+            head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=(drop, 0), bias=(True, proj_bias))
+        self.head = nn.Sequential(head_layers)
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        """ lock modules
+        Args:
+            unlocked_groups (int): leave last n layer groups unlocked (default: 0)
+        """
+        if not unlocked_groups:
+            # lock full model
+            for param in self.trunk.parameters():
+                param.requires_grad = False
+            if freeze_bn_stats:
+                freeze_batch_norm_2d(self.trunk)
+        else:
+            # NOTE: partial freeze requires latest timm (master) branch and is subject to change
+            try:
+                # FIXME import here until API stable and in an official release
+                from timm.models.helpers import group_parameters, group_modules
+            except ImportError:
+                raise RuntimeError(
+                    'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
+            matcher = self.trunk.group_matcher()
+            gparams = group_parameters(self.trunk, matcher)
+            max_layer_id = max(gparams.keys())
+            max_layer_id = max_layer_id - unlocked_groups
+            for group_idx in range(max_layer_id + 1):
+                group = gparams[group_idx]
+                for param in group:
+                    self.trunk.get_parameter(param).requires_grad = False
+            if freeze_bn_stats:
+                gmodules = group_modules(self.trunk, matcher, reverse=True)
+                gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
+                freeze_batch_norm_2d(self.trunk, gmodules)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        try:
+            self.trunk.set_grad_checkpointing(enable)
+        except Exception as e:
+            logging.warning('grad checkpointing not supported for this timm image tower, continuing without...')
+    def forward(self, x):
+        x = self.trunk(x)
+        x = self.head(x)
+        return x

src/open_clip/tokenizer.py ADDED Viewed

	@@ -0,0 +1,517 @@

+""" CLIP tokenizer
+Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+import gzip
+import html
+import os
+import random
+import string
+from functools import lru_cache, partial
+from typing import Callable, List, Optional, Union
+import warnings
+import ftfy
+import numpy as np
+import regex as re
+import torch
+# https://stackoverflow.com/q/62691279
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+_nltk_init = False
+DEFAULT_CONTEXT_LENGTH = 77  # default context length for OpenAI CLIP
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = " ".join(text.split())
+    text = text.strip()
+    return text
+def _clean_canonicalize(x):
+    # basic, remove whitespace, remove punctuation, lower case
+    return canonicalize_text(basic_clean(x))
+def _clean_lower(x):
+    # basic, remove whitespace, lower case
+    return whitespace_clean(basic_clean(x)).lower()
+def _clean_whitespace(x):
+    # basic, remove whitespace
+    return whitespace_clean(basic_clean(x))
+def get_clean_fn(type: str):
+    if type == 'canonicalize':
+        return _clean_canonicalize
+    elif type == 'lower':
+        return _clean_lower
+    elif type == 'whitespace':
+        return _clean_whitespace
+    else:
+        assert False, f"Invalid clean function ({type})."
+def canonicalize_text(
+    text,
+    *,
+    keep_punctuation_exact_string=None,
+    trans_punctuation: dict = str.maketrans("", "", string.punctuation),
+):
+    """Returns canonicalized `text` (lowercase and punctuation removed).
+    From: https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94
+    Args:
+      text: string to be canonicalized.
+      keep_punctuation_exact_string: If provided, then this exact string kept.
+        For example providing '{}' will keep any occurrences of '{}' (but will
+        still remove '{' and '}' that appear separately).
+    """
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(trans_punctuation)
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(trans_punctuation)
+    text = text.lower()
+    text = " ".join(text.split())
+    return text.strip()
+class SimpleTokenizer(object):
+    def __init__(
+            self,
+            bpe_path: str = default_bpe(),
+            additional_special_tokens: Optional[List[str]] = None,
+            context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
+            clean: str = 'lower',
+            reduction_mask: str = ''
+    ):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        special_tokens = ['<start_of_text>', '<end_of_text>']
+        if additional_special_tokens:
+            special_tokens += additional_special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t:t for t in special_tokens}
+        special = "|".join(special_tokens)
+        self.pat = re.compile(
+            special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE,
+        )
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+        self.sot_token_id = self.all_special_ids[0]
+        self.eot_token_id = self.all_special_ids[1]
+        self.context_length = context_length
+        self.clean_fn = get_clean_fn(clean)
+        self.reduction_fn = get_reduction_mask_fn(reduction_mask) if reduction_mask else None
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = self.clean_fn(text)
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+    def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = None) -> torch.LongTensor:
+        """ Returns the tokenized representation of given input string(s)
+        Parameters
+        ----------
+        texts : Union[str, List[str]]
+            An input string or a list of input strings to tokenize
+        context_length : int
+            The context length to use; all CLIP models use 77 as the context length
+        Returns
+        -------
+        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        context_length = context_length or self.context_length
+        assert context_length, 'Please set a valid context length'
+        if self.reduction_fn is not None:
+            # use reduction strategy for tokenize if set, otherwise default to truncation below
+            return self.reduction_fn(
+                texts,
+                context_length=context_length,
+                sot_token_id=self.sot_token_id,
+                eot_token_id=self.eot_token_id,
+                encode_fn=self.encode,
+            )
+        all_tokens = [[self.sot_token_id] + self.encode(text) + [self.eot_token_id] for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                tokens = tokens[:context_length]  # Truncate
+                tokens[-1] = self.eot_token_id
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        return result
+_tokenizer = SimpleTokenizer()
+def decode(output_ids: torch.Tensor):
+    output_ids = output_ids.cpu().numpy()
+    return _tokenizer.decode(output_ids)
+def tokenize(texts: Union[str, List[str]], context_length: int = DEFAULT_CONTEXT_LENGTH) -> torch.LongTensor:
+    return _tokenizer(texts, context_length=context_length)
+def random_mask_tokenize(
+        texts: Union[str, List[str]],
+        context_length: int,
+        sot_token_id: int,
+        eot_token_id: int,
+        encode_fn: Callable,
+        shuffle: bool = False,
+):
+    all_tokens = [encode_fn(text) for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        tokens = torch.tensor(tokens)
+        num_tokens = len(tokens)
+        if num_tokens > context_length - 2:  # 2 for sot and eot token
+            num_keep = context_length - 2
+            indices = torch.randperm(len(tokens))
+            indices = indices[:num_keep]
+            if not shuffle:
+                indices = indices.msort()
+            tokens = tokens[indices]
+            num_tokens = num_keep
+        result[i, 0] = sot_token_id
+        result[i, 1:num_tokens + 1] = tokens
+        result[i, num_tokens + 1] = eot_token_id
+    return result
+def simple_mask_tokenize(
+        texts: Union[str, List[str]],
+        context_length: int,
+        sot_token_id: int,
+        eot_token_id: int,
+        encode_fn: Callable,
+):
+    all_tokens = [encode_fn(text) for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        num_tokens = len(tokens)
+        if num_tokens > context_length - 2:  # 2 for sot and eot token
+            num_keep = context_length - 2
+            start_index = random.randint(0, num_tokens - num_keep)  # high is incl
+            tokens = tokens[start_index: start_index + num_keep]
+        tokens = [sot_token_id] + tokens + [eot_token_id]
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def syntax_mask_tokenize(
+        texts: Union[str, List[str]],
+        context_length: int,
+        sot_token_id: int,
+        eot_token_id: int,
+        encode_fn: Callable,
+) -> torch.LongTensor:
+    """ Returns the tokenized representation of given input string(s).
+    Apply syntax masking before tokenize.
+    """
+    import nltk
+    global _nltk_init
+    if not _nltk_init:
+        # run them for the first time
+        nltk.download('punkt')
+        nltk.download('averaged_perceptron_tagger')
+        _nltk_init = True
+    def get_order(x):
+        if x.startswith('NN'):
+            return 1
+        elif x.startswith('JJ'):
+            return 2
+        elif x.startswith('VB'):
+            return 3
+        else:
+            return 4
+    # syntax masking
+    new_texts = []
+    for text in texts:
+        list_tokens = nltk.tokenize.word_tokenize(text)
+        pos_tags = nltk.pos_tag(list_tokens)
+        #  sample the words by get_order method
+        order_list = [get_order(tag) for _, tag in pos_tags]
+        sorted_ids = np.argsort(np.array(order_list))
+        sampled_ids = sorted(sorted_ids[:context_length - 2]) # need 2 slots for sot and eot tokens
+        sampled_tokens = np.take(np.array(list_tokens), sampled_ids, axis=0)  # sample the tokens
+        new_text = ''
+        for token in sampled_tokens:
+            new_text = new_text + str(token) + ' '
+        new_text = new_text.strip()
+        new_texts.append(new_text)
+    texts = new_texts
+    all_tokens = [[sot_token_id] + encode_fn(text) + [eot_token_id] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        # still need first truncate because some words produces two tokens
+        if len(tokens) > context_length:
+            tokens = tokens[:context_length]  # Truncate
+            tokens[-1] = eot_token_id
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def get_reduction_mask_fn(type: str):
+    """ Choose strategy for dropping (masking) tokens to achieve target context length"""
+    assert type in ('simple', 'random', 'shuffle', 'syntax')
+    if type == 'simple':
+        return simple_mask_tokenize  # randomly select block [start:end]
+    elif type == 'random':
+        return random_mask_tokenize  # randomly drop tokens (keep order)
+    elif type == 'shuffle':
+        return partial(random_mask_tokenize, shuffle=True)  # randomly drop tokens (shuffle order)
+    elif type == 'syntax':
+        return syntax_mask_tokenize  # randomly drop prioritized by syntax
+class HFTokenizer:
+    """HuggingFace tokenizer wrapper"""
+    def __init__(
+            self,
+            tokenizer_name: str,
+            context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
+            clean: str = 'whitespace',
+            strip_sep_token: bool = False,
+            language: Optional[str] = None,
+            **kwargs
+    ):
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **kwargs)
+        set_lang_fn = getattr(self.tokenizer, 'set_src_lang_special_tokens', None)
+        if callable(set_lang_fn):
+            self.set_lang_fn = set_lang_fn
+        if language is not None:
+            self.set_language(language)
+        self.context_length = context_length
+        self.clean_fn = get_clean_fn(clean)
+        self.strip_sep_token = strip_sep_token
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+    def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = None) -> torch.Tensor:
+        # same cleaning as for default tokenizer, except lowercasing
+        # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
+        if isinstance(texts, str):
+            texts = [texts]
+        context_length = context_length or self.context_length
+        assert context_length, 'Please set a valid context length in class init or call.'
+        texts = [self.clean_fn(text) for text in texts]
+        input_ids = self.tokenizer.batch_encode_plus(
+            texts,
+            return_tensors='pt',
+            max_length=context_length,
+            padding='max_length',
+            truncation=True,
+        ).input_ids
+        if self.strip_sep_token:
+            input_ids = torch.where(
+                input_ids == self.tokenizer.sep_token_id,
+                torch.zeros_like(input_ids),
+                input_ids,
+            )
+        return input_ids
+    def set_language(self, src_lang):
+        if hasattr(self, 'set_lang_fn'):
+            self.set_lang_fn(src_lang)
+        else:
+            warnings.warn('Cannot set language for the tokenizer.')
+class SigLipTokenizer:
+    """HuggingFace tokenizer wrapper for SigLIP T5 compatible sentencepiece vocabs
+    """
+    VOCAB_FILES = {
+        # english, vocab_size=32_000
+        "c4-en": "http://storage.googleapis.com/t5-data/vocabs/cc_en.32000/sentencepiece.model",
+        # used in multilingual models (mT5, PaLI), vocab_size=250_000
+        "mc4": "http://storage.googleapis.com/t5-data/vocabs/mc4.250000.100extra/sentencepiece.model",
+    }
+    def __init__(
+            self,
+            tokenizer_name: str,
+            context_length: Optional[int] = 64,
+    ):
+        from transformers import T5TokenizerFast
+        if tokenizer_name in self.VOCAB_FILES:
+            # FIXME temporary hack?
+            import tempfile
+            import fsspec
+            vocab_file = self.VOCAB_FILES[tokenizer_name]
+            with tempfile.NamedTemporaryFile('wb') as dst:
+                with fsspec.open(vocab_file, 'rb') as src:
+                    dst.write(src.read())
+                self.tokenizer = T5TokenizerFast(dst.name, legacy=False)
+        else:
+            self.tokenizer = T5TokenizerFast(tokenizer_name, legacy=False)
+        self.tokenizer.pad_token_id = 1
+        self.tokenizer.eos_token_id = 1
+        self.context_length = context_length
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+    def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = None) -> torch.Tensor:
+        # same cleaning as for default tokenizer, except lowercasing
+        # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
+        if isinstance(texts, str):
+            texts = [texts]
+        context_length = context_length or self.context_length
+        assert context_length, 'Please set a valid context length in class init or call.'
+        texts = [canonicalize_text(basic_clean(text)) for text in texts]
+        output = self.tokenizer(
+            texts,
+            return_tensors='pt',
+            max_length=context_length,
+            padding='max_length',
+            truncation=True,
+        )
+        return output.input_ids

src/open_clip/transform.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import numbers
+import random
+import warnings
+from dataclasses import dataclass, asdict
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import torch
+import torchvision.transforms.functional as F
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
+    CenterCrop, ColorJitter, Grayscale
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .utils import to_2tuple
+@dataclass
+class PreprocessCfg:
+    size: Union[int, Tuple[int, int]] = 224
+    mode: str = 'RGB'
+    mean: Tuple[float, ...] = OPENAI_DATASET_MEAN
+    std: Tuple[float, ...] = OPENAI_DATASET_STD
+    interpolation: str = 'bicubic'
+    resize_mode: str = 'shortest'
+    fill_color: int = 0
+    def __post_init__(self):
+        assert self.mode in ('RGB',)
+    @property
+    def num_channels(self):
+        return 3
+    @property
+    def input_size(self):
+        return (self.num_channels,) + to_2tuple(self.size)
+_PREPROCESS_KEYS = set(asdict(PreprocessCfg()).keys())
+def merge_preprocess_dict(
+        base: Union[PreprocessCfg, Dict],
+        overlay: Dict,
+):
+    """ Merge overlay key-value pairs on top of base preprocess cfg or dict.
+    Input dicts are filtered based on PreprocessCfg fields.
+    """
+    if isinstance(base, PreprocessCfg):
+        base_clean = asdict(base)
+    else:
+        base_clean = {k: v for k, v in base.items() if k in _PREPROCESS_KEYS}
+    if overlay:
+        overlay_clean = {k: v for k, v in overlay.items() if k in _PREPROCESS_KEYS and v is not None}
+        base_clean.update(overlay_clean)
+    return base_clean
+def merge_preprocess_kwargs(base: PreprocessCfg, **kwargs):
+    return merge_preprocess_dict(base, kwargs)
+@dataclass
+class AugmentationCfg:
+    scale: Tuple[float, float] = (0.9, 1.0)
+    ratio: Optional[Tuple[float, float]] = None
+    color_jitter: Optional[Union[float, Tuple[float, float, float], Tuple[float, float, float, float]]] = None
+    re_prob: Optional[float] = None
+    re_count: Optional[int] = None
+    use_timm: bool = False
+    # params for simclr_jitter_gray
+    color_jitter_prob: float = None
+    gray_scale_prob: float = None
+def _setup_size(size, error_msg):
+    if isinstance(size, numbers.Number):
+        return int(size), int(size)
+    if isinstance(size, Sequence) and len(size) == 1:
+        return size[0], size[0]
+    if len(size) != 2:
+        raise ValueError(error_msg)
+    return size
+class ResizeKeepRatio:
+    """ Resize and Keep Ratio
+    Copy & paste from `timm`
+    """
+    def __init__(
+            self,
+            size,
+            longest=0.,
+            interpolation=InterpolationMode.BICUBIC,
+            random_scale_prob=0.,
+            random_scale_range=(0.85, 1.05),
+            random_aspect_prob=0.,
+            random_aspect_range=(0.9, 1.11)
+    ):
+        if isinstance(size, (list, tuple)):
+            self.size = tuple(size)
+        else:
+            self.size = (size, size)
+        self.interpolation = interpolation
+        self.longest = float(longest)  # [0, 1] where 0 == shortest edge, 1 == longest
+        self.random_scale_prob = random_scale_prob
+        self.random_scale_range = random_scale_range
+        self.random_aspect_prob = random_aspect_prob
+        self.random_aspect_range = random_aspect_range
+    @staticmethod
+    def get_params(
+            img,
+            target_size,
+            longest,
+            random_scale_prob=0.,
+            random_scale_range=(0.85, 1.05),
+            random_aspect_prob=0.,
+            random_aspect_range=(0.9, 1.11)
+    ):
+        """Get parameters
+        """
+        source_size = img.size[::-1]  # h, w
+        h, w = source_size
+        target_h, target_w = target_size
+        ratio_h = h / target_h
+        ratio_w = w / target_w
+        ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (1. - longest)
+        if random_scale_prob > 0 and random.random() < random_scale_prob:
+            ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1])
+            ratio_factor = (ratio_factor, ratio_factor)
+        else:
+            ratio_factor = (1., 1.)
+        if random_aspect_prob > 0 and random.random() < random_aspect_prob:
+            aspect_factor = random.uniform(random_aspect_range[0], random_aspect_range[1])
+            ratio_factor = (ratio_factor[0] / aspect_factor, ratio_factor[1] * aspect_factor)
+        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
+        return size
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size
+        """
+        size = self.get_params(
+            img, self.size, self.longest,
+            self.random_scale_prob, self.random_scale_range,
+            self.random_aspect_prob, self.random_aspect_range
+        )
+        img = F.resize(img, size, self.interpolation)
+        return img
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += f', interpolation={self.interpolation})'
+        format_string += f', longest={self.longest:.3f})'
+        return format_string
+def center_crop_or_pad(img: torch.Tensor, output_size: List[int], fill=0) -> torch.Tensor:
+    """Center crops and/or pads the given image.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int,
+            it is used for both directions.
+        fill (int, Tuple[int]): Padding color
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    _, image_height, image_width = F.get_dimensions(img)
+    crop_height, crop_width = output_size
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        img = F.pad(img, padding_ltrb, fill=fill)
+        _, image_height, image_width = F.get_dimensions(img)
+        if crop_width == image_width and crop_height == image_height:
+            return img
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return F.crop(img, crop_top, crop_left, crop_height, crop_width)
+class CenterCropOrPad(torch.nn.Module):
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.fill = fill
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        return center_crop_or_pad(img, self.size, fill=self.fill)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+class color_jitter(object):
+    """
+    Apply Color Jitter to the PIL image with a specified probability.
+    """
+    def __init__(self, brightness=0., contrast=0., saturation=0., hue=0., p=0.8):
+        assert 0. <= p <= 1.
+        self.p = p
+        self.transf = ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+    def __call__(self, img):
+        if random.random() < self.p:
+            return self.transf(img)
+        else:
+            return img
+class gray_scale(object):
+    """
+    Apply Gray Scale to the PIL image with a specified probability.
+    """
+    def __init__(self, p=0.2):
+        assert 0. <= p <= 1.
+        self.p = p
+        self.transf = Grayscale(num_output_channels=3)
+    def __call__(self, img):
+        if random.random() < self.p:
+            return self.transf(img)
+        else:
+            return img
+def image_transform(
+        image_size: Union[int, Tuple[int, int]],
+        is_train: bool,
+        mean: Optional[Tuple[float, ...]] = None,
+        std: Optional[Tuple[float, ...]] = None,
+        resize_mode: Optional[str] = None,
+        interpolation: Optional[str] = None,
+        fill_color: int = 0,
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+):
+    mean = mean or OPENAI_DATASET_MEAN
+    if not isinstance(mean, (list, tuple)):
+        mean = (mean,) * 3
+    std = std or OPENAI_DATASET_STD
+    if not isinstance(std, (list, tuple)):
+        std = (std,) * 3
+    interpolation = interpolation or 'bicubic'
+    assert interpolation in ['bicubic', 'bilinear', 'random']
+    # NOTE random is ignored for interpolation_mode, so defaults to BICUBIC for inference if set
+    interpolation_mode = InterpolationMode.BILINEAR if interpolation == 'bilinear' else InterpolationMode.BICUBIC
+    resize_mode = resize_mode or 'shortest'
+    assert resize_mode in ('shortest', 'longest', 'squash')
+    if isinstance(aug_cfg, dict):
+        aug_cfg = AugmentationCfg(**aug_cfg)
+    else:
+        aug_cfg = aug_cfg or AugmentationCfg()
+    normalize = Normalize(mean=mean, std=std)
+    if is_train:
+        aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None}
+        use_timm = aug_cfg_dict.pop('use_timm', False)
+        if use_timm:
+            from timm.data import create_transform  # timm can still be optional
+            if isinstance(image_size, (tuple, list)):
+                assert len(image_size) >= 2
+                input_size = (3,) + image_size[-2:]
+            else:
+                input_size = (3, image_size, image_size)
+            aug_cfg_dict.setdefault('color_jitter', None)  # disable by default
+            # drop extra non-timm items
+            aug_cfg_dict.pop('color_jitter_prob', None)
+            aug_cfg_dict.pop('gray_scale_prob', None)
+            train_transform = create_transform(
+                input_size=input_size,
+                is_training=True,
+                hflip=0.,
+                mean=mean,
+                std=std,
+                re_mode='pixel',
+                interpolation=interpolation,
+                **aug_cfg_dict,
+            )
+        else:
+            train_transform = [
+                RandomResizedCrop(
+                    image_size,
+                    scale=aug_cfg_dict.pop('scale'),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                _convert_to_rgb,
+            ]
+            if aug_cfg.color_jitter_prob:
+                assert aug_cfg.color_jitter is not None and len(aug_cfg.color_jitter) == 4
+                train_transform.extend([
+                    color_jitter(*aug_cfg.color_jitter, p=aug_cfg.color_jitter_prob)
+                ])
+            if aug_cfg.gray_scale_prob:
+                train_transform.extend([
+                    gray_scale(aug_cfg.gray_scale_prob)
+                ])
+            train_transform.extend([
+                ToTensor(),
+                normalize,
+            ])
+            train_transform = Compose(train_transform)
+            if aug_cfg_dict:
+                warnings.warn(f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).')
+        return train_transform
+    else:
+        if resize_mode == 'longest':
+            transforms = [
+                ResizeKeepRatio(image_size, interpolation=interpolation_mode, longest=1),
+                CenterCropOrPad(image_size, fill=fill_color)
+            ]
+        elif resize_mode == 'squash':
+            if isinstance(image_size, int):
+                image_size = (image_size, image_size)
+            transforms = [
+                Resize(image_size, interpolation=interpolation_mode),
+            ]
+        else:
+            assert resize_mode == 'shortest'
+            if not isinstance(image_size, (tuple, list)):
+                image_size = (image_size, image_size)
+            if image_size[0] == image_size[1]:
+                # simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
+                transforms = [
+                    Resize(image_size[0], interpolation=interpolation_mode)
+                ]
+            else:
+                # resize shortest edge to matching target dim for non-square target
+                transforms = [ResizeKeepRatio(image_size)]
+            transforms += [CenterCrop(image_size)]
+        transforms.extend([
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+        return Compose(transforms)
+def image_transform_v2(
+        cfg: PreprocessCfg,
+        is_train: bool,
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+):
+    return image_transform(
+        image_size=cfg.size,
+        is_train=is_train,
+        mean=cfg.mean,
+        std=cfg.std,
+        interpolation=cfg.interpolation,
+        resize_mode=cfg.resize_mode,
+        fill_color=cfg.fill_color,
+        aug_cfg=aug_cfg,
+    )

src/open_clip/transformer.py ADDED Viewed

	@@ -0,0 +1,908 @@

+from collections import OrderedDict
+import math
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+from functools import partial
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint
+from .utils import to_2tuple
+from .pos_embed import get_2d_sincos_pos_embed
+class LayerNormFp32(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm (with cast back to input dtype)."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+class QuickGELU(nn.Module):
+    # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+    def forward(self, x):
+        if not self.training or self.prob == 0.:
+            return x
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+        rand = torch.randn(batch, num_tokens)
+        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        x = x[batch_indices, patch_indices_keep]
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = True,
+            scaled_cosine: bool = False,
+            scale_heads: bool = False,
+            logit_scale_max: float = math.log(1. / 0.01),
+            batch_first: bool = True,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.
+    ):
+        super().__init__()
+        self.scaled_cosine = scaled_cosine
+        self.scale_heads = scale_heads
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.logit_scale_max = logit_scale_max
+        self.batch_first = batch_first
+        self.use_fsdpa = hasattr(nn.functional, 'scaled_dot_product_attention')
+        # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
+        self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
+        if qkv_bias:
+            self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
+        else:
+            self.in_proj_bias = None
+        if self.scaled_cosine:
+            self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
+        else:
+            self.logit_scale = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        if self.scale_heads:
+            self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
+        else:
+            self.head_scale = None
+        self.out_proj = nn.Linear(dim, dim)
+        self.out_drop = nn.Dropout(proj_drop)
+    def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+        L, N, C = x.shape
+        q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
+        q = q.reshape(L, N * self.num_heads, -1).transpose(0, 1)
+        k = k.reshape(L, N * self.num_heads, -1).transpose(0, 1)
+        v = v.reshape(L, N * self.num_heads, -1).transpose(0, 1)
+        if attn_mask is not None and attn_mask.dtype == torch.bool:
+            new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
+            new_attn_mask.masked_fill_(attn_mask, float("-inf"))
+            attn_mask = new_attn_mask
+        if self.logit_scale is not None:
+            attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
+            logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
+            attn = attn.view(N, self.num_heads, L, L) * logit_scale
+            attn = attn.view(-1, L, L)
+            if attn_mask is not None:
+                attn = attn + attn_mask
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = torch.bmm(attn, v)
+        else:
+            if self.use_fsdpa:
+                x = F.scaled_dot_product_attention(
+                    q, k, v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.attn_drop.p if self.training else 0.,
+                )
+            else:
+                q = q * self.scale
+                attn = torch.bmm(q, k.transpose(-1, -2))
+                if attn_mask is not None:
+                    attn += attn_mask
+                attn = attn.softmax(dim=-1)
+                attn = self.attn_drop(attn)
+                x = torch.bmm(attn, v)
+        if self.head_scale is not None:
+            x = x.view(N, self.num_heads, L, C) * self.head_scale
+            x = x.view(-1, L, C)
+        x = x.transpose(0, 1).reshape(L, N, C)
+        if self.batch_first:
+            x = x.transpose(0, 1)
+        x = self.out_proj(x)
+        x = self.out_drop(x)
+        return x
+class AttentionalPooler(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            context_dim: int,
+            n_head: int = 8,
+            n_queries: int = 256,
+            norm_layer: Callable = LayerNorm,
+    ):
+        super().__init__()
+        self.query = nn.Parameter(torch.randn(n_queries, d_model))
+        self.attn = nn.MultiheadAttention(d_model, n_head, kdim=context_dim, vdim=context_dim, batch_first=True)
+        self.ln_q = norm_layer(d_model)
+        self.ln_k = norm_layer(context_dim)
+    def forward(self, x: torch.Tensor):
+        N = x.shape[0]
+        x = self.ln_k(x)
+        q = self.ln_q(self.query)
+        out = self.attn(q.unsqueeze(0).expand(N, -1, -1), x, x, need_weights=False)[0]
+        return out
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            is_cross_attention: bool = False,
+            batch_first: bool = True,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_head, batch_first=batch_first)
+        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(
+            q_x, k_x, v_x, need_weights=False, attn_mask=attn_mask
+        )[0]
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+        x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
+        x = x + self.ls_2(self.mlp(self.ln_2(x)))
+        return x
+class CustomResidualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            scale_cosine_attn: bool = False,
+            scale_heads: bool = False,
+            scale_attn: bool = False,
+            scale_fc: bool = False,
+            batch_first: bool = True,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        self.attn = Attention(
+            d_model,
+            n_head,
+            scaled_cosine=scale_cosine_attn,
+            scale_heads=scale_heads,
+            batch_first=batch_first,
+        )
+        self.ln_attn = norm_layer(d_model) if scale_attn else nn.Identity()
+        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ('ln', norm_layer(mlp_width) if scale_fc else nn.Identity()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+    def get_reference_weight(self):
+        return self.mlp.c_fc.weight
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        x = x + self.ls_1(self.ln_attn(self.attn(self.ln_1(x), attn_mask=attn_mask)))
+        x = x + self.ls_2(self.mlp(self.ln_2(x)))
+        return x
+def _expand_token(token, batch_size: int):
+    return token.view(1, 1, -1).expand(batch_size, -1, -1)
+class Transformer(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            batch_first: bool = True,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.batch_first = batch_first
+        self.grad_checkpointing = False
+        self.resblocks = nn.ModuleList([
+            ResidualAttentionBlock(
+                width,
+                heads,
+                mlp_ratio,
+                ls_init_value=ls_init_value,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                batch_first=batch_first,
+            )
+            for _ in range(layers)
+        ])
+    def get_cast_dtype(self) -> torch.dtype:
+        if hasattr(self.resblocks[0].mlp.c_fc, 'int8_original_dtype'):
+            return self.resblocks[0].mlp.c_fc.int8_original_dtype
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        if not self.batch_first:
+            x = x.transpose(0, 1).contiguous()    # NLD -> LND
+        for r in self.resblocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
+                x = checkpoint(r, x, None, None, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        if not self.batch_first:
+            x = x.transpose(0, 1)    # LND -> NLD
+        return x
+class CustomTransformer(nn.Module):
+    """ A custom transformer that can use different block types. """
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            batch_first: bool = True,
+            block_types: Union[str, List[str]] = 'CustomResidualAttentionBlock',
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.batch_first = batch_first  # run trasnformer stack in batch first (N, L, D)
+        self.grad_checkpointing = False
+        if isinstance(block_types, str):
+            block_types = [block_types] * layers
+        assert len(block_types) == layers
+        def _create_block(bt: str):
+            if bt == 'CustomResidualAttentionBlock':
+                return CustomResidualAttentionBlock(
+                    width,
+                    heads,
+                    mlp_ratio=mlp_ratio,
+                    ls_init_value=ls_init_value,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    batch_first=batch_first,
+                )
+            else:
+                assert False
+        self.resblocks = nn.ModuleList([
+            _create_block(bt)
+            for bt in block_types
+        ])
+    def get_cast_dtype(self) -> torch.dtype:
+        weight = self.resblocks[0].get_reference_weight()
+        if hasattr(weight, 'int8_original_dtype'):
+            return weight.int8_original_dtype
+        return weight.dtype
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        if not self.batch_first:
+            x = x.transpose(0, 1)  # NLD -> LND
+        for r in self.resblocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
+                x = checkpoint(r, x, None, None, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        if not self.batch_first:
+            x = x.transpose(0, 1)  # NLD -> LND
+        return x
+class VisionTransformer(nn.Module):
+    output_tokens: torch.jit.Final[bool]
+    def __init__(
+            self,
+            image_size: int,
+            patch_size: int,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float,
+            ls_init_value: float = None,
+            attentional_pool: bool = False,
+            attn_pooler_queries: int = 256,
+            attn_pooler_heads: int = 8,
+            output_dim: int = 512,
+            patch_dropout: float = 0.,
+            no_ln_pre: bool = False,
+            pos_embed_type: str = 'learnable',
+            pool_type: str = 'tok',
+            final_ln_after_pool: bool = False,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            output_tokens: bool = False,
+    ):
+        super().__init__()
+        assert pool_type in ('tok', 'avg', 'none')
+        self.output_tokens = output_tokens
+        image_height, image_width = self.image_size = to_2tuple(image_size)
+        patch_height, patch_width = self.patch_size = to_2tuple(patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.final_ln_after_pool = final_ln_after_pool  # currently ignored w/ attn pool enabled
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        if pos_embed_type == 'learnable':
+            self.positional_embedding = nn.Parameter(
+                scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width))
+        elif pos_embed_type == 'sin_cos_2d':
+            # fixed sin-cos embedding
+            assert self.grid_size[0] == self.grid_size[1],\
+                'currently sin cos 2d pos embedding only supports square input'
+            self.positional_embedding = nn.Parameter(
+                torch.zeros(self.grid_size[0] * self.grid_size[1] + 1, width), requires_grad=False)
+            pos_embed_type = get_2d_sincos_pos_embed(width, self.grid_size[0], cls_token=True)
+            self.positional_embedding.data.copy_(torch.from_numpy(pos_embed_type).float())
+        else:
+            raise ValueError
+        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
+        self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
+        self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width)
+        self.transformer = Transformer(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        if attentional_pool:
+            if isinstance(attentional_pool, str):
+                self.attn_pool_type = attentional_pool
+                self.pool_type = 'none'
+                if attentional_pool in ('parallel', 'cascade'):
+                    self.attn_pool = AttentionalPooler(
+                        output_dim,
+                        width,
+                        n_head=attn_pooler_heads,
+                        n_queries=attn_pooler_queries,
+                    )
+                    self.attn_pool_contrastive = AttentionalPooler(
+                        output_dim,
+                        width,
+                        n_head=attn_pooler_heads,
+                        n_queries=1,
+                    )
+                else:
+                    assert False
+            else:
+                self.attn_pool_type = ''
+                self.pool_type = pool_type
+                self.attn_pool = AttentionalPooler(
+                    output_dim,
+                    width,
+                    n_head=attn_pooler_heads,
+                    n_queries=attn_pooler_queries,
+                )
+                self.attn_pool_contrastive = None
+            pool_dim = output_dim
+        else:
+            self.attn_pool = None
+            pool_dim = width
+            self.pool_type = pool_type
+        self.ln_post = norm_layer(pool_dim)
+        self.proj = nn.Parameter(scale * torch.randn(pool_dim, output_dim))
+        self.init_parameters()
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        for param in self.parameters():
+            param.requires_grad = False
+        if unlocked_groups != 0:
+            groups = [
+                [
+                    self.conv1,
+                    self.class_embedding,
+                    self.positional_embedding,
+                    self.ln_pre,
+                ],
+                *self.transformer.resblocks[:-1],
+                [
+                    self.transformer.resblocks[-1],
+                    self.ln_post,
+                ],
+                self.proj,
+            ]
+            def _unlock(x):
+                if isinstance(x, Sequence):
+                    for g in x:
+                        _unlock(g)
+                else:
+                    if isinstance(x, torch.nn.Parameter):
+                        x.requires_grad = True
+                    else:
+                        for p in x.parameters():
+                            p.requires_grad = True
+            _unlock(groups[-unlocked_groups:])
+    def init_parameters(self):
+        # FIXME OpenAI CLIP did not define an init for the VisualTransformer
+        # TODO experiment if default PyTorch init, below, or alternate init is best.
+        # nn.init.normal_(self.class_embedding, std=self.scale)
+        # nn.init.normal_(self.positional_embedding, std=self.scale)
+        #
+        # proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        # attn_std = self.transformer.width ** -0.5
+        # fc_std = (2 * self.transformer.width) ** -0.5
+        # for block in self.transformer.resblocks:
+        #     nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+        #     nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+        #     nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+        #     nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        #
+        # if self.text_projection is not None:
+        #     nn.init.normal_(self.text_projection, std=self.scale)
+        pass
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.grad_checkpointing = enable
+    def _global_pool(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.pool_type == 'avg':
+            pooled, tokens = x[:, 1:].mean(dim=1), x[:, 1:]
+        elif self.pool_type == 'tok':
+            pooled, tokens = x[:, 0], x[:, 1:]
+        else:
+            pooled = tokens = x
+        return pooled, tokens
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        # class embeddings and positional embeddings
+        x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
+        # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.patch_dropout(x)
+        x = self.ln_pre(x)
+        x = self.transformer(x)
+        if self.attn_pool is not None:
+            if self.attn_pool_contrastive is not None:
+                # This is untested, WIP pooling that should match paper
+                x = self.ln_post(x)  # TBD LN first or separate one after each pool?
+                tokens = self.attn_pool(x)
+                if self.attn_pool_type == 'parallel':
+                    pooled = self.attn_pool_contrastive(x)
+                else:
+                    assert self.attn_pool_type == 'cascade'
+                    pooled = self.attn_pool_contrastive(tokens)
+            else:
+                # this is the original OpenCLIP CoCa setup, does not match paper
+                x = self.attn_pool(x)
+                x = self.ln_post(x)
+                pooled, tokens = self._global_pool(x)
+        elif self.final_ln_after_pool:
+            pooled, tokens = self._global_pool(x)
+            pooled = self.ln_post(pooled)
+        else:
+            x = self.ln_post(x)
+            pooled, tokens = self._global_pool(x)
+        if self.proj is not None:
+            pooled = pooled @ self.proj
+        if self.output_tokens:
+            return pooled, tokens
+        return pooled
+def text_global_pool(x, text: Optional[torch.Tensor] = None, pool_type: str = 'argmax'):
+    if pool_type == 'first':
+        pooled, tokens = x[:, 0], x[:, 1:]
+    elif pool_type == 'last':
+        pooled, tokens = x[:, -1], x[:, :-1]
+    elif pool_type == 'argmax':
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        assert text is not None
+        pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x
+    else:
+        pooled = tokens = x
+    return pooled, tokens
+class TextTransformer(nn.Module):
+    output_tokens: torch.jit.Final[bool]
+    def __init__(
+            self,
+            context_length: int = 77,
+            vocab_size: int = 49408,
+            width: int = 512,
+            heads: int = 8,
+            layers: int = 12,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            output_dim: int = 512,
+            embed_cls: bool = False,
+            no_causal_mask: bool = False,
+            pad_id: int = 0,
+            pool_type: str = 'argmax',
+            proj_bias: bool = False,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            output_tokens: bool = False,
+    ):
+        super().__init__()
+        assert pool_type in ('first', 'last', 'argmax', 'none')
+        self.output_tokens = output_tokens
+        self.num_pos = self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.width = width
+        self.output_dim = output_dim
+        self.heads = heads
+        self.pad_id = pad_id
+        self.pool_type = pool_type
+        self.token_embedding = nn.Embedding(vocab_size, width)
+        if embed_cls:
+            self.cls_emb = nn.Parameter(torch.empty(width))
+            self.num_pos += 1
+        else:
+            self.cls_emb = None
+        self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width))
+        self.transformer = Transformer(
+            width=width,
+            layers=layers,
+            heads=heads,
+            mlp_ratio=mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.ln_final = norm_layer(width)
+        if no_causal_mask:
+            self.attn_mask = None
+        else:
+            self.register_buffer('attn_mask', self.build_causal_mask(), persistent=False)
+        if proj_bias:
+            self.text_projection = nn.Linear(width, output_dim)
+        else:
+            self.text_projection = nn.Parameter(torch.empty(width, output_dim))
+        self.init_parameters()
+    def init_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if self.cls_emb is not None:
+            nn.init.normal_(self.cls_emb, std=0.01)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                nn.init.normal_(self.text_projection.weight, std=self.transformer.width ** -0.5)
+                if self.text_projection.bias is not None:
+                    nn.init.zeros_(self.text_projection.bias)
+            else:
+                nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.grad_checkpointing = enable
+    def build_causal_mask(self):
+        # lazily create causal attention mask, with full attention between the tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def build_cls_mask(self, text, cast_dtype: torch.dtype):
+        cls_mask = (text != self.pad_id).unsqueeze(1)
+        cls_mask = F.pad(cls_mask, (1, 0, cls_mask.shape[2], 0), value=True)
+        additive_mask = torch.empty(cls_mask.shape, dtype=cast_dtype, device=cls_mask.device)
+        additive_mask.fill_(0)
+        additive_mask.masked_fill_(~cls_mask, float("-inf"))
+        additive_mask = torch.repeat_interleave(additive_mask, self.heads, 0)
+        return additive_mask
+    def forward(self, text):
+        cast_dtype = self.transformer.get_cast_dtype()
+        seq_len = text.shape[1]
+        x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
+        attn_mask = self.attn_mask
+        if self.cls_emb is not None:
+            seq_len += 1
+            x = torch.cat([x, _expand_token(self.cls_emb, x.shape[0])], dim=1)
+            cls_mask = self.build_cls_mask(text, cast_dtype)
+            if attn_mask is not None:
+                attn_mask = attn_mask[None, :seq_len, :seq_len] + cls_mask[:, :seq_len, :seq_len]
+        x = x + self.positional_embedding[:seq_len].to(cast_dtype)
+        x = self.transformer(x, attn_mask=attn_mask)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        if self.cls_emb is not None:
+            # presence of appended cls embed (CoCa) overrides pool_type, always take last token
+            pooled, tokens = text_global_pool(x, pool_type='last')
+            pooled = self.ln_final(pooled)  # final LN applied after pooling in this case
+        else:
+            x = self.ln_final(x)
+            pooled, tokens = text_global_pool(x, text, pool_type=self.pool_type)
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                pooled = self.text_projection(pooled)
+            else:
+                pooled = pooled @ self.text_projection
+        if self.output_tokens:
+            return pooled, tokens
+        return pooled
+class MultimodalTransformer(Transformer):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            context_length: int = 77,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            output_dim: int = 512,
+            batch_first: bool = True,
+    ):
+        super().__init__(
+            width=width,
+            layers=layers,
+            heads=heads,
+            mlp_ratio=mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            batch_first=batch_first,
+        )
+        self.context_length = context_length
+        self.cross_attn = nn.ModuleList([
+            ResidualAttentionBlock(
+                width,
+                heads,
+                mlp_ratio,
+                ls_init_value=ls_init_value,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                is_cross_attention=True,
+                batch_first=batch_first,
+            )
+            for _ in range(layers)
+        ])
+        self.register_buffer('attn_mask', self.build_attention_mask(), persistent=False)
+        self.ln_final = norm_layer(width)
+        self.text_projection = nn.Parameter(torch.empty(width, output_dim))
+    def init_parameters(self):
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        for block in self.transformer.cross_attn:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def forward(self, image_embs, text_embs):
+        seq_len = text_embs.shape[1]
+        if not self.batch_first:
+            image_embs = image_embs.permute(1, 0, 2)  # NLD -> LND
+            text_embs = text_embs.permute(1, 0, 2)  # NLD -> LND
+        for resblock, cross_attn in zip(self.resblocks, self.cross_attn):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
+                text_embs = checkpoint(resblock, text_embs, None, None, self.attn_mask[:seq_len, :seq_len])
+                text_embs = checkpoint(cross_attn, text_embs, image_embs, image_embs, None)
+            else:
+                text_embs = resblock(text_embs, attn_mask=self.attn_mask[:seq_len, :seq_len])
+                text_embs = cross_attn(text_embs, k_x=image_embs, v_x=image_embs)
+        if not self.batch_first:
+            text_embs = text_embs.permute(1, 0, 2)  # LND -> NLD
+        out = self.ln_final(text_embs)
+        if self.text_projection is not None:
+            out = out @ self.text_projection
+        return out
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable

src/open_clip/utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from itertools import repeat
+import collections.abc
+import torch
+from torch import nn as nn
+from torchvision.ops.misc import FrozenBatchNorm2d
+def freeze_batch_norm_2d(module, module_match={}, name=''):
+    """
+    Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
+    itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
+    returned. Otherwise, the module is walked recursively and submodules are converted in place.
+    Args:
+        module (torch.nn.Module): Any PyTorch module.
+        module_match (dict): Dictionary of full module names to freeze (all if empty)
+        name (str): Full module name (prefix)
+    Returns:
+        torch.nn.Module: Resulting module
+    Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
+    """
+    res = module
+    is_match = True
+    if module_match:
+        is_match = name in module_match
+    if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
+        res = FrozenBatchNorm2d(module.num_features)
+        res.num_features = module.num_features
+        res.affine = module.affine
+        if module.affine:
+            res.weight.data = module.weight.data.clone().detach()
+            res.bias.data = module.bias.data.clone().detach()
+        res.running_mean.data = module.running_mean.data
+        res.running_var.data = module.running_var.data
+        res.eps = module.eps
+    else:
+        for child_name, child in module.named_children():
+            full_child_name = '.'.join([name, child_name]) if name else child_name
+            new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
+            if new_child is not child:
+                res.add_module(child_name, new_child)
+    return res
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = lambda n, x: _ntuple(n)(x)
+# Replaces all linear layers with linear_replacement
+# TODO: add int8 support for other linear layers including attn and convnets
+def replace_linear(model, linear_replacement, include_modules=['c_fc', 'c_proj'], copy_weights=True):
+    for name, module in model.named_children():
+        if len(list(module.children())) > 0:
+            replace_linear(module, linear_replacement, include_modules, copy_weights)
+        if isinstance(module, torch.nn.Linear) and name in include_modules:
+            old_module = model._modules[name]
+            model._modules[name] = linear_replacement(
+                module.in_features,
+                module.out_features,
+                module.bias is not None,
+            )
+            if copy_weights:
+                model._modules[name].weight.data.copy_(old_module.weight.data)
+                if model._modules[name].bias is not None:
+                    model._modules[name].bias.data.copy_(old_module.bias)
+    return model
+def convert_int8_model_to_inference_mode(model):
+    for m in model.modules():
+        if hasattr(m, 'prepare_for_eval'):
+            int8_original_dtype = m.weight.dtype
+            m.prepare_for_eval()
+            m.int8_original_dtype = int8_original_dtype

src/open_clip/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = '2.26.1'