moca-embed
/

MoCa-Qwen25VL-3B

@@ -13,19 +13,30 @@ pipeline_tag: zero-shot-image-classification
 [mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data](https://arxiv.org/abs/2502.08468.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2025
-This model is trained based on [Qwen2.5-7B-VL-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-VL-Instruct).
 [Github](https://github.com/haon-chen/mmcpt)
 ## Train/Eval Data
  - Continual Pre-training data:
- - Contrastive Learning data: https://huggingface.co/datasets/intfloat/mmE5-MMEB-hardneg, https://huggingface.co/datasets/intfloat/mmE5-synthetic
- - Eval data: https://huggingface.co/datasets/TIGER-Lab/MMEB-eval, https://huggingface.co/collections/Haon-Chen/vidore-v2-full-683e7a451417d107337b45d2
 ## Experimental Results
-Our model achieves SOTA performance on MMEB benchmark.
 <img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result.jpg">
 ## Usage
 ### Transformers
@@ -49,8 +60,8 @@ import torch.nn.functional as F
 def compute_similarity(q_reps, p_reps):
     return torch.matmul(q_reps, p_reps.transpose(0, 1))
-model_name = "Haon-Chen/mmE5-qwen25-7B"
-processor_name = "Qwen/Qwen2.5-VL-7B-Instruct"
 # Load Processor and Model
 processor = AutoProcessor.from_pretrained(processor_name)

 [mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data](https://arxiv.org/abs/2502.08468.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2025
+This model is trained based on [Qwen2.5-3B-VL-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-VL-Instruct).
 [Github](https://github.com/haon-chen/mmcpt)
 ## Train/Eval Data
  - Continual Pre-training data:
+    - (1) text-only corpus: [DCLM](https://huggingface.co/datasets/TIGER-Lab/DCLM)
+    - (2) common multimodal corpus: PixelProse ([CommonPool](https://huggingface.co/datasets/TIGER-Lab/CommonPool), [CC12M](https://huggingface.co/datasets/TIGER-Lab/CC12M), and [RedCaps](https://huggingface.co/datasets/TIGER-Lab/RedCaps)), [MAmmoTH-VL-Instruct](https://huggingface.co/datasets/TIGER-Lab/MAmmoTH-VL-Instruct), and the training set of [MMEB](https://huggingface.co/datasets/TIGER-Lab/MMEB-eval)
+    - (3) long-form document-level image understanding corpus: [DocMatix](https://huggingface.co/datasets/TIGER-Lab/DocMatix), VisRAG ([Synthetic](https://huggingface.co/datasets/TIGER-Lab/VisRAG-Synthetic), [In-domain](https://huggingface.co/datasets/TIGER-Lab/VisRAG-Real)), and the training set of [ColPali](https://huggingface.co/datasets/TIGER-Lab/ColPali)
+ - Contrastive Learning data:
+    - (1) Long-form document-level multimodal pairs: VisRAG  and the training set of [ViDoRe-v2](https://huggingface.co/collections/Haon-Chen/vidore-v2-full-683e7a451417d107337b45d2).
+    - (2) Common multimodal pairs: The training sets of [MMEB](https://huggingface.co/datasets/intfloat/mmE5-MMEB-hardneg) and [mmE5](https://huggingface.co/datasets/intfloat/mmE5-synthetic).
+    - (3) Text-only pairs: Large-scale dense retrieval dataset from E5.
+ - Eval data:
+    - [MMEB](https://huggingface.co/datasets/TIGER-Lab/MMEB-eval)
+    - [ViDoRe-v2](https://huggingface.co/collections/Haon-Chen/vidore-v2-full-683e7a451417d107337b45d2)
 ## Experimental Results
+Performances on MMEB and ViDoRe-v2 benchmarks.
 <img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result.jpg">
+<img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result_vidore.jpg">
 ## Usage
 ### Transformers
 def compute_similarity(q_reps, p_reps):
     return torch.matmul(q_reps, p_reps.transpose(0, 1))
+model_name = "mmembed/mmE5-qwen25-3B"
+processor_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 # Load Processor and Model
 processor = AutoProcessor.from_pretrained(processor_name)

config.json CHANGED Viewed

@@ -1,66 +1,66 @@
 {
-  "architectures": [
-    "Qwen2_5ForEmbedding"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "fused_linear_ce": true,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "image_token_id": 151655,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 128000,
-  "max_window_layers": 70,
-  "model_type": "qwen2_5_vl",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 36,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "mrope_section": [
-      16,
-      24,
-      24
-    ],
-    "rope_type": "default",
-    "type": "default"
-  },
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.50.3",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "video_token_id": 151656,
-  "vision_config": {
-    "depth": 32,
-    "fullatt_block_indexes": [
-      7,
-      15,
-      23,
-      31
     ],
     "hidden_act": "silu",
-    "hidden_size": 1280,
-    "in_channels": 3,
-    "in_chans": 3,
-    "intermediate_size": 3420,
     "model_type": "qwen2_5_vl",
-    "num_heads": 16,
-    "out_hidden_size": 2048,
-    "patch_size": 14,
-    "spatial_merge_size": 2,
-    "spatial_patch_size": 14,
-    "temporal_patch_size": 2,
-    "tokens_per_second": 2,
     "torch_dtype": "bfloat16",
-    "window_size": 112
-  },
-  "vision_end_token_id": 151653,
-  "vision_start_token_id": 151652,
-  "vision_token_id": 151654,
-  "vocab_size": 151936
-}

 {
+    "architectures": [
+        "Qwen2_5ForEmbedding"
     ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "fused_linear_ce": true,
     "hidden_act": "silu",
+    "hidden_size": 2048,
+    "image_token_id": 151655,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 128000,
+    "max_window_layers": 70,
     "model_type": "qwen2_5_vl",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+        "mrope_section": [
+            16,
+            24,
+            24
+        ],
+        "rope_type": "default",
+        "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 32768,
+    "tie_word_embeddings": true,
     "torch_dtype": "bfloat16",
+    "transformers_version": "4.50.3",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": 151656,
+    "vision_config": {
+        "depth": 32,
+        "fullatt_block_indexes": [
+            7,
+            15,
+            23,
+            31
+        ],
+        "hidden_act": "silu",
+        "hidden_size": 1280,
+        "in_channels": 3,
+        "in_chans": 3,
+        "intermediate_size": 3420,
+        "model_type": "qwen2_5_vl",
+        "num_heads": 16,
+        "out_hidden_size": 2048,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "temporal_patch_size": 2,
+        "tokens_per_second": 2,
+        "torch_dtype": "bfloat16",
+        "window_size": 112
+    },
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 151936
+}