Haon-Chen commited on
Commit
7ab971b
·
1 Parent(s): 2da4c87
Files changed (2) hide show
  1. README.md +17 -6
  2. config.json +61 -61
README.md CHANGED
@@ -13,19 +13,30 @@ pipeline_tag: zero-shot-image-classification
13
 
14
  [mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data](https://arxiv.org/abs/2502.08468.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2025
15
 
16
- This model is trained based on [Qwen2.5-7B-VL-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-VL-Instruct).
17
 
18
  [Github](https://github.com/haon-chen/mmcpt)
19
 
20
  ## Train/Eval Data
21
  - Continual Pre-training data:
22
- - Contrastive Learning data: https://huggingface.co/datasets/intfloat/mmE5-MMEB-hardneg, https://huggingface.co/datasets/intfloat/mmE5-synthetic
23
- - Eval data: https://huggingface.co/datasets/TIGER-Lab/MMEB-eval, https://huggingface.co/collections/Haon-Chen/vidore-v2-full-683e7a451417d107337b45d2
 
 
 
 
 
 
 
 
24
 
25
  ## Experimental Results
26
- Our model achieves SOTA performance on MMEB benchmark.
27
  <img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result.jpg">
28
 
 
 
 
29
  ## Usage
30
 
31
  ### Transformers
@@ -49,8 +60,8 @@ import torch.nn.functional as F
49
  def compute_similarity(q_reps, p_reps):
50
  return torch.matmul(q_reps, p_reps.transpose(0, 1))
51
 
52
- model_name = "Haon-Chen/mmE5-qwen25-7B"
53
- processor_name = "Qwen/Qwen2.5-VL-7B-Instruct"
54
 
55
  # Load Processor and Model
56
  processor = AutoProcessor.from_pretrained(processor_name)
 
13
 
14
  [mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data](https://arxiv.org/abs/2502.08468.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2025
15
 
16
+ This model is trained based on [Qwen2.5-3B-VL-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-VL-Instruct).
17
 
18
  [Github](https://github.com/haon-chen/mmcpt)
19
 
20
  ## Train/Eval Data
21
  - Continual Pre-training data:
22
+ - (1) text-only corpus: [DCLM](https://huggingface.co/datasets/TIGER-Lab/DCLM)
23
+ - (2) common multimodal corpus: PixelProse ([CommonPool](https://huggingface.co/datasets/TIGER-Lab/CommonPool), [CC12M](https://huggingface.co/datasets/TIGER-Lab/CC12M), and [RedCaps](https://huggingface.co/datasets/TIGER-Lab/RedCaps)), [MAmmoTH-VL-Instruct](https://huggingface.co/datasets/TIGER-Lab/MAmmoTH-VL-Instruct), and the training set of [MMEB](https://huggingface.co/datasets/TIGER-Lab/MMEB-eval)
24
+ - (3) long-form document-level image understanding corpus: [DocMatix](https://huggingface.co/datasets/TIGER-Lab/DocMatix), VisRAG ([Synthetic](https://huggingface.co/datasets/TIGER-Lab/VisRAG-Synthetic), [In-domain](https://huggingface.co/datasets/TIGER-Lab/VisRAG-Real)), and the training set of [ColPali](https://huggingface.co/datasets/TIGER-Lab/ColPali)
25
+ - Contrastive Learning data:
26
+ - (1) Long-form document-level multimodal pairs: VisRAG and the training set of [ViDoRe-v2](https://huggingface.co/collections/Haon-Chen/vidore-v2-full-683e7a451417d107337b45d2).
27
+ - (2) Common multimodal pairs: The training sets of [MMEB](https://huggingface.co/datasets/intfloat/mmE5-MMEB-hardneg) and [mmE5](https://huggingface.co/datasets/intfloat/mmE5-synthetic).
28
+ - (3) Text-only pairs: Large-scale dense retrieval dataset from E5.
29
+ - Eval data:
30
+ - [MMEB](https://huggingface.co/datasets/TIGER-Lab/MMEB-eval)
31
+ - [ViDoRe-v2](https://huggingface.co/collections/Haon-Chen/vidore-v2-full-683e7a451417d107337b45d2)
32
 
33
  ## Experimental Results
34
+ Performances on MMEB and ViDoRe-v2 benchmarks.
35
  <img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result.jpg">
36
 
37
+ <img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result_vidore.jpg">
38
+
39
+
40
  ## Usage
41
 
42
  ### Transformers
 
60
  def compute_similarity(q_reps, p_reps):
61
  return torch.matmul(q_reps, p_reps.transpose(0, 1))
62
 
63
+ model_name = "mmembed/mmE5-qwen25-3B"
64
+ processor_name = "Qwen/Qwen2.5-VL-3B-Instruct"
65
 
66
  # Load Processor and Model
67
  processor = AutoProcessor.from_pretrained(processor_name)
config.json CHANGED
@@ -1,66 +1,66 @@
1
  {
2
- "architectures": [
3
- "Qwen2_5ForEmbedding"
4
- ],
5
- "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "fused_linear_ce": true,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "image_token_id": 151655,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 11008,
14
- "max_position_embeddings": 128000,
15
- "max_window_layers": 70,
16
- "model_type": "qwen2_5_vl",
17
- "num_attention_heads": 16,
18
- "num_hidden_layers": 36,
19
- "num_key_value_heads": 2,
20
- "rms_norm_eps": 1e-06,
21
- "rope_scaling": {
22
- "mrope_section": [
23
- 16,
24
- 24,
25
- 24
26
- ],
27
- "rope_type": "default",
28
- "type": "default"
29
- },
30
- "rope_theta": 1000000.0,
31
- "sliding_window": 32768,
32
- "tie_word_embeddings": true,
33
- "torch_dtype": "bfloat16",
34
- "transformers_version": "4.50.3",
35
- "use_cache": true,
36
- "use_sliding_window": false,
37
- "video_token_id": 151656,
38
- "vision_config": {
39
- "depth": 32,
40
- "fullatt_block_indexes": [
41
- 7,
42
- 15,
43
- 23,
44
- 31
45
  ],
 
 
 
 
46
  "hidden_act": "silu",
47
- "hidden_size": 1280,
48
- "in_channels": 3,
49
- "in_chans": 3,
50
- "intermediate_size": 3420,
 
 
51
  "model_type": "qwen2_5_vl",
52
- "num_heads": 16,
53
- "out_hidden_size": 2048,
54
- "patch_size": 14,
55
- "spatial_merge_size": 2,
56
- "spatial_patch_size": 14,
57
- "temporal_patch_size": 2,
58
- "tokens_per_second": 2,
 
 
 
 
 
 
 
 
 
59
  "torch_dtype": "bfloat16",
60
- "window_size": 112
61
- },
62
- "vision_end_token_id": 151653,
63
- "vision_start_token_id": 151652,
64
- "vision_token_id": 151654,
65
- "vocab_size": 151936
66
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "architectures": [
3
+ "Qwen2_5ForEmbedding"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "fused_linear_ce": true,
9
  "hidden_act": "silu",
10
+ "hidden_size": 2048,
11
+ "image_token_id": 151655,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "max_position_embeddings": 128000,
15
+ "max_window_layers": 70,
16
  "model_type": "qwen2_5_vl",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 36,
19
+ "num_key_value_heads": 2,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "mrope_section": [
23
+ 16,
24
+ 24,
25
+ 24
26
+ ],
27
+ "rope_type": "default",
28
+ "type": "default"
29
+ },
30
+ "rope_theta": 1000000.0,
31
+ "sliding_window": 32768,
32
+ "tie_word_embeddings": true,
33
  "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.50.3",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "video_token_id": 151656,
38
+ "vision_config": {
39
+ "depth": 32,
40
+ "fullatt_block_indexes": [
41
+ 7,
42
+ 15,
43
+ 23,
44
+ 31
45
+ ],
46
+ "hidden_act": "silu",
47
+ "hidden_size": 1280,
48
+ "in_channels": 3,
49
+ "in_chans": 3,
50
+ "intermediate_size": 3420,
51
+ "model_type": "qwen2_5_vl",
52
+ "num_heads": 16,
53
+ "out_hidden_size": 2048,
54
+ "patch_size": 14,
55
+ "spatial_merge_size": 2,
56
+ "spatial_patch_size": 14,
57
+ "temporal_patch_size": 2,
58
+ "tokens_per_second": 2,
59
+ "torch_dtype": "bfloat16",
60
+ "window_size": 112
61
+ },
62
+ "vision_end_token_id": 151653,
63
+ "vision_start_token_id": 151652,
64
+ "vision_token_id": 151654,
65
+ "vocab_size": 151936
66
+ }