File size: 6,734 Bytes
2c28206 501f1f4 2c28206 5a522de 2c28206 5a522de 2c28206 5a522de 2c28206 c713514 2c28206 de9df37 2c28206 501f1f4 2c28206 501f1f4 2c28206 71d439f 2c28206 71d439f 97fe6cd 078d6bc 97fe6cd 2c28206 97fe6cd 2c28206 71d439f 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 97fe6cd 2c28206 501f1f4 211fa5c 501f1f4 211fa5c 501f1f4 2c28206 97fe6cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
---
tags:
- mmeb
- transformers
- sentence-transformers
language:
- en
- ar
- zh
- ko
- ru
- pl
- tr
- fr
library_name: transformers
license: mit
pipeline_tag: image-feature-extraction
---
## mmE5-mllama-11b-instruct
[mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data](https://arxiv.org/abs/2502.08468.pdf). Haonan Chen, Liang Wang, Nan Yang, Yutao Zhu, Ziliang Zhao, Furu Wei, Zhicheng Dou, arXiv 2025
This model is trained based on [Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision).
[Github](https://github.com/haon-chen/mmE5)
## Train/Eval Data
- Train data: https://huggingface.co/datasets/intfloat/mmE5-MMEB-hardneg, https://huggingface.co/datasets/intfloat/mmE5-synthetic
- Eval data: https://huggingface.co/datasets/TIGER-Lab/MMEB-eval, https://huggingface.co/datasets/Haon-Chen/XTD-10
## Experimental Results
Our model achieves SOTA performance on MMEB benchmark.
<img width="900" alt="abs" src="https://raw.githubusercontent.com/haon-chen/mmE5/refs/heads/main/figures//exp_result.jpg">
## Usage
### Transformers
Below is an example we adapted from [VLM2Vec](https://huggingface.co/TIGER-Lab/VLM2Vec-Full).
```python
import torch
import requests
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
# Pooling and Normalization
def last_pooling(last_hidden_state, attention_mask, normalize=True):
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_state.shape[0]
reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths]
if normalize:
reps = torch.nn.functional.normalize(reps, p=2, dim=-1)
return reps
def compute_similarity(q_reps, p_reps):
return torch.matmul(q_reps, p_reps.transpose(0, 1))
model_name = "intfloat/mmE5-mllama-11b-instruct"
# Load Processor and Model
processor = AutoProcessor.from_pretrained(model_name)
model = MllamaForConditionalGeneration.from_pretrained(
model_name, torch_dtype=torch.bfloat16
).to("cuda")
model.eval()
# Image + Text -> Text
image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
inputs = processor(text='<|image|><|begin_of_text|> Represent the given image with the following question: What is in the image', images=[image], return_tensors="pt").to("cuda")
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
string = 'A cat and a dog'
text_inputs = processor(text=string, return_tensors="pt").to("cuda")
tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
print(string, '=', compute_similarity(qry_output, tgt_output))
## A cat and a dog = tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
string = 'A cat and a tiger'
text_inputs = processor(text=string, return_tensors="pt").to("cuda")
tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
print(string, '=', compute_similarity(qry_output, tgt_output))
## A cat and a tiger = tensor([[0.3105]], device='cuda:0', dtype=torch.bfloat16)
# Text -> Image
inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.', return_tensors="pt").to("cuda")
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
string = '<|image|><|begin_of_text|> Represent the given image.'
tgt_inputs = processor(text=string, images=[Image.open('figures/example.jpg')], return_tensors="pt").to("cuda")
tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
print(string, '=', compute_similarity(qry_output, tgt_output))
## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.', return_tensors="pt").to("cuda")
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
string = '<|image|><|begin_of_text|> Represent the given image.'
tgt_inputs = processor(text=string, images=[Image.open('figures/example.jpg')], return_tensors="pt").to("cuda")
tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
print(string, '=', compute_similarity(qry_output, tgt_output))
## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
```
### Sentence Transformers
You can also use Sentence Transformers, where the majority of the pre- and post-processing has been abstracted.
```python
from sentence_transformers import SentenceTransformer
import requests
# Load the model
model = SentenceTransformer("intfloat/mmE5-mllama-11b-instruct", trust_remote_code=True)
# Download an example image of a cat and a dog
dog_cat_image_bytes = requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw.read()
with open("cat_dog_example.jpg", "wb") as f:
f.write(dog_cat_image_bytes)
# Image + Text -> Text
image_embeddings = model.encode([{
"image": "cat_dog_example.jpg",
"text": "Represent the given image with the following question: What is in the image",
}])
text_embeddings = model.encode([
{"text": "A cat and a dog"},
{"text": "A cat and a tiger"},
])
similarity = model.similarity(image_embeddings, text_embeddings)
print(similarity)
# tensor([[0.3967, 0.3090]])
# ✅ The first text is most similar to the image
# Text -> Image
image_embeddings = model.encode([
{"image": dog_cat_image_bytes, "text": "Represent the given image."},
])
text_embeddings = model.encode([
{"text": "Find me an everyday image that matches the given caption: A cat and a dog."},
{"text": "Find me an everyday image that matches the given caption: A cat and a tiger."},
])
similarity = model.similarity(image_embeddings, text_embeddings)
print(similarity)
# tensor([[0.4250, 0.3896]])
# ✅ The first text is most similar to the image
```
## Citation
```
@article{chen2025mmE5,
title={mmE5: Improving Multimodal Multilingual Embeddings via High-quality Synthetic Data},
author={Chen, Haonan and Wang, Liang and Yang, Nan and Zhu, Yutao and Zhao, Ziliang and Wei, Furu and Dou, Zhicheng},
journal={arXiv preprint arXiv:2502.08468},
year={2025}
}
```
|