model name refactoring (#9)
Browse files- refactor code (ed9e65ece65a6aa036374f03e5a11c6d6d38037d)
- update modeling (f6e6df119cb052a1bcbac7e497e1b4fefdeb7bb5)
- update readme (4f3aaec32267bf7d09fe740d647c74f98261a34e)
- README.md +2 -2
- config.json +11 -11
- colgranitevision_config.py → granite_vision_embedding_config.py +4 -2
- modeling_colgranitevision.py → modeling_granite_vision_embedding.py +6 -9
- preprocessor_config.json +1 -1
- processing_colgranitevision.py → processing_granite_vision_embedding.py +11 -11
- processor_config.json +2 -2
README.md
CHANGED
|
@@ -12,7 +12,7 @@ Granite-vision-3.3-2b-embedding is an efficient embedding model based on granite
|
|
| 12 |
By removing the need for OCR-based text extractions, granite-vision-3.3-2b-embedding can help simplify and accelerate RAG pipelines.
|
| 13 |
|
| 14 |
**Evaluations:**
|
| 15 |
-
We evaluated granite-vision-3.3-2b-embedding alongside other top colBERT style multi-modal embedding models in the 1B-4B parameter range using two benchmark: Vidore2 and [Real-MM-RAG-Bench](https://arxiv.org/abs/2502.12342) which aim to specifically address complex multimodal document retrieval tasks.
|
| 16 |
|
| 17 |
## **NDCG@5 - ViDoRe V2**
|
| 18 |
| Collection \ Model | ColPali-v1.3 | ColQwen2.5-v0.2 | ColNomic-3b | ColSmolvlm-v0.1 | granite-vision-3.3-2b-embedding |
|
|
@@ -102,7 +102,7 @@ print(f"📊 Similarity between image and text: {similarity.item():.4f}")
|
|
| 102 |
print("=" * 50)
|
| 103 |
```
|
| 104 |
### Use granite-vision-embedding-3.3-2b for MM RAG
|
| 105 |
-
For an example of MM-RAG using granite-vision-3.3-2b-embedding refer to [this notebook](
|
| 106 |
|
| 107 |
**Model Architecture:**
|
| 108 |
The architecture of granite-vision-3.3-2b-embedding follows ColPali(https://arxiv.org/abs/2407.01449) approach and consists of the following components:
|
|
|
|
| 12 |
By removing the need for OCR-based text extractions, granite-vision-3.3-2b-embedding can help simplify and accelerate RAG pipelines.
|
| 13 |
|
| 14 |
**Evaluations:**
|
| 15 |
+
We evaluated granite-vision-3.3-2b-embedding alongside other top colBERT style multi-modal embedding models in the 1B-4B parameter range using two benchmark: [Vidore2] (https://github.com/illuin-tech/vidore-benchmark/) and [Real-MM-RAG-Bench](https://arxiv.org/abs/2502.12342)([dataset](https://huggingface.co/collections/ibm-research/real-mm-rag-bench-67d2dc0ddf2dfafe66f09d34)) which aim to specifically address complex multimodal document retrieval tasks.
|
| 16 |
|
| 17 |
## **NDCG@5 - ViDoRe V2**
|
| 18 |
| Collection \ Model | ColPali-v1.3 | ColQwen2.5-v0.2 | ColNomic-3b | ColSmolvlm-v0.1 | granite-vision-3.3-2b-embedding |
|
|
|
|
| 102 |
print("=" * 50)
|
| 103 |
```
|
| 104 |
### Use granite-vision-embedding-3.3-2b for MM RAG
|
| 105 |
+
For an example of MM-RAG using granite-vision-3.3-2b-embedding refer to [this notebook](https://github.com/ibm-granite/granite-vision-models/tree/main/cookbooks/GraniteVisionEmbedding_MM-RAG_Notebook).
|
| 106 |
|
| 107 |
**Model Architecture:**
|
| 108 |
The architecture of granite-vision-3.3-2b-embedding follows ColPali(https://arxiv.org/abs/2407.01449) approach and consists of the following components:
|
config.json
CHANGED
|
@@ -1,18 +1,18 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"adapter_path": null,
|
| 4 |
-
|
| 5 |
-
"AutoModel": "
|
| 6 |
-
"AutoProcessor": "
|
| 7 |
-
"AutoConfig": "
|
| 8 |
},
|
| 9 |
"architectures": [
|
| 10 |
-
"
|
| 11 |
],
|
|
|
|
| 12 |
"base_model": null,
|
| 13 |
"emb_dim_doc": 128,
|
| 14 |
"emb_dim_query": 128,
|
| 15 |
-
"base_image_feature_location": "last",
|
| 16 |
"image_grid_pinpoints": [
|
| 17 |
[
|
| 18 |
384,
|
|
@@ -121,7 +121,7 @@
|
|
| 121 |
],
|
| 122 |
"image_seq_length": 576,
|
| 123 |
"image_token_index": 49155,
|
| 124 |
-
"model_type": "
|
| 125 |
"multimodal_projector_bias": true,
|
| 126 |
"pretrained_language_model": "",
|
| 127 |
"pretrained_vision_tower": "",
|
|
@@ -149,12 +149,12 @@
|
|
| 149 |
"rms_norm_eps": 1e-05,
|
| 150 |
"rope_theta": 300000,
|
| 151 |
"tie_word_embeddings": true,
|
| 152 |
-
"torch_dtype": "
|
| 153 |
"vocab_size": 49156
|
| 154 |
},
|
| 155 |
"tie_word_embeddings": true,
|
| 156 |
"torch_dtype": "float32",
|
| 157 |
-
"transformers_version": "4.
|
| 158 |
"use_image_newline_parameter": true,
|
| 159 |
"vision_config": {
|
| 160 |
"_attn_implementation_autoset": true,
|
|
@@ -167,7 +167,7 @@
|
|
| 167 |
"num_attention_heads": 16,
|
| 168 |
"num_hidden_layers": 27,
|
| 169 |
"patch_size": 14,
|
| 170 |
-
"torch_dtype": "
|
| 171 |
},
|
| 172 |
"vision_feature_layer": [
|
| 173 |
-24,
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "ibm_granite/granite-vision-3.3-2b",
|
| 3 |
"adapter_path": null,
|
| 4 |
+
"auto_map": {
|
| 5 |
+
"AutoModel": "modeling_granite_vision_embedding.GraniteVisionEmb",
|
| 6 |
+
"AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor",
|
| 7 |
+
"AutoConfig": "granite_vision_embedding_config.GraniteVisionEmbConfig"
|
| 8 |
},
|
| 9 |
"architectures": [
|
| 10 |
+
"GraniteVisionEmb"
|
| 11 |
],
|
| 12 |
+
"base_image_feature_location": "last",
|
| 13 |
"base_model": null,
|
| 14 |
"emb_dim_doc": 128,
|
| 15 |
"emb_dim_query": 128,
|
|
|
|
| 16 |
"image_grid_pinpoints": [
|
| 17 |
[
|
| 18 |
384,
|
|
|
|
| 121 |
],
|
| 122 |
"image_seq_length": 576,
|
| 123 |
"image_token_index": 49155,
|
| 124 |
+
"model_type": "granitevisionemb",
|
| 125 |
"multimodal_projector_bias": true,
|
| 126 |
"pretrained_language_model": "",
|
| 127 |
"pretrained_vision_tower": "",
|
|
|
|
| 149 |
"rms_norm_eps": 1e-05,
|
| 150 |
"rope_theta": 300000,
|
| 151 |
"tie_word_embeddings": true,
|
| 152 |
+
"torch_dtype": "bfloat16",
|
| 153 |
"vocab_size": 49156
|
| 154 |
},
|
| 155 |
"tie_word_embeddings": true,
|
| 156 |
"torch_dtype": "float32",
|
| 157 |
+
"transformers_version": "4.49.0",
|
| 158 |
"use_image_newline_parameter": true,
|
| 159 |
"vision_config": {
|
| 160 |
"_attn_implementation_autoset": true,
|
|
|
|
| 167 |
"num_attention_heads": 16,
|
| 168 |
"num_hidden_layers": 27,
|
| 169 |
"patch_size": 14,
|
| 170 |
+
"torch_dtype": "bfloat16"
|
| 171 |
},
|
| 172 |
"vision_feature_layer": [
|
| 173 |
-24,
|
colgranitevision_config.py → granite_vision_embedding_config.py
RENAMED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
from transformers import LlavaNextConfig
|
| 2 |
|
| 3 |
|
| 4 |
-
class
|
| 5 |
-
model_type = "
|
| 6 |
|
| 7 |
def __init__(self, **kwargs):
|
| 8 |
self.base_model = kwargs.get("base_model", None)
|
|
@@ -11,3 +11,5 @@ class ColGraniteVisionConfig(LlavaNextConfig):
|
|
| 11 |
self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
|
| 12 |
self.adapter_path = kwargs.get("adapter_path", None)
|
| 13 |
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
| 1 |
from transformers import LlavaNextConfig
|
| 2 |
|
| 3 |
|
| 4 |
+
class GraniteVisionEmbConfig(LlavaNextConfig):
|
| 5 |
+
model_type = "granitevisionemb"
|
| 6 |
|
| 7 |
def __init__(self, **kwargs):
|
| 8 |
self.base_model = kwargs.get("base_model", None)
|
|
|
|
| 11 |
self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
|
| 12 |
self.adapter_path = kwargs.get("adapter_path", None)
|
| 13 |
super().__init__(**kwargs)
|
| 14 |
+
|
| 15 |
+
|
modeling_colgranitevision.py → modeling_granite_vision_embedding.py
RENAMED
|
@@ -7,11 +7,10 @@ from transformers import LlavaNextPreTrainedModel
|
|
| 7 |
from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
|
| 8 |
from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
|
| 9 |
|
| 10 |
-
from .
|
| 11 |
-
|
| 12 |
|
| 13 |
class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
|
| 14 |
-
|
| 15 |
def pack_image_features(
|
| 16 |
self,
|
| 17 |
image_features,
|
|
@@ -93,15 +92,15 @@ class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
|
|
| 93 |
return image_features, feature_lens
|
| 94 |
|
| 95 |
|
| 96 |
-
class
|
| 97 |
"""
|
| 98 |
-
|
| 99 |
"""
|
| 100 |
|
| 101 |
main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
|
| 102 |
-
config_class =
|
| 103 |
|
| 104 |
-
def __init__(self, config:
|
| 105 |
super().__init__(config=config)
|
| 106 |
|
| 107 |
model = LlavaNextWithCustomPacking(config=config)
|
|
@@ -109,8 +108,6 @@ class ColGraniteVision(LlavaNextPreTrainedModel):
|
|
| 109 |
self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
|
| 110 |
self.model = model
|
| 111 |
|
| 112 |
-
# TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
|
| 113 |
-
# We could do it now but it would break all the models trying to load the model from the checkpoint.
|
| 114 |
self.dim = 128
|
| 115 |
self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
|
| 116 |
|
|
|
|
| 7 |
from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
|
| 8 |
from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
|
| 9 |
|
| 10 |
+
from .granite_vision_embedding_config import GraniteVisionEmbConfig
|
|
|
|
| 11 |
|
| 12 |
class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
|
| 13 |
+
|
| 14 |
def pack_image_features(
|
| 15 |
self,
|
| 16 |
image_features,
|
|
|
|
| 92 |
return image_features, feature_lens
|
| 93 |
|
| 94 |
|
| 95 |
+
class GraniteVisionEmb(LlavaNextPreTrainedModel):
|
| 96 |
"""
|
| 97 |
+
GraniteVisionEmb model implementation.
|
| 98 |
"""
|
| 99 |
|
| 100 |
main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
|
| 101 |
+
config_class = GraniteVisionEmbConfig
|
| 102 |
|
| 103 |
+
def __init__(self, config: GraniteVisionEmbConfig):
|
| 104 |
super().__init__(config=config)
|
| 105 |
|
| 106 |
model = LlavaNextWithCustomPacking(config=config)
|
|
|
|
| 108 |
self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
|
| 109 |
self.model = model
|
| 110 |
|
|
|
|
|
|
|
| 111 |
self.dim = 128
|
| 112 |
self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
|
| 113 |
|
preprocessor_config.json
CHANGED
|
@@ -127,7 +127,7 @@
|
|
| 127 |
0.5,
|
| 128 |
0.5
|
| 129 |
],
|
| 130 |
-
"processor_class": "
|
| 131 |
"resample": 3,
|
| 132 |
"rescale_factor": 0.00392156862745098,
|
| 133 |
"size": {
|
|
|
|
| 127 |
0.5,
|
| 128 |
0.5
|
| 129 |
],
|
| 130 |
+
"processor_class": "GraniteVisionEmbProcessor",
|
| 131 |
"resample": 3,
|
| 132 |
"rescale_factor": 0.00392156862745098,
|
| 133 |
"size": {
|
processing_colgranitevision.py → processing_granite_vision_embedding.py
RENAMED
|
@@ -21,7 +21,7 @@ def floor_by_factor(number: float, factor: int) -> int:
|
|
| 21 |
return math.floor(number / factor) * factor
|
| 22 |
|
| 23 |
|
| 24 |
-
class
|
| 25 |
"""
|
| 26 |
Processor for ColPali.
|
| 27 |
"""
|
|
@@ -140,14 +140,14 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
|
|
| 140 |
max_size=self.max_size,
|
| 141 |
fill_color=0
|
| 142 |
)
|
| 143 |
-
|
| 144 |
def resize_and_pad_centered_to_long_side(
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
) -> Image.Image:
|
| 152 |
"""
|
| 153 |
Resizes and pads an image such that:
|
|
@@ -183,10 +183,10 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
|
|
| 183 |
|
| 184 |
# Resize the image
|
| 185 |
resized_image = image.resize((target_width, target_height), Image.LANCZOS)
|
| 186 |
-
final_image =resized_image.convert("RGB")
|
| 187 |
|
| 188 |
return final_image
|
| 189 |
-
|
| 190 |
def resize_and_pad_centered(self,
|
| 191 |
image: Image.Image,
|
| 192 |
factor: int,
|
|
@@ -439,4 +439,4 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
|
|
| 439 |
assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
|
| 440 |
|
| 441 |
scores = scores.to(torch.float32)
|
| 442 |
-
return scores
|
|
|
|
| 21 |
return math.floor(number / factor) * factor
|
| 22 |
|
| 23 |
|
| 24 |
+
class GraniteVisionEmbProcessor(LlavaNextProcessor):
|
| 25 |
"""
|
| 26 |
Processor for ColPali.
|
| 27 |
"""
|
|
|
|
| 140 |
max_size=self.max_size,
|
| 141 |
fill_color=0
|
| 142 |
)
|
| 143 |
+
|
| 144 |
def resize_and_pad_centered_to_long_side(
|
| 145 |
+
self,
|
| 146 |
+
image: Image.Image,
|
| 147 |
+
factor: int,
|
| 148 |
+
min_size: int,
|
| 149 |
+
max_size: int,
|
| 150 |
+
fill_color=0
|
| 151 |
) -> Image.Image:
|
| 152 |
"""
|
| 153 |
Resizes and pads an image such that:
|
|
|
|
| 183 |
|
| 184 |
# Resize the image
|
| 185 |
resized_image = image.resize((target_width, target_height), Image.LANCZOS)
|
| 186 |
+
final_image = resized_image.convert("RGB")
|
| 187 |
|
| 188 |
return final_image
|
| 189 |
+
|
| 190 |
def resize_and_pad_centered(self,
|
| 191 |
image: Image.Image,
|
| 192 |
factor: int,
|
|
|
|
| 439 |
assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
|
| 440 |
|
| 441 |
scores = scores.to(torch.float32)
|
| 442 |
+
return scores
|
processor_config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
-
"processor_class": "
|
| 3 |
"auto_map": {
|
| 4 |
-
"AutoProcessor": "
|
| 5 |
}
|
| 6 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"processor_class": "GraniteVisionEmbProcessor",
|
| 3 |
"auto_map": {
|
| 4 |
+
"AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor"
|
| 5 |
}
|
| 6 |
}
|