Adi Raz Goldfarb [email protected] commited on
Commit
ed9e65e
·
1 Parent(s): 0f8595a

refactor code

Browse files
config.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
- "_name_or_path": "ibm-granite/granite-vision-3.3-2b",
3
  "adapter_path": null,
4
- "auto_map": {
5
- "AutoModel": "modeling_colgranitevision.ColGraniteVision",
6
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor",
7
- "AutoConfig": "colgranitevision_config.ColGraniteVisionConfig"
8
  },
9
  "architectures": [
10
- "ColGraniteVision"
11
  ],
 
12
  "base_model": null,
13
  "emb_dim_doc": 128,
14
  "emb_dim_query": 128,
15
- "base_image_feature_location": "last",
16
  "image_grid_pinpoints": [
17
  [
18
  384,
@@ -121,7 +121,7 @@
121
  ],
122
  "image_seq_length": 576,
123
  "image_token_index": 49155,
124
- "model_type": "colgranitevision",
125
  "multimodal_projector_bias": true,
126
  "pretrained_language_model": "",
127
  "pretrained_vision_tower": "",
@@ -149,12 +149,12 @@
149
  "rms_norm_eps": 1e-05,
150
  "rope_theta": 300000,
151
  "tie_word_embeddings": true,
152
- "torch_dtype": "float32",
153
  "vocab_size": 49156
154
  },
155
  "tie_word_embeddings": true,
156
  "torch_dtype": "float32",
157
- "transformers_version": "4.50.0.dev0",
158
  "use_image_newline_parameter": true,
159
  "vision_config": {
160
  "_attn_implementation_autoset": true,
@@ -167,7 +167,7 @@
167
  "num_attention_heads": 16,
168
  "num_hidden_layers": 27,
169
  "patch_size": 14,
170
- "torch_dtype": "float32"
171
  },
172
  "vision_feature_layer": [
173
  -24,
 
1
  {
2
+ "_name_or_path": "ibm_granite/granite-vision-3.3-2b",
3
  "adapter_path": null,
4
+ "auto_map": {
5
+ "AutoModel": "modeling_granite_vision_embedding.GraniteVisionEmb",
6
+ "AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor",
7
+ "AutoConfig": "granite_vision_embedding_config.GraniteVisionEmbConfig"
8
  },
9
  "architectures": [
10
+ "GraniteVisionEmb"
11
  ],
12
+ "base_image_feature_location": "last",
13
  "base_model": null,
14
  "emb_dim_doc": 128,
15
  "emb_dim_query": 128,
 
16
  "image_grid_pinpoints": [
17
  [
18
  384,
 
121
  ],
122
  "image_seq_length": 576,
123
  "image_token_index": 49155,
124
+ "model_type": "granitevisionemb",
125
  "multimodal_projector_bias": true,
126
  "pretrained_language_model": "",
127
  "pretrained_vision_tower": "",
 
149
  "rms_norm_eps": 1e-05,
150
  "rope_theta": 300000,
151
  "tie_word_embeddings": true,
152
+ "torch_dtype": "bfloat16",
153
  "vocab_size": 49156
154
  },
155
  "tie_word_embeddings": true,
156
  "torch_dtype": "float32",
157
+ "transformers_version": "4.49.0",
158
  "use_image_newline_parameter": true,
159
  "vision_config": {
160
  "_attn_implementation_autoset": true,
 
167
  "num_attention_heads": 16,
168
  "num_hidden_layers": 27,
169
  "patch_size": 14,
170
+ "torch_dtype": "bfloat16"
171
  },
172
  "vision_feature_layer": [
173
  -24,
colgranitevision_config.py → granite_vision_embedding_config.py RENAMED
@@ -1,8 +1,8 @@
1
  from transformers import LlavaNextConfig
2
 
3
 
4
- class ColGraniteVisionConfig(LlavaNextConfig):
5
- model_type = "colgranitevision"
6
 
7
  def __init__(self, **kwargs):
8
  self.base_model = kwargs.get("base_model", None)
@@ -11,3 +11,5 @@ class ColGraniteVisionConfig(LlavaNextConfig):
11
  self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
12
  self.adapter_path = kwargs.get("adapter_path", None)
13
  super().__init__(**kwargs)
 
 
 
1
  from transformers import LlavaNextConfig
2
 
3
 
4
+ class GraniteVisionEmbConfig(LlavaNextConfig):
5
+ model_type = "granitevisionemb"
6
 
7
  def __init__(self, **kwargs):
8
  self.base_model = kwargs.get("base_model", None)
 
11
  self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
12
  self.adapter_path = kwargs.get("adapter_path", None)
13
  super().__init__(**kwargs)
14
+
15
+
modeling_colgranitevision.py → modeling_granite_vision_embedding.py RENAMED
@@ -7,11 +7,14 @@ from transformers import LlavaNextPreTrainedModel
7
  from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
8
  from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
9
 
10
- from .colgranitevision_config import ColGraniteVisionConfig
 
 
 
11
 
12
 
13
  class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
14
-
15
  def pack_image_features(
16
  self,
17
  image_features,
@@ -93,15 +96,15 @@ class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
93
  return image_features, feature_lens
94
 
95
 
96
- class ColGraniteVision(LlavaNextPreTrainedModel):
97
  """
98
- ColGraniteVision model implementation.
99
  """
100
 
101
  main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
102
- config_class = ColGraniteVisionConfig
103
 
104
- def __init__(self, config: ColGraniteVisionConfig):
105
  super().__init__(config=config)
106
 
107
  model = LlavaNextWithCustomPacking(config=config)
@@ -109,8 +112,6 @@ class ColGraniteVision(LlavaNextPreTrainedModel):
109
  self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
110
  self.model = model
111
 
112
- # TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
113
- # We could do it now but it would break all the models trying to load the model from the checkpoint.
114
  self.dim = 128
115
  self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
116
 
 
7
  from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
8
  from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
9
 
10
+ try:
11
+ from .granite_vision_embedding_config import GraniteVisionEmbConfig
12
+ except:
13
+ from granite_vision_embedding_config import GraniteVisionEmbConfig
14
 
15
 
16
  class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
17
+
18
  def pack_image_features(
19
  self,
20
  image_features,
 
96
  return image_features, feature_lens
97
 
98
 
99
+ class GraniteVisionEmb(LlavaNextPreTrainedModel):
100
  """
101
+ GraniteVisionEmb model implementation.
102
  """
103
 
104
  main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
105
+ config_class = GraniteVisionEmbConfig
106
 
107
+ def __init__(self, config: GraniteVisionEmbConfig):
108
  super().__init__(config=config)
109
 
110
  model = LlavaNextWithCustomPacking(config=config)
 
112
  self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
113
  self.model = model
114
 
 
 
115
  self.dim = 128
116
  self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
117
 
preprocessor_config.json CHANGED
@@ -127,7 +127,7 @@
127
  0.5,
128
  0.5
129
  ],
130
- "processor_class": "ColGraniteVisionProcessor",
131
  "resample": 3,
132
  "rescale_factor": 0.00392156862745098,
133
  "size": {
 
127
  0.5,
128
  0.5
129
  ],
130
+ "processor_class": "GraniteVisionEmbProcessor",
131
  "resample": 3,
132
  "rescale_factor": 0.00392156862745098,
133
  "size": {
processing_colgranitevision.py → processing_granite_vision_embedding.py RENAMED
@@ -21,7 +21,7 @@ def floor_by_factor(number: float, factor: int) -> int:
21
  return math.floor(number / factor) * factor
22
 
23
 
24
- class ColGraniteVisionProcessor(LlavaNextProcessor):
25
  """
26
  Processor for ColPali.
27
  """
@@ -140,14 +140,14 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
140
  max_size=self.max_size,
141
  fill_color=0
142
  )
143
-
144
  def resize_and_pad_centered_to_long_side(
145
- self,
146
- image: Image.Image,
147
- factor: int,
148
- min_size: int,
149
- max_size: int,
150
- fill_color=0
151
  ) -> Image.Image:
152
  """
153
  Resizes and pads an image such that:
@@ -183,10 +183,10 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
183
 
184
  # Resize the image
185
  resized_image = image.resize((target_width, target_height), Image.LANCZOS)
186
- final_image =resized_image.convert("RGB")
187
 
188
  return final_image
189
-
190
  def resize_and_pad_centered(self,
191
  image: Image.Image,
192
  factor: int,
@@ -439,4 +439,4 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
439
  assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
440
 
441
  scores = scores.to(torch.float32)
442
- return scores
 
21
  return math.floor(number / factor) * factor
22
 
23
 
24
+ class GraniteVisionEmbProcessor(LlavaNextProcessor):
25
  """
26
  Processor for ColPali.
27
  """
 
140
  max_size=self.max_size,
141
  fill_color=0
142
  )
143
+
144
  def resize_and_pad_centered_to_long_side(
145
+ self,
146
+ image: Image.Image,
147
+ factor: int,
148
+ min_size: int,
149
+ max_size: int,
150
+ fill_color=0
151
  ) -> Image.Image:
152
  """
153
  Resizes and pads an image such that:
 
183
 
184
  # Resize the image
185
  resized_image = image.resize((target_width, target_height), Image.LANCZOS)
186
+ final_image = resized_image.convert("RGB")
187
 
188
  return final_image
189
+
190
  def resize_and_pad_centered(self,
191
  image: Image.Image,
192
  factor: int,
 
439
  assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
440
 
441
  scores = scores.to(torch.float32)
442
+ return scores
processor_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "processor_class": "ColGraniteVisionProcessor",
3
  "auto_map": {
4
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor"
5
  }
6
  }
 
1
  {
2
+ "processor_class": "GraniteVisionEmbProcessor",
3
  "auto_map": {
4
+ "AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor"
5
  }
6
  }