Upload folder using huggingface_hub
Browse files- config.json +2 -2
- configuration_molmoact.py +2 -2
- modeling_molmoact.py +4 -4
config.json
CHANGED
@@ -20,11 +20,11 @@
|
|
20 |
]
|
21 |
},
|
22 |
"architectures": [
|
23 |
-
"
|
24 |
],
|
25 |
"auto_map": {
|
26 |
"AutoConfig": "configuration_molmoact.MolmoActConfig",
|
27 |
-
"AutoModelForImageTextToText": "modeling_molmoact.
|
28 |
},
|
29 |
"image_patch_id": 152066,
|
30 |
"initializer_range": 0.02,
|
|
|
20 |
]
|
21 |
},
|
22 |
"architectures": [
|
23 |
+
"MolmoActForActionReasoning"
|
24 |
],
|
25 |
"auto_map": {
|
26 |
"AutoConfig": "configuration_molmoact.MolmoActConfig",
|
27 |
+
"AutoModelForImageTextToText": "modeling_molmoact.MolmoActForActionReasoning"
|
28 |
},
|
29 |
"image_patch_id": 152066,
|
30 |
"initializer_range": 0.02,
|
configuration_molmoact.py
CHANGED
@@ -245,7 +245,7 @@ class MolmoActLlmConfig(PretrainedConfig):
|
|
245 |
|
246 |
class MolmoActConfig(PretrainedConfig):
|
247 |
r"""
|
248 |
-
This is the configuration class to store the configuration of a [`
|
249 |
It is used to instantiate an MolmoAct model according to the specified arguments, defining the model architecture.
|
250 |
|
251 |
Example:
|
@@ -266,7 +266,7 @@ class MolmoActConfig(PretrainedConfig):
|
|
266 |
>>> configuration = MolmoActConfig(vit_config, adapter_config, llm_config, image_patch_id=152069)
|
267 |
|
268 |
>>> # Initializing a model
|
269 |
-
>>> model =
|
270 |
|
271 |
>>> # Accessing the model configuration
|
272 |
>>> configuration = model.config
|
|
|
245 |
|
246 |
class MolmoActConfig(PretrainedConfig):
|
247 |
r"""
|
248 |
+
This is the configuration class to store the configuration of a [`MolmoActForActionReasoning`].
|
249 |
It is used to instantiate an MolmoAct model according to the specified arguments, defining the model architecture.
|
250 |
|
251 |
Example:
|
|
|
266 |
>>> configuration = MolmoActConfig(vit_config, adapter_config, llm_config, image_patch_id=152069)
|
267 |
|
268 |
>>> # Initializing a model
|
269 |
+
>>> model = MolmoActForActionReasoning(configuration)
|
270 |
|
271 |
>>> # Accessing the model configuration
|
272 |
>>> configuration = model.config
|
modeling_molmoact.py
CHANGED
@@ -1787,7 +1787,7 @@ class MolmoActModel(MolmoActPreTrainedModel):
|
|
1787 |
"The MolmoAct model which consists of a vision backbone and a language model + lm head.",
|
1788 |
MOLMO_START_DOCSTRING,
|
1789 |
)
|
1790 |
-
class
|
1791 |
_checkpoint_conversion_mapping = {}
|
1792 |
_tied_weights_keys = [] # Weights are not tied
|
1793 |
config_class = MolmoActConfig
|
@@ -1858,9 +1858,9 @@ class MolmoActForConditionalActionGeneration(MolmoActPreTrainedModel, Generation
|
|
1858 |
```python
|
1859 |
>>> from PIL import Image
|
1860 |
>>> import requests
|
1861 |
-
>>> from transformers import AutoProcessor,
|
1862 |
|
1863 |
-
>>> model =
|
1864 |
>>> processor = AutoProcessor.from_pretrained("...")
|
1865 |
|
1866 |
>>> prompt = "What's the content of the image?"
|
@@ -2096,5 +2096,5 @@ class MolmoActForConditionalActionGeneration(MolmoActPreTrainedModel, Generation
|
|
2096 |
|
2097 |
|
2098 |
# Always register for multi-modal features
|
2099 |
-
AutoModelForImageTextToText.register(MolmoActConfig,
|
2100 |
AutoModelForCausalLM.register(MolmoActLlmConfig, MolmoActForCausalLM)
|
|
|
1787 |
"The MolmoAct model which consists of a vision backbone and a language model + lm head.",
|
1788 |
MOLMO_START_DOCSTRING,
|
1789 |
)
|
1790 |
+
class MolmoActForActionReasoning(MolmoActPreTrainedModel, GenerationMixin):
|
1791 |
_checkpoint_conversion_mapping = {}
|
1792 |
_tied_weights_keys = [] # Weights are not tied
|
1793 |
config_class = MolmoActConfig
|
|
|
1858 |
```python
|
1859 |
>>> from PIL import Image
|
1860 |
>>> import requests
|
1861 |
+
>>> from transformers import AutoProcessor, MolmoActForActionReasoning
|
1862 |
|
1863 |
+
>>> model = MolmoActForActionReasoning.from_pretrained("...")
|
1864 |
>>> processor = AutoProcessor.from_pretrained("...")
|
1865 |
|
1866 |
>>> prompt = "What's the content of the image?"
|
|
|
2096 |
|
2097 |
|
2098 |
# Always register for multi-modal features
|
2099 |
+
AutoModelForImageTextToText.register(MolmoActConfig, MolmoActForActionReasoning)
|
2100 |
AutoModelForCausalLM.register(MolmoActLlmConfig, MolmoActForCausalLM)
|