hqfang commited on
Commit
abec641
·
verified ·
1 Parent(s): c009752

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -20,11 +20,11 @@
20
  ]
21
  },
22
  "architectures": [
23
- "MolmoActForConditionalActionGeneration"
24
  ],
25
  "auto_map": {
26
  "AutoConfig": "configuration_molmoact.MolmoActConfig",
27
- "AutoModelForImageTextToText": "modeling_molmoact.MolmoActForConditionalActionGeneration"
28
  },
29
  "image_patch_id": 152066,
30
  "initializer_range": 0.02,
 
20
  ]
21
  },
22
  "architectures": [
23
+ "MolmoActForActionReasoning"
24
  ],
25
  "auto_map": {
26
  "AutoConfig": "configuration_molmoact.MolmoActConfig",
27
+ "AutoModelForImageTextToText": "modeling_molmoact.MolmoActForActionReasoning"
28
  },
29
  "image_patch_id": 152066,
30
  "initializer_range": 0.02,
configuration_molmoact.py CHANGED
@@ -245,7 +245,7 @@ class MolmoActLlmConfig(PretrainedConfig):
245
 
246
  class MolmoActConfig(PretrainedConfig):
247
  r"""
248
- This is the configuration class to store the configuration of a [`MolmoActForConditionalActionGeneration`].
249
  It is used to instantiate an MolmoAct model according to the specified arguments, defining the model architecture.
250
 
251
  Example:
@@ -266,7 +266,7 @@ class MolmoActConfig(PretrainedConfig):
266
  >>> configuration = MolmoActConfig(vit_config, adapter_config, llm_config, image_patch_id=152069)
267
 
268
  >>> # Initializing a model
269
- >>> model = MolmoActForConditionalActionGeneration(configuration)
270
 
271
  >>> # Accessing the model configuration
272
  >>> configuration = model.config
 
245
 
246
  class MolmoActConfig(PretrainedConfig):
247
  r"""
248
+ This is the configuration class to store the configuration of a [`MolmoActForActionReasoning`].
249
  It is used to instantiate an MolmoAct model according to the specified arguments, defining the model architecture.
250
 
251
  Example:
 
266
  >>> configuration = MolmoActConfig(vit_config, adapter_config, llm_config, image_patch_id=152069)
267
 
268
  >>> # Initializing a model
269
+ >>> model = MolmoActForActionReasoning(configuration)
270
 
271
  >>> # Accessing the model configuration
272
  >>> configuration = model.config
modeling_molmoact.py CHANGED
@@ -1787,7 +1787,7 @@ class MolmoActModel(MolmoActPreTrainedModel):
1787
  "The MolmoAct model which consists of a vision backbone and a language model + lm head.",
1788
  MOLMO_START_DOCSTRING,
1789
  )
1790
- class MolmoActForConditionalActionGeneration(MolmoActPreTrainedModel, GenerationMixin):
1791
  _checkpoint_conversion_mapping = {}
1792
  _tied_weights_keys = [] # Weights are not tied
1793
  config_class = MolmoActConfig
@@ -1858,9 +1858,9 @@ class MolmoActForConditionalActionGeneration(MolmoActPreTrainedModel, Generation
1858
  ```python
1859
  >>> from PIL import Image
1860
  >>> import requests
1861
- >>> from transformers import AutoProcessor, MolmoActForConditionalActionGeneration
1862
 
1863
- >>> model = MolmoActForConditionalActionGeneration.from_pretrained("...")
1864
  >>> processor = AutoProcessor.from_pretrained("...")
1865
 
1866
  >>> prompt = "What's the content of the image?"
@@ -2096,5 +2096,5 @@ class MolmoActForConditionalActionGeneration(MolmoActPreTrainedModel, Generation
2096
 
2097
 
2098
  # Always register for multi-modal features
2099
- AutoModelForImageTextToText.register(MolmoActConfig, MolmoActForConditionalActionGeneration)
2100
  AutoModelForCausalLM.register(MolmoActLlmConfig, MolmoActForCausalLM)
 
1787
  "The MolmoAct model which consists of a vision backbone and a language model + lm head.",
1788
  MOLMO_START_DOCSTRING,
1789
  )
1790
+ class MolmoActForActionReasoning(MolmoActPreTrainedModel, GenerationMixin):
1791
  _checkpoint_conversion_mapping = {}
1792
  _tied_weights_keys = [] # Weights are not tied
1793
  config_class = MolmoActConfig
 
1858
  ```python
1859
  >>> from PIL import Image
1860
  >>> import requests
1861
+ >>> from transformers import AutoProcessor, MolmoActForActionReasoning
1862
 
1863
+ >>> model = MolmoActForActionReasoning.from_pretrained("...")
1864
  >>> processor = AutoProcessor.from_pretrained("...")
1865
 
1866
  >>> prompt = "What's the content of the image?"
 
2096
 
2097
 
2098
  # Always register for multi-modal features
2099
+ AutoModelForImageTextToText.register(MolmoActConfig, MolmoActForActionReasoning)
2100
  AutoModelForCausalLM.register(MolmoActLlmConfig, MolmoActForCausalLM)