Feature Extraction
Transformers
Safetensors
English
Chinese
emova
Omni-modal-LLM
Multi-modal-LLM
Emotional-spoken-dialogue
custom_code
Eval Results
emova-qwen-2-5-7b-hf / configuration_emova.py
KaiChen1998's picture
Upload EMOVAForConditionalGeneration
e76d237 verified
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""EMOVA model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from transformers.models.auto import CONFIG_MAPPING
from .configuration_qwen2vit import Qwen2VLVisionConfig
logger = logging.get_logger(__name__)
class EMOVAConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`EMOVAForConditionalGeneration`]. It is used to instantiate an
EMOVA model according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`):
The config object or dictionary of the vision backbone.
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
The config object or dictionary of the text backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
image_token_index (`int`, *optional*, defaults to 32000):
The image token index to encode the image prompt.
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
If `"full"`, the full vision features are used.
vision_feature_layer (`int`, *optional*, defaults to -2):
The index of the layer to select the vision feature.
image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
of the form `(height, width)`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
Example:
```python
>>> from transformers import EMOVAForConditionalGeneration, EMOVAConfig, CLIPVisionConfig, LlamaConfig
>>> # Initializing a CLIP-vision config
>>> vision_config = CLIPVisionConfig()
>>> # Initializing a Llama config
>>> text_config = LlamaConfig()
>>> # Initializing a EMOVA style configuration
>>> configuration = EMOVAConfig(vision_config, text_config)
>>> # Initializing a model from the style configuration
>>> model = EMOVAForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "emova"
is_composition = False
def __init__(
self,
vision_config=None,
mm_projector_config=None,
text_config=None,
ignore_index=-100,
image_token_index=32000,
vision_feature_select_strategy="default",
vision_feature_layer=-2,
is_native_resolution=True,
image_grid_pinpoints=None,
use_image_newline_parameter=False,
tie_word_embeddings=False,
**kwargs,
):
self.ignore_index = ignore_index
self.image_token_index = image_token_index
if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
"vision_feature_select_strategy should be one of 'default', 'full'."
f"Got: {vision_feature_select_strategy}"
)
# self.vision_feature_select_strategy = vision_feature_select_strategy
# self.vision_feature_layer = vision_feature_layer
# image_grid_pinpoints = (
# image_grid_pinpoints
# if image_grid_pinpoints is not None
# else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
# )
# self.is_native_resolution = is_native_resolution
# self.image_grid_pinpoints = image_grid_pinpoints if not self.is_native_resolution else None
# self.use_image_newline_parameter = use_image_newline_parameter
if isinstance(vision_config, dict):
vision_config = Qwen2VLVisionConfig(**vision_config)
elif vision_config is None:
vision_config = Qwen2VLVisionConfig(
depth=32,
embed_dim=1280,
hidden_act="quick_gelu",
hidden_size=3584,
in_channels=3,
in_chans=3,
mlp_ratio=4,
model_type="qwen2_vl",
num_heads=16,
patch_size=14,
spatial_merge_size=2,
spatial_patch_size=14,
temporal_patch_size=2,
initializer_range=0.02,
)
# if isinstance(vision_config, dict):
# vision_config["model_type"] = (
# vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
# )
# vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
# elif vision_config is None:
# vision_config = CONFIG_MAPPING["clip_vision_model"](
# intermediate_size=4096,
# hidden_size=1024,
# patch_size=14,
# image_size=336,
# num_hidden_layers=24,
# num_attention_heads=16,
# vocab_size=32000,
# projection_dim=768,
# )
self.vision_config = vision_config
self.mm_projector_config = mm_projector_config
if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["llama"]()
self.text_config = text_config
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)