File size: 7,079 Bytes

b3afdde

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""EMOVA model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from transformers.models.auto import CONFIG_MAPPING

from .configuration_qwen2vit import Qwen2VLVisionConfig

logger = logging.get_logger(__name__)


class EMOVAConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`EMOVAForConditionalGeneration`]. It is used to instantiate an
    EMOVA model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
            The config object or dictionary of the vision backbone.
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
            The config object or dictionary of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        image_token_index (`int`, *optional*, defaults to 32000):
            The image token index to encode the image prompt.
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The activation function used by the multimodal projector.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
            If `"full"`, the full vision features are used.
        vision_feature_layer (`int`, *optional*, defaults to -2):
            The index of the layer to select the vision feature.
        image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
            A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.

    Example:

    ```python
    >>> from transformers import EMOVAForConditionalGeneration, EMOVAConfig, CLIPVisionConfig, LlamaConfig

    >>> # Initializing a CLIP-vision config
    >>> vision_config = CLIPVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a EMOVA style configuration
    >>> configuration = EMOVAConfig(vision_config, text_config)

    >>> # Initializing a model from the style configuration
    >>> model = EMOVAForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "emova"
    is_composition = False

    def __init__(
            self,
            vision_config=None,
            mm_projector_config=None,
            text_config=None,
            ignore_index=-100,
            image_token_index=32000,
            vision_feature_select_strategy="default",
            vision_feature_layer=-2,
            is_native_resolution=True,
            image_grid_pinpoints=None,
            use_image_newline_parameter=False,
            tie_word_embeddings=False,
            **kwargs,
    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index

        if vision_feature_select_strategy not in ["default", "full"]:
            raise ValueError(
                "vision_feature_select_strategy should be one of 'default', 'full'."
                f"Got: {vision_feature_select_strategy}"
            )

        # self.vision_feature_select_strategy = vision_feature_select_strategy
        # self.vision_feature_layer = vision_feature_layer
        # image_grid_pinpoints = (
        #     image_grid_pinpoints
        #     if image_grid_pinpoints is not None
        #     else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
        # )
        # self.is_native_resolution = is_native_resolution
        # self.image_grid_pinpoints = image_grid_pinpoints if not self.is_native_resolution else None
        # self.use_image_newline_parameter = use_image_newline_parameter

        if isinstance(vision_config, dict):
            vision_config = Qwen2VLVisionConfig(**vision_config)
        elif vision_config is None:
            vision_config = Qwen2VLVisionConfig(
                depth=32,
                embed_dim=1280,
                hidden_act="quick_gelu",
                hidden_size=3584,
                in_channels=3,
                in_chans=3,
                mlp_ratio=4,
                model_type="qwen2_vl",
                num_heads=16,
                patch_size=14,
                spatial_merge_size=2,
                spatial_patch_size=14,
                temporal_patch_size=2,
                initializer_range=0.02,
            )

        # if isinstance(vision_config, dict):
        #     vision_config["model_type"] = (
        #         vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
        #     )
        #     vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
        # elif vision_config is None:
        #     vision_config = CONFIG_MAPPING["clip_vision_model"](
        #         intermediate_size=4096,
        #         hidden_size=1024,
        #         patch_size=14,
        #         image_size=336,
        #         num_hidden_layers=24,
        #         num_attention_heads=16,
        #         vocab_size=32000,
        #         projection_dim=768,
        #     )

        self.vision_config = vision_config
        self.mm_projector_config = mm_projector_config
        if isinstance(text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
        elif text_config is None:
            text_config = CONFIG_MAPPING["llama"]()

        self.text_config = text_config
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)