Spaces:
Runtime error
Runtime error
| # coding=utf-8 | |
| # Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ Emu3VisionVQ model configuration """ | |
| from typing import List | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.utils import logging | |
| logger = logging.get_logger(__name__) | |
| class Emu3VisionVQConfig(PretrainedConfig): | |
| r""" | |
| This is the configuration class to store the configuration of a [`Emu3VisionVQ`]. It is used to instantiate an video movq | |
| model according to the specified arguments, defining the model architecture. Instantiating a configuration with the | |
| defaults will yield a configuration to the VQ model presented in Emu3 paper. | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| codebook_size (`int`, *optional*, defaults to 32768): | |
| Codebook size of the VQ model. | |
| embed_dim (`int`, *optional*, defaults to 4): | |
| Dimension of the quantized vector in codebook. | |
| z_channels (`int`, *optional*, defaults to 4): | |
| Dimension of the output channel of encoder and the input channel of decoder | |
| double_z (`bool`, *optional*, defaults to False): | |
| Whether double the output dim of the encoder. | |
| in_channels (`int`, *optional*, defaults to 3): | |
| Input channel of encoder. | |
| out_channels (`int`, *optional*, defaults to 3): | |
| Output channel of decoder. | |
| temporal_downsample_factor (`int`, *optional*, defaults to 4): | |
| Temporal downsample factor. | |
| ch (`int`, *optional*, defaults to 256): | |
| Basic channel number of the intermediate blocks. | |
| ch_mult (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`): | |
| Channel scaling factor of the intermediate blocks. | |
| num_res_blocks (`int`, *optional*, defaults to 2): | |
| Residual block number in each stage. | |
| attn_resolutions (`List[int]`, *optional*, defaults to 3): | |
| Stage indices to apply attention. | |
| dropout (`float`, *optional*, defaults to 0.0): | |
| Dropout probability. | |
| ```python | |
| >>> from transformers import Emu3VisionVQ, Emu3VisionVQConfig | |
| >>> # Initializing a video VQ model of Emu3 configuration | |
| >>> configuration = Emu3VisionVQConfig() | |
| >>> # Initializing a model from the Emu3 VQ model style configuration | |
| >>> model = Emu3VisionVQModel(configuration) | |
| >>> # Accessing the model configuration | |
| >>> configuration = model.config | |
| ```""" | |
| model_type = "Emu3VisionVQ" | |
| def __init__( | |
| self, | |
| codebook_size: int = 32768, | |
| embed_dim: int = 4, | |
| z_channels: int = 4, | |
| double_z: bool = False, | |
| in_channels: int = 3, | |
| out_channels: int = 3, | |
| temporal_downsample_factor: int = 4, | |
| ch: int = 256, | |
| ch_mult: List[int] = [1, 2, 2, 4], | |
| num_res_blocks: int = 2, | |
| attn_resolutions: List[int] = [3], | |
| dropout: float = 0.0, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.codebook_size = codebook_size | |
| self.embed_dim = embed_dim | |
| self.z_channels = z_channels | |
| self.double_z = double_z | |
| self.in_channels = in_channels | |
| self.out_channels = out_channels | |
| self.temporal_downsample_factor = temporal_downsample_factor | |
| self.ch = ch | |
| self.ch_mult = ch_mult | |
| self.num_res_blocks = num_res_blocks | |
| self.attn_resolutions = attn_resolutions | |
| self.dropout = dropout | |