""" VibeVoice_AcousticTokenizer model configuration""" from typing import Dict, List, Optional, Tuple from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging from transformers.models.qwen2.configuration_qwen2 import Qwen2Config logger = logging.get_logger(__name__) class VibeVoiceAcousticTokenizerConfig(PretrainedConfig): model_type = "vibevoice_acoustic_tokenizer" def __init__( self, channels: int = 1, corpus_normalize: float = 0.0, causal: bool = True, vae_dim: int = 64, fix_std: float = 0.5, std_dist_type: str = 'gaussian', # common mixer_layer: str = 'depthwise_conv', conv_norm: str = 'none', pad_mode: str = 'constant', disable_last_norm: bool = True, layernorm: str = 'RMSNorm', layernorm_eps: float = 1e-5, layernorm_elementwise_affine: bool = True, conv_bias: bool = True, layer_scale_init_value: float = 1e-6, weight_init_value: float = 1e-2, # encoder specific encoder_n_filters: int = 32, encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2], encoder_depths: str = "3-3-3-3-3-3-8", # decoder specific decoder_n_filters: int = 32, decoder_ratios: Optional[List[int]] = None, # if None, same as encoder decoder_depths: Optional[str] = None, **kwargs ): super().__init__(**kwargs) self.channels = channels self.corpus_normalize = corpus_normalize self.causal = causal self.vae_dim = vae_dim self.fix_std = fix_std self.std_dist_type = std_dist_type # common parameters self.conv_norm = conv_norm self.pad_mode = pad_mode self.layernorm_eps = layernorm_eps self.disable_last_norm = disable_last_norm self.layernorm = layernorm self.layernorm_elementwise_affine = layernorm_elementwise_affine self.conv_bias = conv_bias self.layer_scale_init_value = layer_scale_init_value self.weight_init_value = weight_init_value self.mixer_layer = mixer_layer # encoder specific parameters self.encoder_n_filters = encoder_n_filters self.encoder_ratios = encoder_ratios self.encoder_depths = encoder_depths # decoder specific parameters self.decoder_ratios = decoder_ratios if decoder_ratios is not None else encoder_ratios self.decoder_n_filters = decoder_n_filters self.decoder_depths = decoder_depths class VibeVoiceSemanticTokenizerConfig(PretrainedConfig): model_type = "vibevoice_semantic_tokenizer" def __init__( self, channels: int = 1, corpus_normalize: float = 0.0, causal: bool = True, vae_dim: int = 64, fix_std: float = 0, std_dist_type: str = 'none', # common mixer_layer: str = 'depthwise_conv', conv_norm: str = 'none', pad_mode: str = 'constant', disable_last_norm: bool = True, layernorm: str = 'RMSNorm', layernorm_eps: float = 1e-5, layernorm_elementwise_affine: bool = True, conv_bias: bool = True, layer_scale_init_value: float = 1e-6, weight_init_value: float = 1e-2, # encoder specific encoder_n_filters: int = 32, encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2], encoder_depths: str = "3-3-3-3-3-3-8", **kwargs ): super().__init__(**kwargs) self.channels = channels self.corpus_normalize = corpus_normalize self.causal = causal self.vae_dim = vae_dim self.fix_std = fix_std self.std_dist_type = std_dist_type # common parameters self.conv_norm = conv_norm self.pad_mode = pad_mode self.layernorm_eps = layernorm_eps self.disable_last_norm = disable_last_norm self.layernorm = layernorm self.layernorm_elementwise_affine = layernorm_elementwise_affine self.conv_bias = conv_bias self.layer_scale_init_value = layer_scale_init_value self.weight_init_value = weight_init_value self.mixer_layer = mixer_layer # encoder specific parameters self.encoder_n_filters = encoder_n_filters self.encoder_ratios = encoder_ratios self.encoder_depths = encoder_depths class VibeVoiceDiffusionHeadConfig(PretrainedConfig): model_type = "vibevoice_diffusion_head" def __init__( self, hidden_size=768, head_layers=4, head_ffn_ratio=3.0, rms_norm_eps=1e-5, latent_size=64, speech_vae_dim=None, prediction_type="v_prediction", diffusion_type="ddpm", ddpm_num_steps=1000, ddpm_num_inference_steps=20, ddpm_beta_schedule="cosine", ddpm_batch_mul=4, **kwargs ): self.hidden_size = hidden_size self.head_layers = head_layers self.head_ffn_ratio = head_ffn_ratio self.rms_norm_eps = rms_norm_eps self.latent_size = latent_size self.speech_vae_dim = speech_vae_dim self.prediction_type = prediction_type self.diffusion_type = diffusion_type self.ddpm_num_steps = ddpm_num_steps self.ddpm_num_inference_steps = ddpm_num_inference_steps self.ddpm_beta_schedule = ddpm_beta_schedule self.ddpm_batch_mul = ddpm_batch_mul super().__init__(**kwargs) class VibeVoiceConfig(PretrainedConfig): model_type = "vibevoice" is_composition = True sub_configs = { "acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig, "semantic_tokenizer_config": VibeVoiceSemanticTokenizerConfig, "decoder_config": Qwen2Config, "diffusion_head_config": VibeVoiceDiffusionHeadConfig, } # keys_to_ignore_at_inference = ["past_key_values"] # Default tensor parallel plan for base model `Qwen2` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } def __init__( self, acoustic_tokenizer_config=None, semantic_tokenizer_config=None, decoder_config=None, diffusion_head_config=None, **kwargs ): # kwargs["_attn_implementation"] = "flash_attention_2" kwargs["_attn_implementation_autoset"] = False if acoustic_tokenizer_config is None: self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]() elif isinstance(acoustic_tokenizer_config, dict): acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer" self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config) elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig): # If an instance of the config class is provided self.acoustic_tokenizer_config = acoustic_tokenizer_config if semantic_tokenizer_config is None: self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]() elif isinstance(semantic_tokenizer_config, dict): semantic_tokenizer_config["model_type"] = "vibevoice_semantic_tokenizer" self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config) elif isinstance(semantic_tokenizer_config, VibeVoiceSemanticTokenizerConfig): # If an instance of the config class is provided self.semantic_tokenizer_config = semantic_tokenizer_config if decoder_config is None: self.decoder_config = self.sub_configs["decoder_config"]() elif isinstance(decoder_config, dict): # If a dictionary is provided, instantiate the config class with it # self.decoder_config = self.sub_configs["decoder_config"](**decoder_config) if decoder_config.get("model_type", '') == "qwen2": self.decoder_config = Qwen2Config(**decoder_config) else: raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}") elif isinstance(decoder_config, (Qwen2Config,)): # If an instance of the config class is provided self.decoder_config = decoder_config if diffusion_head_config is None: self.diffusion_head_config = self.sub_configs["diffusion_head_config"]() elif isinstance(diffusion_head_config, dict): diffusion_head_config["model_type"] = "vibevoice_diffusion_head" self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config) elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig): # If an instance of the config class is provided self.diffusion_head_config = diffusion_head_config # other parameters self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64) self.semantic_vae_dim = getattr(self.semantic_tokenizer_config, 'vae_dim', 128) super().__init__(**kwargs) __all__ = [ "VibeVoiceAcousticTokenizerConfig", "VibeVoiceSemanticTokenizerConfig", "VibeVoiceDiffusionHeadConfig", "VibeVoiceConfig" ]