Spaces:
Running
on
Zero
Running
on
Zero
""" VibeVoice_AcousticTokenizer model configuration""" | |
from typing import Dict, List, Optional, Tuple | |
from transformers.configuration_utils import PretrainedConfig | |
from transformers.utils import logging | |
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config | |
logger = logging.get_logger(__name__) | |
class VibeVoiceAcousticTokenizerConfig(PretrainedConfig): | |
model_type = "vibevoice_acoustic_tokenizer" | |
def __init__( | |
self, | |
channels: int = 1, | |
corpus_normalize: float = 0.0, | |
causal: bool = True, | |
vae_dim: int = 64, | |
fix_std: float = 0.5, | |
std_dist_type: str = 'gaussian', | |
# common | |
mixer_layer: str = 'depthwise_conv', | |
conv_norm: str = 'none', | |
pad_mode: str = 'constant', | |
disable_last_norm: bool = True, | |
layernorm: str = 'RMSNorm', | |
layernorm_eps: float = 1e-5, | |
layernorm_elementwise_affine: bool = True, | |
conv_bias: bool = True, | |
layer_scale_init_value: float = 1e-6, | |
weight_init_value: float = 1e-2, | |
# encoder specific | |
encoder_n_filters: int = 32, | |
encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2], | |
encoder_depths: str = "3-3-3-3-3-3-8", | |
# decoder specific | |
decoder_n_filters: int = 32, | |
decoder_ratios: Optional[List[int]] = None, # if None, same as encoder | |
decoder_depths: Optional[str] = None, | |
**kwargs | |
): | |
super().__init__(**kwargs) | |
self.channels = channels | |
self.corpus_normalize = corpus_normalize | |
self.causal = causal | |
self.vae_dim = vae_dim | |
self.fix_std = fix_std | |
self.std_dist_type = std_dist_type | |
# common parameters | |
self.conv_norm = conv_norm | |
self.pad_mode = pad_mode | |
self.layernorm_eps = layernorm_eps | |
self.disable_last_norm = disable_last_norm | |
self.layernorm = layernorm | |
self.layernorm_elementwise_affine = layernorm_elementwise_affine | |
self.conv_bias = conv_bias | |
self.layer_scale_init_value = layer_scale_init_value | |
self.weight_init_value = weight_init_value | |
self.mixer_layer = mixer_layer | |
# encoder specific parameters | |
self.encoder_n_filters = encoder_n_filters | |
self.encoder_ratios = encoder_ratios | |
self.encoder_depths = encoder_depths | |
# decoder specific parameters | |
self.decoder_ratios = decoder_ratios if decoder_ratios is not None else encoder_ratios | |
self.decoder_n_filters = decoder_n_filters | |
self.decoder_depths = decoder_depths | |
class VibeVoiceSemanticTokenizerConfig(PretrainedConfig): | |
model_type = "vibevoice_semantic_tokenizer" | |
def __init__( | |
self, | |
channels: int = 1, | |
corpus_normalize: float = 0.0, | |
causal: bool = True, | |
vae_dim: int = 64, | |
fix_std: float = 0, | |
std_dist_type: str = 'none', | |
# common | |
mixer_layer: str = 'depthwise_conv', | |
conv_norm: str = 'none', | |
pad_mode: str = 'constant', | |
disable_last_norm: bool = True, | |
layernorm: str = 'RMSNorm', | |
layernorm_eps: float = 1e-5, | |
layernorm_elementwise_affine: bool = True, | |
conv_bias: bool = True, | |
layer_scale_init_value: float = 1e-6, | |
weight_init_value: float = 1e-2, | |
# encoder specific | |
encoder_n_filters: int = 32, | |
encoder_ratios: Optional[List[int]] = [8,5,5,4,2,2], | |
encoder_depths: str = "3-3-3-3-3-3-8", | |
**kwargs | |
): | |
super().__init__(**kwargs) | |
self.channels = channels | |
self.corpus_normalize = corpus_normalize | |
self.causal = causal | |
self.vae_dim = vae_dim | |
self.fix_std = fix_std | |
self.std_dist_type = std_dist_type | |
# common parameters | |
self.conv_norm = conv_norm | |
self.pad_mode = pad_mode | |
self.layernorm_eps = layernorm_eps | |
self.disable_last_norm = disable_last_norm | |
self.layernorm = layernorm | |
self.layernorm_elementwise_affine = layernorm_elementwise_affine | |
self.conv_bias = conv_bias | |
self.layer_scale_init_value = layer_scale_init_value | |
self.weight_init_value = weight_init_value | |
self.mixer_layer = mixer_layer | |
# encoder specific parameters | |
self.encoder_n_filters = encoder_n_filters | |
self.encoder_ratios = encoder_ratios | |
self.encoder_depths = encoder_depths | |
class VibeVoiceDiffusionHeadConfig(PretrainedConfig): | |
model_type = "vibevoice_diffusion_head" | |
def __init__( | |
self, | |
hidden_size=768, | |
head_layers=4, | |
head_ffn_ratio=3.0, | |
rms_norm_eps=1e-5, | |
latent_size=64, | |
speech_vae_dim=None, | |
prediction_type="v_prediction", | |
diffusion_type="ddpm", | |
ddpm_num_steps=1000, | |
ddpm_num_inference_steps=20, | |
ddpm_beta_schedule="cosine", | |
ddpm_batch_mul=4, | |
**kwargs | |
): | |
self.hidden_size = hidden_size | |
self.head_layers = head_layers | |
self.head_ffn_ratio = head_ffn_ratio | |
self.rms_norm_eps = rms_norm_eps | |
self.latent_size = latent_size | |
self.speech_vae_dim = speech_vae_dim | |
self.prediction_type = prediction_type | |
self.diffusion_type = diffusion_type | |
self.ddpm_num_steps = ddpm_num_steps | |
self.ddpm_num_inference_steps = ddpm_num_inference_steps | |
self.ddpm_beta_schedule = ddpm_beta_schedule | |
self.ddpm_batch_mul = ddpm_batch_mul | |
super().__init__(**kwargs) | |
class VibeVoiceConfig(PretrainedConfig): | |
model_type = "vibevoice" | |
is_composition = True | |
sub_configs = { | |
"acoustic_tokenizer_config": VibeVoiceAcousticTokenizerConfig, | |
"semantic_tokenizer_config": VibeVoiceSemanticTokenizerConfig, | |
"decoder_config": Qwen2Config, | |
"diffusion_head_config": VibeVoiceDiffusionHeadConfig, | |
} | |
# keys_to_ignore_at_inference = ["past_key_values"] | |
# Default tensor parallel plan for base model `Qwen2` | |
base_model_tp_plan = { | |
"layers.*.self_attn.q_proj": "colwise", | |
"layers.*.self_attn.k_proj": "colwise", | |
"layers.*.self_attn.v_proj": "colwise", | |
"layers.*.self_attn.o_proj": "rowwise", | |
"layers.*.mlp.gate_proj": "colwise", | |
"layers.*.mlp.up_proj": "colwise", | |
"layers.*.mlp.down_proj": "rowwise", | |
} | |
def __init__( | |
self, | |
acoustic_tokenizer_config=None, | |
semantic_tokenizer_config=None, | |
decoder_config=None, | |
diffusion_head_config=None, | |
**kwargs | |
): | |
# kwargs["_attn_implementation"] = "flash_attention_2" | |
kwargs["_attn_implementation_autoset"] = False | |
if acoustic_tokenizer_config is None: | |
self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]() | |
elif isinstance(acoustic_tokenizer_config, dict): | |
acoustic_tokenizer_config["model_type"] = "vibevoice_acoustic_tokenizer" | |
self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config) | |
elif isinstance(acoustic_tokenizer_config, VibeVoiceAcousticTokenizerConfig): | |
# If an instance of the config class is provided | |
self.acoustic_tokenizer_config = acoustic_tokenizer_config | |
if semantic_tokenizer_config is None: | |
self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]() | |
elif isinstance(semantic_tokenizer_config, dict): | |
semantic_tokenizer_config["model_type"] = "vibevoice_semantic_tokenizer" | |
self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config) | |
elif isinstance(semantic_tokenizer_config, VibeVoiceSemanticTokenizerConfig): | |
# If an instance of the config class is provided | |
self.semantic_tokenizer_config = semantic_tokenizer_config | |
if decoder_config is None: | |
self.decoder_config = self.sub_configs["decoder_config"]() | |
elif isinstance(decoder_config, dict): | |
# If a dictionary is provided, instantiate the config class with it | |
# self.decoder_config = self.sub_configs["decoder_config"](**decoder_config) | |
if decoder_config.get("model_type", '') == "qwen2": | |
self.decoder_config = Qwen2Config(**decoder_config) | |
else: | |
raise ValueError(f"Unsupported decoder model type: {decoder_config.get('model_type', '')}") | |
elif isinstance(decoder_config, (Qwen2Config,)): | |
# If an instance of the config class is provided | |
self.decoder_config = decoder_config | |
if diffusion_head_config is None: | |
self.diffusion_head_config = self.sub_configs["diffusion_head_config"]() | |
elif isinstance(diffusion_head_config, dict): | |
diffusion_head_config["model_type"] = "vibevoice_diffusion_head" | |
self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config) | |
elif isinstance(diffusion_head_config, VibeVoiceDiffusionHeadConfig): | |
# If an instance of the config class is provided | |
self.diffusion_head_config = diffusion_head_config | |
# other parameters | |
self.acoustic_vae_dim = getattr(self.acoustic_tokenizer_config, 'vae_dim', 64) | |
self.semantic_vae_dim = getattr(self.semantic_tokenizer_config, 'vae_dim', 128) | |
super().__init__(**kwargs) | |
__all__ = [ | |
"VibeVoiceAcousticTokenizerConfig", | |
"VibeVoiceSemanticTokenizerConfig", | |
"VibeVoiceDiffusionHeadConfig", | |
"VibeVoiceConfig" | |
] |