File size: 6,522 Bytes
679b242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
from transformers import PretrainedConfig, WhisperConfig
class BEATsConfig(PretrainedConfig):
def __init__(self, cfg=None):
# update the default values to BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt
self.input_patch_size: int = 16 # path size of patch embedding
self.embed_dim: int = 512 # patch embedding dimension
self.conv_bias: bool = False # include bias in conv encoder
self.encoder_layers: int = 12 # num encoder layers in the transformer
self.encoder_embed_dim: int = 768 # encoder embedding dimension
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
self.encoder_attention_heads: int = 12 # num encoder attention heads
self.activation_fn: str = "gelu" # activation function to use
self.layer_wise_gradient_decay_ratio: float = 0.6 # ratio for layer-wise gradient decay
self.layer_norm_first: bool = False # apply layernorm first in the transformer
self.deep_norm: bool = True # apply deep_norm first in the transformer
# dropouts
self.dropout: float = 0.0 # dropout probability for the transformer
self.attention_dropout: float = 0.0 # dropout probability for attention weights
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
self.encoder_layerdrop: float = 0.05 # probability of dropping a tarnsformer layer
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
# positional embeddings
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
# relative position embedding
self.relative_position_embedding: bool = True # apply relative position embedding
self.num_buckets: int = 320 # number of buckets for relative position embedding
self.max_distance: int = 800 # maximum distance for relative position embedding
self.gru_rel_pos: bool = True # apply gated relative position embedding
# label predictor
self.finetuned_model: bool = True # whether the model is a fine-tuned model.
self.predictor_dropout: float = 0.0 # dropout probability for the predictor
self.predictor_class: int = 527 # target class number for the predictor
if cfg is not None:
self.update(cfg)
def update(self, cfg: dict):
self.__dict__.update(cfg)
class Typhoon2AudioConfig(PretrainedConfig):
model_type = "typhoon2audio"
def __init__(self, **kwargs):
# LLM -- Llama3
self.llama_base_model = "scb10x/typhoon-2-llama31-8b-instruct-beta-v1"
# Whisper
self.whisper_extractor_feature_size=128
self.whisper = WhisperConfig(
activation_dropout=0.0,
activation_function="gelu",
apply_spec_augment=True,
attention_dropout=0.0,
begin_suppress_tokens=[220, 50257],
bos_token_id=50257,
d_model=1280,
decoder_attention_heads=20,
decoder_ffn_dim=5120,
decoder_layerdrop=0.0,
decoder_layers=32,
decoder_start_token_id=50258,
dropout=0.0,
encoder_attention_heads=20,
encoder_ffn_dim=5120,
encoder_layerdrop=0.0,
encoder_layers=32,
eos_token_id=50257,
init_std=0.02,
mask_feature_length=64,
mask_feature_min_masks=0,
mask_feature_prob=0.1,
mask_time_length=10,
mask_time_min_masks=2,
mask_time_prob=0.1,
max_length=448,
max_source_positions=1500,
max_target_positions=448,
median_filter_width=7,
num_hidden_layers=32,
num_mel_bins=128,
pad_token_id=50256,
scale_embedding=False,
use_weighted_layer_sum=False,
vocab_size=51866,
)
# BEATs
self.beats = BEATsConfig()
# Speech QFormer
self.speech_qformer_token_num=1
self.speech_qformer_layer=2
self.second_per_frame=0.333333
self.second_stride=0.333333
# SpeechDecoder CTC
self.pretraining_tp = 1
self.ctc_decoder_config='(4,4096,32,11008)'
self.ctc_upsample_factor=25
self.ctc_loss_weight=1.0
self.unit_vocab_size=1000
self.speech_decoder_ignore_index=-100
self.attention_bias=False
self.attention_dropout=0.0
self.bos_token_id=128000
self.eos_token_id=128009
self.head_dim=128
self.hidden_act="silu"
self.hidden_size=4096
self.intermediate_size=14336
self.max_position_embeddings=131072
self.mlp_bias=False
self.num_attention_heads=32
self.num_hidden_layers=32
self.num_key_value_heads=8
self.rms_norm_eps=1e-05
self.rope_scaling={
"factor": 8.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
}
self.rope_theta=500000.0
self.vocab_size=128256
# Unit Vocoder (HiFiGAN)
self.vocoder_path = {
'repo_id': 'scb10x/unit-vocoder-gcp-th-v1-00206600',
'filename': 'checkpoint.pt'
}
self.vocoder_config = {
'resblock': 1,
'upsample_rates': [5, 4, 4, 2, 2],
'upsample_kernel_sizes': [11, 8, 8, 4, 4],
'upsample_initial_channel': 512,
'resblock_kernel_sizes': [3, 7, 11],
'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
'num_embeddings': 1000,
'embedding_dim': 512,
'model_in_dim': 512,
'segment_size': 8960,
'code_hop_size': 320,
'num_mels': 80,
'num_freq': 1025,
'n_fft': 1024,
'hop_size': 256,
'win_size': 1024,
'sampling_rate': 16000,
'dur_prediction_weight': 1.0,
'dur_predictor_params': {
'encoder_embed_dim': 512,
'var_pred_hidden_dim': 512,
'var_pred_kernel_size': 3,
'var_pred_dropout': 0.5
}
}
super().__init__(**kwargs)
|