{ "architectures": [ "Typhoon2Audio2AudioForConditionalGeneration" ], "attention_bias": false, "attention_dropout": 0.0, "auto_map": { "AutoConfig": "configuration_typhoon2audio.Typhoon2AudioConfig", "AutoModel": "modeling_typhoon2audio.Typhoon2Audio2AudioForConditionalGeneration" }, "beats": { "model_type": "" }, "ctc_decoder_config": "(4,4096,32,11008)", "ctc_loss_weight": 1.0, "ctc_upsample_factor": 25, "head_dim": 128, "hidden_act": "silu", "hidden_size": 4096, "intermediate_size": 14336, "llama_base_model": "scb10x/typhoon-2-llama31-8b-instruct-beta-v1", "max_position_embeddings": 131072, "mlp_bias": false, "model_type": "typhoon2audio", "num_attention_heads": 32, "num_hidden_layers": 32, "num_key_value_heads": 8, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": { "factor": 8.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3" }, "rope_theta": 500000.0, "second_per_frame": 0.333333, "second_stride": 0.333333, "speech_decoder_ignore_index": -100, "speech_qformer_layer": 2, "speech_qformer_token_num": 1, "torch_dtype": "float16", "transformers_version": "4.45.0", "unit_vocab_size": 1000, "vocab_size": 128256, "vocoder_config": { "code_hop_size": 320, "dur_prediction_weight": 1.0, "dur_predictor_params": { "encoder_embed_dim": 512, "var_pred_dropout": 0.5, "var_pred_hidden_dim": 512, "var_pred_kernel_size": 3 }, "embedding_dim": 512, "hop_size": 256, "model_in_dim": 512, "n_fft": 1024, "num_embeddings": 1000, "num_freq": 1025, "num_mels": 80, "resblock": 1, "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "resblock_kernel_sizes": [ 3, 7, 11 ], "sampling_rate": 16000, "segment_size": 8960, "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 11, 8, 8, 4, 4 ], "upsample_rates": [ 5, 4, 4, 2, 2 ], "win_size": 1024 }, "vocoder_path": { "filename": "checkpoint.pt", "repo_id": "scb10x/unit-vocoder-gcp-th-v1-00206600" }, "whisper": { "apply_spec_augment": true, "begin_suppress_tokens": [ 220, 50257 ], "bos_token_id": 50257, "d_model": 1280, "decoder_attention_heads": 20, "decoder_ffn_dim": 5120, "decoder_layers": 32, "decoder_start_token_id": 50258, "encoder_attention_heads": 20, "encoder_ffn_dim": 5120, "encoder_layers": 32, "eos_token_id": 50257, "mask_feature_length": 64, "mask_feature_prob": 0.1, "mask_time_prob": 0.1, "max_length": 448, "model_type": "whisper", "num_hidden_layers": 32, "num_mel_bins": 128, "vocab_size": 51866 }, "whisper_extractor_feature_size": 128 }