| { | |
| "config": { | |
| "audio_history_cfg": { | |
| "attn_qk_norm": true, | |
| "attn_scaled_cosine": false, | |
| "clip_attn_logit": null, | |
| "dropout_broadcast_dims": [ | |
| -2 | |
| ], | |
| "dropout_rate": 0.0, | |
| "droppath_rate": 0.0, | |
| "dtype": "float32", | |
| "emb_dim": 768, | |
| "float32_attention_logits": true, | |
| "head_dim": 64, | |
| "latents_size": 16, | |
| "layer_drop": 0.0, | |
| "max_frames": 8, | |
| "mlp_activations": [ | |
| "gelu" | |
| ], | |
| "mlp_dim": 2048, | |
| "num_heads": 12, | |
| "num_layers": 2, | |
| "resampler_type": "perceiver", | |
| "xattention_index": [ | |
| 0, | |
| 1 | |
| ], | |
| "xattn_qk_norm": true, | |
| "xattn_scaled_cosine": false | |
| }, | |
| "audio_vit_cfg": { | |
| "default_input_size": [ | |
| 256, | |
| 128 | |
| ], | |
| "dropout_broadcast_dims": [], | |
| "dropout_rate": 0.0, | |
| "dtype": "float32", | |
| "emb_dim": 768, | |
| "float32_attention_logits": true, | |
| "head_dim": 64, | |
| "mlp_activations": [ | |
| "gelu" | |
| ], | |
| "mlp_dim": 3072, | |
| "num_heads": 12, | |
| "num_layers": 11, | |
| "patch_size": 16, | |
| "pos_patch_size": 16, | |
| "transpose_input": true, | |
| "vit_embed": true | |
| }, | |
| "audio_vqgan": { | |
| "act_fn": "relu", | |
| "attention_dropout_rate": 0.0, | |
| "checkpoint_path": "", | |
| "decoder_head_dim": 64, | |
| "decoder_hidden_size": 512, | |
| "decoder_mlp_dim": 2048, | |
| "decoder_num_heads": 8, | |
| "decoder_num_layers": 8, | |
| "default_input_size": [ | |
| 128, | |
| 256 | |
| ], | |
| "dropout_rate": 0.0, | |
| "droppath_rate": 0.0, | |
| "dtype": "float32", | |
| "encoder_head_dim": 64, | |
| "encoder_hidden_size": 512, | |
| "encoder_mlp_dim": 2048, | |
| "encoder_num_heads": 8, | |
| "encoder_num_layers": 8, | |
| "output_channel": 1, | |
| "patch_size": [ | |
| 8, | |
| 8 | |
| ], | |
| "proj_dim": 32, | |
| "use_bias": false, | |
| "use_decoder": true, | |
| "vocab_size": 8192 | |
| }, | |
| "freeze_vit": true, | |
| "image_history_cfg": { | |
| "attn_qk_norm": true, | |
| "attn_scaled_cosine": false, | |
| "clip_attn_logit": null, | |
| "dropout_broadcast_dims": [ | |
| -2 | |
| ], | |
| "dropout_rate": 0.0, | |
| "droppath_rate": 0.0, | |
| "dtype": "float32", | |
| "emb_dim": 768, | |
| "float32_attention_logits": true, | |
| "head_dim": 64, | |
| "latents_size": 32, | |
| "layer_drop": 0.0, | |
| "max_frames": 8, | |
| "mlp_activations": [ | |
| "gelu" | |
| ], | |
| "mlp_dim": 2048, | |
| "num_heads": 12, | |
| "num_layers": 2, | |
| "resampler_type": "perceiver", | |
| "xattention_index": [ | |
| 0, | |
| 1 | |
| ], | |
| "xattn_qk_norm": true, | |
| "xattn_scaled_cosine": false | |
| }, | |
| "image_vit_cfg": { | |
| "default_input_size": [ | |
| 256, | |
| 256 | |
| ], | |
| "dropout_broadcast_dims": [], | |
| "dropout_rate": 0.0, | |
| "dtype": "float32", | |
| "emb_dim": 768, | |
| "float32_attention_logits": true, | |
| "head_dim": 64, | |
| "mlp_activations": [ | |
| "gelu" | |
| ], | |
| "mlp_dim": 3072, | |
| "num_heads": 12, | |
| "num_layers": 11, | |
| "num_pos": 197, | |
| "patch_size": 16, | |
| "pos_patch_size": 16 | |
| }, | |
| "image_vqgan": { | |
| "attn_resolutions": [ | |
| 32 | |
| ], | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 2, | |
| 2, | |
| 4 | |
| ], | |
| "checkpoint_path": "", | |
| "default_input_size": [ | |
| 256, | |
| 256 | |
| ], | |
| "double_z": false, | |
| "dropout": 0, | |
| "dtype": "float32", | |
| "embed_dim": 4, | |
| "in_channels": 3, | |
| "n_embed": 16384, | |
| "num_res_blocks": 2, | |
| "out_ch": 3, | |
| "patch_size": [ | |
| 8, | |
| 8 | |
| ], | |
| "resolution": 256, | |
| "z_channels": 4 | |
| }, | |
| "input_modalities": [ | |
| "text", | |
| "image", | |
| "image_history", | |
| "audio", | |
| "audio_history" | |
| ], | |
| "sequence_length": { | |
| "audio_history_input_samples": 128, | |
| "audio_input_samples": 128, | |
| "image_history_input_samples": 256, | |
| "image_input_samples": 576, | |
| "is_training": true, | |
| "num_frames": 4 | |
| }, | |
| "t5_config": { | |
| "audio_history_pos_emb": "llama_rope", | |
| "audio_patch_size": 16, | |
| "audio_pos_emb": "llama_rope", | |
| "audio_vit_patch_size": 16, | |
| "audio_vocab_size": 8320, | |
| "dalle_attn_mask": true, | |
| "decoder_max_audio_length": 512, | |
| "decoder_max_image_length": 1024, | |
| "decoder_max_text_length": 512, | |
| "decoder_xattention_internval": 1, | |
| "default_audio_history_vit_size": [ | |
| 256, | |
| 128 | |
| ], | |
| "default_audio_size": [ | |
| 256, | |
| 128 | |
| ], | |
| "default_audio_vit_size": [ | |
| 256, | |
| 128 | |
| ], | |
| "default_image_history_vit_size": [ | |
| 256, | |
| 256 | |
| ], | |
| "default_image_size": [ | |
| 256, | |
| 256 | |
| ], | |
| "default_image_vit_size": [ | |
| 384, | |
| 384 | |
| ], | |
| "dropout_broadcast_dims": [ | |
| -2 | |
| ], | |
| "dropout_rate": 0.0, | |
| "dtype": "float32", | |
| "dynamic_unk_mask": true, | |
| "emb_dim": 1024, | |
| "encoder_max_audio_length": 128, | |
| "encoder_max_image_length": 576, | |
| "encoder_max_text_length": 512, | |
| "float32_attention_logits": true, | |
| "head_dim": 64, | |
| "image_history_pos_emb": "llama_rope", | |
| "image_patch_size": 16, | |
| "image_pos_emb": "llama_rope", | |
| "image_tokenizer_type": "vqgan", | |
| "image_vit_patch_size": 16, | |
| "image_vocab_size": 16512, | |
| "logits_via_embedding": true, | |
| "mlp_activations": [ | |
| "silu", | |
| "linear" | |
| ], | |
| "mlp_dim": 2816, | |
| "num_decoder_layers": 24, | |
| "num_encoder_layers": 24, | |
| "num_heads": 16, | |
| "qk_norm": true, | |
| "text_pos_emb": "llama_rope", | |
| "vocab_size": 33280 | |
| }, | |
| "target_modalities": [ | |
| "text", | |
| "image", | |
| "audio" | |
| ], | |
| "use_audio_history_vit": true, | |
| "use_audio_vit": true, | |
| "use_image_history_vit": true, | |
| "use_image_vit": true | |
| }, | |
| "sequence_length": { | |
| "audio_history_input_samples": 128, | |
| "audio_input_samples": 128, | |
| "image_history_input_samples": 256, | |
| "image_input_samples": 576, | |
| "is_training": true, | |
| "num_frames": 4 | |
| } | |
| } | |