MaxMilan1 commited on
Commit
0f211e9
·
1 Parent(s): c0bdbf4

add configs?

Browse files
configs/ae/video.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ target: sgm.models.autoencoder.AutoencodingEngine
2
+ params:
3
+ loss_config:
4
+ target: torch.nn.Identity
5
+ regularizer_config:
6
+ target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
7
+ encoder_config:
8
+ target: sgm.modules.diffusionmodules.model.Encoder
9
+ params:
10
+ attn_type: vanilla
11
+ double_z: True
12
+ z_channels: 4
13
+ resolution: 256
14
+ in_channels: 3
15
+ out_ch: 3
16
+ ch: 128
17
+ ch_mult: [1, 2, 4, 4]
18
+ num_res_blocks: 2
19
+ attn_resolutions: []
20
+ dropout: 0.0
21
+ decoder_config:
22
+ target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
23
+ params:
24
+ attn_type: vanilla
25
+ double_z: True
26
+ z_channels: 4
27
+ resolution: 256
28
+ in_channels: 3
29
+ out_ch: 3
30
+ ch: 128
31
+ ch_mult: [1, 2, 4, 4]
32
+ num_res_blocks: 2
33
+ attn_resolutions: []
34
+ dropout: 0.0
35
+ video_kernel_size: [3, 1, 1]
configs/embedder/clip_image.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
2
+ params:
3
+ n_cond_frames: 1
4
+ n_copies: 1
5
+ open_clip_embedding_config:
6
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
7
+ params:
8
+ freeze: True
configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-6
3
+ target: sgm.models.autoencoder.AutoencodingEngine
4
+ params:
5
+ input_key: jpg
6
+ monitor: val/rec_loss
7
+
8
+ loss_config:
9
+ target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator
10
+ params:
11
+ perceptual_weight: 0.25
12
+ disc_start: 20001
13
+ disc_weight: 0.5
14
+ learn_logvar: True
15
+
16
+ regularization_weights:
17
+ kl_loss: 1.0
18
+
19
+ regularizer_config:
20
+ target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
21
+
22
+ encoder_config:
23
+ target: sgm.modules.diffusionmodules.model.Encoder
24
+ params:
25
+ attn_type: none
26
+ double_z: True
27
+ z_channels: 4
28
+ resolution: 256
29
+ in_channels: 3
30
+ out_ch: 3
31
+ ch: 128
32
+ ch_mult: [1, 2, 4]
33
+ num_res_blocks: 4
34
+ attn_resolutions: []
35
+ dropout: 0.0
36
+
37
+ decoder_config:
38
+ target: sgm.modules.diffusionmodules.model.Decoder
39
+ params: ${model.params.encoder_config.params}
40
+
41
+ data:
42
+ target: sgm.data.dataset.StableDataModuleFromConfig
43
+ params:
44
+ train:
45
+ datapipeline:
46
+ urls:
47
+ - DATA-PATH
48
+ pipeline_config:
49
+ shardshuffle: 10000
50
+ sample_shuffle: 10000
51
+
52
+ decoders:
53
+ - pil
54
+
55
+ postprocessors:
56
+ - target: sdata.mappers.TorchVisionImageTransforms
57
+ params:
58
+ key: jpg
59
+ transforms:
60
+ - target: torchvision.transforms.Resize
61
+ params:
62
+ size: 256
63
+ interpolation: 3
64
+ - target: torchvision.transforms.ToTensor
65
+ - target: sdata.mappers.Rescaler
66
+ - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
67
+ params:
68
+ h_key: height
69
+ w_key: width
70
+
71
+ loader:
72
+ batch_size: 8
73
+ num_workers: 4
74
+
75
+
76
+ lightning:
77
+ strategy:
78
+ target: pytorch_lightning.strategies.DDPStrategy
79
+ params:
80
+ find_unused_parameters: True
81
+
82
+ modelcheckpoint:
83
+ params:
84
+ every_n_train_steps: 5000
85
+
86
+ callbacks:
87
+ metrics_over_trainsteps_checkpoint:
88
+ params:
89
+ every_n_train_steps: 50000
90
+
91
+ image_logger:
92
+ target: main.ImageLogger
93
+ params:
94
+ enable_autocast: False
95
+ batch_frequency: 1000
96
+ max_images: 8
97
+ increase_log_steps: True
98
+
99
+ trainer:
100
+ devices: 0,
101
+ limit_val_batches: 50
102
+ benchmark: True
103
+ accumulate_grad_batches: 1
104
+ val_check_interval: 10000
configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-6
3
+ target: sgm.models.autoencoder.AutoencodingEngine
4
+ params:
5
+ input_key: jpg
6
+ monitor: val/loss/rec
7
+ disc_start_iter: 0
8
+
9
+ encoder_config:
10
+ target: sgm.modules.diffusionmodules.model.Encoder
11
+ params:
12
+ attn_type: vanilla-xformers
13
+ double_z: true
14
+ z_channels: 8
15
+ resolution: 256
16
+ in_channels: 3
17
+ out_ch: 3
18
+ ch: 128
19
+ ch_mult: [1, 2, 4, 4]
20
+ num_res_blocks: 2
21
+ attn_resolutions: []
22
+ dropout: 0.0
23
+
24
+ decoder_config:
25
+ target: sgm.modules.diffusionmodules.model.Decoder
26
+ params: ${model.params.encoder_config.params}
27
+
28
+ regularizer_config:
29
+ target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
30
+
31
+ loss_config:
32
+ target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator
33
+ params:
34
+ perceptual_weight: 0.25
35
+ disc_start: 20001
36
+ disc_weight: 0.5
37
+ learn_logvar: True
38
+
39
+ regularization_weights:
40
+ kl_loss: 1.0
41
+
42
+ data:
43
+ target: sgm.data.dataset.StableDataModuleFromConfig
44
+ params:
45
+ train:
46
+ datapipeline:
47
+ urls:
48
+ - DATA-PATH
49
+ pipeline_config:
50
+ shardshuffle: 10000
51
+ sample_shuffle: 10000
52
+
53
+ decoders:
54
+ - pil
55
+
56
+ postprocessors:
57
+ - target: sdata.mappers.TorchVisionImageTransforms
58
+ params:
59
+ key: jpg
60
+ transforms:
61
+ - target: torchvision.transforms.Resize
62
+ params:
63
+ size: 256
64
+ interpolation: 3
65
+ - target: torchvision.transforms.ToTensor
66
+ - target: sdata.mappers.Rescaler
67
+ - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
68
+ params:
69
+ h_key: height
70
+ w_key: width
71
+
72
+ loader:
73
+ batch_size: 8
74
+ num_workers: 4
75
+
76
+
77
+ lightning:
78
+ strategy:
79
+ target: pytorch_lightning.strategies.DDPStrategy
80
+ params:
81
+ find_unused_parameters: True
82
+
83
+ modelcheckpoint:
84
+ params:
85
+ every_n_train_steps: 5000
86
+
87
+ callbacks:
88
+ metrics_over_trainsteps_checkpoint:
89
+ params:
90
+ every_n_train_steps: 50000
91
+
92
+ image_logger:
93
+ target: main.ImageLogger
94
+ params:
95
+ enable_autocast: False
96
+ batch_frequency: 1000
97
+ max_images: 8
98
+ increase_log_steps: True
99
+
100
+ trainer:
101
+ devices: 0,
102
+ limit_val_batches: 50
103
+ benchmark: True
104
+ accumulate_grad_batches: 1
105
+ val_check_interval: 10000
configs/example_training/imagenet-f8_cond.yaml ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ scale_factor: 0.13025
6
+ disable_first_stage_autocast: True
7
+ log_keys:
8
+ - cls
9
+
10
+ scheduler_config:
11
+ target: sgm.lr_scheduler.LambdaLinearScheduler
12
+ params:
13
+ warm_up_steps: [10000]
14
+ cycle_lengths: [10000000000000]
15
+ f_start: [1.e-6]
16
+ f_max: [1.]
17
+ f_min: [1.]
18
+
19
+ denoiser_config:
20
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
21
+ params:
22
+ num_idx: 1000
23
+
24
+ scaling_config:
25
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
26
+ discretization_config:
27
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
28
+
29
+ network_config:
30
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
31
+ params:
32
+ use_checkpoint: True
33
+ in_channels: 4
34
+ out_channels: 4
35
+ model_channels: 256
36
+ attention_resolutions: [1, 2, 4]
37
+ num_res_blocks: 2
38
+ channel_mult: [1, 2, 4]
39
+ num_head_channels: 64
40
+ num_classes: sequential
41
+ adm_in_channels: 1024
42
+ transformer_depth: 1
43
+ context_dim: 1024
44
+ spatial_transformer_attn_type: softmax-xformers
45
+
46
+ conditioner_config:
47
+ target: sgm.modules.GeneralConditioner
48
+ params:
49
+ emb_models:
50
+ - is_trainable: True
51
+ input_key: cls
52
+ ucg_rate: 0.2
53
+ target: sgm.modules.encoders.modules.ClassEmbedder
54
+ params:
55
+ add_sequence_dim: True
56
+ embed_dim: 1024
57
+ n_classes: 1000
58
+
59
+ - is_trainable: False
60
+ ucg_rate: 0.2
61
+ input_key: original_size_as_tuple
62
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
63
+ params:
64
+ outdim: 256
65
+
66
+ - is_trainable: False
67
+ input_key: crop_coords_top_left
68
+ ucg_rate: 0.2
69
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
70
+ params:
71
+ outdim: 256
72
+
73
+ first_stage_config:
74
+ target: sgm.models.autoencoder.AutoencoderKL
75
+ params:
76
+ ckpt_path: CKPT_PATH
77
+ embed_dim: 4
78
+ monitor: val/rec_loss
79
+ ddconfig:
80
+ attn_type: vanilla-xformers
81
+ double_z: true
82
+ z_channels: 4
83
+ resolution: 256
84
+ in_channels: 3
85
+ out_ch: 3
86
+ ch: 128
87
+ ch_mult: [1, 2, 4, 4]
88
+ num_res_blocks: 2
89
+ attn_resolutions: []
90
+ dropout: 0.0
91
+ lossconfig:
92
+ target: torch.nn.Identity
93
+
94
+ loss_fn_config:
95
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
96
+ params:
97
+ loss_weighting_config:
98
+ target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
99
+ sigma_sampler_config:
100
+ target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
101
+ params:
102
+ num_idx: 1000
103
+
104
+ discretization_config:
105
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
106
+
107
+ sampler_config:
108
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
109
+ params:
110
+ num_steps: 50
111
+
112
+ discretization_config:
113
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
114
+
115
+ guider_config:
116
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
117
+ params:
118
+ scale: 5.0
119
+
120
+ data:
121
+ target: sgm.data.dataset.StableDataModuleFromConfig
122
+ params:
123
+ train:
124
+ datapipeline:
125
+ urls:
126
+ # USER: adapt this path the root of your custom dataset
127
+ - DATA_PATH
128
+ pipeline_config:
129
+ shardshuffle: 10000
130
+ sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM
131
+
132
+ decoders:
133
+ - pil
134
+
135
+ postprocessors:
136
+ - target: sdata.mappers.TorchVisionImageTransforms
137
+ params:
138
+ key: jpg # USER: you might wanna adapt this for your custom dataset
139
+ transforms:
140
+ - target: torchvision.transforms.Resize
141
+ params:
142
+ size: 256
143
+ interpolation: 3
144
+ - target: torchvision.transforms.ToTensor
145
+ - target: sdata.mappers.Rescaler
146
+
147
+ - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
148
+ params:
149
+ h_key: height # USER: you might wanna adapt this for your custom dataset
150
+ w_key: width # USER: you might wanna adapt this for your custom dataset
151
+
152
+ loader:
153
+ batch_size: 64
154
+ num_workers: 6
155
+
156
+ lightning:
157
+ modelcheckpoint:
158
+ params:
159
+ every_n_train_steps: 5000
160
+
161
+ callbacks:
162
+ metrics_over_trainsteps_checkpoint:
163
+ params:
164
+ every_n_train_steps: 25000
165
+
166
+ image_logger:
167
+ target: main.ImageLogger
168
+ params:
169
+ disabled: False
170
+ enable_autocast: False
171
+ batch_frequency: 1000
172
+ max_images: 8
173
+ increase_log_steps: True
174
+ log_first_step: False
175
+ log_images_kwargs:
176
+ use_ema_scope: False
177
+ N: 8
178
+ n_rows: 2
179
+
180
+ trainer:
181
+ devices: 0,
182
+ benchmark: True
183
+ num_sanity_val_steps: 0
184
+ accumulate_grad_batches: 1
185
+ max_epochs: 1000
configs/example_training/toy/cifar10_cond.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ denoiser_config:
6
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
+ params:
8
+ scaling_config:
9
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
+ params:
11
+ sigma_data: 1.0
12
+
13
+ network_config:
14
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
+ params:
16
+ in_channels: 3
17
+ out_channels: 3
18
+ model_channels: 32
19
+ attention_resolutions: []
20
+ num_res_blocks: 4
21
+ channel_mult: [1, 2, 2]
22
+ num_head_channels: 32
23
+ num_classes: sequential
24
+ adm_in_channels: 128
25
+
26
+ conditioner_config:
27
+ target: sgm.modules.GeneralConditioner
28
+ params:
29
+ emb_models:
30
+ - is_trainable: True
31
+ input_key: cls
32
+ ucg_rate: 0.2
33
+ target: sgm.modules.encoders.modules.ClassEmbedder
34
+ params:
35
+ embed_dim: 128
36
+ n_classes: 10
37
+
38
+ first_stage_config:
39
+ target: sgm.models.autoencoder.IdentityFirstStage
40
+
41
+ loss_fn_config:
42
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
43
+ params:
44
+ loss_weighting_config:
45
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
46
+ params:
47
+ sigma_data: 1.0
48
+ sigma_sampler_config:
49
+ target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
50
+
51
+ sampler_config:
52
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
53
+ params:
54
+ num_steps: 50
55
+
56
+ discretization_config:
57
+ target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
58
+
59
+ guider_config:
60
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
61
+ params:
62
+ scale: 3.0
63
+
64
+ data:
65
+ target: sgm.data.cifar10.CIFAR10Loader
66
+ params:
67
+ batch_size: 512
68
+ num_workers: 1
69
+
70
+ lightning:
71
+ modelcheckpoint:
72
+ params:
73
+ every_n_train_steps: 5000
74
+
75
+ callbacks:
76
+ metrics_over_trainsteps_checkpoint:
77
+ params:
78
+ every_n_train_steps: 25000
79
+
80
+ image_logger:
81
+ target: main.ImageLogger
82
+ params:
83
+ disabled: False
84
+ batch_frequency: 1000
85
+ max_images: 64
86
+ increase_log_steps: True
87
+ log_first_step: False
88
+ log_images_kwargs:
89
+ use_ema_scope: False
90
+ N: 64
91
+ n_rows: 8
92
+
93
+ trainer:
94
+ devices: 0,
95
+ benchmark: True
96
+ num_sanity_val_steps: 0
97
+ accumulate_grad_batches: 1
98
+ max_epochs: 20
configs/example_training/toy/mnist.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ denoiser_config:
6
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
+ params:
8
+ scaling_config:
9
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
+ params:
11
+ sigma_data: 1.0
12
+
13
+ network_config:
14
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
+ params:
16
+ in_channels: 1
17
+ out_channels: 1
18
+ model_channels: 32
19
+ attention_resolutions: []
20
+ num_res_blocks: 4
21
+ channel_mult: [1, 2, 2]
22
+ num_head_channels: 32
23
+
24
+ first_stage_config:
25
+ target: sgm.models.autoencoder.IdentityFirstStage
26
+
27
+ loss_fn_config:
28
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
29
+ params:
30
+ loss_weighting_config:
31
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
32
+ params:
33
+ sigma_data: 1.0
34
+ sigma_sampler_config:
35
+ target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
36
+
37
+ sampler_config:
38
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
39
+ params:
40
+ num_steps: 50
41
+
42
+ discretization_config:
43
+ target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
44
+
45
+ data:
46
+ target: sgm.data.mnist.MNISTLoader
47
+ params:
48
+ batch_size: 512
49
+ num_workers: 1
50
+
51
+ lightning:
52
+ modelcheckpoint:
53
+ params:
54
+ every_n_train_steps: 5000
55
+
56
+ callbacks:
57
+ metrics_over_trainsteps_checkpoint:
58
+ params:
59
+ every_n_train_steps: 25000
60
+
61
+ image_logger:
62
+ target: main.ImageLogger
63
+ params:
64
+ disabled: False
65
+ batch_frequency: 1000
66
+ max_images: 64
67
+ increase_log_steps: False
68
+ log_first_step: False
69
+ log_images_kwargs:
70
+ use_ema_scope: False
71
+ N: 64
72
+ n_rows: 8
73
+
74
+ trainer:
75
+ devices: 0,
76
+ benchmark: True
77
+ num_sanity_val_steps: 0
78
+ accumulate_grad_batches: 1
79
+ max_epochs: 10
configs/example_training/toy/mnist_cond.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ denoiser_config:
6
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
+ params:
8
+ scaling_config:
9
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
+ params:
11
+ sigma_data: 1.0
12
+
13
+ network_config:
14
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
+ params:
16
+ in_channels: 1
17
+ out_channels: 1
18
+ model_channels: 32
19
+ attention_resolutions: []
20
+ num_res_blocks: 4
21
+ channel_mult: [1, 2, 2]
22
+ num_head_channels: 32
23
+ num_classes: sequential
24
+ adm_in_channels: 128
25
+
26
+ conditioner_config:
27
+ target: sgm.modules.GeneralConditioner
28
+ params:
29
+ emb_models:
30
+ - is_trainable: True
31
+ input_key: cls
32
+ ucg_rate: 0.2
33
+ target: sgm.modules.encoders.modules.ClassEmbedder
34
+ params:
35
+ embed_dim: 128
36
+ n_classes: 10
37
+
38
+ first_stage_config:
39
+ target: sgm.models.autoencoder.IdentityFirstStage
40
+
41
+ loss_fn_config:
42
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
43
+ params:
44
+ loss_weighting_config:
45
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
46
+ params:
47
+ sigma_data: 1.0
48
+ sigma_sampler_config:
49
+ target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
50
+
51
+ sampler_config:
52
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
53
+ params:
54
+ num_steps: 50
55
+
56
+ discretization_config:
57
+ target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
58
+
59
+ guider_config:
60
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
61
+ params:
62
+ scale: 3.0
63
+
64
+ data:
65
+ target: sgm.data.mnist.MNISTLoader
66
+ params:
67
+ batch_size: 512
68
+ num_workers: 1
69
+
70
+ lightning:
71
+ modelcheckpoint:
72
+ params:
73
+ every_n_train_steps: 5000
74
+
75
+ callbacks:
76
+ metrics_over_trainsteps_checkpoint:
77
+ params:
78
+ every_n_train_steps: 25000
79
+
80
+ image_logger:
81
+ target: main.ImageLogger
82
+ params:
83
+ disabled: False
84
+ batch_frequency: 1000
85
+ max_images: 16
86
+ increase_log_steps: True
87
+ log_first_step: False
88
+ log_images_kwargs:
89
+ use_ema_scope: False
90
+ N: 16
91
+ n_rows: 4
92
+
93
+ trainer:
94
+ devices: 0,
95
+ benchmark: True
96
+ num_sanity_val_steps: 0
97
+ accumulate_grad_batches: 1
98
+ max_epochs: 20
configs/example_training/toy/mnist_cond_discrete_eps.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ denoiser_config:
6
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
7
+ params:
8
+ num_idx: 1000
9
+
10
+ scaling_config:
11
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
12
+ discretization_config:
13
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
14
+
15
+ network_config:
16
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
17
+ params:
18
+ in_channels: 1
19
+ out_channels: 1
20
+ model_channels: 32
21
+ attention_resolutions: []
22
+ num_res_blocks: 4
23
+ channel_mult: [1, 2, 2]
24
+ num_head_channels: 32
25
+ num_classes: sequential
26
+ adm_in_channels: 128
27
+
28
+ conditioner_config:
29
+ target: sgm.modules.GeneralConditioner
30
+ params:
31
+ emb_models:
32
+ - is_trainable: True
33
+ input_key: cls
34
+ ucg_rate: 0.2
35
+ target: sgm.modules.encoders.modules.ClassEmbedder
36
+ params:
37
+ embed_dim: 128
38
+ n_classes: 10
39
+
40
+ first_stage_config:
41
+ target: sgm.models.autoencoder.IdentityFirstStage
42
+
43
+ loss_fn_config:
44
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
45
+ params:
46
+ loss_weighting_config:
47
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
48
+ sigma_sampler_config:
49
+ target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
50
+ params:
51
+ num_idx: 1000
52
+
53
+ discretization_config:
54
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
55
+
56
+ sampler_config:
57
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
58
+ params:
59
+ num_steps: 50
60
+
61
+ discretization_config:
62
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
63
+
64
+ guider_config:
65
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
66
+ params:
67
+ scale: 5.0
68
+
69
+ data:
70
+ target: sgm.data.mnist.MNISTLoader
71
+ params:
72
+ batch_size: 512
73
+ num_workers: 1
74
+
75
+ lightning:
76
+ modelcheckpoint:
77
+ params:
78
+ every_n_train_steps: 5000
79
+
80
+ callbacks:
81
+ metrics_over_trainsteps_checkpoint:
82
+ params:
83
+ every_n_train_steps: 25000
84
+
85
+ image_logger:
86
+ target: main.ImageLogger
87
+ params:
88
+ disabled: False
89
+ batch_frequency: 1000
90
+ max_images: 16
91
+ increase_log_steps: True
92
+ log_first_step: False
93
+ log_images_kwargs:
94
+ use_ema_scope: False
95
+ N: 16
96
+ n_rows: 4
97
+
98
+ trainer:
99
+ devices: 0,
100
+ benchmark: True
101
+ num_sanity_val_steps: 0
102
+ accumulate_grad_batches: 1
103
+ max_epochs: 20
configs/example_training/toy/mnist_cond_l1_loss.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ denoiser_config:
6
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
+ params:
8
+ scaling_config:
9
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
+ params:
11
+ sigma_data: 1.0
12
+
13
+ network_config:
14
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
+ params:
16
+ in_channels: 1
17
+ out_channels: 1
18
+ model_channels: 32
19
+ attention_resolutions: []
20
+ num_res_blocks: 4
21
+ channel_mult: [1, 2, 2]
22
+ num_head_channels: 32
23
+ num_classes: sequential
24
+ adm_in_channels: 128
25
+
26
+ conditioner_config:
27
+ target: sgm.modules.GeneralConditioner
28
+ params:
29
+ emb_models:
30
+ - is_trainable: True
31
+ input_key: cls
32
+ ucg_rate: 0.2
33
+ target: sgm.modules.encoders.modules.ClassEmbedder
34
+ params:
35
+ embed_dim: 128
36
+ n_classes: 10
37
+
38
+ first_stage_config:
39
+ target: sgm.models.autoencoder.IdentityFirstStage
40
+
41
+ loss_fn_config:
42
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
43
+ params:
44
+ loss_type: l1
45
+ loss_weighting_config:
46
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
47
+ params:
48
+ sigma_data: 1.0
49
+ sigma_sampler_config:
50
+ target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
51
+
52
+ sampler_config:
53
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
54
+ params:
55
+ num_steps: 50
56
+
57
+ discretization_config:
58
+ target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
59
+
60
+ guider_config:
61
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
62
+ params:
63
+ scale: 3.0
64
+
65
+ data:
66
+ target: sgm.data.mnist.MNISTLoader
67
+ params:
68
+ batch_size: 512
69
+ num_workers: 1
70
+
71
+ lightning:
72
+ modelcheckpoint:
73
+ params:
74
+ every_n_train_steps: 5000
75
+
76
+ callbacks:
77
+ metrics_over_trainsteps_checkpoint:
78
+ params:
79
+ every_n_train_steps: 25000
80
+
81
+ image_logger:
82
+ target: main.ImageLogger
83
+ params:
84
+ disabled: False
85
+ batch_frequency: 1000
86
+ max_images: 64
87
+ increase_log_steps: True
88
+ log_first_step: False
89
+ log_images_kwargs:
90
+ use_ema_scope: False
91
+ N: 64
92
+ n_rows: 8
93
+
94
+ trainer:
95
+ devices: 0,
96
+ benchmark: True
97
+ num_sanity_val_steps: 0
98
+ accumulate_grad_batches: 1
99
+ max_epochs: 20
configs/example_training/toy/mnist_cond_with_ema.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ use_ema: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
9
+ params:
10
+ scaling_config:
11
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
12
+ params:
13
+ sigma_data: 1.0
14
+
15
+ network_config:
16
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
17
+ params:
18
+ in_channels: 1
19
+ out_channels: 1
20
+ model_channels: 32
21
+ attention_resolutions: []
22
+ num_res_blocks: 4
23
+ channel_mult: [1, 2, 2]
24
+ num_head_channels: 32
25
+ num_classes: sequential
26
+ adm_in_channels: 128
27
+
28
+ conditioner_config:
29
+ target: sgm.modules.GeneralConditioner
30
+ params:
31
+ emb_models:
32
+ - is_trainable: True
33
+ input_key: cls
34
+ ucg_rate: 0.2
35
+ target: sgm.modules.encoders.modules.ClassEmbedder
36
+ params:
37
+ embed_dim: 128
38
+ n_classes: 10
39
+
40
+ first_stage_config:
41
+ target: sgm.models.autoencoder.IdentityFirstStage
42
+
43
+ loss_fn_config:
44
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
45
+ params:
46
+ loss_weighting_config:
47
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
48
+ params:
49
+ sigma_data: 1.0
50
+ sigma_sampler_config:
51
+ target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
52
+
53
+ sampler_config:
54
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
55
+ params:
56
+ num_steps: 50
57
+
58
+ discretization_config:
59
+ target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
60
+
61
+ guider_config:
62
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
63
+ params:
64
+ scale: 3.0
65
+
66
+ data:
67
+ target: sgm.data.mnist.MNISTLoader
68
+ params:
69
+ batch_size: 512
70
+ num_workers: 1
71
+
72
+ lightning:
73
+ modelcheckpoint:
74
+ params:
75
+ every_n_train_steps: 5000
76
+
77
+ callbacks:
78
+ metrics_over_trainsteps_checkpoint:
79
+ params:
80
+ every_n_train_steps: 25000
81
+
82
+ image_logger:
83
+ target: main.ImageLogger
84
+ params:
85
+ disabled: False
86
+ batch_frequency: 1000
87
+ max_images: 64
88
+ increase_log_steps: True
89
+ log_first_step: False
90
+ log_images_kwargs:
91
+ use_ema_scope: False
92
+ N: 64
93
+ n_rows: 8
94
+
95
+ trainer:
96
+ devices: 0,
97
+ benchmark: True
98
+ num_sanity_val_steps: 0
99
+ accumulate_grad_batches: 1
100
+ max_epochs: 20
configs/example_training/txt2img-clipl-legacy-ucg-training.yaml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ scale_factor: 0.13025
6
+ disable_first_stage_autocast: True
7
+ log_keys:
8
+ - txt
9
+
10
+ scheduler_config:
11
+ target: sgm.lr_scheduler.LambdaLinearScheduler
12
+ params:
13
+ warm_up_steps: [10000]
14
+ cycle_lengths: [10000000000000]
15
+ f_start: [1.e-6]
16
+ f_max: [1.]
17
+ f_min: [1.]
18
+
19
+ denoiser_config:
20
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
21
+ params:
22
+ num_idx: 1000
23
+
24
+ scaling_config:
25
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
26
+ discretization_config:
27
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
28
+
29
+ network_config:
30
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
31
+ params:
32
+ use_checkpoint: True
33
+ in_channels: 4
34
+ out_channels: 4
35
+ model_channels: 320
36
+ attention_resolutions: [1, 2, 4]
37
+ num_res_blocks: 2
38
+ channel_mult: [1, 2, 4, 4]
39
+ num_head_channels: 64
40
+ num_classes: sequential
41
+ adm_in_channels: 1792
42
+ num_heads: 1
43
+ transformer_depth: 1
44
+ context_dim: 768
45
+ spatial_transformer_attn_type: softmax-xformers
46
+
47
+ conditioner_config:
48
+ target: sgm.modules.GeneralConditioner
49
+ params:
50
+ emb_models:
51
+ - is_trainable: True
52
+ input_key: txt
53
+ ucg_rate: 0.1
54
+ legacy_ucg_value: ""
55
+ target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
56
+ params:
57
+ always_return_pooled: True
58
+
59
+ - is_trainable: False
60
+ ucg_rate: 0.1
61
+ input_key: original_size_as_tuple
62
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
63
+ params:
64
+ outdim: 256
65
+
66
+ - is_trainable: False
67
+ input_key: crop_coords_top_left
68
+ ucg_rate: 0.1
69
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
70
+ params:
71
+ outdim: 256
72
+
73
+ first_stage_config:
74
+ target: sgm.models.autoencoder.AutoencoderKL
75
+ params:
76
+ ckpt_path: CKPT_PATH
77
+ embed_dim: 4
78
+ monitor: val/rec_loss
79
+ ddconfig:
80
+ attn_type: vanilla-xformers
81
+ double_z: true
82
+ z_channels: 4
83
+ resolution: 256
84
+ in_channels: 3
85
+ out_ch: 3
86
+ ch: 128
87
+ ch_mult: [ 1, 2, 4, 4 ]
88
+ num_res_blocks: 2
89
+ attn_resolutions: [ ]
90
+ dropout: 0.0
91
+ lossconfig:
92
+ target: torch.nn.Identity
93
+
94
+ loss_fn_config:
95
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
96
+ params:
97
+ loss_weighting_config:
98
+ target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
99
+ sigma_sampler_config:
100
+ target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
101
+ params:
102
+ num_idx: 1000
103
+
104
+ discretization_config:
105
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
106
+
107
+ sampler_config:
108
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
109
+ params:
110
+ num_steps: 50
111
+
112
+ discretization_config:
113
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
114
+
115
+ guider_config:
116
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
117
+ params:
118
+ scale: 7.5
119
+
120
+ data:
121
+ target: sgm.data.dataset.StableDataModuleFromConfig
122
+ params:
123
+ train:
124
+ datapipeline:
125
+ urls:
126
+ # USER: adapt this path the root of your custom dataset
127
+ - DATA_PATH
128
+ pipeline_config:
129
+ shardshuffle: 10000
130
+ sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM
131
+
132
+ decoders:
133
+ - pil
134
+
135
+ postprocessors:
136
+ - target: sdata.mappers.TorchVisionImageTransforms
137
+ params:
138
+ key: jpg # USER: you might wanna adapt this for your custom dataset
139
+ transforms:
140
+ - target: torchvision.transforms.Resize
141
+ params:
142
+ size: 256
143
+ interpolation: 3
144
+ - target: torchvision.transforms.ToTensor
145
+ - target: sdata.mappers.Rescaler
146
+ - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
147
+ # USER: you might wanna use non-default parameters due to your custom dataset
148
+
149
+ loader:
150
+ batch_size: 64
151
+ num_workers: 6
152
+
153
+ lightning:
154
+ modelcheckpoint:
155
+ params:
156
+ every_n_train_steps: 5000
157
+
158
+ callbacks:
159
+ metrics_over_trainsteps_checkpoint:
160
+ params:
161
+ every_n_train_steps: 25000
162
+
163
+ image_logger:
164
+ target: main.ImageLogger
165
+ params:
166
+ disabled: False
167
+ enable_autocast: False
168
+ batch_frequency: 1000
169
+ max_images: 8
170
+ increase_log_steps: True
171
+ log_first_step: False
172
+ log_images_kwargs:
173
+ use_ema_scope: False
174
+ N: 8
175
+ n_rows: 2
176
+
177
+ trainer:
178
+ devices: 0,
179
+ benchmark: True
180
+ num_sanity_val_steps: 0
181
+ accumulate_grad_batches: 1
182
+ max_epochs: 1000
configs/example_training/txt2img-clipl.yaml ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: sgm.models.diffusion.DiffusionEngine
4
+ params:
5
+ scale_factor: 0.13025
6
+ disable_first_stage_autocast: True
7
+ log_keys:
8
+ - txt
9
+
10
+ scheduler_config:
11
+ target: sgm.lr_scheduler.LambdaLinearScheduler
12
+ params:
13
+ warm_up_steps: [10000]
14
+ cycle_lengths: [10000000000000]
15
+ f_start: [1.e-6]
16
+ f_max: [1.]
17
+ f_min: [1.]
18
+
19
+ denoiser_config:
20
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
21
+ params:
22
+ num_idx: 1000
23
+
24
+ scaling_config:
25
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
26
+ discretization_config:
27
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
28
+
29
+ network_config:
30
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
31
+ params:
32
+ use_checkpoint: True
33
+ in_channels: 4
34
+ out_channels: 4
35
+ model_channels: 320
36
+ attention_resolutions: [1, 2, 4]
37
+ num_res_blocks: 2
38
+ channel_mult: [1, 2, 4, 4]
39
+ num_head_channels: 64
40
+ num_classes: sequential
41
+ adm_in_channels: 1792
42
+ num_heads: 1
43
+ transformer_depth: 1
44
+ context_dim: 768
45
+ spatial_transformer_attn_type: softmax-xformers
46
+
47
+ conditioner_config:
48
+ target: sgm.modules.GeneralConditioner
49
+ params:
50
+ emb_models:
51
+ - is_trainable: True
52
+ input_key: txt
53
+ ucg_rate: 0.1
54
+ legacy_ucg_value: ""
55
+ target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
56
+ params:
57
+ always_return_pooled: True
58
+
59
+ - is_trainable: False
60
+ ucg_rate: 0.1
61
+ input_key: original_size_as_tuple
62
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
63
+ params:
64
+ outdim: 256
65
+
66
+ - is_trainable: False
67
+ input_key: crop_coords_top_left
68
+ ucg_rate: 0.1
69
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
70
+ params:
71
+ outdim: 256
72
+
73
+ first_stage_config:
74
+ target: sgm.models.autoencoder.AutoencoderKL
75
+ params:
76
+ ckpt_path: CKPT_PATH
77
+ embed_dim: 4
78
+ monitor: val/rec_loss
79
+ ddconfig:
80
+ attn_type: vanilla-xformers
81
+ double_z: true
82
+ z_channels: 4
83
+ resolution: 256
84
+ in_channels: 3
85
+ out_ch: 3
86
+ ch: 128
87
+ ch_mult: [1, 2, 4, 4]
88
+ num_res_blocks: 2
89
+ attn_resolutions: []
90
+ dropout: 0.0
91
+ lossconfig:
92
+ target: torch.nn.Identity
93
+
94
+ loss_fn_config:
95
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
96
+ params:
97
+ loss_weighting_config:
98
+ target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
99
+ sigma_sampler_config:
100
+ target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
101
+ params:
102
+ num_idx: 1000
103
+
104
+ discretization_config:
105
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
106
+
107
+ sampler_config:
108
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
109
+ params:
110
+ num_steps: 50
111
+
112
+ discretization_config:
113
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
114
+
115
+ guider_config:
116
+ target: sgm.modules.diffusionmodules.guiders.VanillaCFG
117
+ params:
118
+ scale: 7.5
119
+
120
+ data:
121
+ target: sgm.data.dataset.StableDataModuleFromConfig
122
+ params:
123
+ train:
124
+ datapipeline:
125
+ urls:
126
+ # USER: adapt this path the root of your custom dataset
127
+ - DATA_PATH
128
+ pipeline_config:
129
+ shardshuffle: 10000
130
+ sample_shuffle: 10000
131
+
132
+
133
+ decoders:
134
+ - pil
135
+
136
+ postprocessors:
137
+ - target: sdata.mappers.TorchVisionImageTransforms
138
+ params:
139
+ key: jpg # USER: you might wanna adapt this for your custom dataset
140
+ transforms:
141
+ - target: torchvision.transforms.Resize
142
+ params:
143
+ size: 256
144
+ interpolation: 3
145
+ - target: torchvision.transforms.ToTensor
146
+ - target: sdata.mappers.Rescaler
147
+ # USER: you might wanna use non-default parameters due to your custom dataset
148
+ - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
149
+ # USER: you might wanna use non-default parameters due to your custom dataset
150
+
151
+ loader:
152
+ batch_size: 64
153
+ num_workers: 6
154
+
155
+ lightning:
156
+ modelcheckpoint:
157
+ params:
158
+ every_n_train_steps: 5000
159
+
160
+ callbacks:
161
+ metrics_over_trainsteps_checkpoint:
162
+ params:
163
+ every_n_train_steps: 25000
164
+
165
+ image_logger:
166
+ target: main.ImageLogger
167
+ params:
168
+ disabled: False
169
+ enable_autocast: False
170
+ batch_frequency: 1000
171
+ max_images: 8
172
+ increase_log_steps: True
173
+ log_first_step: False
174
+ log_images_kwargs:
175
+ use_ema_scope: False
176
+ N: 8
177
+ n_rows: 2
178
+
179
+ trainer:
180
+ devices: 0,
181
+ benchmark: True
182
+ num_sanity_val_steps: 0
183
+ accumulate_grad_batches: 1
184
+ max_epochs: 1000
configs/inference/sd_2_1.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.18215
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ scaling_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
+ discretization_config:
15
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
+
17
+ network_config:
18
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
+ params:
20
+ use_checkpoint: True
21
+ in_channels: 4
22
+ out_channels: 4
23
+ model_channels: 320
24
+ attention_resolutions: [4, 2, 1]
25
+ num_res_blocks: 2
26
+ channel_mult: [1, 2, 4, 4]
27
+ num_head_channels: 64
28
+ use_linear_in_transformer: True
29
+ transformer_depth: 1
30
+ context_dim: 1024
31
+
32
+ conditioner_config:
33
+ target: sgm.modules.GeneralConditioner
34
+ params:
35
+ emb_models:
36
+ - is_trainable: False
37
+ input_key: txt
38
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
39
+ params:
40
+ freeze: true
41
+ layer: penultimate
42
+
43
+ first_stage_config:
44
+ target: sgm.models.autoencoder.AutoencoderKL
45
+ params:
46
+ embed_dim: 4
47
+ monitor: val/rec_loss
48
+ ddconfig:
49
+ double_z: true
50
+ z_channels: 4
51
+ resolution: 256
52
+ in_channels: 3
53
+ out_ch: 3
54
+ ch: 128
55
+ ch_mult: [1, 2, 4, 4]
56
+ num_res_blocks: 2
57
+ attn_resolutions: []
58
+ dropout: 0.0
59
+ lossconfig:
60
+ target: torch.nn.Identity
configs/inference/sd_2_1_768.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.18215
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ scaling_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_scaling.VScaling
14
+ discretization_config:
15
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
+
17
+ network_config:
18
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
+ params:
20
+ use_checkpoint: True
21
+ in_channels: 4
22
+ out_channels: 4
23
+ model_channels: 320
24
+ attention_resolutions: [4, 2, 1]
25
+ num_res_blocks: 2
26
+ channel_mult: [1, 2, 4, 4]
27
+ num_head_channels: 64
28
+ use_linear_in_transformer: True
29
+ transformer_depth: 1
30
+ context_dim: 1024
31
+
32
+ conditioner_config:
33
+ target: sgm.modules.GeneralConditioner
34
+ params:
35
+ emb_models:
36
+ - is_trainable: False
37
+ input_key: txt
38
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
39
+ params:
40
+ freeze: true
41
+ layer: penultimate
42
+
43
+ first_stage_config:
44
+ target: sgm.models.autoencoder.AutoencoderKL
45
+ params:
46
+ embed_dim: 4
47
+ monitor: val/rec_loss
48
+ ddconfig:
49
+ double_z: true
50
+ z_channels: 4
51
+ resolution: 256
52
+ in_channels: 3
53
+ out_ch: 3
54
+ ch: 128
55
+ ch_mult: [1, 2, 4, 4]
56
+ num_res_blocks: 2
57
+ attn_resolutions: []
58
+ dropout: 0.0
59
+ lossconfig:
60
+ target: torch.nn.Identity
configs/inference/sd_xl_base.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.13025
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ scaling_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
+ discretization_config:
15
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
+
17
+ network_config:
18
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
+ params:
20
+ adm_in_channels: 2816
21
+ num_classes: sequential
22
+ use_checkpoint: True
23
+ in_channels: 4
24
+ out_channels: 4
25
+ model_channels: 320
26
+ attention_resolutions: [4, 2]
27
+ num_res_blocks: 2
28
+ channel_mult: [1, 2, 4]
29
+ num_head_channels: 64
30
+ use_linear_in_transformer: True
31
+ transformer_depth: [1, 2, 10]
32
+ context_dim: 2048
33
+ spatial_transformer_attn_type: softmax-xformers
34
+
35
+ conditioner_config:
36
+ target: sgm.modules.GeneralConditioner
37
+ params:
38
+ emb_models:
39
+ - is_trainable: False
40
+ input_key: txt
41
+ target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
42
+ params:
43
+ layer: hidden
44
+ layer_idx: 11
45
+
46
+ - is_trainable: False
47
+ input_key: txt
48
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
49
+ params:
50
+ arch: ViT-bigG-14
51
+ version: laion2b_s39b_b160k
52
+ freeze: True
53
+ layer: penultimate
54
+ always_return_pooled: True
55
+ legacy: False
56
+
57
+ - is_trainable: False
58
+ input_key: original_size_as_tuple
59
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
60
+ params:
61
+ outdim: 256
62
+
63
+ - is_trainable: False
64
+ input_key: crop_coords_top_left
65
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
66
+ params:
67
+ outdim: 256
68
+
69
+ - is_trainable: False
70
+ input_key: target_size_as_tuple
71
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
72
+ params:
73
+ outdim: 256
74
+
75
+ first_stage_config:
76
+ target: sgm.models.autoencoder.AutoencoderKL
77
+ params:
78
+ embed_dim: 4
79
+ monitor: val/rec_loss
80
+ ddconfig:
81
+ attn_type: vanilla-xformers
82
+ double_z: true
83
+ z_channels: 4
84
+ resolution: 256
85
+ in_channels: 3
86
+ out_ch: 3
87
+ ch: 128
88
+ ch_mult: [1, 2, 4, 4]
89
+ num_res_blocks: 2
90
+ attn_resolutions: []
91
+ dropout: 0.0
92
+ lossconfig:
93
+ target: torch.nn.Identity
configs/inference/sd_xl_refiner.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.13025
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ scaling_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
+ discretization_config:
15
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
+
17
+ network_config:
18
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
+ params:
20
+ adm_in_channels: 2560
21
+ num_classes: sequential
22
+ use_checkpoint: True
23
+ in_channels: 4
24
+ out_channels: 4
25
+ model_channels: 384
26
+ attention_resolutions: [4, 2]
27
+ num_res_blocks: 2
28
+ channel_mult: [1, 2, 4, 4]
29
+ num_head_channels: 64
30
+ use_linear_in_transformer: True
31
+ transformer_depth: 4
32
+ context_dim: [1280, 1280, 1280, 1280]
33
+ spatial_transformer_attn_type: softmax-xformers
34
+
35
+ conditioner_config:
36
+ target: sgm.modules.GeneralConditioner
37
+ params:
38
+ emb_models:
39
+ - is_trainable: False
40
+ input_key: txt
41
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
42
+ params:
43
+ arch: ViT-bigG-14
44
+ version: laion2b_s39b_b160k
45
+ legacy: False
46
+ freeze: True
47
+ layer: penultimate
48
+ always_return_pooled: True
49
+
50
+ - is_trainable: False
51
+ input_key: original_size_as_tuple
52
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
+ params:
54
+ outdim: 256
55
+
56
+ - is_trainable: False
57
+ input_key: crop_coords_top_left
58
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
+ params:
60
+ outdim: 256
61
+
62
+ - is_trainable: False
63
+ input_key: aesthetic_score
64
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
65
+ params:
66
+ outdim: 256
67
+
68
+ first_stage_config:
69
+ target: sgm.models.autoencoder.AutoencoderKL
70
+ params:
71
+ embed_dim: 4
72
+ monitor: val/rec_loss
73
+ ddconfig:
74
+ attn_type: vanilla-xformers
75
+ double_z: true
76
+ z_channels: 4
77
+ resolution: 256
78
+ in_channels: 3
79
+ out_ch: 3
80
+ ch: 128
81
+ ch_mult: [1, 2, 4, 4]
82
+ num_res_blocks: 2
83
+ attn_resolutions: []
84
+ dropout: 0.0
85
+ lossconfig:
86
+ target: torch.nn.Identity
configs/inference/svd.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.18215
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
9
+ params:
10
+ scaling_config:
11
+ target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
12
+
13
+ network_config:
14
+ target: sgm.modules.diffusionmodules.video_model.VideoUNet
15
+ params:
16
+ adm_in_channels: 768
17
+ num_classes: sequential
18
+ use_checkpoint: True
19
+ in_channels: 8
20
+ out_channels: 4
21
+ model_channels: 320
22
+ attention_resolutions: [4, 2, 1]
23
+ num_res_blocks: 2
24
+ channel_mult: [1, 2, 4, 4]
25
+ num_head_channels: 64
26
+ use_linear_in_transformer: True
27
+ transformer_depth: 1
28
+ context_dim: 1024
29
+ spatial_transformer_attn_type: softmax-xformers
30
+ extra_ff_mix_layer: True
31
+ use_spatial_context: True
32
+ merge_strategy: learned_with_images
33
+ video_kernel_size: [3, 1, 1]
34
+
35
+ conditioner_config:
36
+ target: sgm.modules.GeneralConditioner
37
+ params:
38
+ emb_models:
39
+ - is_trainable: False
40
+ input_key: cond_frames_without_noise
41
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
42
+ params:
43
+ n_cond_frames: 1
44
+ n_copies: 1
45
+ open_clip_embedding_config:
46
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
47
+ params:
48
+ freeze: True
49
+
50
+ - input_key: fps_id
51
+ is_trainable: False
52
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
+ params:
54
+ outdim: 256
55
+
56
+ - input_key: motion_bucket_id
57
+ is_trainable: False
58
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
+ params:
60
+ outdim: 256
61
+
62
+ - input_key: cond_frames
63
+ is_trainable: False
64
+ target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
65
+ params:
66
+ disable_encoder_autocast: True
67
+ n_cond_frames: 1
68
+ n_copies: 1
69
+ is_ae: True
70
+ encoder_config:
71
+ target: sgm.models.autoencoder.AutoencoderKLModeOnly
72
+ params:
73
+ embed_dim: 4
74
+ monitor: val/rec_loss
75
+ ddconfig:
76
+ attn_type: vanilla-xformers
77
+ double_z: True
78
+ z_channels: 4
79
+ resolution: 256
80
+ in_channels: 3
81
+ out_ch: 3
82
+ ch: 128
83
+ ch_mult: [1, 2, 4, 4]
84
+ num_res_blocks: 2
85
+ attn_resolutions: []
86
+ dropout: 0.0
87
+ lossconfig:
88
+ target: torch.nn.Identity
89
+
90
+ - input_key: cond_aug
91
+ is_trainable: False
92
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
93
+ params:
94
+ outdim: 256
95
+
96
+ first_stage_config:
97
+ target: sgm.models.autoencoder.AutoencodingEngine
98
+ params:
99
+ loss_config:
100
+ target: torch.nn.Identity
101
+ regularizer_config:
102
+ target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
103
+ encoder_config:
104
+ target: sgm.modules.diffusionmodules.model.Encoder
105
+ params:
106
+ attn_type: vanilla
107
+ double_z: True
108
+ z_channels: 4
109
+ resolution: 256
110
+ in_channels: 3
111
+ out_ch: 3
112
+ ch: 128
113
+ ch_mult: [1, 2, 4, 4]
114
+ num_res_blocks: 2
115
+ attn_resolutions: []
116
+ dropout: 0.0
117
+ decoder_config:
118
+ target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
119
+ params:
120
+ attn_type: vanilla
121
+ double_z: True
122
+ z_channels: 4
123
+ resolution: 256
124
+ in_channels: 3
125
+ out_ch: 3
126
+ ch: 128
127
+ ch_mult: [1, 2, 4, 4]
128
+ num_res_blocks: 2
129
+ attn_resolutions: []
130
+ dropout: 0.0
131
+ video_kernel_size: [3, 1, 1]
configs/inference/svd_image_decoder.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.18215
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
9
+ params:
10
+ scaling_config:
11
+ target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
12
+
13
+ network_config:
14
+ target: sgm.modules.diffusionmodules.video_model.VideoUNet
15
+ params:
16
+ adm_in_channels: 768
17
+ num_classes: sequential
18
+ use_checkpoint: True
19
+ in_channels: 8
20
+ out_channels: 4
21
+ model_channels: 320
22
+ attention_resolutions: [4, 2, 1]
23
+ num_res_blocks: 2
24
+ channel_mult: [1, 2, 4, 4]
25
+ num_head_channels: 64
26
+ use_linear_in_transformer: True
27
+ transformer_depth: 1
28
+ context_dim: 1024
29
+ spatial_transformer_attn_type: softmax-xformers
30
+ extra_ff_mix_layer: True
31
+ use_spatial_context: True
32
+ merge_strategy: learned_with_images
33
+ video_kernel_size: [3, 1, 1]
34
+
35
+ conditioner_config:
36
+ target: sgm.modules.GeneralConditioner
37
+ params:
38
+ emb_models:
39
+ - is_trainable: False
40
+ input_key: cond_frames_without_noise
41
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
42
+ params:
43
+ n_cond_frames: 1
44
+ n_copies: 1
45
+ open_clip_embedding_config:
46
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
47
+ params:
48
+ freeze: True
49
+
50
+ - input_key: fps_id
51
+ is_trainable: False
52
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
+ params:
54
+ outdim: 256
55
+
56
+ - input_key: motion_bucket_id
57
+ is_trainable: False
58
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
+ params:
60
+ outdim: 256
61
+
62
+ - input_key: cond_frames
63
+ is_trainable: False
64
+ target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
65
+ params:
66
+ disable_encoder_autocast: True
67
+ n_cond_frames: 1
68
+ n_copies: 1
69
+ is_ae: True
70
+ encoder_config:
71
+ target: sgm.models.autoencoder.AutoencoderKLModeOnly
72
+ params:
73
+ embed_dim: 4
74
+ monitor: val/rec_loss
75
+ ddconfig:
76
+ attn_type: vanilla-xformers
77
+ double_z: True
78
+ z_channels: 4
79
+ resolution: 256
80
+ in_channels: 3
81
+ out_ch: 3
82
+ ch: 128
83
+ ch_mult: [1, 2, 4, 4]
84
+ num_res_blocks: 2
85
+ attn_resolutions: []
86
+ dropout: 0.0
87
+ lossconfig:
88
+ target: torch.nn.Identity
89
+
90
+ - input_key: cond_aug
91
+ is_trainable: False
92
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
93
+ params:
94
+ outdim: 256
95
+
96
+ first_stage_config:
97
+ target: sgm.models.autoencoder.AutoencoderKL
98
+ params:
99
+ embed_dim: 4
100
+ monitor: val/rec_loss
101
+ ddconfig:
102
+ attn_type: vanilla-xformers
103
+ double_z: True
104
+ z_channels: 4
105
+ resolution: 256
106
+ in_channels: 3
107
+ out_ch: 3
108
+ ch: 128
109
+ ch_mult: [1, 2, 4, 4]
110
+ num_res_blocks: 2
111
+ attn_resolutions: []
112
+ dropout: 0.0
113
+ lossconfig:
114
+ target: torch.nn.Identity
configs/inference/svd_mv.yaml ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-05
3
+ target: sgm.models.video_diffusion.DiffusionEngine
4
+ params:
5
+ ckpt_path: ckpts/svd_xt.safetensors
6
+ scale_factor: 0.18215
7
+ disable_first_stage_autocast: true
8
+ scheduler_config:
9
+ target: sgm.lr_scheduler.LambdaLinearScheduler
10
+ params:
11
+ warm_up_steps:
12
+ - 1
13
+ cycle_lengths:
14
+ - 10000000000000
15
+ f_start:
16
+ - 1.0e-06
17
+ f_max:
18
+ - 1.0
19
+ f_min:
20
+ - 1.0
21
+ denoiser_config:
22
+ target: sgm.modules.diffusionmodules.denoiser.Denoiser
23
+ params:
24
+ scaling_config:
25
+ target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
26
+ network_config:
27
+ target: sgm.modules.diffusionmodules.video_model.VideoUNet
28
+ params:
29
+ adm_in_channels: 768
30
+ num_classes: sequential
31
+ use_checkpoint: true
32
+ in_channels: 8
33
+ out_channels: 4
34
+ model_channels: 320
35
+ attention_resolutions:
36
+ - 4
37
+ - 2
38
+ - 1
39
+ num_res_blocks: 2
40
+ channel_mult:
41
+ - 1
42
+ - 2
43
+ - 4
44
+ - 4
45
+ num_head_channels: 64
46
+ use_linear_in_transformer: true
47
+ transformer_depth: 1
48
+ context_dim: 1024
49
+ spatial_transformer_attn_type: softmax-xformers
50
+ extra_ff_mix_layer: true
51
+ use_spatial_context: true
52
+ merge_strategy: learned_with_images
53
+ video_kernel_size:
54
+ - 3
55
+ - 1
56
+ - 1
57
+ conditioner_config:
58
+ target: sgm.modules.GeneralConditioner
59
+ params:
60
+ emb_models:
61
+ - is_trainable: false
62
+ ucg_rate: 0.2
63
+ input_key: cond_frames_without_noise
64
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
65
+ params:
66
+ n_cond_frames: 1
67
+ n_copies: 1
68
+ open_clip_embedding_config:
69
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
70
+ params:
71
+ freeze: true
72
+ - input_key: fps_id
73
+ is_trainable: true
74
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
75
+ params:
76
+ outdim: 256
77
+ - input_key: motion_bucket_id
78
+ is_trainable: true
79
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
80
+ params:
81
+ outdim: 256
82
+ - input_key: cond_frames
83
+ is_trainable: false
84
+ ucg_rate: 0.2
85
+ target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
86
+ params:
87
+ disable_encoder_autocast: true
88
+ n_cond_frames: 1
89
+ n_copies: 1
90
+ is_ae: true
91
+ encoder_config:
92
+ target: sgm.models.autoencoder.AutoencoderKLModeOnly
93
+ params:
94
+ embed_dim: 4
95
+ monitor: val/rec_loss
96
+ ddconfig:
97
+ attn_type: vanilla-xformers
98
+ double_z: true
99
+ z_channels: 4
100
+ resolution: 256
101
+ in_channels: 3
102
+ out_ch: 3
103
+ ch: 128
104
+ ch_mult:
105
+ - 1
106
+ - 2
107
+ - 4
108
+ - 4
109
+ num_res_blocks: 2
110
+ attn_resolutions: []
111
+ dropout: 0.0
112
+ lossconfig:
113
+ target: torch.nn.Identity
114
+ - input_key: cond_aug
115
+ is_trainable: true
116
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
117
+ params:
118
+ outdim: 256
119
+ first_stage_config:
120
+ target: sgm.models.autoencoder.AutoencodingEngine
121
+ params:
122
+ loss_config:
123
+ target: torch.nn.Identity
124
+ regularizer_config:
125
+ target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
126
+ encoder_config:
127
+ target: sgm.modules.diffusionmodules.model.Encoder
128
+ params:
129
+ attn_type: vanilla
130
+ double_z: true
131
+ z_channels: 4
132
+ resolution: 256
133
+ in_channels: 3
134
+ out_ch: 3
135
+ ch: 128
136
+ ch_mult:
137
+ - 1
138
+ - 2
139
+ - 4
140
+ - 4
141
+ num_res_blocks: 2
142
+ attn_resolutions: []
143
+ dropout: 0.0
144
+ decoder_config:
145
+ target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
146
+ params:
147
+ attn_type: vanilla
148
+ double_z: true
149
+ z_channels: 4
150
+ resolution: 256
151
+ in_channels: 3
152
+ out_ch: 3
153
+ ch: 128
154
+ ch_mult:
155
+ - 1
156
+ - 2
157
+ - 4
158
+ - 4
159
+ num_res_blocks: 2
160
+ attn_resolutions: []
161
+ dropout: 0.0
162
+ video_kernel_size:
163
+ - 3
164
+ - 1
165
+ - 1
166
+ sampler_config:
167
+ target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
168
+ params:
169
+ num_steps: 30
170
+ discretization_config:
171
+ target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
172
+ params:
173
+ sigma_max: 700.0
174
+ guider_config:
175
+ target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
176
+ params:
177
+ max_scale: 2.5
178
+ min_scale: 1.0
179
+ num_frames: 24
180
+ loss_fn_config:
181
+ target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
182
+ params:
183
+ batch2model_keys:
184
+ - num_video_frames
185
+ - image_only_indicator
186
+ loss_weighting_config:
187
+ target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
188
+ params:
189
+ sigma_data: 1.0
190
+ sigma_sampler_config:
191
+ target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
192
+ params:
193
+ p_mean: 0.3
194
+ p_std: 1.2
195
+ data:
196
+ target: sgm.data.objaverse.ObjaverseSpiralDataset
197
+ params:
198
+ root_dir: /mnt/mfs/zilong.chen/Downloads/objaverse-ndd-samples
199
+ random_front: true
200
+ batch_size: 2
201
+ num_workers: 16
202
+ cond_aug_mean: -0.0