Evgeny Zhukov
Origin: https://github.com/ali-vilab/UniAnimate/commit/d7814fa44a0a1154524b92fce0e3133a2604d333
2ba4412
import torch
import logging
import os.path as osp
from datetime import datetime
from easydict import EasyDict
import os
cfg = EasyDict(__name__='Config: VideoLDM Decoder')
# -------------------------------distributed training--------------------------
pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
gpus_per_machine = torch.cuda.device_count()
world_size = pmi_world_size * gpus_per_machine
# -----------------------------------------------------------------------------
# ---------------------------Dataset Parameter---------------------------------
cfg.mean = [0.5, 0.5, 0.5]
cfg.std = [0.5, 0.5, 0.5]
cfg.max_words = 1000
cfg.num_workers = 8
cfg.prefetch_factor = 2
# PlaceHolder
cfg.resolution = [448, 256]
cfg.vit_out_dim = 1024
cfg.vit_resolution = 336
cfg.depth_clamp = 10.0
cfg.misc_size = 384
cfg.depth_std = 20.0
cfg.save_fps = 8
cfg.frame_lens = [32, 32, 32, 1]
cfg.sample_fps = [4, ]
cfg.vid_dataset = {
'type': 'VideoBaseDataset',
'data_list': [],
'max_words': cfg.max_words,
'resolution': cfg.resolution}
cfg.img_dataset = {
'type': 'ImageBaseDataset',
'data_list': ['laion_400m',],
'max_words': cfg.max_words,
'resolution': cfg.resolution}
cfg.batch_sizes = {
str(1):256,
str(4):4,
str(8):4,
str(16):4}
# -----------------------------------------------------------------------------
# ---------------------------Mode Parameters-----------------------------------
# Diffusion
cfg.Diffusion = {
'type': 'DiffusionDDIM',
'schedule': 'cosine', # cosine
'schedule_param': {
'num_timesteps': 1000,
'cosine_s': 0.008,
'zero_terminal_snr': True,
},
'mean_type': 'v', # [v, eps]
'loss_type': 'mse',
'var_type': 'fixed_small',
'rescale_timesteps': False,
'noise_strength': 0.1,
'ddim_timesteps': 50
}
cfg.ddim_timesteps = 50 # official: 250
cfg.use_div_loss = False
# classifier-free guidance
cfg.p_zero = 0.9
cfg.guide_scale = 3.0
# clip vision encoder
cfg.vit_mean = [0.48145466, 0.4578275, 0.40821073]
cfg.vit_std = [0.26862954, 0.26130258, 0.27577711]
# sketch
cfg.sketch_mean = [0.485, 0.456, 0.406]
cfg.sketch_std = [0.229, 0.224, 0.225]
# cfg.misc_size = 256
cfg.depth_std = 20.0
cfg.depth_clamp = 10.0
cfg.hist_sigma = 10.0
# Model
cfg.scale_factor = 0.18215
cfg.use_checkpoint = True
cfg.use_sharded_ddp = False
cfg.use_fsdp = False
cfg.use_fp16 = True
cfg.temporal_attention = True
cfg.UNet = {
'type': 'UNetSD',
'in_dim': 4,
'dim': 320,
'y_dim': cfg.vit_out_dim,
'context_dim': 1024,
'out_dim': 8,
'dim_mult': [1, 2, 4, 4],
'num_heads': 8,
'head_dim': 64,
'num_res_blocks': 2,
'attn_scales': [1 / 1, 1 / 2, 1 / 4],
'dropout': 0.1,
'temporal_attention': cfg.temporal_attention,
'temporal_attn_times': 1,
'use_checkpoint': cfg.use_checkpoint,
'use_fps_condition': False,
'use_sim_mask': False
}
# auotoencoder from stabel diffusion
cfg.guidances = []
cfg.auto_encoder = {
'type': 'AutoencoderKL',
'ddconfig': {
'double_z': True,
'z_channels': 4,
'resolution': 256,
'in_channels': 3,
'out_ch': 3,
'ch': 128,
'ch_mult': [1, 2, 4, 4],
'num_res_blocks': 2,
'attn_resolutions': [],
'dropout': 0.0,
'video_kernel_size': [3, 1, 1]
},
'embed_dim': 4,
'pretrained': 'models/v2-1_512-ema-pruned.ckpt'
}
# clip embedder
cfg.embedder = {
'type': 'FrozenOpenCLIPEmbedder',
'layer': 'penultimate',
'pretrained': 'models/open_clip_pytorch_model.bin'
}
# -----------------------------------------------------------------------------
# ---------------------------Training Settings---------------------------------
# training and optimizer
cfg.ema_decay = 0.9999
cfg.num_steps = 600000
cfg.lr = 5e-5
cfg.weight_decay = 0.0
cfg.betas = (0.9, 0.999)
cfg.eps = 1.0e-8
cfg.chunk_size = 16
cfg.decoder_bs = 8
cfg.alpha = 0.7
cfg.save_ckp_interval = 1000
# scheduler
cfg.warmup_steps = 10
cfg.decay_mode = 'cosine'
# acceleration
cfg.use_ema = True
if world_size<2:
cfg.use_ema = False
cfg.load_from = None
# -----------------------------------------------------------------------------
# ----------------------------Pretrain Settings---------------------------------
cfg.Pretrain = {
'type': 'pretrain_specific_strategies',
'fix_weight': False,
'grad_scale': 0.2,
'resume_checkpoint': 'models/jiuniu_0267000.pth',
'sd_keys_path': 'models/stable_diffusion_image_key_temporal_attention_x1.json',
}
# -----------------------------------------------------------------------------
# -----------------------------Visual-------------------------------------------
# Visual videos
cfg.viz_interval = 1000
cfg.visual_train = {
'type': 'VisualTrainTextImageToVideo',
}
cfg.visual_inference = {
'type': 'VisualGeneratedVideos',
}
cfg.inference_list_path = ''
# logging
cfg.log_interval = 100
### Default log_dir
cfg.log_dir = 'outputs/'
# -----------------------------------------------------------------------------
# ---------------------------Others--------------------------------------------
# seed
cfg.seed = 8888
cfg.negative_prompt = 'Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms'
# -----------------------------------------------------------------------------