Spaces:

mcLigero
/

PoseModifier

Running

PoseModifier / UniAnimate /tools /modules /config.py

Evgeny Zhukov

Origin: https://github.com/ali-vilab/UniAnimate/commit/d7814fa44a0a1154524b92fce0e3133a2604d333

2ba4412 2 months ago

5.42 kB

	import torch
	import logging
	import os.path as osp
	from datetime import datetime
	from easydict import EasyDict
	import os

	cfg = EasyDict(__name__='Config: VideoLDM Decoder')

	# -------------------------------distributed training--------------------------
	pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
	gpus_per_machine = torch.cuda.device_count()
	world_size = pmi_world_size * gpus_per_machine
	# -----------------------------------------------------------------------------


	# ---------------------------Dataset Parameter---------------------------------
	cfg.mean = [0.5, 0.5, 0.5]
	cfg.std = [0.5, 0.5, 0.5]
	cfg.max_words = 1000
	cfg.num_workers = 8
	cfg.prefetch_factor = 2

	# PlaceHolder
	cfg.resolution = [448, 256]
	cfg.vit_out_dim = 1024
	cfg.vit_resolution = 336
	cfg.depth_clamp = 10.0
	cfg.misc_size = 384
	cfg.depth_std = 20.0

	cfg.save_fps = 8

	cfg.frame_lens = [32, 32, 32, 1]
	cfg.sample_fps = [4, ]
	cfg.vid_dataset = {
	'type': 'VideoBaseDataset',
	'data_list': [],
	'max_words': cfg.max_words,
	'resolution': cfg.resolution}
	cfg.img_dataset = {
	'type': 'ImageBaseDataset',
	'data_list': ['laion_400m',],
	'max_words': cfg.max_words,
	'resolution': cfg.resolution}

	cfg.batch_sizes = {
	str(1):256,
	str(4):4,
	str(8):4,
	str(16):4}
	# -----------------------------------------------------------------------------


	# ---------------------------Mode Parameters-----------------------------------
	# Diffusion
	cfg.Diffusion = {
	'type': 'DiffusionDDIM',
	'schedule': 'cosine', # cosine
	'schedule_param': {
	'num_timesteps': 1000,
	'cosine_s': 0.008,
	'zero_terminal_snr': True,
	},
	'mean_type': 'v', # [v, eps]
	'loss_type': 'mse',
	'var_type': 'fixed_small',
	'rescale_timesteps': False,
	'noise_strength': 0.1,
	'ddim_timesteps': 50
	}
	cfg.ddim_timesteps = 50 # official: 250
	cfg.use_div_loss = False
	# classifier-free guidance
	cfg.p_zero = 0.9
	cfg.guide_scale = 3.0

	# clip vision encoder
	cfg.vit_mean = [0.48145466, 0.4578275, 0.40821073]
	cfg.vit_std = [0.26862954, 0.26130258, 0.27577711]

	# sketch
	cfg.sketch_mean = [0.485, 0.456, 0.406]
	cfg.sketch_std = [0.229, 0.224, 0.225]
	# cfg.misc_size = 256
	cfg.depth_std = 20.0
	cfg.depth_clamp = 10.0
	cfg.hist_sigma = 10.0

	# Model
	cfg.scale_factor = 0.18215
	cfg.use_checkpoint = True
	cfg.use_sharded_ddp = False
	cfg.use_fsdp = False
	cfg.use_fp16 = True
	cfg.temporal_attention = True

	cfg.UNet = {
	'type': 'UNetSD',
	'in_dim': 4,
	'dim': 320,
	'y_dim': cfg.vit_out_dim,
	'context_dim': 1024,
	'out_dim': 8,
	'dim_mult': [1, 2, 4, 4],
	'num_heads': 8,
	'head_dim': 64,
	'num_res_blocks': 2,
	'attn_scales': [1 / 1, 1 / 2, 1 / 4],
	'dropout': 0.1,
	'temporal_attention': cfg.temporal_attention,
	'temporal_attn_times': 1,
	'use_checkpoint': cfg.use_checkpoint,
	'use_fps_condition': False,
	'use_sim_mask': False
	}

	# auotoencoder from stabel diffusion
	cfg.guidances = []
	cfg.auto_encoder = {
	'type': 'AutoencoderKL',
	'ddconfig': {
	'double_z': True,
	'z_channels': 4,
	'resolution': 256,
	'in_channels': 3,
	'out_ch': 3,
	'ch': 128,
	'ch_mult': [1, 2, 4, 4],
	'num_res_blocks': 2,
	'attn_resolutions': [],
	'dropout': 0.0,
	'video_kernel_size': [3, 1, 1]
	},
	'embed_dim': 4,
	'pretrained': 'models/v2-1_512-ema-pruned.ckpt'
	}
	# clip embedder
	cfg.embedder = {
	'type': 'FrozenOpenCLIPEmbedder',
	'layer': 'penultimate',
	'pretrained': 'models/open_clip_pytorch_model.bin'
	}
	# -----------------------------------------------------------------------------

	# ---------------------------Training Settings---------------------------------
	# training and optimizer
	cfg.ema_decay = 0.9999
	cfg.num_steps = 600000
	cfg.lr = 5e-5
	cfg.weight_decay = 0.0
	cfg.betas = (0.9, 0.999)
	cfg.eps = 1.0e-8
	cfg.chunk_size = 16
	cfg.decoder_bs = 8
	cfg.alpha = 0.7
	cfg.save_ckp_interval = 1000

	# scheduler
	cfg.warmup_steps = 10
	cfg.decay_mode = 'cosine'

	# acceleration
	cfg.use_ema = True
	if world_size<2:
	cfg.use_ema = False
	cfg.load_from = None
	# -----------------------------------------------------------------------------


	# ----------------------------Pretrain Settings---------------------------------
	cfg.Pretrain = {
	'type': 'pretrain_specific_strategies',
	'fix_weight': False,
	'grad_scale': 0.2,
	'resume_checkpoint': 'models/jiuniu_0267000.pth',
	'sd_keys_path': 'models/stable_diffusion_image_key_temporal_attention_x1.json',
	}
	# -----------------------------------------------------------------------------


	# -----------------------------Visual-------------------------------------------
	# Visual videos
	cfg.viz_interval = 1000
	cfg.visual_train = {
	'type': 'VisualTrainTextImageToVideo',
	}
	cfg.visual_inference = {
	'type': 'VisualGeneratedVideos',
	}
	cfg.inference_list_path = ''

	# logging
	cfg.log_interval = 100

	### Default log_dir
	cfg.log_dir = 'outputs/'
	# -----------------------------------------------------------------------------


	# ---------------------------Others--------------------------------------------
	# seed
	cfg.seed = 8888
	cfg.negative_prompt = 'Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms'
	# -----------------------------------------------------------------------------