Spaces:

Skywork
/

skyreels-a1-talking-head

Running on L40S

App Files Files Community

skyreels-a1-talking-head / diffposetalk /diffposetalk.py

multimodalart HF staff

Upload 83 files

38e20ed verified 29 days ago

raw

history blame contribute delete

10.6 kB

	import math
	import tempfile
	import warnings
	from pathlib import Path

	import cv2
	import librosa
	import numpy as np
	import torch
	import torch.nn.functional as F
	from tqdm import tqdm
	from pydantic import BaseModel

	from .diff_talking_head import DiffTalkingHead
	from .utils import NullableArgs, coef_dict_to_vertices, get_coef_dict
	from .utils.media import combine_video_and_audio, convert_video, reencode_audio

	warnings.filterwarnings('ignore', message='PySoundFile failed. Trying audioread instead.')

	class DiffPoseTalkConfig(BaseModel):
	no_context_audio_feat: bool = False
	model_path: str = "pretrained_models/diffposetalk/iter_0110000.pt" # DPT/head-SA-hubert-WM
	coef_stats: str = "pretrained_models/diffposetalk/stats_train.npz"
	style_path: str = "pretrained_models/diffposetalk/style/L4H4-T0.1-BS32/iter_0034000/normal.npy"
	dynamic_threshold_ratio: float = 0.99
	dynamic_threshold_min: float = 1.0
	dynamic_threshold_max: float = 4.0
	scale_audio: float = 1.15
	scale_style: float = 3.0

	class DiffPoseTalk:
	def __init__(self, config: DiffPoseTalkConfig = DiffPoseTalkConfig(), device="cuda"):
	self.cfg = config
	self.device = device

	self.no_context_audio_feat = self.cfg.no_context_audio_feat
	model_data = torch.load(self.cfg.model_path, map_location=self.device)

	self.model_args = NullableArgs(model_data['args'])
	self.model = DiffTalkingHead(self.model_args, self.device)
	model_data['model'].pop('denoising_net.TE.pe')
	self.model.load_state_dict(model_data['model'], strict=False)
	self.model.to(self.device)
	self.model.eval()

	self.use_indicator = self.model_args.use_indicator
	self.rot_repr = self.model_args.rot_repr
	self.predict_head_pose = not self.model_args.no_head_pose
	if self.model.use_style:
	style_dir = Path(self.model_args.style_enc_ckpt)
	style_dir = Path(*style_dir.with_suffix('').parts[-3::2])
	self.style_dir = style_dir

	# sequence
	self.n_motions = self.model_args.n_motions
	self.n_prev_motions = self.model_args.n_prev_motions
	self.fps = self.model_args.fps
	self.audio_unit = 16000. / self.fps # num of samples per frame
	self.n_audio_samples = round(self.audio_unit * self.n_motions)
	self.pad_mode = self.model_args.pad_mode

	self.coef_stats = dict(np.load(self.cfg.coef_stats))
	self.coef_stats = {k: torch.from_numpy(v).to(self.device) for k, v in self.coef_stats.items()}

	if self.cfg.dynamic_threshold_ratio > 0:
	self.dynamic_threshold = (self.cfg.dynamic_threshold_ratio, self.cfg.dynamic_threshold_min,
	self.cfg.dynamic_threshold_max)
	else:
	self.dynamic_threshold = None


	def infer_from_file(self, audio_path, shape_coef):
	n_repetitions = 1
	cfg_mode = None
	cfg_cond = self.model.guiding_conditions
	cfg_scale = []
	for cond in cfg_cond:
	if cond == 'audio':
	cfg_scale.append(self.cfg.scale_audio)
	elif cond == 'style':
	cfg_scale.append(self.cfg.scale_style)

	coef_dict = self.infer_coeffs(audio_path, shape_coef, self.cfg.style_path, n_repetitions,
	cfg_mode, cfg_cond, cfg_scale, include_shape=True)
	return coef_dict

	@torch.no_grad()
	def infer_coeffs(self, audio, shape_coef, style_feat=None, n_repetitions=1,
	cfg_mode=None, cfg_cond=None, cfg_scale=1.15, include_shape=False):
	# Returns dict[str, (n_repetitions, L, *)]
	# Step 1: Preprocessing
	# Preprocess audio
	if isinstance(audio, (str, Path)):
	audio, _ = librosa.load(audio, sr=16000, mono=True)
	if isinstance(audio, np.ndarray):
	audio = torch.from_numpy(audio).to(self.device)
	assert audio.ndim == 1, 'Audio must be 1D tensor.'
	audio_mean, audio_std = torch.mean(audio), torch.std(audio)
	audio = (audio - audio_mean) / (audio_std + 1e-5)

	# Preprocess shape coefficient
	if isinstance(shape_coef, (str, Path)):
	shape_coef = np.load(shape_coef)
	if not isinstance(shape_coef, np.ndarray):
	shape_coef = shape_coef['shape']
	if isinstance(shape_coef, np.ndarray):
	shape_coef = torch.from_numpy(shape_coef).float().to(self.device)
	assert shape_coef.ndim <= 2, 'Shape coefficient must be 1D or 2D tensor.'
	if shape_coef.ndim > 1:
	# use the first frame as the shape coefficient
	shape_coef = shape_coef[0]
	original_shape_coef = shape_coef.clone()
	if self.coef_stats is not None:
	shape_coef = (shape_coef - self.coef_stats['shape_mean']) / self.coef_stats['shape_std']
	shape_coef = shape_coef.unsqueeze(0).expand(n_repetitions, -1)

	# Preprocess style feature if given
	if style_feat is not None:
	assert self.model.use_style
	if isinstance(style_feat, (str, Path)):
	style_feat = Path(style_feat)
	if not style_feat.exists() and not style_feat.is_absolute():
	style_feat = style_feat.parent / self.style_dir / style_feat.name
	style_feat = np.load(style_feat)
	if not isinstance(style_feat, np.ndarray):
	style_feat = style_feat['style']
	if isinstance(style_feat, np.ndarray):
	style_feat = torch.from_numpy(style_feat).float().to(self.device)
	assert style_feat.ndim == 1, 'Style feature must be 1D tensor.'
	style_feat = style_feat.unsqueeze(0).expand(n_repetitions, -1)

	# Step 2: Predict motion coef
	# divide into synthesize units and do synthesize
	clip_len = int(len(audio) / 16000 * self.fps)
	stride = self.n_motions
	if clip_len <= self.n_motions:
	n_subdivision = 1
	else:
	n_subdivision = math.ceil(clip_len / stride)

	# Prepare audio input
	n_padding_audio_samples = self.n_audio_samples * n_subdivision - len(audio)
	n_padding_frames = math.ceil(n_padding_audio_samples / self.audio_unit)
	if n_padding_audio_samples > 0:
	if self.pad_mode == 'zero':
	padding_value = 0
	elif self.pad_mode == 'replicate':
	padding_value = audio[-1]
	else:
	raise ValueError(f'Unknown pad mode: {self.pad_mode}')
	audio = F.pad(audio, (0, n_padding_audio_samples), value=padding_value)

	if not self.no_context_audio_feat:
	audio_feat = self.model.extract_audio_feature(audio.unsqueeze(0), self.n_motions * n_subdivision)

	# Generate `self.n_motions` new frames at one time, and use the last `self.n_prev_motions` frames
	# from the previous generation as the initial motion condition
	coef_list = []
	for i in range(0, n_subdivision):
	start_idx = i * stride
	end_idx = start_idx + self.n_motions
	indicator = torch.ones((n_repetitions, self.n_motions)).to(self.device) if self.use_indicator else None
	if indicator is not None and i == n_subdivision - 1 and n_padding_frames > 0:
	indicator[:, -n_padding_frames:] = 0
	if not self.no_context_audio_feat:
	audio_in = audio_feat[:, start_idx:end_idx].expand(n_repetitions, -1, -1)
	else:
	audio_in = audio[round(start_idx * self.audio_unit):round(end_idx * self.audio_unit)].unsqueeze(0)

	# generate motion coefficients
	if i == 0:
	# -> (N, L, d_motion=n_code_per_frame * code_dim)
	motion_feat, noise, prev_audio_feat = self.model.sample(audio_in, shape_coef, style_feat,
	indicator=indicator, cfg_mode=cfg_mode,
	cfg_cond=cfg_cond, cfg_scale=cfg_scale,
	dynamic_threshold=self.dynamic_threshold)
	else:
	motion_feat, noise, prev_audio_feat = self.model.sample(audio_in, shape_coef, style_feat,
	prev_motion_feat, prev_audio_feat, noise,
	indicator=indicator, cfg_mode=cfg_mode,
	cfg_cond=cfg_cond, cfg_scale=cfg_scale,
	dynamic_threshold=self.dynamic_threshold)
	prev_motion_feat = motion_feat[:, -self.n_prev_motions:].clone()
	prev_audio_feat = prev_audio_feat[:, -self.n_prev_motions:]

	motion_coef = motion_feat
	if i == n_subdivision - 1 and n_padding_frames > 0:
	motion_coef = motion_coef[:, :-n_padding_frames] # delete padded frames
	coef_list.append(motion_coef)

	motion_coef = torch.cat(coef_list, dim=1)

	# Step 3: restore to coef dict
	coef_dict = get_coef_dict(motion_coef, None, self.coef_stats, self.predict_head_pose, self.rot_repr)
	if include_shape:
	coef_dict['shape'] = original_shape_coef[None, None].expand(n_repetitions, motion_coef.shape[1], -1)
	return self.coef_to_a1_format(coef_dict)

	def coef_to_a1_format(self, coef_dict):
	n_frames = coef_dict['exp'].shape[1]
	new_coef_dict = []
	for i in range(n_frames):

	new_coef_dict.append({
	"expression_params": coef_dict["exp"][0, i:i+1],
	"jaw_params": coef_dict["pose"][0, i:i+1, 3:],
	"eye_pose_params": torch.zeros(1, 6).type_as(coef_dict["pose"]),
	"pose_params": coef_dict["pose"][0, i:i+1, :3],
	"eyelid_params": None
	})
	return new_coef_dict





	@staticmethod
	def _pad_coef(coef, n_frames, elem_ndim=1):
	if coef.ndim == elem_ndim:
	coef = coef[None]
	elem_shape = coef.shape[1:]
	if coef.shape[0] >= n_frames:
	new_coef = coef[:n_frames]
	else:
	# repeat the last coef frame
	new_coef = torch.cat([coef, coef[[-1]].expand(n_frames - coef.shape[0], *elem_shape)], dim=0)
	return new_coef # (n_frames, *elem_shape)