Spaces:

jmanhype
/

MuseV

Runtime error

MuseV / musev /schedulers /scheduling_ddpm.py

jmanhype

Initial commit without binary files

06e9d12 3 months ago

11.2 kB

	# Copyright 2023 UC Berkeley Team and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim

	from __future__ import annotations

	import math
	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union

	import numpy as np
	from numpy import ndarray
	import torch

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.utils import BaseOutput
	from diffusers.utils.torch_utils import randn_tensor
	from diffusers.schedulers.scheduling_utils import (
	KarrasDiffusionSchedulers,
	SchedulerMixin,
	)
	from diffusers.schedulers.scheduling_ddpm import (
	DDPMSchedulerOutput,
	betas_for_alpha_bar,
	DDPMScheduler as DiffusersDDPMScheduler,
	)
	from ..utils.noise_util import video_fusion_noise


	class DDPMScheduler(DiffusersDDPMScheduler):
	"""
	`DDPMScheduler` explores the connections between denoising score matching and Langevin dynamics sampling.

	This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
	methods the library implements for all schedulers such as loading and saving.

	Args:
	num_train_timesteps (`int`, defaults to 1000):
	The number of diffusion steps to train the model.
	beta_start (`float`, defaults to 0.0001):
	The starting `beta` value of inference.
	beta_end (`float`, defaults to 0.02):
	The final `beta` value.
	beta_schedule (`str`, defaults to `"linear"`):
	The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
	`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
	variance_type (`str`, defaults to `"fixed_small"`):
	Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
	`fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
	clip_sample (`bool`, defaults to `True`):
	Clip the predicted sample for numerical stability.
	clip_sample_range (`float`, defaults to 1.0):
	The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
	prediction_type (`str`, defaults to `epsilon`, optional):
	Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
	`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
	Video](https://imagen.research.google/video/paper.pdf) paper).
	thresholding (`bool`, defaults to `False`):
	Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
	as Stable Diffusion.
	dynamic_thresholding_ratio (`float`, defaults to 0.995):
	The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
	sample_max_value (`float`, defaults to 1.0):
	The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
	timestep_spacing (`str`, defaults to `"leading"`):
	The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
	Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
	steps_offset (`int`, defaults to 0):
	An offset added to the inference steps. You can use a combination of `offset=1` and
	`set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
	Diffusion.
	"""

	_compatibles = [e.name for e in KarrasDiffusionSchedulers]
	order = 1

	@register_to_config
	def __init__(
	self,
	num_train_timesteps: int = 1000,
	beta_start: float = 0.0001,
	beta_end: float = 0.02,
	beta_schedule: str = "linear",
	trained_betas: ndarray \| List[float] \| None = None,
	variance_type: str = "fixed_small",
	clip_sample: bool = True,
	prediction_type: str = "epsilon",
	thresholding: bool = False,
	dynamic_thresholding_ratio: float = 0.995,
	clip_sample_range: float = 1,
	sample_max_value: float = 1,
	timestep_spacing: str = "leading",
	steps_offset: int = 0,
	):
	super().__init__(
	num_train_timesteps,
	beta_start,
	beta_end,
	beta_schedule,
	trained_betas,
	variance_type,
	clip_sample,
	prediction_type,
	thresholding,
	dynamic_thresholding_ratio,
	clip_sample_range,
	sample_max_value,
	timestep_spacing,
	steps_offset,
	)

	def step(
	self,
	model_output: torch.FloatTensor,
	timestep: int,
	sample: torch.FloatTensor,
	generator=None,
	return_dict: bool = True,
	w_ind_noise: float = 0.5,
	noise_type: str = "random",
	) -> Union[DDPMSchedulerOutput, Tuple]:
	"""
	Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
	process from the learned model outputs (most often the predicted noise).

	Args:
	model_output (`torch.FloatTensor`):
	The direct output from learned diffusion model.
	timestep (`float`):
	The current discrete timestep in the diffusion chain.
	sample (`torch.FloatTensor`):
	A current instance of a sample created by the diffusion process.
	generator (`torch.Generator`, optional):
	A random number generator.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.

	Returns:
	[`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
	If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
	tuple is returned where the first element is the sample tensor.

	"""
	t = timestep

	prev_t = self.previous_timestep(t)

	if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in [
	"learned",
	"learned_range",
	]:
	model_output, predicted_variance = torch.split(
	model_output, sample.shape[1], dim=1
	)
	else:
	predicted_variance = None

	# 1. compute alphas, betas
	alpha_prod_t = self.alphas_cumprod[t]
	alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
	beta_prod_t = 1 - alpha_prod_t
	beta_prod_t_prev = 1 - alpha_prod_t_prev
	current_alpha_t = alpha_prod_t / alpha_prod_t_prev
	current_beta_t = 1 - current_alpha_t

	# 2. compute predicted original sample from predicted noise also called
	# "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
	if self.config.prediction_type == "epsilon":
	pred_original_sample = (
	sample - beta_prod_t ** (0.5) * model_output
	) / alpha_prod_t ** (0.5)
	elif self.config.prediction_type == "sample":
	pred_original_sample = model_output
	elif self.config.prediction_type == "v_prediction":
	pred_original_sample = (alpha_prod_t*0.5) sample - (
	beta_prod_t**0.5
	) * model_output
	else:
	raise ValueError(
	f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
	" `v_prediction` for the DDPMScheduler."
	)

	# 3. Clip or threshold "predicted x_0"
	if self.config.thresholding:
	pred_original_sample = self._threshold_sample(pred_original_sample)
	elif self.config.clip_sample:
	pred_original_sample = pred_original_sample.clamp(
	-self.config.clip_sample_range, self.config.clip_sample_range
	)

	# 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
	# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
	pred_original_sample_coeff = (
	alpha_prod_t_prev ** (0.5) * current_beta_t
	) / beta_prod_t
	current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t

	# 5. Compute predicted previous sample µ_t
	# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
	pred_prev_sample = (
	pred_original_sample_coeff * pred_original_sample
	+ current_sample_coeff * sample
	)

	# 6. Add noise
	variance = 0
	if t > 0:
	device = model_output.device
	# if variance_noise is None:
	# variance_noise = randn_tensor(
	# model_output.shape,
	# generator=generator,
	# device=model_output.device,
	# dtype=model_output.dtype,
	# )
	device = model_output.device

	if noise_type == "random":
	variance_noise = randn_tensor(
	model_output.shape,
	dtype=model_output.dtype,
	device=device,
	generator=generator,
	)
	elif noise_type == "video_fusion":
	variance_noise = video_fusion_noise(
	model_output, w_ind_noise=w_ind_noise, generator=generator
	)
	if self.variance_type == "fixed_small_log":
	variance = (
	self._get_variance(t, predicted_variance=predicted_variance)
	* variance_noise
	)
	elif self.variance_type == "learned_range":
	variance = self._get_variance(t, predicted_variance=predicted_variance)
	variance = torch.exp(0.5 * variance) * variance_noise
	else:
	variance = (
	self._get_variance(t, predicted_variance=predicted_variance) ** 0.5
	) * variance_noise

	pred_prev_sample = pred_prev_sample + variance

	if not return_dict:
	return (pred_prev_sample,)

	return DDPMSchedulerOutput(
	prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample
	)