artificial-styletts2 / audiocraft /conditioners.py

AudioGen class

54adc39 4 months ago

9.85 kB

	from collections import defaultdict
	from dataclasses import dataclass, field
	import logging
	import random
	import typing as tp
	import warnings
	import soundfile
	from transformers import T5EncoderModel, T5Tokenizer # type: ignore
	import torch
	from torch import nn
	logger = logging.getLogger(__name__)
	TextCondition = tp.Optional[str] # a text condition can be a string or None (if doesn't exist)
	ConditionType = tp.Tuple[torch.Tensor, torch.Tensor] # condition, mask




	class JointEmbedCondition(tp.NamedTuple):
	wav: torch.Tensor
	text: tp.List[tp.Optional[str]]
	length: torch.Tensor
	sample_rate: tp.List[int]
	path: tp.List[tp.Optional[str]] = []
	seek_time: tp.List[tp.Optional[float]] = []


	@dataclass
	class ConditioningAttributes:
	text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
	wav: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
	joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)

	def __getitem__(self, item):
	return getattr(self, item)

	@property
	def text_attributes(self):
	return self.text.keys()

	@property
	def wav_attributes(self):
	return self.wav.keys()

	@property
	def joint_embed_attributes(self):
	return self.joint_embed.keys()

	@property
	def attributes(self):
	return {
	"text": self.text_attributes,
	"wav": self.wav_attributes,
	"joint_embed": self.joint_embed_attributes,
	}

	def to_flat_dict(self):
	return {
	**{f"text.{k}": v for k, v in self.text.items()},
	**{f"wav.{k}": v for k, v in self.wav.items()},
	**{f"joint_embed.{k}": v for k, v in self.joint_embed.items()}
	}

	@classmethod
	def from_flat_dict(cls, x):
	out = cls()
	for k, v in x.items():
	kind, att = k.split(".")
	out[kind][att] = v
	return out


	class Tokenizer:
	"""Base tokenizer implementation
	(in case we want to introduce more advances tokenizers in the future).
	"""
	def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
	raise NotImplementedError()



	class T5Conditioner(nn.Module):

	MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
	"google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
	"google/flan-t5-xl", "google/flan-t5-xxl"]
	MODELS_DIMS = {
	"t5-small": 512,
	"t5-base": 768,
	"t5-large": 1024,
	"t5-3b": 1024,
	"t5-11b": 1024,
	"google/flan-t5-small": 512,
	"google/flan-t5-base": 768,
	"google/flan-t5-large": 1024,
	"google/flan-t5-3b": 1024,
	"google/flan-t5-11b": 1024,
	}

	def __init__(self,
	name: str,
	output_dim: int,
	device: str,
	word_dropout: float = 0.,
	normalize_text: bool = False,
	finetune=False):
	print(f'{finetune=}')
	assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
	super().__init__()
	self.dim = self.MODELS_DIMS[name]
	self.output_dim = output_dim
	self.output_proj = nn.Linear(self.dim, output_dim)
	self.device = device
	self.name = name
	self.word_dropout = word_dropout

	# Let's disable logging temporarily because T5 will vomit some errors otherwise.
	# thanks https://gist.github.com/simon-weber/7853144
	previous_level = logging.root.manager.disable
	logging.disable(logging.ERROR)
	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	try:
	self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
	t5 = T5EncoderModel.from_pretrained(name).eval() #.train(mode=finetune)
	finally:
	logging.disable(previous_level)
	if finetune:
	self.t5 = t5
	else:
	# this makes sure that the t5 models is not part
	# of the saved checkpoint
	self.__dict__['t5'] = t5.to(device)

	self.normalize_text = normalize_text
	if normalize_text:
	self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)

	def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
	# if current sample doesn't have a certain attribute, replace with empty string
	entries: tp.List[str] = [xi if xi is not None else "" for xi in x]
	if self.normalize_text:
	_, _, entries = self.text_normalizer(entries, return_text=True)
	if self.word_dropout > 0. and self.training:
	new_entries = []
	for entry in entries:
	words = [word for word in entry.split(" ") if random.random() >= self.word_dropout]
	new_entries.append(" ".join(words))
	entries = new_entries

	empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == ""])

	inputs = self.t5_tokenizer(entries, return_tensors='pt', padding=True).to(self.device)
	mask = inputs['attention_mask']
	mask[empty_idx, :] = 0 # zero-out index where the input is non-existant
	return inputs

	def forward(self, inputs):
	mask = inputs['attention_mask']
	with torch.no_grad():
	embeds = self.t5(**inputs).last_hidden_state
	embeds = self.output_proj(embeds.to(self.output_proj.weight))
	embeds = (embeds * mask.unsqueeze(-1))

	# T5 torch.Size([2, 4, 1536]) dict_keys(['input_ids', 'attention_mask'])
	print(f'{embeds.dtype=}') # inputs["input_ids"].shape=torch.Size([2, 4])
	return embeds, mask








	class ConditioningProvider(nn.Module):

	def __init__(self,
	conditioners):
	super().__init__()
	self.conditioners = nn.ModuleDict(conditioners)

	@property
	def text_conditions(self):
	return [k for k, v in self.conditioners.items() if isinstance(v, T5Conditioner)]



	def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
	output = {}
	text = self._collate_text(inputs)
	# wavs = self._collate_wavs(inputs)
	# joint_embeds = self._collate_joint_embeds(inputs)

	# assert set(text.keys() \| wavs.keys() \| joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
	# f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
	# f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
	# )
	for attribute, batch in text.items(): #, joint_embeds.items()):
	output[attribute] = self.conditioners[attribute].tokenize(batch)
	print(f'COndProvToknz {output=}\n==')
	return output

	def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
	"""Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
	The output is for example:
	{
	"genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
	"description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
	...
	}

	Args:
	tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
	"""
	output = {}
	for attribute, inputs in tokenized.items():
	condition, mask = self.conditioners[attribute](inputs)
	output[attribute] = (condition, mask)
	return output

	def _collate_text(self, samples):
	"""Given a list of ConditioningAttributes objects, compile a dictionary where the keys
	are the attributes and the values are the aggregated input per attribute.
	For example:
	Input:
	[
	ConditioningAttributes(text={"genre": "Rock", "description": "A rock song with a guitar solo"}, wav=...),
	ConditioningAttributes(text={"genre": "Hip-hop", "description": "A hip-hop verse"}, wav=...),
	]
	Output:
	{
	"genre": ["Rock", "Hip-hop"],
	"description": ["A rock song with a guitar solo", "A hip-hop verse"]
	}

	Args:
	samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
	Returns:
	dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
	"""
	out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
	texts = [x.text for x in samples]
	for text in texts:
	for condition in self.text_conditions:
	out[condition].append(text[condition])
	return out






	class ConditionFuser(nn.Module):

	FUSING_METHODS = ["sum", "prepend", "cross", "input_interpolate"]

	def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
	cross_attention_pos_emb_scale: float = 1.0):
	super().__init__()
	assert all(
	[k in self.FUSING_METHODS for k in fuse2cond.keys()]
	), f"Got invalid fuse method, allowed methods: {self.FUSING_METHODS}"
	self.cross_attention_pos_emb = cross_attention_pos_emb
	self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
	self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
	self.cond2fuse: tp.Dict[str, str] = {}
	for fuse_method, conditions in fuse2cond.items():
	for condition in conditions:
	self.cond2fuse[condition] = fuse_method

	def forward(
	self,
	input,
	conditions):
	return input, conditions['description'][0] #cross_attention_output