Spaces:

maitrix-org
/

Voila-demo

Runtime error

Voila-demo / model.py

Mark Shi

upload code

c0a944c about 11 hours ago

61.5 kB

	import math
	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union, Dict, Any

	import torch
	from torch import nn
	import torch.nn.functional as F
	from torch.nn import CrossEntropyLoss

	from transformers.cache_utils import Cache, DynamicCache
	from transformers.utils import ModelOutput, logging
	from transformers.models.llama.modeling_llama import LlamaModel, LlamaPreTrainedModel

	from audio_transformer import AudioTransformer

	logger = logging.get_logger(__name__)


	# Copied from https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/components.py#L43
	class LayerNorm(torch.nn.LayerNorm):
	"""Layer norm with transpose"""

	def forward(self, input: torch.Tensor) -> torch.Tensor:
	x = input.transpose(-2, -1)
	x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
	x = x.transpose(-2, -1)
	return x

	# Copied from https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/components.py#L53
	class ConvLayerBlock(torch.nn.Module):
	"""Convolution unit of FeatureExtractor"""

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int,
	stride: int,
	bias: bool,
	layer_norm: Optional[torch.nn.Module],
	):
	super().__init__()
	self.kernel_size = kernel_size
	self.stride = stride
	self.layer_norm = layer_norm
	self.conv = torch.nn.Conv1d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	bias=bias,
	)

	def forward(
	self,
	x: torch.Tensor,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""
	Args:
	x (Tensor): Shape: ``[batch, in_channels, in_frame]``.
	Returns:
	Tensor: Shape ``[batch, out_channels, out_frames]``.
	Optional[Tensor]: Shape ``[batch, ]``.
	"""
	x = self.conv(x)
	if self.layer_norm is not None:
	x = self.layer_norm(x)
	x = torch.nn.functional.gelu(x)

	return x

	# Copied from https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/components.py#L146
	class FeatureProjection(torch.nn.Module):
	"""Layer that connects FeatureExtractor and Encoder

	Projects features to encoder dimension.

	Args:
	in_features (int): Input feature dim.
	out_features (int): Output feature dim.
	dropout (float): Dropout probability.
	"""

	def __init__(
	self,
	in_features: int,
	out_features: int,
	dropout=0.1,
	):
	super().__init__()
	self.layer_norm = torch.nn.LayerNorm(in_features)
	self.projection = torch.nn.Linear(
	in_features,
	out_features,
	)
	self.dropout = torch.nn.Dropout(dropout)

	def forward(self, x):
	"""
	Args:
	x (Tensor):
	Feature Tensor. shape: ``[batch, frame, in_feature]``
	Returns:
	Tensor: Projected features. ``[batch, frame, out_feature]``.
	"""
	x = self.layer_norm(x)
	x = self.projection(x)
	x = self.dropout(x)
	return x

	# Modified from https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/components.py#L102
	class FeatureExtractor(torch.nn.Module):
	"""Extract features from audio

	Args:
	conv_layers (nn.ModuleList):
	convolution layers
	"""

	def __init__(
	self,
	shapes=[(512, 10, 5), (512, 3, 2), (512, 3, 2), (512, 3, 2), (512, 3, 2), (512, 2, 2), (512, 2, 2)],
	bias=False,
	norm_mode="group_norm",
	):
	super().__init__()
	if norm_mode not in ["group_norm", "layer_norm"]:
	raise ValueError("Invalid norm mode")
	blocks = []
	in_channels = 1
	for i, (out_channels, kernel_size, stride) in enumerate(shapes):
	normalization = None
	if norm_mode == "group_norm" and i == 0:
	normalization = torch.nn.GroupNorm(
	num_groups=out_channels,
	num_channels=out_channels,
	affine=True,
	)
	elif norm_mode == "layer_norm":
	normalization = LayerNorm(
	normalized_shape=out_channels,
	elementwise_affine=True,
	)
	blocks.append(
	ConvLayerBlock(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	bias=bias,
	layer_norm=normalization,
	)
	)
	in_channels = out_channels
	self.conv_layers = torch.nn.ModuleList(blocks)

	def forward(
	self,
	x: torch.Tensor,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""
	Args:
	x (Tensor):
	Input Tensor representing a batch of audio,
	shape: ``[batch, time]``.

	Returns:
	Tensor:
	The resulting feature, shape: ``[batch, frame, feature]``
	Optional[Tensor]:
	Valid length of each output sample. shape: ``[batch, ]``.
	"""
	if x.ndim != 2:
	raise ValueError(f"Expected the input Tensor to be 2D (batch, time). Found: {list(x.shape)}")

	x = x.unsqueeze(1) # (batch, channel==1, frame)
	for layer in self.conv_layers:
	x = layer(x) # (batch, feature, frame)
	x = x.transpose(1, 2) # (batch, frame, feature)
	return x

	# Modified from https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/components.py#L102
	class FeatureExtractorAdapter(torch.nn.Module):
	"""Extract features from audio

	Args:
	conv_layers (nn.ModuleList):
	convolution layers
	"""

	def __init__(
	self,
	shapes=(512, 512, 2, 2),
	hidden_size=2048,
	bias=False,
	norm_mode="group_norm",
	):
	super().__init__()
	if norm_mode not in ["group_norm", "layer_norm"]:
	raise ValueError("Invalid norm mode")
	in_channels, out_channels, kernel_size, stride = shapes
	normalization = LayerNorm(
	normalized_shape=out_channels,
	elementwise_affine=True,
	)
	self.conv_layers = ConvLayerBlock(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=kernel_size,
	stride=stride,
	bias=False,
	layer_norm=normalization,
	)
	self.feat_proj = FeatureProjection(out_channels, hidden_size)

	def forward(
	self,
	x: torch.Tensor,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""
	Args:
	x (Tensor):
	Input Tensor representing a batch of audio,
	shape: ``[batch, time]``.

	Returns:
	Tensor:
	The resulting feature, shape: ``[batch, frame, feature]``
	Optional[Tensor]:
	Valid length of each output sample. shape: ``[batch, ]``.
	"""
	x = x.transpose(1, 2) # (batch, feature, frame)
	x = self.conv_layers(x) # (batch, feature, frame)
	x = x.transpose(1, 2) # (batch, frame, feature)
	x = self.feat_proj(x)
	return x

	@dataclass
	class VoilaOutput(ModelOutput):
	"""
	Modified from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_outputs.py#L678

	Base class for Voila outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	The hidden state of the last attention layer.
	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: torch.FloatTensor = None
	last_hidden_state: torch.FloatTensor = None
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	voila_pred: Optional[torch.FloatTensor] = None


	# Modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1103
	class VoilaModel(LlamaPreTrainedModel):
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config):
	super().__init__(config)
	self.model = LlamaModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	self.pad_vocab_size_multiple = 64

	self.ref_emb_linear = nn.Linear(256, config.hidden_size, bias=True)
	self.audio_transformer = AudioTransformer(config, use_sdpa=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	audio_labels: Optional[torch.LongTensor] = None,
	ref_embs: Optional[List[torch.Tensor]] = None,
	ref_embs_mask: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	num_logits_to_keep: int = 0,
	) -> Union[Tuple, VoilaOutput]:
	r"""
	Args:
	input_ids: [bs, seq_len, num_codebooks]
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	if inputs_embeds is None:
	inputs_embeds = self.model.embed_tokens(input_ids)
	assert len(inputs_embeds.shape) == 4
	if len(inputs_embeds.shape) == 4:
	inputs_embeds = inputs_embeds.mean(dim=2)

	if self.training or \
	(past_key_values is None and ref_embs is not None) or \
	(past_key_values is not None and past_key_values.get_seq_length() < 4 and ref_embs is not None):
	ref_embs = self.ref_emb_linear(ref_embs.to(self.ref_emb_linear.weight.dtype))
	ref_embs = ref_embs * ref_embs_mask.unsqueeze(-1).unsqueeze(-1)
	# (padding_left,padding_right,padding_top,padding_bottom,padding_front,padding_back)
	padding = (0, 0, 4, inputs_embeds.shape[1] - 5, 0, 0)
	ref_embs = torch.nn.functional.pad(ref_embs, padding, mode='constant', value=0.0)
	inputs_embeds = inputs_embeds + ref_embs

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)

	hidden_states = outputs[0]
	if self.config.pretraining_tp > 1:
	lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
	logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
	logits = torch.cat(logits, dim=-1)
	else:
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
	logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])

	loss = None

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return VoilaOutput(
	loss=loss,
	logits=logits,
	last_hidden_state=hidden_states,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def _prepare_inputs_for_generation(
	self, input_ids, ref_embs=None, ref_embs_mask=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
	):
	if past_key_values is not None and past_key_values.get_seq_length() > 0:
	if isinstance(past_key_values, Cache):
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values.seen_tokens
	max_cache_length = past_key_values.get_max_cache_shape()
	else:
	cache_length = past_length = past_key_values[0][0].shape[2]
	max_cache_length = None

	# Keep only the unprocessed tokens:
	# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
	# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
	# input)
	if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
	input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
	# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
	# input_ids based on the past_length.
	elif past_length < input_ids.shape[1]:
	input_ids = input_ids[:, past_length:]
	# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

	# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
	if (
	max_cache_length is not None
	and attention_mask is not None
	and cache_length + input_ids.shape[1] > max_cache_length
	):
	attention_mask = attention_mask[:, -max_cache_length:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is None and \
	(past_key_values is None or past_key_values.get_seq_length() <= 0):
	inputs_embeds = self.model.embed_tokens(input_ids)
	if inputs_embeds is not None and \
	(past_key_values is None or past_key_values.get_seq_length() <= 0):
	model_inputs = {"inputs_embeds": inputs_embeds, "ref_embs": ref_embs, "ref_embs_mask": ref_embs_mask}
	else:
	model_inputs = {"input_ids": input_ids, "ref_embs": None}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	def _update_model_kwargs_for_generation(
	self,
	outputs,
	model_kwargs: Dict[str, Any],
	num_new_token: int = 1,
	) -> Dict[str, Any]:
	# update past_key_values
	model_kwargs["past_key_values"] = outputs.past_key_values

	# update attention mask
	if "attention_mask" in model_kwargs:
	attention_mask = model_kwargs["attention_mask"]
	model_kwargs["attention_mask"] = torch.cat(
	[attention_mask, attention_mask.new_ones((attention_mask.shape[0], num_new_token))], dim=-1
	)

	return model_kwargs

	def _prepare_attention_mask_for_generation(
	self,
	inputs: torch.Tensor,
	pad_token_id: Optional[int],
	eos_token_id: Optional[Union[int, List[int]]],
	) -> torch.LongTensor:
	is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
	is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
	if isinstance(eos_token_id, int):
	eos_token_id = [eos_token_id]
	is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id not in eos_token_id)

	# Check if input is input_ids and padded -> only then is attention_mask defined
	if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
	return inputs.ne(pad_token_id).long()
	else:
	return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)

	@torch.inference_mode()
	def run_generate(
	self,
	input_ids: torch.LongTensor,
	ref_embs: Optional[List[torch.Tensor]] = None,
	ref_embs_mask: Optional[torch.LongTensor] = None,
	max_new_tokens: Optional[int] = 128,
	pad_token_id: Optional[int] = None,
	eos_token_id: Optional[Union[int, List[int]]] = None,
	streamer: Optional["BaseStreamer"] = None,
	llm_audio_token_id: Optional[int] = None,
	min_audio_token_id: Optional[int] = None,
	temperature=0.2,
	top_k=50,
	audio_temperature=0.2,
	audio_top_k=50,
	):
	assert eos_token_id is not None and pad_token_id is not None, "eos_token_id and pad_token_id are required for inference"
	assert llm_audio_token_id is not None and min_audio_token_id is not None, "llm_audio_token_id and min_audio_token_id are required for inference"
	assert len(input_ids.shape) == 2 or len(input_ids.shape) == 3, f"input_ids is supposed to be [batch, seq_len] or [batch, seq_len, num_codebooks], and got {input_ids.shape}"

	eos_token_id_tensor = torch.tensor([eos_token_id]).to(input_ids.device)

	# keep track of which sequences are already finished
	unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)

	# Extend input_ids with additional num_codebooks dim
	if len(input_ids.shape) == 2:
	input_ids = input_ids[:, :, None].expand(1, 1, self.config.num_codebooks)

	this_peer_finished = False # used by synced_gpus only
	max_length = input_ids.shape[1] + max_new_tokens

	model_kwargs = {
	"use_cache": True,
	"past_key_values": DynamicCache(),
	"attention_mask": self._prepare_attention_mask_for_generation(
	input_ids, pad_token_id, eos_token_id
	),
	}
	# auto-regressive generation
	while True:
	# prepare model inputs
	model_inputs = self._prepare_inputs_for_generation(
	input_ids,
	ref_embs=ref_embs,
	ref_embs_mask=ref_embs_mask,
	**model_kwargs
	)

	# forward pass to get next token
	outputs = self(
	**model_inputs,
	return_dict=True,
	)
	audio_tokens = self.audio_transformer.inference(
	outputs.last_hidden_state,
	temperature=audio_temperature,
	top_k=audio_top_k,
	)
	audio_tokens = torch.stack(
	[
	audio_tokens[:, :, ci] + min_audio_token_id + ci*self.config.codebook_size
	for ci in range(self.config.num_codebooks)
	],
	dim=2,
	)

	next_token_logits = outputs.logits[:, -1, :]

	# pre-process distribution
	# Apply temperature and top-k
	if temperature > 0:
	next_token_logits = next_token_logits / temperature
	if top_k > 0:
	top_k = min(top_k, next_token_logits.size(-1)) # Safety check
	# Remove all tokens with a probability less than the last token of the top-k
	indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
	next_token_logits = next_token_logits.masked_fill(indices_to_remove, -float("Inf"))

	# sample
	probs = nn.functional.softmax(next_token_logits, dim=-1)
	next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)

	# finished sentences should have their next token be a padding token
	if eos_token_id is not None:
	if pad_token_id is None:
	raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
	next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

	# Append NUM_CODEBOOK text tokens or audio_tokens
	if len(next_tokens.shape) == 1:
	next_tokens = next_tokens[:, None, None].expand(-1, 1, self.config.num_codebooks)
	next_tokens = torch.where(next_tokens==llm_audio_token_id, audio_tokens, next_tokens)

	input_ids = torch.cat([input_ids, next_tokens], dim=1)
	if streamer is not None:
	streamer.put(next_tokens.cpu())
	model_kwargs = self._update_model_kwargs_for_generation(
	outputs, model_kwargs
	)

	# if eos_token was found in one sentence, set sentence to finished
	if eos_token_id_tensor is not None:
	unfinished_sequences = unfinished_sequences.mul(
	next_tokens[:, :, 0].ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=1)
	)

	# stop when each sentence is finished
	if unfinished_sequences.max() == 0:
	this_peer_finished = True

	# stop if we exceed the maximum length
	if input_ids.shape[1] >= max_length:
	this_peer_finished = True

	if this_peer_finished:
	break

	if streamer is not None:
	streamer.end()

	return input_ids


	# Modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1103
	class VoilaAudioAlphaModel(LlamaPreTrainedModel):
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config):
	super().__init__(config)
	self.model = LlamaModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	self.pad_vocab_size_multiple = 64


	self.ref_emb_linear = nn.Linear(256, config.hidden_size, bias=True)
	self.audio_transformer = AudioTransformer(config, use_sdpa=False)

	self.feature_extractor = FeatureExtractor()
	self.audio_feature_extractor_adapter = FeatureExtractorAdapter(hidden_size=config.hidden_size)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	audio_labels: Optional[torch.LongTensor] = None,
	ref_embs: Optional[List[torch.Tensor]] = None,
	ref_embs_mask: Optional[torch.LongTensor] = None,
	audio_datas: Optional[torch.FloatTensor] = None,
	audio_data_masks: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	num_logits_to_keep: int = 0,
	) -> Union[Tuple, VoilaOutput]:
	r"""
	Args:
	input_ids: [bs, seq_len, num_codebooks]
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	if inputs_embeds is None:
	inputs_embeds = self.model.embed_tokens(input_ids)
	assert len(inputs_embeds.shape) == 4
	if len(inputs_embeds.shape) == 4:
	inputs_embeds = inputs_embeds.mean(dim=2)

	if self.training or \
	(past_key_values is None and ref_embs is not None) or \
	(past_key_values is not None and past_key_values.get_seq_length() < 4 and ref_embs is not None):
	ref_embs = self.ref_emb_linear(ref_embs.to(self.ref_emb_linear.weight.dtype))
	ref_embs = ref_embs * ref_embs_mask.unsqueeze(-1).unsqueeze(-1)
	# (padding_left,padding_right,padding_top,padding_bottom,padding_front,padding_back)
	padding = (0, 0, 4, inputs_embeds.shape[1] - 5, 0, 0)
	ref_embs = torch.nn.functional.pad(ref_embs, padding, mode='constant', value=0.0)
	inputs_embeds = inputs_embeds + ref_embs

	if self.training or audio_datas is not None:
	audio_embeds = self.feature_extractor(audio_datas)
	audio_embeds = self.audio_feature_extractor_adapter(audio_embeds)
	audio_embeds = audio_embeds * audio_data_masks[..., None]
	inputs_embeds = inputs_embeds + audio_embeds

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)

	hidden_states = outputs[0]
	if self.config.pretraining_tp > 1:
	lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
	logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
	logits = torch.cat(logits, dim=-1)
	else:
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
	logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])

	loss = None
	if labels is not None:
	# Upcast to float if we need to compute the loss to avoid potential precision issues
	logits = logits.float()
	# We shift tokens and labels in dataloader
	shift_logits = logits.contiguous()
	shift_labels = labels.contiguous()
	# Flatten the tokens
	loss_fct = CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	# Enable model parallelism
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)

	if audio_labels is not None:
	au_mask = (audio_labels >= 0).all(dim=-1)
	au_hidden_states = hidden_states[au_mask]
	au_audio_labels = audio_labels[au_mask]
	if len(au_hidden_states) <= 0:
	au_hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1])
	au_audio_labels = torch.zeros_like(audio_labels).reshape(-1, self.config.num_codebooks)
	loss_weight = 0.0
	else:
	loss_weight = 1.0
	au_logits = self.audio_transformer(au_hidden_states, au_audio_labels)
	# We shift tokens and labels in dataloader
	shift_au_logits = au_logits.contiguous()
	shift_audio_labels = au_audio_labels.contiguous()
	# Flatten the tokens
	loss_fct = CrossEntropyLoss()
	shift_au_logits = shift_au_logits.view(-1, self.config.codebook_size)
	shift_audio_labels = shift_audio_labels.view(-1)
	# Enable model parallelism
	shift_audio_labels = shift_audio_labels.to(shift_au_logits.device)
	au_loss = loss_fct(shift_au_logits, shift_audio_labels)

	loss += au_loss * loss_weight
	else:
	# au_tokens = self.audio_transformer.inference(hidden_states)
	pass

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return VoilaOutput(
	loss=loss,
	logits=logits,
	last_hidden_state=hidden_states,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def _prepare_inputs_for_generation(
	self, input_ids, ref_embs=None, ref_embs_mask=None, audio_datas=None, audio_data_masks=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
	):
	if past_key_values is not None and past_key_values.get_seq_length() > 0:
	if isinstance(past_key_values, Cache):
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values.seen_tokens
	max_cache_length = past_key_values.get_max_cache_shape()
	else:
	cache_length = past_length = past_key_values[0][0].shape[2]
	max_cache_length = None

	# Keep only the unprocessed tokens:
	# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
	# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
	# input)
	if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
	input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
	# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
	# input_ids based on the past_length.
	elif past_length < input_ids.shape[1]:
	input_ids = input_ids[:, past_length:]
	# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

	# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
	if (
	max_cache_length is not None
	and attention_mask is not None
	and cache_length + input_ids.shape[1] > max_cache_length
	):
	attention_mask = attention_mask[:, -max_cache_length:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is None and \
	(past_key_values is None or past_key_values.get_seq_length() <= 0):
	inputs_embeds = self.model.embed_tokens(input_ids)
	if inputs_embeds is not None and \
	(past_key_values is None or past_key_values.get_seq_length() <= 0):
	model_inputs = {"inputs_embeds": inputs_embeds, "ref_embs": ref_embs, "ref_embs_mask": ref_embs_mask, "audio_datas": audio_datas, "audio_data_masks": audio_data_masks}
	else:
	model_inputs = {"input_ids": input_ids, "ref_embs": None, "audio_datas": None, "audio_data_masks": None}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	def _update_model_kwargs_for_generation(
	self,
	outputs,
	model_kwargs: Dict[str, Any],
	num_new_token: int = 1,
	) -> Dict[str, Any]:
	# update past_key_values
	model_kwargs["past_key_values"] = outputs.past_key_values

	# update attention mask
	if "attention_mask" in model_kwargs:
	attention_mask = model_kwargs["attention_mask"]
	model_kwargs["attention_mask"] = torch.cat(
	[attention_mask, attention_mask.new_ones((attention_mask.shape[0], num_new_token))], dim=-1
	)

	return model_kwargs

	def _prepare_attention_mask_for_generation(
	self,
	inputs: torch.Tensor,
	pad_token_id: Optional[int],
	eos_token_id: Optional[Union[int, List[int]]],
	) -> torch.LongTensor:
	is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
	is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
	if isinstance(eos_token_id, int):
	eos_token_id = [eos_token_id]
	is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id not in eos_token_id)

	# Check if input is input_ids and padded -> only then is attention_mask defined
	if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
	return inputs.ne(pad_token_id).long()
	else:
	return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)

	@torch.inference_mode()
	def run_generate(
	self,
	input_ids: torch.LongTensor,
	ref_embs: Optional[List[torch.Tensor]] = None,
	ref_embs_mask: Optional[torch.LongTensor] = None,
	audio_datas: Optional[torch.FloatTensor] = None,
	audio_data_masks: Optional[torch.LongTensor] = None,
	max_new_tokens: Optional[int] = 128,
	pad_token_id: Optional[int] = None,
	eos_token_id: Optional[Union[int, List[int]]] = None,
	streamer: Optional["BaseStreamer"] = None,
	llm_audio_token_id: Optional[int] = None,
	min_audio_token_id: Optional[int] = None,
	temperature=0.2,
	top_k=50,
	audio_temperature=0.2,
	audio_top_k=50,
	):
	assert eos_token_id is not None and pad_token_id is not None, "eos_token_id and pad_token_id are required for inference"
	assert llm_audio_token_id is not None and min_audio_token_id is not None, "llm_audio_token_id and min_audio_token_id are required for inference"
	assert len(input_ids.shape) == 2 or len(input_ids.shape) == 3, f"input_ids is supposed to be [batch, seq_len] or [batch, seq_len, num_codebooks], and got {input_ids.shape}"

	eos_token_id_tensor = torch.tensor([eos_token_id]).to(input_ids.device)

	# keep track of which sequences are already finished
	unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)

	# Extend input_ids with additional num_codebooks dim
	if len(input_ids.shape) == 2:
	input_ids = input_ids[:, :, None].expand(1, 1, self.config.num_codebooks)

	this_peer_finished = False # used by synced_gpus only
	max_length = input_ids.shape[1] + max_new_tokens

	model_kwargs = {
	"use_cache": True,
	"past_key_values": DynamicCache(),
	"attention_mask": self._prepare_attention_mask_for_generation(
	input_ids, pad_token_id, eos_token_id
	),
	}
	# auto-regressive generation
	while True:
	# prepare model inputs
	model_inputs = self._prepare_inputs_for_generation(
	input_ids,
	ref_embs=ref_embs,
	ref_embs_mask=ref_embs_mask,
	audio_datas=audio_datas,
	audio_data_masks=audio_data_masks,
	**model_kwargs
	)

	# forward pass to get next token
	outputs = self(
	**model_inputs,
	return_dict=True,
	)
	audio_tokens = self.audio_transformer.inference(
	outputs.last_hidden_state,
	temperature=audio_temperature,
	top_k=audio_top_k,
	)
	audio_tokens = torch.stack(
	[
	audio_tokens[:, :, ci] + min_audio_token_id + ci*self.config.codebook_size
	for ci in range(self.config.num_codebooks)
	],
	dim=2,
	)

	next_token_logits = outputs.logits[:, -1, :]

	# pre-process distribution
	# Apply temperature and top-k
	if temperature > 0:
	next_token_logits = next_token_logits / temperature
	if top_k > 0:
	top_k = min(top_k, next_token_logits.size(-1)) # Safety check
	# Remove all tokens with a probability less than the last token of the top-k
	indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
	next_token_logits = next_token_logits.masked_fill(indices_to_remove, -float("Inf"))

	# sample
	probs = nn.functional.softmax(next_token_logits, dim=-1)
	next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)

	# finished sentences should have their next token be a padding token
	if eos_token_id is not None:
	if pad_token_id is None:
	raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
	next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

	# Append NUM_CODEBOOK text tokens or audio_tokens
	if len(next_tokens.shape) == 1:
	next_tokens = next_tokens[:, None, None].expand(-1, 1, self.config.num_codebooks)
	next_tokens = torch.where(next_tokens==llm_audio_token_id, audio_tokens, next_tokens)

	input_ids = torch.cat([input_ids, next_tokens], dim=1)
	if streamer is not None:
	streamer.put(next_tokens.cpu())
	model_kwargs = self._update_model_kwargs_for_generation(
	outputs, model_kwargs
	)

	# if eos_token was found in one sentence, set sentence to finished
	if eos_token_id_tensor is not None:
	unfinished_sequences = unfinished_sequences.mul(
	next_tokens[:, :, 0].ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=1)
	)

	# stop when each sentence is finished
	if unfinished_sequences.max() == 0:
	this_peer_finished = True

	# stop if we exceed the maximum length
	if input_ids.shape[1] >= max_length:
	this_peer_finished = True

	if this_peer_finished:
	break

	if streamer is not None:
	streamer.end()

	return input_ids


	# Modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1103
	class VoilaAutonomousModel(LlamaPreTrainedModel):
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config):
	super().__init__(config)
	self.model = LlamaModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	self.pad_vocab_size_multiple = 64

	self.ref_emb_linear = nn.Linear(256, config.hidden_size, bias=True)
	self.audio_transformer = AudioTransformer(config, use_sdpa=False)
	self.voila_predictor = nn.Sequential(nn.Linear(config.hidden_size, 2, bias=True),)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	audio_labels: Optional[torch.LongTensor] = None,
	voila_labels: Optional[torch.LongTensor] = None,
	ref_embs: Optional[List[torch.Tensor]] = None,
	ref_embs_mask: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	num_logits_to_keep: int = 0,
	) -> Union[Tuple, VoilaOutput]:
	r"""
	Args:
	input_ids: [bs, seq_len, num_codebooks]
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	if inputs_embeds is None:
	inputs_embeds = self.model.embed_tokens(input_ids)
	assert len(inputs_embeds.shape) == 4
	if len(inputs_embeds.shape) == 4:
	inputs_embeds = inputs_embeds.mean(dim=2)

	if self.training or \
	(past_key_values is None and ref_embs is not None) or \
	(past_key_values is not None and past_key_values.get_seq_length() < 4 and ref_embs is not None):
	ref_embs = self.ref_emb_linear(ref_embs.to(self.ref_emb_linear.weight.dtype))
	ref_embs = ref_embs * ref_embs_mask.unsqueeze(-1).unsqueeze(-1)
	# (padding_left,padding_right,padding_top,padding_bottom,padding_front,padding_back)
	padding = (0, 0, 4, inputs_embeds.shape[1] - 5, 0, 0)
	ref_embs = torch.nn.functional.pad(ref_embs, padding, mode='constant', value=0.0)
	inputs_embeds = inputs_embeds + ref_embs

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs = self.model(
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	cache_position=cache_position,
	)

	hidden_states = outputs[0]
	if self.config.pretraining_tp > 1:
	lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
	logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
	logits = torch.cat(logits, dim=-1)
	else:
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
	logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])

	# calc voila_predict_loss
	voila_pred = self.voila_predictor(hidden_states)
	voila_pred = voila_pred.float()

	loss = None

	if not return_dict:
	output = (logits,) + outputs[1:]
	return (loss,) + output if loss is not None else output

	return VoilaOutput(
	loss=loss,
	logits=logits,
	last_hidden_state=hidden_states,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	voila_pred=voila_pred,
	)

	def _prepare_inputs_for_generation(
	self, input_ids, ref_embs=None, ref_embs_mask=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
	):
	if past_key_values is not None and past_key_values.get_seq_length() > 0:
	if isinstance(past_key_values, Cache):
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values.seen_tokens
	max_cache_length = past_key_values.get_max_cache_shape()
	else:
	cache_length = past_length = past_key_values[0][0].shape[2]
	max_cache_length = None

	# Keep only the unprocessed tokens:
	# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
	# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
	# input)
	if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
	input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
	# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
	# input_ids based on the past_length.
	elif past_length < input_ids.shape[1]:
	input_ids = input_ids[:, past_length:]
	# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

	# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
	if (
	max_cache_length is not None
	and attention_mask is not None
	and cache_length + input_ids.shape[1] > max_cache_length
	):
	attention_mask = attention_mask[:, -max_cache_length:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is None and \
	(past_key_values is None or past_key_values.get_seq_length() <= 0):
	inputs_embeds = self.model.embed_tokens(input_ids)
	if inputs_embeds is not None and \
	(past_key_values is None or past_key_values.get_seq_length() <= 0):
	model_inputs = {"inputs_embeds": inputs_embeds, "ref_embs": ref_embs, "ref_embs_mask": ref_embs_mask}
	else:
	model_inputs = {"input_ids": input_ids, "ref_embs": None}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	def _update_model_kwargs_for_generation(
	self,
	outputs,
	model_kwargs: Dict[str, Any],
	num_new_token: int = 1,
	) -> Dict[str, Any]:
	# update past_key_values
	model_kwargs["past_key_values"] = outputs.past_key_values

	# update attention mask
	if "attention_mask" in model_kwargs:
	attention_mask = model_kwargs["attention_mask"]
	model_kwargs["attention_mask"] = torch.cat(
	[attention_mask, attention_mask.new_ones((attention_mask.shape[0], num_new_token))], dim=-1
	)

	return model_kwargs

	def _prepare_attention_mask_for_generation(
	self,
	inputs: torch.Tensor,
	pad_token_id: Optional[int],
	eos_token_id: Optional[Union[int, List[int]]],
	) -> torch.LongTensor:
	is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
	is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
	if isinstance(eos_token_id, int):
	eos_token_id = [eos_token_id]
	is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id not in eos_token_id)

	# Check if input is input_ids and padded -> only then is attention_mask defined
	if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
	return inputs.ne(pad_token_id).long()
	else:
	return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)

	@torch.inference_mode()
	def run_generate(
	self,
	input_ids: torch.LongTensor,
	input_generator,
	ref_embs: Optional[List[torch.Tensor]] = None,
	ref_embs_mask: Optional[torch.LongTensor] = None,
	max_new_tokens: Optional[int] = 128,
	pad_token_id: Optional[int] = None,
	eos_token_id: Optional[Union[int, List[int]]] = None,
	streamer: Optional["BaseStreamer"] = None,
	llm_audio_token_id: Optional[int] = None,
	min_audio_token_id: Optional[int] = None,
	llm_assistant_token_id: Optional[int] = None,
	temperature=0.2,
	top_k=50,
	audio_temperature=0.8,
	audio_top_k=50,
	):
	assert eos_token_id is not None and pad_token_id is not None, "eos_token_id and pad_token_id are required for inference"
	assert llm_audio_token_id is not None and min_audio_token_id is not None, "llm_audio_token_id and min_audio_token_id are required for inference"
	assert len(input_ids.shape) == 2 or len(input_ids.shape) == 3, f"input_ids is supposed to be [batch, seq_len] or [batch, seq_len, num_codebooks], and got {input_ids.shape}"

	eos_token_id_tensor = torch.tensor([eos_token_id]).to(input_ids.device)

	# keep track of which sequences are already finished
	unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)

	# Extend input_ids with additional num_codebooks dim
	input_ids = input_ids.clone()
	if len(input_ids.shape) == 2:
	input_ids = input_ids[:, :, None].expand(1, 1, self.config.num_codebooks)

	this_peer_finished = False # used by synced_gpus only
	max_length = input_ids.shape[1] + max_new_tokens

	model_kwargs = {
	"use_cache": True,
	"past_key_values": DynamicCache(),
	"attention_mask": self._prepare_attention_mask_for_generation(
	input_ids, pad_token_id, eos_token_id
	),
	}
	speaking = False
	# auto-regressive generation
	while True:
	# prepare model inputs
	model_inputs = self._prepare_inputs_for_generation(
	input_ids,
	ref_embs=ref_embs,
	ref_embs_mask=ref_embs_mask,
	**model_kwargs
	)

	# forward pass to get next token
	outputs = self(
	**model_inputs,
	return_dict=True,
	)
	audio_tokens = self.audio_transformer.inference(
	outputs.last_hidden_state,
	temperature=audio_temperature,
	top_k=audio_top_k,
	)
	audio_tokens = torch.stack(
	[
	audio_tokens[:, :, ci] + min_audio_token_id + ci*self.config.codebook_size
	for ci in range(self.config.num_codebooks)
	],
	dim=2,
	)

	next_token_logits = outputs.logits[:, -1, :]

	# voila head output
	voila_head_pred = outputs.voila_pred[:, -1, :]
	voila_head_pred = torch.argmax(voila_head_pred, dim=-1)
	voila_head_pred = voila_head_pred.cpu()[0].item()

	# pre-process distribution
	# Apply temperature and top-k
	if temperature > 0:
	next_token_logits = next_token_logits / temperature
	if top_k > 0:
	top_k = min(top_k, next_token_logits.size(-1)) # Safety check
	# Remove all tokens with a probability less than the last token of the top-k
	indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
	next_token_logits = next_token_logits.masked_fill(indices_to_remove, -float("Inf"))

	# sample
	probs = nn.functional.softmax(next_token_logits, dim=-1)
	next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)

	# voila head pred == 1, use assistant token
	if voila_head_pred == 1 and not speaking:
	next_tokens[0] = llm_assistant_token_id
	speaking = True
	elif next_tokens[0] == eos_token_id:
	speaking = False

	# finished sentences should have their next token be a padding token
	if eos_token_id is not None:
	if pad_token_id is None:
	raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
	next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

	# Append NUM_CODEBOOK text tokens or audio_tokens
	if len(next_tokens.shape) == 1:
	next_tokens = next_tokens[:, None, None].expand(-1, 1, self.config.num_codebooks)
	audio_token_mask = next_tokens == llm_audio_token_id
	next_tokens = next_tokens * torch.logical_not(audio_token_mask) + audio_tokens * audio_token_mask

	if audio_token_mask[0, 0, 0].item():
	try:
	new_input_tokens = next(input_generator)
	except:
	this_peer_finished = True
	break
	new_input_tokens = new_input_tokens[None,None,:]
	else:
	new_input_tokens = next_tokens
	new_input_tokens = torch.cat([new_input_tokens, next_tokens], dim=2)

	input_ids = torch.cat([input_ids, new_input_tokens], dim=1)
	if streamer is not None:
	streamer.put(next_tokens.cpu())
	model_kwargs = self._update_model_kwargs_for_generation(
	outputs, model_kwargs
	)

	# # if eos_token was found in one sentence, set sentence to finished
	# if eos_token_id_tensor is not None:
	# unfinished_sequences = unfinished_sequences.mul(
	# next_tokens[:, :, 0].ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=1)
	# )

	# # stop when each sentence is finished
	# if unfinished_sequences.max() == 0:
	# this_peer_finished = True

	# stop if we exceed the maximum length
	if input_ids.shape[1] >= max_length:
	this_peer_finished = True

	if this_peer_finished:
	break

	if streamer is not None:
	streamer.end()

	return input_ids