Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Steveeeeeeen HF Staff commited on 30 days ago

Commit

7e6946d

1 Parent(s): 3cf0e6f

add model

Browse files

Files changed (45) hide show

cosyvoice2/flow/__init__.py +0 -0
cosyvoice2/flow/decoder_dit.py +585 -0
cosyvoice2/flow/flow.py +225 -0
cosyvoice2/flow/flow_matching.py +205 -0
cosyvoice2/transformer/__init__.py +0 -0
cosyvoice2/transformer/attention.py +328 -0
cosyvoice2/transformer/embedding.py +119 -0
cosyvoice2/transformer/encoder_layer.py +163 -0
cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
cosyvoice2/transformer/subsampling.py +79 -0
cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
cosyvoice2/utils/class_utils.py +41 -0
cosyvoice2/utils/common.py +101 -0
cosyvoice2/utils/mask.py +49 -0
flashcosyvoice/__init__.py +0 -0
flashcosyvoice/cli.py +424 -0
flashcosyvoice/config.py +80 -0
flashcosyvoice/cosyvoice2.py +160 -0
flashcosyvoice/cosyvoice3.py +1 -0
flashcosyvoice/engine/__init__.py +0 -0
flashcosyvoice/engine/block_manager.py +114 -0
flashcosyvoice/engine/llm_engine.py +125 -0
flashcosyvoice/engine/model_runner.py +310 -0
flashcosyvoice/engine/scheduler.py +77 -0
flashcosyvoice/engine/sequence.py +90 -0
flashcosyvoice/modules/__init__.py +0 -0
flashcosyvoice/modules/flow.py +198 -0
flashcosyvoice/modules/flow_components/__init__.py +0 -0
flashcosyvoice/modules/flow_components/estimator.py +974 -0
flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
flashcosyvoice/modules/hifigan.py +249 -0
flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
flashcosyvoice/modules/hifigan_components/layers.py +433 -0
flashcosyvoice/modules/qwen2.py +92 -0
flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
flashcosyvoice/modules/qwen2_components/layers.py +616 -0
flashcosyvoice/modules/sampler.py +231 -0
flashcosyvoice/utils/__init__.py +0 -0
flashcosyvoice/utils/audio.py +77 -0
flashcosyvoice/utils/context.py +28 -0
flashcosyvoice/utils/loader.py +116 -0
flashcosyvoice/utils/memory.py +19 -0
stepaudio2.py +204 -0
token2wav.py +79 -0
utils.py +91 -0

cosyvoice2/flow/__init__.py ADDED Viewed

File without changes

cosyvoice2/flow/decoder_dit.py ADDED Viewed

	@@ -0,0 +1,585 @@

+import math
+import torch
+import numpy as np
+from typing import Optional
+from einops import pack, rearrange, repeat
+import torch.nn as nn
+import torch.nn.functional as F
+"""
+DiT-v5
+- Add convolution in DiTBlock to increase high-freq component
+"""
+class MLP(torch.nn.Module):
+    def __init__(
+            self,
+            in_features:int,
+            hidden_features:Optional[int]=None,
+            out_features:Optional[int]=None,
+            act_layer=nn.GELU,
+            norm_layer=None,
+            bias=True,
+            drop=0.,
+    ):
+        super().__init__()
+        hidden_features = hidden_features or in_features
+        out_features = out_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop)
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop2 = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(torch.nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            head_dim: int = 64,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.inner_dim = num_heads * head_dim
+        self.scale = head_dim ** -0.5
+        self.to_q = nn.Linear(dim, self.inner_dim, bias=qkv_bias)
+        self.to_k = nn.Linear(dim, self.inner_dim, bias=qkv_bias)
+        self.to_v = nn.Linear(dim, self.inner_dim, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.proj = nn.Linear(self.inner_dim, dim)
+    def to_heads(self, ts:torch.Tensor):
+        b, t, c = ts.shape
+        # (b, t, nh, c)
+        ts = ts.reshape(b, t, self.num_heads, c // self.num_heads)
+        ts = ts.transpose(1, 2)
+        return ts
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor:
+        """Args:
+            x(torch.Tensor): shape (b, t, c)
+            attn_mask(torch.Tensor): shape (b, t, t)
+        """
+        b, t, c = x.shape
+        q = self.to_q(x)
+        k = self.to_k(x)
+        v = self.to_v(x)
+        q = self.to_heads(q)    # (b, nh, t, c)
+        k = self.to_heads(k)
+        v = self.to_heads(v)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        attn_mask = attn_mask.unsqueeze(1)
+        x = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attn_mask,
+            dropout_p=self.attn_drop.p if self.training else 0.,
+        )   # (b, nh, t, c)
+        x = x.transpose(1, 2).reshape(b, t, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def forward_chunk(self, x: torch.Tensor, att_cache: torch.Tensor=None, attn_mask: torch.Tensor=None):
+        """
+        Args:
+            x: shape (b, dt, c)
+            att_cache: shape (b, nh, t, c*2)
+        """
+        b, t, c = x.shape
+        q = self.to_q(x)
+        k = self.to_k(x)
+        v = self.to_v(x)
+        q = self.to_heads(q)    # (b, nh, t, c)
+        k = self.to_heads(k)
+        v = self.to_heads(v)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # unpack {k,v}_cache
+        if att_cache is not None:
+            if attn_mask is not None:
+                k_cache, v_cache = att_cache.chunk(2, dim=3)
+                k = torch.cat([k, k_cache], dim=2)
+                v = torch.cat([v, v_cache], dim=2)
+            else:
+                k_cache, v_cache = att_cache.chunk(2, dim=3)
+                k = torch.cat([k, k_cache], dim=2)
+                v = torch.cat([v, v_cache], dim=2)
+        # new {k,v}_cache
+        new_att_cache = torch.cat([k, v], dim=3)
+        # attn_mask = torch.ones((b, 1, t, t1), dtype=torch.bool, device=x.device)
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(1)
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)   # (b, nh, t, c)
+        x = x.transpose(1, 2).reshape(b, t, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, new_att_cache
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        # from SinusoidalPosEmb
+        self.scale = 1000
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half) / half
+        ).to(t)
+        args = t[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t * self.scale, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+# Convolution related
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels, kernel_size)
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+    def forward_chunk(self, x: torch.Tensor, cnn_cache: torch.Tensor=None):
+        if cnn_cache is None:
+            cnn_cache = x.new_zeros((x.shape[0], self.in_channels, self.causal_padding[0]))
+        x = torch.cat([cnn_cache, x], dim=2)
+        new_cnn_cache = x[..., -self.causal_padding[0]:]
+        x = super(CausalConv1d, self).forward(x)
+        return x, new_cnn_cache
+class CausalConvBlock(nn.Module):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int = 3,
+                 ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.block = torch.nn.Sequential(
+            # norm
+            # conv1
+            Transpose(1, 2),
+            CausalConv1d(in_channels, out_channels, kernel_size),
+            Transpose(1, 2),
+            # norm & act
+            nn.LayerNorm(out_channels),
+            nn.Mish(),
+            # conv2
+            Transpose(1, 2),
+            CausalConv1d(out_channels, out_channels, kernel_size),
+            Transpose(1, 2),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor = None):
+        """
+        Args:
+            x: shape (b, t, c)
+            mask: shape (b, t, 1)
+        """
+        if mask is not None: x = x * mask
+        x = self.block(x)
+        if mask is not None: x = x * mask
+        return x
+    def forward_chunk(self, x: torch.Tensor, cnn_cache: torch.Tensor=None):
+        """
+        Args:
+            x: shape (b, dt, c)
+            cnn_cache: shape (b, c1+c2, 2)
+        """
+        if cnn_cache is not None:
+            cnn_cache1, cnn_cache2 = cnn_cache.split((self.in_channels, self.out_channels), dim=1)
+        else:
+            cnn_cache1, cnn_cache2 = None, None
+        x = self.block[0](x)
+        x, new_cnn_cache1 = self.block[1].forward_chunk(x, cnn_cache1)
+        x = self.block[2:6](x)
+        x, new_cnn_cache2 = self.block[6].forward_chunk(x, cnn_cache2)
+        x = self.block[7](x)
+        new_cnn_cache = torch.cat((new_cnn_cache1, new_cnn_cache2), dim=1)
+        return x, new_cnn_cache
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, head_dim, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, head_dim=head_dim, qkv_bias=True, qk_norm=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = MLP(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.conv = CausalConvBlock(in_channels=hidden_size, out_channels=hidden_size, kernel_size=3)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 9 * hidden_size, bias=True)
+        )
+    def forward(self, x:torch.Tensor, c:torch.Tensor, attn_mask:torch.Tensor):
+        """Args
+            x: shape (b, t, c)
+            c: shape (b, 1, c)
+            attn_mask: shape (b, t, t), bool type attention mask
+        """
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, shift_conv, scale_conv, gate_conv \
+              = self.adaLN_modulation(c).chunk(9, dim=-1)
+        # attention
+        x = x + gate_msa * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), attn_mask)
+        # conv
+        x = x + gate_conv * self.conv(modulate(self.norm3(x), shift_conv, scale_conv))
+        # mlp
+        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+    def forward_chunk(self, x: torch.Tensor, c: torch.Tensor, cnn_cache: torch.Tensor=None, att_cache: torch.Tensor=None, mask: torch.Tensor=None):
+        """
+        Args:
+            x: shape (b, dt, c)
+            c: shape (b, 1, c)
+            cnn_cache: shape (b, c1+c2, 2)
+            att_cache: shape (b, nh, t, c * 2)
+        """
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, shift_conv, scale_conv, gate_conv \
+              = self.adaLN_modulation(c).chunk(9, dim=-1)
+        # attention
+        x_att, new_att_cache = self.attn.forward_chunk(modulate(self.norm1(x), shift_msa, scale_msa), att_cache, mask)
+        x = x + gate_msa * x_att
+        # conv
+        x_conv, new_cnn_cache = self.conv.forward_chunk(modulate(self.norm3(x), shift_conv, scale_conv), cnn_cache)
+        x = x + gate_conv * x_conv
+        # mlp
+        x = x + gate_mlp * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x, new_cnn_cache, new_att_cache
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mlp_ratio: float = 4.0,
+        depth: int = 28,
+        num_heads: int = 8,
+        head_dim: int = 64,
+        hidden_size: int = 256,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.in_proj = nn.Linear(in_channels, hidden_size)
+        self.blocks = nn.ModuleList([
+            DiTBlock(hidden_size, num_heads, head_dim, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, self.out_channels)
+        self.initialize_weights()
+        self.enable_cuda_graph = False
+        self.use_cuda_graph = False
+        self.graph_chunk = {}
+        self.inference_buffers_chunk = {}
+        self.max_size_chunk = {}
+        self.register_buffer('att_cache_buffer', torch.zeros((16, 2, 8, 1000, 128)), persistent=False)
+        self.register_buffer('cnn_cache_buffer', torch.zeros((16, 2, 1024, 2)), persistent=False)
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def _init_cuda_graph_chunk(self):
+        # get dtype, device from registered buffer
+        dtype, device = self.cnn_cache_buffer.dtype, self.cnn_cache_buffer.device
+        # init cuda graph for streaming forward
+        with torch.no_grad():
+            for chunk_size in [30, 48, 96]:
+                if chunk_size == 30 or chunk_size == 48:
+                    max_size = 500
+                    self.max_size_chunk[chunk_size] = max_size
+                else:
+                    max_size = 1000
+                    self.max_size_chunk[chunk_size] = max_size
+                static_x1 = torch.zeros((2, 320, chunk_size), dtype=dtype, device=device)
+                static_t1 = torch.zeros((2, 1, 512), dtype=dtype, device=device)
+                static_mask1 = torch.ones((2, chunk_size, max_size+chunk_size), dtype=torch.bool, device=device)
+                static_att_cache = torch.zeros((16, 2, 8, max_size, 128), dtype=dtype, device=device)
+                static_cnn_cache = torch.zeros((16, 2, 1024, 2), dtype=dtype, device=device)
+                static_inputs1 = [
+                    static_x1,
+                    static_t1,
+                    static_mask1,
+                    static_cnn_cache,
+                    static_att_cache,
+                ]
+                static_new_cnn_cache = torch.zeros((16, 2, 1024, 2), dtype=dtype, device=device)
+                static_new_att_cache = torch.zeros((16, 2, 8, max_size+chunk_size, 128), dtype=dtype, device=device)
+                self.blocks_forward_chunk(
+                    static_inputs1[0],
+                    static_inputs1[1],
+                    static_inputs1[2],
+                    static_inputs1[3],
+                    static_inputs1[4],
+                    static_new_cnn_cache,
+                    static_new_att_cache)
+                graph_chunk = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph_chunk):
+                    static_out1 = self.blocks_forward_chunk(static_x1, static_t1, static_mask1, static_cnn_cache, static_att_cache, static_new_cnn_cache, static_new_att_cache)
+                static_outputs1 = [static_out1, static_new_cnn_cache, static_new_att_cache]
+                self.inference_buffers_chunk[chunk_size] = {
+                    'static_inputs': static_inputs1,
+                    'static_outputs': static_outputs1
+                }
+                self.graph_chunk[chunk_size] = graph_chunk
+    def _init_cuda_graph_all(self):
+        self._init_cuda_graph_chunk()
+        self.use_cuda_graph = True
+        print(f"CUDA Graph initialized successfully for chunk decoder")
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Args:
+            x: shape (b, c, t)
+            mask: shape (b, 1, t)
+            t: shape (b,)
+            spks: shape (b, c)
+            cond: shape (b, c, t)
+        """
+        # (sfy) chunk training strategy should not be open-sourced
+        # time
+        t = self.t_embedder(t).unsqueeze(1)  # (b, 1, c)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        return self.blocks_forward(x, t, mask)
+    def blocks_forward(self, x, t, mask):
+        x = x.transpose(1, 2)
+        attn_mask = mask.bool()
+        x = self.in_proj(x)
+        for block in self.blocks:
+            x = block(x, t, attn_mask)
+        x = self.final_layer(x, t)
+        x = x.transpose(1, 2)
+        return x
+    def forward_chunk(self,
+                      x: torch.Tensor,
+                      mu: torch.Tensor,
+                      t: torch.Tensor,
+                      spks: torch.Tensor,
+                      cond: torch.Tensor,
+                      cnn_cache: torch.Tensor = None,
+                      att_cache: torch.Tensor = None,
+                      ):
+        """
+        Args:
+            x: shape (b, dt, c)
+            mu: shape (b, dt, c)
+            t: shape (b,)
+            spks: shape (b, c)
+            cond: shape (b, dt, c)
+            cnn_cache: shape (depth, b, c1+c2, 2)
+            att_cache: shape (depth, b, nh, t, c * 2)
+        """
+        # time
+        t = self.t_embedder(t).unsqueeze(1)  # (b, 1, c)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        # create fake cache
+        if cnn_cache is None:
+            cnn_cache = [None] * len(self.blocks)
+        if att_cache is None:
+            att_cache = [None] * len(self.blocks)
+        if att_cache[0] is not None:
+            last_att_len = att_cache.shape[3]
+        else:
+            last_att_len = 0
+        chunk_size = x.shape[2]
+        mask = torch.ones(x.shape[0], chunk_size, last_att_len+chunk_size, dtype=torch.bool, device=x.device)
+        if self.use_cuda_graph and att_cache[0] is not None and chunk_size in self.graph_chunk and last_att_len <= self.max_size_chunk[chunk_size]:
+            padded_mask = torch.zeros((2, chunk_size, self.max_size_chunk[chunk_size]+chunk_size), dtype=mask.dtype, device=mask.device)
+            padded_mask[:, :, :mask.shape[-1]] = mask
+            padded_att_cache = torch.zeros((16, 2, 8, self.max_size_chunk[chunk_size], 128), dtype=att_cache.dtype, device=att_cache.device)
+            padded_att_cache[:, :, :, :last_att_len, :] = att_cache
+            self.inference_buffers_chunk[chunk_size]['static_inputs'][0].copy_(x)
+            self.inference_buffers_chunk[chunk_size]['static_inputs'][1].copy_(t)
+            self.inference_buffers_chunk[chunk_size]['static_inputs'][2].copy_(padded_mask)
+            self.inference_buffers_chunk[chunk_size]['static_inputs'][3].copy_(cnn_cache)
+            self.inference_buffers_chunk[chunk_size]['static_inputs'][4].copy_(padded_att_cache)
+            self.graph_chunk[chunk_size].replay()
+            x = self.inference_buffers_chunk[chunk_size]['static_outputs'][0][:, :, :chunk_size]
+            new_cnn_cache = self.inference_buffers_chunk[chunk_size]['static_outputs'][1]
+            new_att_cache = self.inference_buffers_chunk[chunk_size]['static_outputs'][2][:, :, :, :chunk_size+last_att_len, :]
+        else:
+            mask = None
+            x = self.blocks_forward_chunk(x, t, mask, cnn_cache, att_cache, self.cnn_cache_buffer, self.att_cache_buffer)
+            new_cnn_cache = self.cnn_cache_buffer
+            new_att_cache = self.att_cache_buffer[:, :, :, :last_att_len+chunk_size, :]
+        return x, new_cnn_cache, new_att_cache
+    def blocks_forward_chunk(self, x, t, mask, cnn_cache=None, att_cache=None, cnn_cache_buffer=None, att_cache_buffer=None):
+        x = x.transpose(1, 2)
+        x = self.in_proj(x)
+        for b_idx, block in enumerate(self.blocks):
+            x, this_new_cnn_cache, this_new_att_cache \
+                = block.forward_chunk(x, t, cnn_cache[b_idx], att_cache[b_idx], mask)
+            cnn_cache_buffer[b_idx] = this_new_cnn_cache
+            att_cache_buffer[b_idx][:, :, :this_new_att_cache.shape[2], :] = this_new_att_cache
+        x = self.final_layer(x, t)
+        x = x.transpose(1, 2)
+        return x

cosyvoice2/flow/flow.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from cosyvoice2.utils.mask import make_pad_mask
+from cosyvoice2.flow.flow_matching import CausalConditionalCFM
+from cosyvoice2.transformer.upsample_encoder_v2 import UpsampleConformerEncoderV2
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 5121,
+                 encoder: UpsampleConformerEncoderV2 = None,
+                 decoder: CausalConditionalCFM = None,
+                 input_embedding: torch.nn.Module = None,
+                 ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.pre_lookahead_len = int(encoder.pre_lookahead_layer.pre_lookahead_len)
+        self.up_rate = int(encoder.up_layer.stride)
+        if input_embedding is None:
+            self.input_embedding = nn.Embedding(vocab_size, input_size)
+        else:
+            self.input_embedding = input_embedding
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        # xvec projection with CUDA Graph optimization
+        # 初始化 CUDA Graph 相关变量
+        self.enable_cuda_graph = False
+        self.static_embedding = None
+        self.static_output = None
+        self.graph = None
+        self.embedding_shape = None
+    def scatter_cuda_graph(self, enable_cuda_graph: bool):
+        self.enable_cuda_graph = enable_cuda_graph
+        if self.enable_cuda_graph:
+            # self.encoder.scatter_cuda_graph(enable_cuda_graph)
+            self.decoder.scatter_cuda_graph(enable_cuda_graph)
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  n_timesteps: int = 10,
+                  ):
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token_len = prompt_token_len + token_len
+        token = torch.concat([prompt_token, token], dim=1)
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # token encode
+        h, _ = self.encoder.forward(token, token_len)
+        h = self.encoder_proj(h)
+        # condition
+        mel_len1 = prompt_feat.shape[1]
+        mel_len2 = h.shape[1] - prompt_feat.shape[1]
+        conds = torch.zeros_like(h)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2).contiguous()
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat = self.decoder.forward(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=n_timesteps,
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat
+    @torch.inference_mode()
+    def setup_cache(self,
+                    token: torch.Tensor,
+                    mel: torch.Tensor,
+                    spk: torch.Tensor,
+                    n_timesteps: int = 10,
+                    ):
+        """
+        Args:
+            token: shape (b, t), with look ahead tokens
+            mel: shape (b, t, c), groundtruth mel
+            spk: shape (b, 192), speaker embedding
+        Returns:
+            cache: dict {
+                'conformer': {'cnn_cache': xxx, 'att_cache': xxx},
+                'estimator': {'cnn_cache': xxx, 'att_cache': xxx}
+            }
+        """
+        # check if look ahead token included
+        assert (token.shape[1] - self.pre_lookahead_len) * self.up_rate == mel.shape[1], (token.shape, mel.shape)
+        # xvec projection
+        spk = F.normalize(spk, dim=1)
+        spk = self.spk_embed_affine_layer(spk)
+        token = self.input_embedding(token)
+        # NOTE encoder.forward_chunk will strip the look ahead part
+        h, conformer_cnn_cache, conformer_att_cache = self.encoder.forward_chunk(
+            xs = token,
+            last_chunk = False,
+            cnn_cache = None,
+            att_cache = None,
+        )
+        h = self.encoder_proj(h)
+        feat, estimator_cnn_cache, estimator_att_cache = self.decoder.forward_chunk(
+            mu = h.transpose(1, 2).contiguous(),
+            spks = spk,
+            cond = mel.transpose(1, 2).contiguous(),
+            n_timesteps = n_timesteps,
+            temperature = 1.0,
+            cnn_cache = None,
+            att_cache = None,
+        )
+        cache = {
+            'conformer_cnn_cache': conformer_cnn_cache,
+            'conformer_att_cache': conformer_att_cache,
+            'estimator_cnn_cache': estimator_cnn_cache,
+            'estimator_att_cache': estimator_att_cache,
+        }
+        return cache
+    @torch.inference_mode()
+    def inference_chunk(self,
+                        token: torch.Tensor,
+                        spk: torch.Tensor,
+                        cache: dict,
+                        last_chunk: bool = False,
+                        n_timesteps: int = 10,
+                        ):
+        """
+        Args:
+            token: shape (b, t), with look ahead tokens
+            spk: shape (b, 192), speaker embedding
+            cache: dict {
+                'conformer_cnn_cache': xxx,
+                ...
+            }
+        """
+        # unpack cache
+        conformer_cnn_cache = cache['conformer_cnn_cache']
+        conformer_att_cache = cache['conformer_att_cache']
+        estimator_cnn_cache = cache['estimator_cnn_cache']
+        estimator_att_cache = cache['estimator_att_cache']
+        # xvec projection
+        spk = F.normalize(spk, dim=1)
+        spk = self.spk_embed_affine_layer(spk)
+        token = self.input_embedding(token)
+        # if not the last chunk, h is shorter than xs for a length of lookahead_length * stride (6)
+        h, conformer_cnn_cache, conformer_att_cache = self.encoder.forward_chunk(
+            xs = token,
+            last_chunk = last_chunk,
+            cnn_cache = conformer_cnn_cache,
+            att_cache = conformer_att_cache,
+        )
+        h = self.encoder_proj(h)
+        cond = torch.zeros_like(h)
+        # forward estimator
+        feat, estimator_cnn_cache, estimator_att_cache = self.decoder.forward_chunk(
+            mu = h.transpose(1, 2).contiguous(),
+            spks = spk,
+            cond = cond.transpose(1, 2).contiguous(),
+            n_timesteps = n_timesteps,
+            temperature = 1.0,
+            cnn_cache = estimator_cnn_cache,
+            att_cache = estimator_att_cache,
+        )
+        new_cache = {
+            'conformer_cnn_cache': conformer_cnn_cache,
+            'conformer_att_cache': conformer_att_cache,
+            'estimator_cnn_cache': estimator_cnn_cache,
+            'estimator_att_cache': estimator_att_cache,
+        }
+        return feat, new_cache

cosyvoice2/flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+import onnxruntime
+import torch
+import torch.nn.functional as F
+from cosyvoice2.flow.decoder_dit import DiT
+from cosyvoice2.utils.mask import make_pad_mask
+"""
+Inference wrapper
+"""
+class CausalConditionalCFM(torch.nn.Module):
+    def __init__(self, estimator: DiT, inference_cfg_rate:float=0.7):
+        super().__init__()
+        self.estimator = estimator
+        self.inference_cfg_rate = inference_cfg_rate
+        self.out_channels = estimator.out_channels
+         # a maximum of 600s
+        self.register_buffer('rand_noise', torch.randn([1, self.out_channels, 50 * 600]), persistent=False)
+        self.register_buffer('cnn_cache_buffer', torch.zeros(16, 16, 2, 1024, 2), persistent=False)
+        self.register_buffer('att_cache_buffer', torch.zeros(16, 16, 2, 8, 1000, 128), persistent=False)
+    def scatter_cuda_graph(self, enable_cuda_graph: bool):
+        if enable_cuda_graph:
+            self.estimator._init_cuda_graph_all()
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        assert self.inference_cfg_rate > 0, 'inference_cfg_rate better > 0'
+        # constant during denoising
+        mask_in = torch.cat([mask, mask], dim=0)
+        mu_in = torch.cat([mu, torch.zeros_like(mu)], dim=0)
+        spks_in = torch.cat([spks, torch.zeros_like(spks)], dim=0)
+        cond_in = torch.cat([cond, torch.zeros_like(cond)], dim=0)
+        for step in range(1, len(t_span)):
+            x_in = torch.cat([x, x], dim=0)
+            t_in = torch.cat([t, t], dim=0)
+            dphi_dt = self.estimator.forward(
+                x_in,
+                mask_in,
+                mu_in,
+                t_in,
+                spks_in,
+                cond_in,
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return x
+    @torch.inference_mode()
+    def forward(self, mu, mask, spks, cond, n_timesteps=10, temperature=1.0):
+        z = self.rand_noise[:, :, :mu.size(2)] * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        # cosine scheduling
+        t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span, mu, mask, spks, cond)
+    def solve_euler_chunk(self,
+                          x:torch.Tensor,
+                          t_span:torch.Tensor,
+                          mu:torch.Tensor,
+                          spks:torch.Tensor,
+                          cond:torch.Tensor,
+                          cnn_cache:torch.Tensor=None,
+                          att_cache:torch.Tensor=None,
+                          ):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+            cnn_cache: shape (n_time, depth, b, c1+c2, 2)
+            att_cache: shape (n_time, depth, b, nh, t, c * 2)
+        """
+        assert self.inference_cfg_rate > 0, 'cfg rate should be > 0'
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)  # (b,)
+        # setup initial cache
+        if cnn_cache is None:
+            cnn_cache = [None for _ in range(len(t_span)-1)]
+        if att_cache is None:
+            att_cache = [None for _ in range(len(t_span)-1)]
+        # next chunk's cache at each timestep
+        if att_cache[0] is not None:
+            last_att_len = att_cache.shape[4]
+        else:
+            last_att_len = 0
+        # constant during denoising
+        mu_in = torch.cat([mu, torch.zeros_like(mu)], dim=0)
+        spks_in = torch.cat([spks, torch.zeros_like(spks)], dim=0)
+        cond_in = torch.cat([cond, torch.zeros_like(cond)], dim=0)
+        for step in range(1, len(t_span)):
+            # torch.cuda.memory._record_memory_history(max_entries=100000)
+            # torch.cuda.memory._record_memory_history(max_entries=100000)
+            this_att_cache = att_cache[step-1]
+            this_cnn_cache = cnn_cache[step-1]
+            dphi_dt, this_new_cnn_cache, this_new_att_cache = self.estimator.forward_chunk(
+                x = x.repeat(2, 1, 1),
+                mu = mu_in,
+                t = t.repeat(2),
+                spks = spks_in,
+                cond = cond_in,
+                cnn_cache = this_cnn_cache,
+                att_cache = this_att_cache,
+            )
+            dphi_dt, cfg_dphi_dt = dphi_dt.chunk(2, dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+            self.cnn_cache_buffer[step-1] = this_new_cnn_cache
+            self.att_cache_buffer[step-1][:, :, :, :x.shape[2]+last_att_len, :] = this_new_att_cache
+        cnn_cache = self.cnn_cache_buffer
+        att_cache = self.att_cache_buffer[:, :, :, :, :x.shape[2]+last_att_len, :]
+        return x, cnn_cache, att_cache
+    @torch.inference_mode()
+    def forward_chunk(self,
+                      mu:torch.Tensor,
+                      spks:torch.Tensor,
+                      cond:torch.Tensor,
+                      n_timesteps:int=10,
+                      temperature:float=1.0,
+                      cnn_cache:torch.Tensor=None,
+                      att_cache:torch.Tensor=None,
+                      ):
+        """
+        Args:
+            mu(torch.Tensor): shape (b, c, t)
+            spks(torch.Tensor): shape (b, 192)
+            cond(torch.Tensor): shape (b, c, t)
+            cnn_cache: shape (n_time, depth, b, c1+c2, 2)
+            att_cache: shape (n_time, depth, b, nh, t, c * 2)
+        """
+        # get offset from att_cache
+        offset = att_cache.shape[4] if att_cache is not None else 0
+        z = self.rand_noise[:, :, offset:offset+mu.size(2)] * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        # cosine scheduling
+        t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        x, new_cnn_cache, new_att_cache = self.solve_euler_chunk(
+            x=z,
+            t_span=t_span,
+            mu=mu,
+            spks=spks,
+            cond=cond,
+            att_cache=att_cache,
+            cnn_cache=cnn_cache,
+        )
+        return x, new_cnn_cache, new_att_cache

cosyvoice2/transformer/__init__.py ADDED Viewed

File without changes

cosyvoice2/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song ([email protected])
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+import math
+from typing import Tuple
+import torch
+from torch import nn
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                CosyVoice.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache is not None and cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache, cache.size(-1) // 2, dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache

cosyvoice2/transformer/embedding.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Positonal Encoding Module."""
+import math
+from typing import Tuple, Union
+import torch
+import torch.nn.functional as F
+import numpy as np
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Construct an PositionalEncoding object."""
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
+        ]
+        return pos_emb

cosyvoice2/transformer/encoder_layer.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+        enable_cuda_graph (bool): Control whether to enable CUDA Graph.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.1,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-12)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-12)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-12)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-12)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        return self._forward_impl(x, mask, pos_emb, mask_pad, att_cache, cnn_cache)
+    def _forward_impl(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """原始的前向传播实现"""
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        # att_cache: (b, head, cache_t, d_k*2)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        return x, mask, new_att_cache, new_cnn_cache

cosyvoice2/transformer/positionwise_feed_forward.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Positionwise feed forward layer definition."""
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))

cosyvoice2/transformer/subsampling.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+from typing import Tuple, Union
+import torch
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask

cosyvoice2/transformer/upsample_encoder_v2.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Tuple, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+from cosyvoice2.transformer.encoder_layer import ConformerEncoderLayer
+from cosyvoice2.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from cosyvoice2.utils.class_utils import (
+    COSYVOICE_EMB_CLASSES,
+    COSYVOICE_SUBSAMPLE_CLASSES,
+    COSYVOICE_ATTENTION_CLASSES,
+    COSYVOICE_ACTIVATION_CLASSES,
+)
+from cosyvoice2.utils.mask import (
+    make_pad_mask,
+)
+import torch._dynamo
+torch._dynamo.config.suppress_errors = True
+torch._dynamo.config.cache_size_limit = 128
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels: int, out_channels: int, stride: int = 2, scale_factor: float = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.stride = stride
+        # In this mode, first repeat interpolate, than conv with stride=1
+        self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
+        self.scale_factor = float(self.stride) if scale_factor is None else float(scale_factor)
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor):
+        outputs = F.interpolate(inputs, scale_factor=self.scale_factor, mode="nearest")
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        outputs = self.conv(outputs)
+        return outputs, input_lengths * self.stride
+    def forward_chunk(self, inputs: torch.Tensor, input_lengths: torch.Tensor, cache: torch.Tensor = torch.zeros((0, 0, 0))):
+        """
+        Args:
+            inputs(torch.Tensor): shape (b, c, t)
+            input_length(torch.Tensor): shape (b), can be None
+            cache(torch.Tensor): shape (b, c, cache_t), where cache_t = stride * 2
+        """
+        outputs = F.interpolate(inputs, scale_factor=self.scale_factor, mode="nearest")
+        if cache is None:
+            cache = inputs.new_zeros(inputs.shape[0], inputs.shape[1], self.stride * 2)
+        outputs = torch.cat([cache, outputs], dim=2)
+        new_cache = outputs[..., -self.stride*2:]
+        outputs = self.conv(outputs)
+        if input_lengths is not None:
+            input_lengths = input_lengths * self.stride
+        return outputs, input_lengths, new_cache
+class PreLookaheadLayer(nn.Module):
+    def __init__(self, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            channels, channels,
+            kernel_size=pre_lookahead_len + 1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        # look ahead
+        outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (2, 0), mode='constant', value=0.0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        # residual connection
+        outputs = outputs + inputs
+        return outputs
+    def forward_chunk(self, inputs: torch.Tensor, cache: torch.Tensor = None):
+        """
+        Args:
+            inputs(torch.Tensor): shape (b, t, c)
+            cache(torch.Tensor): shape (b, c, cache_t=2), c = channels
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # the length of outputs is input length - pre_lookahead_len
+        if cache is None:
+            cache = outputs.new_zeros(outputs.shape[0], outputs.shape[1], 2)
+        # NOTE
+        new_cache = outputs[..., -2:]
+        outputs = torch.cat([cache, outputs], dim=2)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        # residual connection
+        outputs = outputs + inputs[:, :-self.pre_lookahead_len]
+        return outputs, new_cache
+"""Customize each sample's chunk attention mask
+"""
+class UpsampleConformerEncoderV2(torch.nn.Module):
+    def __init__(
+        self,
+        # input & output
+        input_size: int,
+        output_size: int = 256,
+        input_layer: str = "linear",
+        pre_lookahead_len: int = 3,
+        # size
+        num_blocks: int = 6,
+        num_up_blocks: int = 4,
+        # upsampling
+        up_stride: int = 2,
+        up_scale_factor: float = 2,
+        # attention
+        attention_heads: int = 4,
+        pos_enc_layer_type: str = "rel_pos_espnet",
+        selfattention_layer_type: str = "rel_selfattn",
+        key_bias: bool = True,
+        # mlp
+        linear_units: int = 2048,
+        # dropouts
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        # other
+        normalize_before: bool = True,
+        activation_type: str = "swish",
+        **kwargs,
+    ):
+        super().__init__()
+        self._output_size = output_size
+        self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
+                output_size,
+                positional_dropout_rate
+            ),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        self.pre_lookahead_layer = PreLookaheadLayer(
+            channels=output_size,
+            pre_lookahead_len=pre_lookahead_len
+        )
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args
+                ),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                None,
+                None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])
+        self.up_layer = Upsample1D(
+            channels=output_size,
+            out_channels=output_size,
+            stride=up_stride,
+            scale_factor=up_scale_factor
+        )
+        self.up_embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](
+                output_size,
+                positional_dropout_rate
+            ),
+        )
+        self.up_encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args
+                ),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                None,
+                None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_up_blocks)
+        ])
+        self.enable_cuda_graph = False
+        self.use_cuda_graph = False
+        self.graph_encoder = {}
+        self.graph_up_encoder = {}
+        self.inference_buffers_encoder = {}
+        self.inference_buffers_up_encoder = {}
+        self.max_static_time = 1500
+    # FIXME(sfy) revert hard-coded bfloat16
+    # this method is skipped in CausalMaskedDiffWithXvec.scatter_cuda_graph
+    def scatter_cuda_graph(self, enable_cuda_graph: bool):
+        self.enable_cuda_graph = enable_cuda_graph
+        if self.enable_cuda_graph:
+            self._init_cuda_graph()
+    def _init_cuda_graph(self):
+        """初始化 CUDA Graph"""
+        for l in range(100, 1500, 10):
+            static_x = torch.zeros((1, l, 512),
+                                dtype=torch.float32, device=torch.device('cuda'))
+            static_mask = torch.ones((1, 1, l),
+                                    dtype=torch.bool, device=torch.device('cuda'))
+            static_pos_emb = torch.zeros((1, 2*l-1, 512),
+                                        dtype=torch.float32, device=torch.device('cuda'))
+            static_inputs = [
+                static_x,
+                static_mask,
+                static_pos_emb,
+            ]
+            self._forward_impl_encoder(
+                static_inputs[0],
+                static_inputs[1],
+                static_inputs[2],
+            )
+            graph = torch.cuda.CUDAGraph()
+            with torch.no_grad():
+                with torch.cuda.graph(graph):
+                    static_out_x = self._forward_impl_encoder(
+                        static_inputs[0],
+                        static_inputs[1],
+                        static_inputs[2]
+                    )
+            self.graph_encoder[l] = graph
+            static_outputs = [
+                static_out_x,
+            ]
+            self.inference_buffers_encoder[l] = {
+                'static_inputs': static_inputs,
+                'static_outputs': static_outputs
+            }
+        for l in range(100, 1500, 10):
+            static_x = torch.zeros((1, l, 512),
+                                dtype=torch.float32, device=torch.device('cuda'))
+            static_mask = torch.ones((1, 1, l),
+                                    dtype=torch.bool, device=torch.device('cuda'))
+            static_pos_emb = torch.zeros((1, 2*l-1, 512),
+                                        dtype=torch.float32, device=torch.device('cuda'))
+            static_inputs = [
+                static_x,
+                static_mask,
+                static_pos_emb,
+            ]
+            self._forward_impl_up_encoder(
+                static_inputs[0],
+                static_inputs[1],
+                static_inputs[2],
+            )
+            graph = torch.cuda.CUDAGraph()
+            with torch.no_grad():
+                with torch.cuda.graph(graph):
+                    static_out_x = self._forward_impl_up_encoder(
+                        static_inputs[0],
+                        static_inputs[1],
+                        static_inputs[2]
+                    )
+            self.graph_up_encoder[l] = graph
+            static_outputs = [
+                static_out_x,
+            ]
+            self.inference_buffers_up_encoder[l] = {
+                'static_inputs': static_inputs,
+                'static_outputs': static_outputs
+            }
+        self.use_cuda_graph = True
+        print("CUDA Graph initialized successfully for encoder and up_encoder")
+    # @torch.compile(dynamic=True,backend="eager")
+    def _forward_impl_encoder(self,
+                             x: torch.Tensor,
+                             mask: torch.Tensor,
+                             pos_emb: torch.Tensor):
+        for layer in self.encoders:
+            x, _, _, _ = layer(x, mask, pos_emb)
+        return x
+    # @torch.compile(dynamic=True,backend="eager")
+    def _forward_impl_up_encoder(self,
+                             x: torch.Tensor,
+                             mask: torch.Tensor,
+                             pos_emb: torch.Tensor):
+        for layer in self.up_encoders:
+            x, _, _, _ = layer(x, mask, pos_emb)
+        return x
+    def output_size(self) -> int:
+        return self._output_size
+    # @torch.compile(dynamic=True,backend="eager")
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # (sfy) chunk training strategy should not be open-sourced
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        # lookahead
+        xs = self.pre_lookahead_layer(xs)
+        # conformer block
+        if self.enable_cuda_graph and xs.shape[1] in self.graph_encoder:
+            self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][0].copy_(xs)
+            self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][1].copy_(masks)
+            self.inference_buffers_encoder[xs.shape[1]]['static_inputs'][2].copy_(pos_emb)
+            self.graph_encoder[xs.shape[1]].replay()
+            xs = self.inference_buffers_encoder[xs.shape[1]]['static_outputs'][0]
+        else:
+            xs = self._forward_impl_encoder(xs, masks, pos_emb)
+        # upsample
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs = xs.transpose(1, 2).contiguous()
+        # 2nd conformer block
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.up_embed(xs, masks)
+        if self.enable_cuda_graph and xs.shape[1] in self.graph_up_encoder:
+            self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][0].copy_(xs)
+            self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][1].copy_(masks)
+            self.inference_buffers_up_encoder[xs.shape[1]]['static_inputs'][2].copy_(pos_emb)
+            self.graph_up_encoder[xs.shape[1]].replay()
+            xs = self.inference_buffers_up_encoder[xs.shape[1]]['static_outputs'][0]
+        else:
+            xs = self._forward_impl_up_encoder(xs, masks, pos_emb)
+        # post norm
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+    @torch.compile(dynamic=True,backend="eager")
+    def forward_chunk(self,
+                      xs: torch.Tensor,
+                      last_chunk: bool = False,
+                      cnn_cache: torch.Tensor = None,
+                      att_cache: torch.Tensor = None,
+                      ):
+        """
+        Args:
+            xs: shape (b, dt, c)
+            last_chunk: bool. If last chunk, will pad input with lookaheads
+            att_cache: shape (depth1+depth2, b, nh, 2*t1, c).
+            cnn_cache: shape (b, c, t1+t2). Where t1=2 (pre_lookahead_layer), t2=4 (up_layer)
+        """
+        if att_cache is not None:
+            assert att_cache.shape[3] % 2 == 0, att_cache.shape
+        if cnn_cache is not None:
+            assert cnn_cache.shape[2] == 2+self.up_layer.stride*2, cnn_cache.shape
+        # unpack caches
+        offset1 = att_cache.shape[3] // 2 if att_cache is not None else 0
+        att_cache1 = att_cache[:len(self.encoders), :, :, :offset1] if att_cache is not None else [None] * len(self.encoders)
+        att_cache2 = att_cache[len(self.encoders):] if att_cache is not None else [None] * len(self.encoders)
+        cnn_cache1 = cnn_cache[:, :, :2] if cnn_cache is not None else None
+        cnn_cache2 = cnn_cache[:, :, 2:] if cnn_cache is not None else None
+        xs, _, _ = self.embed(xs, None)
+        if last_chunk:
+            xs = F.pad(xs, (0, 0, 0, self.pre_lookahead_layer.pre_lookahead_len))
+        # this_cnn_cache: shape (b=1, c=512, t=2)
+        xs, new_cnn_cache1 = self.pre_lookahead_layer.forward_chunk(xs, cache=cnn_cache1)
+        # remake pos_emb, offset param is ignored by position_encoding
+        pos_emb = self.embed.position_encoding(offset=None, size=offset1 + xs.shape[1])
+        # first conformer
+        chunk_masks = torch.zeros((0, 0, 0))
+        new_att_cache1 = []
+        for idx, layer in enumerate(self.encoders):
+            # this_att_cache: shape (b, nh, t, c * 2)
+            xs, _, this_new_att_cache1, _ = layer(xs, chunk_masks, pos_emb, att_cache=att_cache1[idx])
+            new_att_cache1.append(this_new_att_cache1)
+        new_att_cache1 = torch.stack(new_att_cache1, dim=0)
+        # upsample + conformer encoder, xs: (b, t, c) -> (b, c, t)
+        xs = xs.transpose(1, 2).contiguous()
+        # this_cnn_cache: shape (b=1, c=512, t=2*2)
+        xs, _, new_cnn_cache2 = self.up_layer.forward_chunk(xs, None, cache=cnn_cache2)
+        xs = xs.transpose(1, 2).contiguous()
+        # at this time, xs are doubled in length
+        xs, _, _ = self.up_embed(xs, None)
+        # remake pos_emb
+        pos_emb = self.embed.position_encoding(offset=None, size=offset1 * self.up_layer.stride + xs.shape[1])
+        # second conformer
+        chunk_masks = torch.zeros((0, 0, 0),dtype=torch.bfloat16)
+        new_att_cache2 = []
+        for idx, layer in enumerate(self.up_encoders):
+            xs, _, this_new_att_cache2, _ = layer(xs, chunk_masks, pos_emb, att_cache=att_cache2[idx])
+            new_att_cache2.append(this_new_att_cache2)
+        new_att_cache2 = torch.stack(new_att_cache2, dim=0)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # pack new cache
+        new_att_cache = torch.cat([new_att_cache1.repeat(1, 1, 1, 2, 1), new_att_cache2], dim=0)
+        new_cnn_cache = torch.cat([new_cnn_cache1, new_cnn_cache2], dim=2)
+        return xs, new_cnn_cache, new_att_cache

cosyvoice2/utils/class_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright [2023-11-28] <[email protected], Xingchen Song>
+#            2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from cosyvoice2.transformer.subsampling import LinearNoSubsampling
+from cosyvoice2.transformer.attention import RelPositionMultiHeadedAttention
+from cosyvoice2.transformer.embedding import EspnetRelPositionalEncoding
+COSYVOICE_ACTIVATION_CLASSES = {
+    "hardtanh": torch.nn.Hardtanh,
+    "tanh": torch.nn.Tanh,
+    "relu": torch.nn.ReLU,
+    "selu": torch.nn.SELU,
+    "swish": torch.nn.SiLU,
+    "gelu": torch.nn.GELU,
+}
+COSYVOICE_SUBSAMPLE_CLASSES = {
+    "linear": LinearNoSubsampling,
+}
+COSYVOICE_EMB_CLASSES = {
+    "rel_pos_espnet": EspnetRelPositionalEncoding,
+}
+COSYVOICE_ATTENTION_CLASSES = {
+    "rel_selfattn": RelPositionMultiHeadedAttention,
+}

cosyvoice2/utils/common.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Unility functions for Transformer."""
+import random
+from typing import List
+import numpy as np
+import torch
+IGNORE_ID = -1
+def pad_list(xs: List[torch.Tensor], pad_value: int):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    max_len = max([len(item) for item in xs])
+    batchs = len(xs)
+    ndim = xs[0].ndim
+    if ndim == 1:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 2:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 3:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              xs[0].shape[2],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    else:
+        raise ValueError(f"Unsupported ndim: {ndim}")
+    pad_res.fill_(pad_value)
+    for i in range(batchs):
+        pad_res[i, :len(xs[i])] = xs[i]
+    return pad_res
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def fade_in_out(fade_in_mel, fade_out_mel, window):
+    device = fade_in_mel.device
+    fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
+    mel_overlap_len = int(window.shape[0] / 2)
+    if fade_in_mel.device == torch.device('cpu'):
+        fade_in_mel = fade_in_mel.clone()
+    fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
+        fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
+    return fade_in_mel.to(device)
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)

cosyvoice2/utils/mask.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+from typing import List
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+    See description of make_non_pad_mask.
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask

flashcosyvoice/__init__.py ADDED Viewed

File without changes

flashcosyvoice/cli.py ADDED Viewed

	@@ -0,0 +1,424 @@

+# Copyright (c) 2025 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Example Usage: see README.md
+"""
+import argparse
+import json
+import os
+import random
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+import numpy as np
+import onnxruntime
+import s3tokenizer
+import torch
+import torch.distributed as dist
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from tqdm import tqdm
+from flashcosyvoice.config import Config, CosyVoice2LLMConfig, SamplingParams
+from flashcosyvoice.cosyvoice2 import CosyVoice2
+from flashcosyvoice.utils.audio import mel_spectrogram
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def save_file_async(
+    wav, prompt_speech_tokens, generated_speech_tokens,
+    info, timing_stats
+):
+    """Save audio asynchronously."""
+    try:
+        os.makedirs(os.path.dirname(info['wav']), exist_ok=True)
+        if wav is not None:
+            wav = wav.cpu()
+            torchaudio.save(info['wav'], wav, 24000)
+            duration = wav.shape[-1] / 24000.0
+            rtf = ((timing_stats['dataloader_time'] + timing_stats['model_inference_time']) / timing_stats['batch_size']) / duration
+            timing_stats['rtf'] = rtf
+        else:
+            duration = 0.0
+        info['timing_stats'] = timing_stats
+        info['prompt_speech_tokens'] = prompt_speech_tokens
+        info['generated_speech_tokens'] = generated_speech_tokens
+        with open(f"{info['wav'].replace('.wav', '.json')}", "w") as f:
+            json.dump(info, f, ensure_ascii=False, indent=4)
+        return duration
+    except Exception as e:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [ERROR] - Error saving audio {info.get('key', 'unknown')}: {e}")
+        return 0.0
+class AudioDataset(Dataset):
+    def __init__(self, text_norm, text_tokenizer, data_list, model_config: Config):
+        self.datas = []
+        self.text_norm = text_norm
+        self.model_config = model_config
+        """Example data_list:
+        ```
+        {"key": "uttid_1", "prompt_text": "你好，我是小明。", "text": "你好，我是小红。", "prompt_wav": "/mnt/data/audio/00000000.wav", "wav": "/mnt/data/audio_synthetic/uttid_1.wav"}
+        {"key": "uttid_2", "prompt_text": "你好，我是小红。", "text": "你好，我是小明。", "prompt_wav": "/mnt/data/audio/00000001.wav", "wav": "/mnt/data/audio_synthetic/uttid_2.wav"}
+        ```
+        Note:
+            - `key` is the key of this sample.
+            - `prompt_text` is the text used for prompt.
+            - `text` is the text used for generating real audio.
+            - `prompt_wav` is the audio used for prompt.
+            - `wav` is the path to the generated audio to be saved (we highly recommend to pre-define the save path before running the script).
+        """
+        missing = 0
+        with open(data_list, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            total_lines = len(lines)
+            if torch.distributed.get_node_local_rank() == 0:
+                iterator = tqdm(lines, desc='Loading data')
+            else:
+                iterator = lines
+            for line in iterator:
+                data = json.loads(line.strip())
+                valid = True
+                for k in ['key', 'prompt_text', 'text', 'prompt_wav']:
+                    if k not in data:
+                        valid = False
+                        break
+                    if data[k] is None:
+                        valid = False
+                        break
+                if not os.path.exists(data['prompt_wav']):
+                    valid = False
+                if valid:
+                    self.datas.append(data)
+                else:
+                    missing += 1
+        if torch.distributed.get_node_local_rank() == 0:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f'[{timestamp}] - [INFO] - Loaded {total_lines} lines, found {missing} missing lines, total valid lines == {len(self.datas)}.')
+        self.text_tokenizer = text_tokenizer
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.spk_model = onnxruntime.InferenceSession(f"{self.model_config.model}/campplus.onnx", sess_options=option,
+                                                      providers=["CPUExecutionProvider"])
+    def __len__(self):
+        return len(self.datas)
+    def __getitem__(self, idx):
+        data = self.datas[idx]
+        try:
+            # 1. feature for s3tokenizer
+            audio = s3tokenizer.load_audio(data['prompt_wav'], sr=16000)  # [T]
+            log_mel = s3tokenizer.log_mel_spectrogram(audio)  # [num_mels, T]
+            # 2. feature for speaker embedding
+            spk_feat = kaldi.fbank(audio.unsqueeze(0), num_mel_bins=80, dither=0, sample_frequency=16000)
+            spk_feat = spk_feat - spk_feat.mean(dim=0, keepdim=True)
+            spk_emb = self.spk_model.run(
+                None, {self.spk_model.get_inputs()[0].name: spk_feat.unsqueeze(dim=0).cpu().numpy()}
+            )[0].flatten().tolist()
+            # 3. feature for flow
+            audio, sample_rate = torchaudio.load(data['prompt_wav'], backend='soundfile')
+            audio = audio.mean(dim=0, keepdim=True)  # [1, T]
+            if sample_rate != 24000:
+                audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)(audio)
+            mel = mel_spectrogram(audio).transpose(1, 2).squeeze(0)  # [T, num_mels]
+            mel_len = mel.shape[0]
+            # 4. feature for llm
+            if self.text_norm is not None:
+                prompt_texts = [i["text"] for i in json.loads(self.text_norm.do_voicegen_frd(data['prompt_text'].strip()))["sentences"]]
+                prompt_text = ''.join(prompt_texts)
+                texts = [i["text"] for i in json.loads(self.text_norm.do_voicegen_frd(data['text'].strip()))["sentences"]]
+                text = ''.join(texts)
+            else:
+                prompt_text = data['prompt_text']
+                text = data['text']
+            prompt_text_ids = self.text_tokenizer.encode(prompt_text)
+            prompt_text_ids = [i + self.model_config.hf_config.speech_vocab_size + 2 for i in prompt_text_ids]
+            text_ids = self.text_tokenizer.encode(text)
+            text_ids = [i + self.model_config.hf_config.speech_vocab_size + 2 for i in text_ids]
+            item = {
+                "prompt_text_tokens": prompt_text_ids, "text_tokens": text_ids,
+                "spk_emb": spk_emb, "mel": mel, "mel_len": mel_len, "log_mel": log_mel, "info": data,
+                "min_tokens": len(text_ids) * self.model_config.min_token_text_ratio,
+                "max_tokens": len(text_ids) * self.model_config.max_token_text_ratio,
+            }
+        except Exception as e:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [WARNING] - Error processing data item {data.get('key', idx)}: {e}")
+            return None
+        return item
+def collate_fn(batch):
+    prompt_mels_for_llm = [item["log_mel"] for item in batch if item is not None]
+    prompt_mels_for_llm, prompt_mels_lens_for_llm = s3tokenizer.padding(prompt_mels_for_llm)  # [B, num_mels=128, T]
+    prompt_text_tokens_for_llm = [item["prompt_text_tokens"] for item in batch if item is not None]
+    text_tokens_for_llm = [item["text_tokens"] for item in batch if item is not None]
+    prompt_mels_for_flow = [item["mel"] for item in batch if item is not None]
+    prompt_mels_for_flow = torch.nn.utils.rnn.pad_sequence(prompt_mels_for_flow, batch_first=True, padding_value=0)  # [B, T', num_mels=80]
+    prompt_mels_lens_for_flow = [item["mel_len"] for item in batch if item is not None]
+    prompt_mels_lens_for_flow = torch.tensor(prompt_mels_lens_for_flow)
+    spk_emb_for_flow = [item["spk_emb"] for item in batch if item is not None]
+    spk_emb_for_flow = torch.tensor(spk_emb_for_flow)
+    sampling_params = [SamplingParams(min_tokens=item["min_tokens"], max_tokens=item["max_tokens"], use_ras=True) for item in batch if item is not None]
+    infos = [item["info"] for item in batch if item is not None]
+    return {
+        "prompt_mels_for_llm": prompt_mels_for_llm,
+        "prompt_mels_lens_for_llm": prompt_mels_lens_for_llm,
+        "prompt_text_tokens_for_llm": prompt_text_tokens_for_llm,
+        "text_tokens_for_llm": text_tokens_for_llm,
+        "prompt_mels_for_flow": prompt_mels_for_flow,
+        "prompt_mels_lens_for_flow": prompt_mels_lens_for_flow,
+        "spk_emb_for_flow": spk_emb_for_flow,
+        "sampling_params": sampling_params,
+        "infos": infos,
+    }
+def init_distributed():
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+    tqdm.write(f'[{timestamp}] - [INFO] - Inference on multiple gpus, this gpu {local_rank}, rank {rank}, world_size {world_size}')
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group("nccl")
+    return world_size, local_rank, rank
+def get_args():
+    parser = argparse.ArgumentParser(description='FlashCosyVoice')
+    parser.add_argument('--model_path',
+                        required=True,
+                        type=str,
+                        help='model path')
+    parser.add_argument('--data_list',
+                        required=True,
+                        type=str,
+                        help='data list')
+    parser.add_argument('--batch_size_dataloader',
+                        required=True,
+                        type=int,
+                        help='batch size (per-device) for dataloading')
+    parser.add_argument('--batch_size_flow',
+                        required=True,
+                        type=int,
+                        help='batch size (per-device) for flow-matching')
+    parser.add_argument('--num_workers',
+                        type=int,
+                        default=4,
+                        help='workers for dataloader')
+    parser.add_argument('--prefetch',
+                        type=int,
+                        default=5,
+                        help='prefetch for dataloader')
+    parser.add_argument('--enable_tn',
+                        action='store_true',
+                        help='enable text normalization')
+    parser.add_argument('--only_llm',
+                        action='store_true',
+                        help='only generate speech tokens from llm')
+    parser.add_argument('--fp16_flow',
+                        action='store_true',
+                        help='enable fp16 flow')
+    parser.add_argument('--seed',
+                        type=int,
+                        default=1986,
+                        help='random seed for generation')
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    if args.enable_tn:
+        # Check python version, if == 3.10, use ttsfrd
+        if sys.version_info.major == 3 and sys.version_info.minor == 10:
+            # Check if ttsfrd is installed
+            try:
+                import ttsfrd
+                from cosyvoice_ttsfrd import get_resource_path
+            except ImportError as e:
+                raise ImportError("ttsfrd is not installed, please install it first, see `https://github.com/xingchensong/CosyVoice-ttsfrd` for installation guide.") from e
+            text_norm = ttsfrd.TtsFrontendEngine()
+            text_norm.initialize(get_resource_path())
+            text_norm.set_lang_type('pinyinvg')
+        else:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [WARNING] - Only python 3.10 is supported for ttsfrd, see `https://github.com/xingchensong/CosyVoice-ttsfrd` for more info. Setting enable_tn to False...")
+            # TODO: maybe we should use wetext if python version is not 3.10?
+            args.enable_tn = False
+            text_norm = None
+    else:
+        text_norm = None
+    assert (torch.cuda.is_available())
+    world_size, local_rank, rank = init_distributed()
+    config = Config(model=args.model_path, enforce_eager=True, tensor_parallel_size=1,
+                    max_num_seqs=args.batch_size_dataloader,
+                    hf_config=CosyVoice2LLMConfig(fp16_flow=args.fp16_flow), rank=local_rank)
+    model = CosyVoice2(config)
+    set_all_random_seed(args.seed)
+    dataset = AudioDataset(text_norm, model.llm.tokenizer, args.data_list, config)
+    sampler = DistributedSampler(dataset,
+                                 num_replicas=world_size,
+                                 rank=rank)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size_dataloader, num_workers=args.num_workers, pin_memory=True,
+                            sampler=sampler, shuffle=False, prefetch_factor=args.prefetch, collate_fn=collate_fn)
+    total_steps = len(dataset)
+    if local_rank == 0:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - {args}")
+        progress_bar = tqdm(total=total_steps, desc="Processing samples", unit="wav",
+                            position=0, leave=True, dynamic_ncols=True)
+    cpu_counts = os.cpu_count()
+    executor = ThreadPoolExecutor(max_workers=min(args.batch_size_dataloader, cpu_counts // 8))
+    pending_futures = []
+    dataloader_iter = iter(dataloader)
+    succeed_duration = 0.01  # avoid division by zero
+    start_time = time.time()
+    estimated_total_wavs = 0
+    succeed_wavs = 0
+    failed_wavs = 0
+    last_print_time = start_time
+    while True:
+        try:
+            dataloader_start = time.time()
+            batch = next(dataloader_iter)
+            dataloader_time = time.time() - dataloader_start
+            if len(batch['infos']) == 0:
+                timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+                tqdm.write(f"[{timestamp}] - [WARNING] - rank {rank} of {world_size}: No valid batch found, skipping this batch...")
+                continue
+            model_start = time.time()
+            results_dict, timing_stats = model(**batch, batch_size_flow=args.batch_size_flow,
+                                               only_llm=args.only_llm)
+            model_time = time.time() - model_start
+            estimated_total_wavs += len(results_dict['generated_wavs'])
+            timing_stats['dataloader_time'] = dataloader_time
+            timing_stats['model_inference_time'] = model_time
+            if args.only_llm:
+                results_dict['generated_wavs'] = [None] * len(results_dict['prompt_speech_tokens'])
+            for i in range(len(results_dict['generated_wavs'])):
+                future = executor.submit(
+                    save_file_async, results_dict['generated_wavs'][i],
+                    results_dict['prompt_speech_tokens'][i],
+                    results_dict['generated_speech_tokens'][i],
+                    batch['infos'][i].copy(), timing_stats.copy()
+                )
+                pending_futures.append(future)
+            completed_futures = []
+            for future in pending_futures:
+                if future.done():
+                    try:
+                        duration = future.result()
+                        succeed_duration += duration
+                        succeed_wavs += 1
+                    except Exception as e:
+                        failed_wavs += 1
+                        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+                        tqdm.write(f"[{timestamp}] - [ERROR] - rank {rank} of {world_size}: Error in async save task: {e}")
+                    completed_futures.append(future)
+            for future in completed_futures:
+                pending_futures.remove(future)
+            if local_rank == 0:
+                update_n = world_size * len(batch["prompt_text_tokens_for_llm"])
+                if progress_bar.n + update_n > progress_bar.total:
+                    progress_bar.update(progress_bar.total - progress_bar.n)
+                else:
+                    progress_bar.update(update_n)
+                current_time = time.time()
+                if current_time - last_print_time >= 120 and not args.only_llm:
+                    elapsed_time = current_time - start_time
+                    avg_duration = succeed_duration / succeed_wavs if succeed_wavs > 0 else 0
+                    estimated_total_duration = avg_duration * estimated_total_wavs
+                    current_rtf = elapsed_time / estimated_total_duration if estimated_total_duration > 0.01 else 0
+                    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+                    tqdm.write(f"[{timestamp}] - [INFO] - rank {rank} of {world_size}: Estimated total wavs: {estimated_total_wavs} ({estimated_total_wavs - succeed_wavs} pending to save), Succeed wavs: {succeed_wavs}, Failed wavs: {failed_wavs}, Estimated total duration: {estimated_total_duration:.2f}s ({estimated_total_duration / 3600:.2f} h), Estimated RTF: {current_rtf:.5f}, Elapsed time: {elapsed_time:.2f}s")  # noqa
+                    last_print_time = current_time
+        except StopIteration:
+            break
+        except Exception as e:
+            failed_wavs += 1
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [ERROR] - rank {rank} of {world_size}: Error in main loop: {e}")
+            continue
+    total_time = time.time() - start_time
+    if local_rank == 0:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - Waiting for {len(pending_futures)} pending save tasks to complete...")
+    for future in pending_futures:
+        try:
+            duration = future.result(timeout=60)
+            succeed_duration += duration
+            succeed_wavs += 1
+        except Exception as e:
+            failed_wavs += 1
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [ERROR] - rank {rank} of {world_size}: Error in final async save task: {e}")
+    executor.shutdown(wait=True)
+    if local_rank == 0:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - All async save tasks completed.")
+        progress_bar.close()
+    if not args.only_llm:
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+        tqdm.write(f"[{timestamp}] - [INFO] - rank {rank} of {world_size}: Final Report - Succeed wavs: {succeed_wavs}, Failed wavs: {failed_wavs}, Total duration: {succeed_duration:.2f}s ({succeed_duration / 3600:.2f} h), RTF: {total_time / succeed_duration:.5f}")  # noqa
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

flashcosyvoice/config.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+from dataclasses import dataclass, field
+import torch
+from transformers import AutoConfig
+@dataclass
+class CosyVoice2LLMConfig:
+    architectures: list[str] = field(default_factory=lambda: ["Qwen2ForCausalLM"])
+    attention_dropout: float = 0.0
+    bos_token_id: int = 151643
+    eos_token_id: int = 6561  # speech eos
+    hidden_act: str = "silu"
+    hidden_size: int = 896
+    initializer_range: float = 0.02
+    intermediate_size: int = 4864
+    max_position_embeddings: int = 32768
+    max_window_layers: int = 24
+    model_type: str = "qwen2"
+    num_attention_heads: int = 14
+    num_hidden_layers: int = 24
+    num_key_value_heads: int = 2
+    head_dim: int = 64
+    rms_norm_eps: float = 1e-06
+    rope_scaling: dict | None = None
+    rope_theta: float = 1000000.0
+    sliding_window: int = 32768
+    tie_word_embeddings: bool = False
+    torch_dtype: torch.dtype = torch.bfloat16
+    transformers_version: str = "4.52.0.dev0"
+    use_cache: bool = True
+    use_sliding_window: bool = False
+    vocab_size: int = 158500  # text_vocab_size + speech_vocab_size + 2 (eos and task_id)
+    text_vocab_size: int = 151936
+    speech_vocab_size: int = 6562  # actually 6564, we only care about non-streaming inference, so cut off tokens (6562, 6563) that are only used for streaming TTS
+    lm_head_bias: bool = True
+    qkv_bias: bool = True
+    fp16_flow: bool = True
+@dataclass
+class SamplingParams:
+    temperature: float = 1.0
+    min_tokens: int = 2
+    max_tokens: int = 64
+    ignore_eos: bool = False
+    top_k: int = 25
+    # RasSampler parameters
+    use_ras: bool = False
+    win_size: int = 10
+    tau_r: float = 0.1
+    top_p: float = 0.8
+@dataclass
+class Config:
+    model: str
+    max_num_batched_tokens: int = 1572864
+    max_num_seqs: int = 1024
+    max_model_len: int = 1536  # 15s prompt + 30s generated audio for 25hz audio tokenizer
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    enforce_eager: bool = False
+    hf_config: CosyVoice2LLMConfig | AutoConfig = field(default_factory=CosyVoice2LLMConfig)
+    eos: int = -1
+    kvcache_block_size: int = 256
+    num_kvcache_blocks: int = -1
+    min_token_text_ratio: int = 2
+    max_token_text_ratio: int = 20
+    rank: int = 0
+    def __post_init__(self):
+        assert os.path.isdir(self.model)
+        assert self.kvcache_block_size % 256 == 0
+        assert 1 <= self.tensor_parallel_size <= 8
+        max_pos = getattr(self.hf_config, "max_position_embeddings", 4096)
+        self.max_model_len = min(self.max_model_len, max_pos)
+        assert self.max_num_batched_tokens >= self.max_model_len

flashcosyvoice/cosyvoice2.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Copyright (c) 2025 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from datetime import datetime
+import s3tokenizer
+import torch
+from tqdm import tqdm
+from flashcosyvoice.config import Config, SamplingParams
+from flashcosyvoice.engine.llm_engine import LLMEngine
+from flashcosyvoice.modules.flow import CausalMaskedDiffWithXvec
+from flashcosyvoice.modules.hifigan import HiFTGenerator
+class CosyVoice2(torch.nn.Module):
+    def __init__(self, config: Config = None):
+        super().__init__()
+        self.config = Config() if config is None else config
+        self.audio_tokenizer = s3tokenizer.load_model("speech_tokenizer_v2_25hz").cuda().eval()
+        self.llm = LLMEngine(**self.config.__dict__)
+        self.use_tqdm = torch.distributed.get_node_local_rank() == 0
+        self.flow = CausalMaskedDiffWithXvec()
+        if self.config.hf_config.fp16_flow:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
+            tqdm.write(f"[{timestamp}] - [INFO] - Casting flow to fp16")
+            self.flow.half()
+        self.flow.load_state_dict(torch.load(f"{self.config.model}/flow.pt", map_location="cpu", weights_only=True), strict=True)
+        self.flow.cuda().eval()
+        self.hift = HiFTGenerator()
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(f"{self.config.model}/hift.pt", map_location="cpu", weights_only=True).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.cuda().eval()
+    @torch.inference_mode()
+    def forward(
+        self, prompt_mels_for_llm: torch.Tensor, prompt_mels_lens_for_llm: torch.Tensor,
+        prompt_text_tokens_for_llm: list[list[int]], text_tokens_for_llm: list[list[int]],
+        prompt_mels_for_flow: torch.Tensor, prompt_mels_lens_for_flow: torch.Tensor,
+        spk_emb_for_flow: torch.Tensor,
+        sampling_params: SamplingParams | list[SamplingParams],
+        batch_size_flow: int,
+        only_llm: bool,
+        **kwargs,  # for compatibility
+    ):
+        timing_stats = {}
+        # Audio tokenization
+        start_time = time.time()
+        prompt_speech_tokens, prompt_speech_tokens_lens = self.audio_tokenizer.quantize(
+            prompt_mels_for_llm.cuda(), prompt_mels_lens_for_llm.cuda()
+        )
+        timing_stats['audio_tokenization'] = time.time() - start_time
+        batch_size = prompt_speech_tokens.shape[0]
+        assert len(prompt_text_tokens_for_llm) == batch_size
+        # Prepare LLM inputs
+        start_time = time.time()
+        valid_prompt_speech_tokens = []
+        inputs = []
+        for i in range(batch_size):
+            speech_tokens_i = prompt_speech_tokens[i, :prompt_speech_tokens_lens[i].item()].tolist()
+            valid_prompt_speech_tokens.append(speech_tokens_i)
+            inputs.append([self.config.hf_config.speech_vocab_size] + prompt_text_tokens_for_llm[i] + text_tokens_for_llm[i] + [self.config.hf_config.speech_vocab_size + 1] + speech_tokens_i)
+        timing_stats['prepare_llm_inputs'] = time.time() - start_time
+        # LLM generation
+        start_time = time.time()
+        llm_outputs = self.llm.generate(inputs, sampling_params, use_tqdm=self.use_tqdm)
+        timing_stats['llm_generation'] = time.time() - start_time
+        results_dict = {
+            "prompt_speech_tokens": valid_prompt_speech_tokens,
+            "generated_speech_tokens": [o['token_ids'][:-1] for o in llm_outputs],
+        }
+        if only_llm:
+            return results_dict, timing_stats
+        # Prepare Flow inputs
+        start_time = time.time()
+        flow_inputs = []
+        flow_inputs_lens = []
+        for i, o in enumerate(llm_outputs):
+            generated_speech_tokens = o['token_ids'][:-1]  # ignore last eos
+            prompt_speech_tokens = valid_prompt_speech_tokens[i]
+            flow_inputs.append(torch.tensor(prompt_speech_tokens + generated_speech_tokens))
+            flow_inputs_lens.append(len(prompt_speech_tokens) + len(generated_speech_tokens))
+        flow_inputs = torch.nn.utils.rnn.pad_sequence(flow_inputs, batch_first=True, padding_value=0)
+        flow_inputs_lens = torch.tensor(flow_inputs_lens)
+        timing_stats['prepare_flow_inputs'] = time.time() - start_time
+        # Flow generation and HiFi-GAN generation (with batching)
+        total_batch_size = flow_inputs.shape[0]
+        generated_wavs = []
+        flow_total_time = 0.0
+        hifigan_total_time = 0.0
+        # Process in batches according to batch_size_flow, batch_size_flow <= total_batch_size
+        # NOTE(xcsong): When executing both LLM and Flow on the same GPU,
+        #   Flow can easily fill up the SM and memory. Therefore, batch processing is required to avoid OOM.
+        num_batches = (total_batch_size + batch_size_flow - 1) // batch_size_flow
+        batch_iterator = range(0, total_batch_size, batch_size_flow)
+        if self.use_tqdm:
+            batch_iterator = tqdm(batch_iterator, desc="Generating wavs (Flow+HiFi-GAN)", leave=False, unit="batch",
+                                  total=num_batches, dynamic_ncols=True, position=self.config.rank + 1)
+        for start_idx in batch_iterator:
+            end_idx = min(start_idx + batch_size_flow, total_batch_size)
+            batch_flow_inputs = flow_inputs[start_idx:end_idx]
+            batch_flow_inputs_lens = flow_inputs_lens[start_idx:end_idx]
+            batch_prompt_mels = prompt_mels_for_flow[start_idx:end_idx]
+            batch_prompt_mels_lens = prompt_mels_lens_for_flow[start_idx:end_idx]
+            batch_spk_emb = spk_emb_for_flow[start_idx:end_idx]
+            # Flow generation for this batch
+            flow_start_time = time.time()
+            with torch.amp.autocast("cuda", dtype=torch.float16 if self.config.hf_config.fp16_flow else torch.float32):
+                batch_generated_mels, batch_generated_mels_lens = self.flow(
+                    batch_flow_inputs.cuda(), batch_flow_inputs_lens.cuda(),
+                    batch_prompt_mels.cuda(), batch_prompt_mels_lens.cuda(), batch_spk_emb.cuda(),
+                    streaming=False, finalize=True
+                )
+            flow_total_time += time.time() - flow_start_time
+            # HiFi-GAN generation for this batch
+            hifigan_start_time = time.time()
+            batch_size_current = end_idx - start_idx
+            for i in range(batch_size_current):
+                mel = batch_generated_mels[i, :, batch_prompt_mels_lens[i].item():batch_generated_mels_lens[i].item()].unsqueeze(0)
+                wav, _ = self.hift(speech_feat=mel)
+                generated_wavs.append(wav)
+            hifigan_total_time += time.time() - hifigan_start_time
+        timing_stats['flow_generation'] = flow_total_time
+        timing_stats['hifigan_generation'] = hifigan_total_time
+        # Calculate total time and batch statistics
+        timing_stats['model.forward_total'] = sum(timing_stats.values())
+        timing_stats['batch_size'] = len(generated_wavs)
+        timing_stats['batch_size_flow'] = batch_size_flow
+        results_dict['generated_wavs'] = generated_wavs
+        return results_dict, timing_stats

flashcosyvoice/cosyvoice3.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # TODO(xcsong): Implement CosyVoice3 when it is released

flashcosyvoice/engine/__init__.py ADDED Viewed

File without changes

flashcosyvoice/engine/block_manager.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from collections import deque
+import numpy as np
+import xxhash
+from flashcosyvoice.engine.sequence import Sequence
+class Block:
+    def __init__(self, block_id):
+        self.block_id = block_id
+        self.ref_count = 0
+        self.hash = -1
+        self.token_ids = []
+    def update(self, hash: int, token_ids: list[int]):
+        self.hash = hash
+        self.token_ids = token_ids
+    def reset(self):
+        self.ref_count = 1
+        self.hash = -1
+        self.token_ids = []
+class BlockManager:
+    def __init__(self, num_blocks: int, block_size: int):
+        assert num_blocks > 0
+        self.block_size = block_size
+        self.blocks: list[Block] = [Block(i) for i in range(num_blocks)]
+        self.hash_to_block_id: dict[int, int] = dict()
+        self.free_block_ids: deque[int] = deque(range(num_blocks))
+        self.used_block_ids: set[int] = set()
+    @classmethod
+    def compute_hash(cls, token_ids: list[int], prefix: int = -1):
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()
+    def _allocate_block(self, block_id: int) -> Block:
+        block = self.blocks[block_id]
+        assert block.ref_count == 0
+        block.reset()
+        self.free_block_ids.remove(block_id)
+        self.used_block_ids.add(block_id)
+        return self.blocks[block_id]
+    def _deallocate_block(self, block_id: int) -> Block:
+        assert self.blocks[block_id].ref_count == 0
+        self.used_block_ids.remove(block_id)
+        self.free_block_ids.append(block_id)
+    def can_allocate(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= seq.num_blocks
+    def allocate(self, seq: Sequence):
+        assert not seq.block_table
+        h = -1
+        cache_miss = False
+        for i in range(seq.num_blocks):
+            token_ids = seq.block(i)
+            h = self.compute_hash(token_ids, h) if len(token_ids) == self.block_size else -1
+            block_id = self.hash_to_block_id.get(h, -1)
+            if block_id == -1 or self.blocks[block_id].token_ids != token_ids:
+                cache_miss = True
+            if cache_miss:
+                block_id = self.free_block_ids[0]
+                block = self._allocate_block(block_id)
+            else:
+                seq.num_cached_tokens += self.block_size
+                if block_id in self.used_block_ids:
+                    block = self.blocks[block_id]
+                    block.ref_count += 1
+                else:
+                    block = self._allocate_block(block_id)
+            if h != -1:
+                block.update(h, token_ids)
+                self.hash_to_block_id[h] = block_id
+            seq.block_table.append(block_id)
+    def deallocate(self, seq: Sequence):
+        for block_id in reversed(seq.block_table):
+            block = self.blocks[block_id]
+            block.ref_count -= 1
+            if block.ref_count == 0:
+                self._deallocate_block(block_id)
+        seq.num_cached_tokens = 0
+        seq.block_table.clear()
+    def can_append(self, seq: Sequence) -> bool:
+        return len(self.free_block_ids) >= (len(seq) % self.block_size == 1)
+    def may_append(self, seq: Sequence):
+        block_table = seq.block_table
+        last_block = self.blocks[block_table[-1]]
+        if len(seq) % self.block_size == 1:
+            assert last_block.hash != -1
+            block_id = self.free_block_ids[0]
+            self._allocate_block(block_id)
+            block_table.append(block_id)
+        elif len(seq) % self.block_size == 0:
+            assert last_block.hash == -1
+            token_ids = seq.block(seq.num_blocks - 1)
+            prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
+            h = self.compute_hash(token_ids, prefix)
+            last_block.update(h, token_ids)
+            self.hash_to_block_id[h] = last_block.block_id
+        else:
+            assert last_block.hash == -1

flashcosyvoice/engine/llm_engine.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import atexit
+from dataclasses import fields
+from time import perf_counter
+import torch.multiprocessing as mp
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer
+from flashcosyvoice.config import Config, SamplingParams
+from flashcosyvoice.engine.model_runner import ModelRunner
+from flashcosyvoice.engine.scheduler import Scheduler
+from flashcosyvoice.engine.sequence import Sequence
+class LLMEngine:
+    def __init__(self, model, **kwargs):
+        config_fields = {field.name for field in fields(Config)}
+        config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
+        config = Config(model, **config_kwargs)
+        self.ps = []
+        self.events = []
+        ctx = mp.get_context("spawn")
+        assert config.tensor_parallel_size == 1, "NOTE(xcsong): Currently only support tp=1"
+        for i in range(1, config.tensor_parallel_size):
+            event = ctx.Event()
+            process = ctx.Process(target=ModelRunner, args=(config, i, event))
+            process.start()
+            self.ps.append(process)
+            self.events.append(event)
+        if hasattr(config.hf_config, "speech_vocab_size"):
+            # NOTE: non-chat model, all these special tokens keep randomly initialized.
+            special_tokens = {
+                'eos_token': '<|endoftext|>',
+                'pad_token': '<|endoftext|>',
+                'additional_special_tokens': [
+                    '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                    '[breath]', '<strong>', '</strong>', '[noise]',
+                    '[laughter]', '[cough]', '[clucking]', '[accent]',
+                    '[quick_breath]',
+                    "<laughter>", "</laughter>",
+                    "[hissing]", "[sigh]", "[vocalized-noise]",
+                    "[lipsmack]", "[mn]"
+                ]
+            }
+            self.tokenizer = AutoTokenizer.from_pretrained(f"{config.model}/CosyVoice-BlankEN")
+            self.tokenizer.add_special_tokens(special_tokens)
+            self.skip_special_tokens = True
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True)
+        if hasattr(config.hf_config, "eos_token_id"):
+            config.eos = config.hf_config.eos_token_id
+        else:
+            config.eos = self.tokenizer.eos_token_id
+        self.model_runner = ModelRunner(config, config.rank, self.events)
+        self.scheduler = Scheduler(config)
+        self.config = config
+        atexit.register(self.exit)
+    def exit(self):
+        self.model_runner.call("exit")
+        del self.model_runner
+        for p in self.ps:
+            p.join()
+    def add_request(self, prompt: str | list[int], sampling_params: SamplingParams):
+        if isinstance(prompt, str):
+            prompt = self.tokenizer.encode(prompt)
+        seq = Sequence(prompt, sampling_params)
+        self.scheduler.add(seq)
+    def step(self):
+        seqs, is_prefill = self.scheduler.schedule()
+        token_ids = self.model_runner.call("run", seqs, is_prefill)
+        self.scheduler.postprocess(seqs, token_ids)
+        outputs = [(seq.seq_id, seq.completion_token_ids) for seq in seqs if seq.is_finished]
+        num_tokens = sum(len(seq) for seq in seqs) if is_prefill else -len(seqs)
+        return outputs, num_tokens
+    def is_finished(self):
+        return self.scheduler.is_finished()
+    def generate(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+    ) -> list[str]:
+        if use_tqdm:
+            pbar = tqdm(total=len(prompts), desc="Generating tokens (LLM)", leave=False,
+                        dynamic_ncols=True, position=self.config.rank + 1)
+        if not isinstance(sampling_params, list):
+            sampling_params = [sampling_params] * len(prompts)
+        for prompt, sp in zip(prompts, sampling_params):
+            self.add_request(prompt, sp)
+        outputs = {}
+        prefill_throughput = decode_throughput = instant_decode_throughput = 0.
+        total_decode_tokens = 0
+        total_decode_time = 0.
+        while not self.is_finished():
+            t = perf_counter()
+            output, num_tokens = self.step()
+            step_time = perf_counter() - t
+            if use_tqdm:
+                if num_tokens > 0:
+                    prefill_throughput = num_tokens / step_time
+                else:
+                    instant_decode_throughput = -num_tokens / step_time
+                    total_decode_tokens += -num_tokens
+                    total_decode_time += step_time
+                    decode_throughput = total_decode_tokens / total_decode_time if total_decode_time > 0 else 0
+                pbar.set_postfix({
+                    "Prefill": f"{int(prefill_throughput)}tok/s",
+                    "AvgDecode": f"{int(decode_throughput)}tok/s",
+                    "InstDecode": f"{int(instant_decode_throughput)}tok/s",
+                })
+            for seq_id, token_ids in output:
+                outputs[seq_id] = token_ids
+                if use_tqdm:
+                    pbar.update(1)
+        outputs = [outputs[seq_id] for seq_id in sorted(outputs)]
+        outputs = [{"text": self.tokenizer.decode(token_ids), "token_ids": token_ids} for token_ids in outputs]
+        if use_tqdm:
+            pbar.close()
+        return outputs

flashcosyvoice/engine/model_runner.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import pickle
+from multiprocessing.shared_memory import SharedMemory
+from multiprocessing.synchronize import Event
+import torch
+import torch.distributed as dist
+from flashcosyvoice.config import Config
+from flashcosyvoice.engine.sequence import Sequence
+from flashcosyvoice.modules.qwen2 import Qwen2ForCausalLM
+from flashcosyvoice.modules.sampler import RasSampler, Sampler
+from flashcosyvoice.utils.context import (get_context, reset_context,
+                                          set_context)
+from flashcosyvoice.utils.loader import load_model
+class ModelRunner:
+    def __init__(self, config: Config, rank: int, event: Event | list[Event]):
+        self.config = config
+        hf_config = config.hf_config
+        self.block_size = config.kvcache_block_size
+        self.enforce_eager = config.enforce_eager
+        self.world_size = config.tensor_parallel_size
+        self.rank = rank
+        self.event = event
+        # TODO(xcsong): support tp > 1
+        if self.world_size > 1:
+            dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
+        torch.cuda.set_device(rank)
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(hf_config.torch_dtype)
+        torch.set_default_device("cuda")
+        self.model = Qwen2ForCausalLM(hf_config)
+        load_model(self.model, config.model, hf_config)
+        self.sampler = Sampler()
+        self.ras_sampler = RasSampler()
+        self.warmup_model()
+        self.allocate_kv_cache()
+        if not self.enforce_eager:
+            self.capture_cudagraph()
+        torch.set_default_device("cpu")
+        torch.set_default_dtype(default_dtype)
+        if self.world_size > 1:
+            if rank == 0:
+                self.shm = SharedMemory(name="flashcosyvoice", create=True, size=2**20)
+                dist.barrier()
+            else:
+                dist.barrier()
+                self.shm = SharedMemory(name="flashcosyvoice")
+                self.loop()
+    def exit(self):
+        if self.world_size > 1:
+            self.shm.close()
+            dist.barrier()
+            if self.rank == 0:
+                self.shm.unlink()
+        if not self.enforce_eager:
+            del self.graphs, self.graph_pool
+        torch.cuda.synchronize()
+        if self.world_size > 1:
+            dist.destroy_process_group()
+    def loop(self):
+        while True:
+            method_name, args = self.read_shm()
+            self.call(method_name, *args)
+            if method_name == "exit":
+                break
+    def read_shm(self):
+        assert self.world_size > 1 and self.rank
+        self.event.wait()
+        n = int.from_bytes(self.shm.buf[0:4], "little")
+        method_name, *args = pickle.loads(self.shm.buf[4:n + 4])
+        self.event.clear()
+        return method_name, args
+    def write_shm(self, method_name, *args):
+        assert self.world_size > 1 and not self.rank
+        data = pickle.dumps([method_name, *args])
+        n = len(data)
+        self.shm.buf[0:4] = n.to_bytes(4, "little")
+        self.shm.buf[4:n + 4] = data
+        for event in self.event:
+            event.set()
+    def call(self, method_name, *args):
+        if self.world_size > 1 and self.rank == 0:
+            self.write_shm(method_name, *args)
+        method = getattr(self, method_name, None)
+        return method(*args)
+    def warmup_model(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        max_num_batched_tokens, max_model_len = self.config.max_num_batched_tokens, self.config.max_model_len
+        num_seqs = min(max_num_batched_tokens // max_model_len, self.config.max_num_seqs)
+        seqs = [Sequence([0] * max_model_len) for _ in range(num_seqs)]
+        self.run(seqs, True)
+        torch.cuda.empty_cache()
+    def allocate_kv_cache(self):
+        config = self.config
+        hf_config = config.hf_config
+        free, total = torch.cuda.mem_get_info()
+        used = total - free
+        peak = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+        current = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+        num_kv_heads = hf_config.num_key_value_heads // self.world_size
+        head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads)
+        block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * hf_config.torch_dtype.itemsize
+        config.num_kvcache_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes
+        assert config.num_kvcache_blocks > 0, "try to **increase** gpu_memory_utilization"
+        self.kv_cache = torch.zeros(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim)
+        layer_id = 0
+        for module in self.model.modules():
+            if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
+                module.k_cache = self.kv_cache[0, layer_id]
+                module.v_cache = self.kv_cache[1, layer_id]
+                layer_id += 1
+    def prepare_block_tables(self, seqs: list[Sequence]):
+        max_len = max(len(seq.block_table) for seq in seqs)
+        block_tables = [seq.block_table + [-1] * (max_len - len(seq.block_table)) for seq in seqs]
+        block_tables = torch.tensor(block_tables, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        return block_tables
+    def prepare_prefill(self, seqs: list[Sequence]):
+        input_ids = []
+        positions = []
+        cu_seqlens_q = [0]
+        cu_seqlens_k = [0]
+        max_seqlen_q = 0
+        max_seqlen_k = 0
+        slot_mapping = []
+        block_tables = None
+        for seq in seqs:
+            seqlen = len(seq)
+            input_ids.extend(seq[seq.num_cached_tokens:])
+            positions.extend(list(range(seq.num_cached_tokens, seqlen)))
+            seqlen_q = seqlen - seq.num_cached_tokens
+            seqlen_k = seqlen
+            cu_seqlens_q.append(cu_seqlens_q[-1] + seqlen_q)
+            cu_seqlens_k.append(cu_seqlens_k[-1] + seqlen_k)
+            max_seqlen_q = max(seqlen_q, max_seqlen_q)
+            max_seqlen_k = max(seqlen_k, max_seqlen_k)
+            if not seq.block_table:
+                continue
+            for i in range(seq.num_cached_blocks, seq.num_blocks):
+                start = seq.block_table[i] * self.block_size
+                if i != seq.num_blocks - 1:
+                    end = start + self.block_size
+                else:
+                    end = start + seq.last_block_num_tokens
+                slot_mapping.extend(list(range(start, end)))
+        if cu_seqlens_k[-1] > cu_seqlens_q[-1]:    # prefix cache
+            block_tables = self.prepare_block_tables(seqs)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_q = torch.tensor(cu_seqlens_q, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        cu_seqlens_k = torch.tensor(cu_seqlens_k, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        set_context(True, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, None, block_tables)
+        return input_ids, positions
+    def prepare_decode(self, seqs: list[Sequence]):
+        input_ids = []
+        positions = []
+        slot_mapping = []
+        context_lens = []
+        for seq in seqs:
+            input_ids.append(seq.last_token)
+            positions.append(len(seq))
+            context_lens.append(len(seq))
+            slot_mapping.append(seq.block_table[-1] * self.block_size + seq.last_block_num_tokens - 1)
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        positions = torch.tensor(positions, dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
+        slot_mapping = torch.tensor(slot_mapping, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        context_lens = torch.tensor(context_lens, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
+        block_tables = self.prepare_block_tables(seqs)
+        set_context(False, slot_mapping=slot_mapping, context_lens=context_lens, block_tables=block_tables)
+        return input_ids, positions
+    def prepare_sample(self, seqs: list[Sequence]):
+        temperatures = []
+        top_ks = []
+        win_sizes = []
+        tau_rs = []
+        top_ps = []
+        min_tokens_list = []
+        use_ras_list = []
+        for seq in seqs:
+            temperatures.append(seq.temperature)
+            top_ks.append(seq.top_k)
+            win_sizes.append(seq.win_size)
+            tau_rs.append(seq.tau_r)
+            top_ps.append(seq.top_p)
+            min_tokens_list.append(seq.min_tokens)
+            use_ras_list.append(seq.use_ras)
+        temperatures_tensor = torch.tensor(temperatures, dtype=torch.float32, pin_memory=True).cuda(non_blocking=True)
+        # check all items equal
+        assert all(item == temperatures[0] for item in temperatures)
+        assert all(item == top_ks[0] for item in top_ks)
+        assert all(item == win_sizes[0] for item in win_sizes)
+        assert all(item == tau_rs[0] for item in tau_rs)
+        assert all(item == top_ps[0] for item in top_ps)
+        assert all(item == use_ras_list[0] for item in use_ras_list)
+        return {
+            'temperatures': temperatures_tensor,
+            'top_k': top_ks[0],
+            'win_size': win_sizes[0],
+            'tau_r': tau_rs[0],
+            'top_p': top_ps[0],
+            'eos_token': self.config.eos,
+            'min_tokens': min_tokens_list,
+            'use_ras': use_ras_list[0]
+        }
+    @torch.inference_mode()
+    def run_model(self, input_ids: torch.Tensor, positions: torch.Tensor, is_prefill: bool):
+        if is_prefill or self.enforce_eager or input_ids.size(0) > 512:
+            return self.model.compute_logits(self.model(input_ids, positions))
+        else:
+            bs = input_ids.size(0)
+            context = get_context()
+            graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
+            graph_vars = self.graph_vars
+            for k, v in graph_vars.items():
+                if k != "outputs":
+                    v.zero_()
+            graph_vars["input_ids"][:bs] = input_ids
+            graph_vars["positions"][:bs] = positions
+            graph_vars["slot_mapping"][:bs] = context.slot_mapping
+            graph_vars["context_lens"][:bs] = context.context_lens
+            graph_vars["block_tables"][:bs, :context.block_tables.size(1)] = context.block_tables
+            graph.replay()
+            return self.model.compute_logits(graph_vars["outputs"][:bs])
+    def run(self, seqs: list[Sequence], is_prefill: bool) -> list[int]:
+        input_ids, positions = self.prepare_prefill(seqs) if is_prefill else self.prepare_decode(seqs)
+        if self.rank == 0 or self.world_size == 1:
+            sample_params = self.prepare_sample(seqs)
+            logits = self.run_model(input_ids, positions, is_prefill)
+            if sample_params['use_ras']:
+                # Prepare decoded tokens list for RasSampler
+                decoded_tokens_list = [seq.completion_token_ids for seq in seqs]
+                # Pass all parameters as lists to RasSampler
+                token_ids = self.ras_sampler(
+                    logits,
+                    decoded_tokens_list,
+                    win_size=sample_params['win_size'],
+                    tau_r=sample_params['tau_r'],
+                    top_p=sample_params['top_p'],
+                    top_k=sample_params['top_k'],
+                    eos_token=sample_params['eos_token'],
+                    min_tokens=sample_params['min_tokens']
+                ).tolist()
+            else:
+                # Use the default sampler with list form of top_ks
+                token_ids = self.sampler(logits, sample_params['temperatures'], sample_params['top_k']).tolist()
+        else:
+            logits = self.run_model(input_ids, positions, is_prefill)
+            token_ids = None
+        reset_context()
+        return token_ids
+    @torch.inference_mode()
+    def capture_cudagraph(self):
+        config = self.config
+        hf_config = config.hf_config
+        max_bs = min(self.config.max_num_seqs, 512)
+        max_num_blocks = (config.max_model_len + self.block_size - 1) // self.block_size
+        input_ids = torch.zeros(max_bs, dtype=torch.int64)
+        positions = torch.zeros(max_bs, dtype=torch.int64)
+        slot_mapping = torch.zeros(max_bs, dtype=torch.int32)
+        context_lens = torch.zeros(max_bs, dtype=torch.int32)
+        block_tables = torch.zeros(max_bs, max_num_blocks, dtype=torch.int32)
+        outputs = torch.zeros(max_bs, hf_config.hidden_size)
+        self.graph_bs = [1, 2, 4, 8] + list(range(16, max_bs + 1, 16))
+        self.graphs = {}
+        self.graph_pool = None
+        for bs in reversed(self.graph_bs):
+            graph = torch.cuda.CUDAGraph()
+            set_context(False, slot_mapping=slot_mapping[:bs], context_lens=context_lens[:bs], block_tables=block_tables[:bs])
+            outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # warmup
+            with torch.cuda.graph(graph, self.graph_pool):
+                outputs[:bs] = self.model(input_ids[:bs], positions[:bs])    # capture
+            if self.graph_pool is None:
+                self.graph_pool = graph.pool()
+            self.graphs[bs] = graph
+            torch.cuda.synchronize()
+            reset_context()
+        self.graph_vars = dict(
+            input_ids=input_ids,
+            positions=positions,
+            slot_mapping=slot_mapping,
+            context_lens=context_lens,
+            block_tables=block_tables,
+            outputs=outputs,
+        )

flashcosyvoice/engine/scheduler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from collections import deque
+from flashcosyvoice.config import Config
+from flashcosyvoice.engine.block_manager import BlockManager
+from flashcosyvoice.engine.sequence import Sequence, SequenceStatus
+class Scheduler:
+    def __init__(self, config: Config):
+        self.max_num_seqs = config.max_num_seqs
+        self.max_num_batched_tokens = config.max_num_batched_tokens
+        self.eos = config.eos
+        self.block_manager = BlockManager(config.num_kvcache_blocks, config.kvcache_block_size)
+        self.waiting: deque[Sequence] = deque()
+        self.running: deque[Sequence] = deque()
+    def is_finished(self):
+        return not self.waiting and not self.running
+    def add(self, seq: Sequence):
+        self.waiting.append(seq)
+    def schedule(self) -> tuple[list[Sequence], bool]:
+        # prefill
+        scheduled_seqs = []
+        num_seqs = 0
+        num_batched_tokens = 0
+        while self.waiting and num_seqs < self.max_num_seqs:
+            seq = self.waiting[0]
+            if num_batched_tokens + len(seq) > self.max_num_batched_tokens or not self.block_manager.can_allocate(seq):
+                break
+            num_seqs += 1
+            self.block_manager.allocate(seq)
+            num_batched_tokens += len(seq) - seq.num_cached_tokens
+            seq.status = SequenceStatus.RUNNING
+            self.waiting.popleft()
+            self.running.append(seq)
+            scheduled_seqs.append(seq)
+        if scheduled_seqs:
+            return scheduled_seqs, True
+        # decode
+        while self.running and num_seqs < self.max_num_seqs:
+            seq = self.running.popleft()
+            while not self.block_manager.can_append(seq):
+                if self.running:
+                    self.preempt(self.running.pop())
+                else:
+                    self.preempt(seq)
+                    break
+            else:
+                num_seqs += 1
+                self.block_manager.may_append(seq)
+                scheduled_seqs.append(seq)
+        assert scheduled_seqs
+        self.running.extendleft(reversed(scheduled_seqs))
+        return scheduled_seqs, False
+    def preempt(self, seq: Sequence):
+        seq.status = SequenceStatus.WAITING
+        self.block_manager.deallocate(seq)
+        self.waiting.appendleft(seq)
+    def postprocess(self, seqs: list[Sequence], token_ids: list[int]) -> list[bool]:
+        for seq, token_id in zip(seqs, token_ids):
+            seq.append_token(token_id)
+            # Check if the sequence has reached the maximum number of tokens
+            reached_max_tokens = seq.num_completion_tokens == seq.max_tokens
+            # Check if the sequence has reached EOS and has generated enough tokens (satisfying min_tokens requirements)
+            eos_with_min_tokens = (not seq.ignore_eos and token_id == self.eos and
+                                   seq.num_completion_tokens >= seq.min_tokens)
+            if reached_max_tokens or eos_with_min_tokens:
+                seq.status = SequenceStatus.FINISHED
+                self.block_manager.deallocate(seq)
+                self.running.remove(seq)

flashcosyvoice/engine/sequence.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from copy import copy
+from enum import Enum, auto
+from itertools import count
+from flashcosyvoice.config import SamplingParams
+class SequenceStatus(Enum):
+    WAITING = auto()
+    RUNNING = auto()
+    FINISHED = auto()
+class Sequence:
+    block_size = 256
+    counter = count()
+    def __init__(self, token_ids: list[int], sampling_params = SamplingParams()):
+        self.seq_id = next(Sequence.counter)
+        self.status = SequenceStatus.WAITING
+        self.token_ids = copy(token_ids)
+        self.last_token = token_ids[-1]
+        self.num_tokens = len(self.token_ids)
+        self.num_prompt_tokens = len(token_ids)
+        self.num_cached_tokens = 0
+        self.block_table = []
+        self.temperature = sampling_params.temperature
+        self.min_tokens = sampling_params.min_tokens
+        self.max_tokens = sampling_params.max_tokens
+        self.ignore_eos = sampling_params.ignore_eos
+        self.top_k = sampling_params.top_k
+        # RasSampler parameters
+        self.use_ras = sampling_params.use_ras
+        self.win_size = sampling_params.win_size
+        self.tau_r = sampling_params.tau_r
+        self.top_p = sampling_params.top_p
+    def __len__(self):
+        return self.num_tokens
+    def __getitem__(self, key):
+        return self.token_ids[key]
+    @property
+    def is_finished(self):
+        return self.status == SequenceStatus.FINISHED
+    @property
+    def num_completion_tokens(self):
+        return self.num_tokens - self.num_prompt_tokens
+    @property
+    def prompt_token_ids(self):
+        return self.token_ids[:self.num_prompt_tokens]
+    @property
+    def completion_token_ids(self):
+        return self.token_ids[self.num_prompt_tokens:]
+    @property
+    def num_cached_blocks(self):
+        return self.num_cached_tokens // self.block_size
+    @property
+    def num_blocks(self):
+        return (self.num_tokens + self.block_size - 1) // self.block_size
+    @property
+    def last_block_num_tokens(self):
+        return self.num_tokens - (self.num_blocks - 1) * self.block_size
+    def block(self, i):
+        assert 0 <= i < self.num_blocks
+        return self.token_ids[i*self.block_size: (i+1)*self.block_size]
+    def append_token(self, token_id: int):
+        self.token_ids.append(token_id)
+        self.last_token = token_id
+        self.num_tokens += 1
+    def __getstate__(self):
+        return (self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table,
+                self.token_ids if self.num_completion_tokens == 0 else self.last_token)
+    def __setstate__(self, state):
+        self.num_tokens, self.num_prompt_tokens, self.num_cached_tokens, self.block_table = state[:-1]
+        if self.num_completion_tokens == 0:
+            self.token_ids = state[-1]
+        else:
+            self.last_token = state[-1]

flashcosyvoice/modules/__init__.py ADDED Viewed

File without changes

flashcosyvoice/modules/flow.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from flashcosyvoice.modules.flow_components.estimator import \
+    CausalConditionalDecoder
+from flashcosyvoice.modules.flow_components.upsample_encoder import (
+    UpsampleConformerEncoder, make_pad_mask)
+# TODO(xcsong): make it configurable
+@dataclass
+class CfmParams:
+    sigma_min: float = 1e-6
+    solver: str = "euler"
+    t_scheduler: str = "cosine"
+    training_cfg_rate: float = 0.2
+    inference_cfg_rate: float = 0.7
+class CausalConditionalCFM(torch.nn.Module):
+    def __init__(self, in_channels=320, cfm_params=CfmParams(), n_spks=1, spk_emb_dim=80, estimator: torch.nn.Module = None):
+        super().__init__()
+        self.n_feats = in_channels
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.solver = cfm_params.solver
+        if hasattr(cfm_params, "sigma_min"):
+            self.sigma_min = cfm_params.sigma_min
+        else:
+            self.sigma_min = 1e-4
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = CausalConditionalDecoder() if estimator is None else estimator
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, streaming=False):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond, streaming=streaming), None
+    def solve_euler(self, x, t_span, mu, mask, spks, cond, streaming=False):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        batch_size = x.size(0)
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        # Create tensors with double batch size for CFG (conditional + unconditional)
+        x_in = torch.zeros([batch_size * 2, x.size(1), x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([batch_size * 2, mask.size(1), mask.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([batch_size * 2, mu.size(1), mu.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([batch_size * 2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([batch_size * 2, spks.size(1)], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([batch_size * 2, cond.size(1), cond.size(2)], device=x.device, dtype=x.dtype)
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            # Copy conditional and unconditional input
+            x_in[:batch_size] = x
+            x_in[batch_size:] = x
+            mask_in[:batch_size] = mask
+            mask_in[batch_size:] = mask
+            mu_in[:batch_size] = mu
+            # Unconditional part remains 0
+            t_in.fill_(t)
+            spks_in[:batch_size] = spks
+            cond_in[:batch_size] = cond
+            dphi_dt = self.estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in,
+                streaming
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [batch_size, batch_size], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int = 512,
+        output_size: int = 80,
+        spk_embed_dim: int = 192,
+        output_type: str = "mel",
+        vocab_size: int = 6561,
+        input_frame_rate: int = 25,
+        token_mel_ratio: int = 2,
+        pre_lookahead_len: int = 3,
+        encoder: torch.nn.Module = None,
+        decoder: torch.nn.Module = None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = UpsampleConformerEncoder() if encoder is None else encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = CausalConditionalCFM() if decoder is None else decoder
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+    @torch.inference_mode()
+    def forward(self,
+                token,
+                token_len,
+                prompt_feat,
+                prompt_feat_len,
+                embedding,
+                streaming,
+                finalize):
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len, max_len=token.shape[1])).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        if finalize is True:
+            h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        else:
+            token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
+            h, h_lengths = self.encoder(token, token_len, context=context, streaming=streaming)
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros_like(h, device=token.device)
+        for i, j in enumerate(prompt_feat_len):
+            conds[i, :j] = prompt_feat[i, :j]
+        conds = conds.transpose(1, 2)
+        h_lengths = h_lengths.sum(dim=-1).squeeze(dim=1)
+        mask = (~make_pad_mask(h_lengths, max_len=h.shape[1])).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            streaming=streaming
+        )  # [B, num_mels, T]
+        return feat.float(), h_lengths

flashcosyvoice/modules/flow_components/__init__.py ADDED Viewed

File without changes

flashcosyvoice/modules/flow_components/estimator.py ADDED Viewed

	@@ -0,0 +1,974 @@

+import math
+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.attention import (GEGLU, GELU, AdaLayerNorm,
+                                        AdaLayerNormZero, ApproximateGELU)
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from einops import pack, rearrange, repeat
+from flashcosyvoice.modules.flow_components.upsample_encoder import \
+    add_optional_chunk_mask
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    Args:
+        in_features: shape of the input
+        out_features: shape of the output
+        alpha: trainable parameter that controls frequency
+        alpha_trainable: whether alpha is trainable
+        alpha_logscale: whether to use log scale for alpha
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+    """
+    def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
+        super().__init__()
+        self.in_features = out_features if isinstance(out_features, list) else [out_features]
+        self.proj = LoRACompatibleLinear(in_features, out_features)
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        x = self.proj(x)
+        if self.alpha_logscale:
+            alpha = torch.exp(self.alpha)
+            beta = torch.exp(self.beta)
+        else:
+            alpha = self.alpha
+            beta = self.beta
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
+        return x
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        elif activation_fn == "snakebeta":
+            act_fn = SnakeBeta(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                # scale_qk=False, # uncomment this to not to use flash attention
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ):
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class Block1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.block = torch.nn.Sequential(
+            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
+            torch.nn.GroupNorm(groups, dim_out),
+            nn.Mish(),
+        )
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class Downsample1D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+        assert act_fn == "silu", "act_fn must be silu"
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = nn.SiLU()
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = nn.SiLU()
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, inputs):
+        assert inputs.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(inputs)
+        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
+        if self.use_conv:
+            outputs = self.conv(outputs)
+        return outputs
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = kernel_size - 1
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class ConditionalDecoder(nn.Module):
+    """
+    This decoder requires an input with the same shape of the target. So, if your text content
+    is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+    Args:
+        in_channels: number of input channels
+        out_channels: number of output channels
+        channels: tuple of channel dimensions
+        dropout: dropout rate
+        attention_head_dim: dimension of attention heads
+        n_blocks: number of transformer blocks
+        num_mid_blocks: number of middle blocks
+        num_heads: number of attention heads
+        act_fn: activation function name
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
+class CausalConditionalDecoder(ConditionalDecoder):
+    """
+    This decoder requires an input with the same shape of the target. So, if your text content
+    is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+    Args:
+        in_channels: number of input channels
+        out_channels: number of output channels
+        channels: list of channel dimensions
+        dropout: dropout rate
+        attention_head_dim: dimension of attention heads
+        n_blocks: number of transformer blocks
+        num_mid_blocks: number of middle blocks
+        num_heads: number of attention heads
+        act_fn: activation function name
+        static_chunk_size: size of static chunks
+        num_decoding_left_chunks: number of left chunks for decoding
+    """
+    def __init__(
+        self,
+        in_channels=320,
+        out_channels=80,
+        channels=[256],  # noqa
+        dropout=0.0,
+        attention_head_dim=64,
+        n_blocks=4,
+        num_mid_blocks=12,
+        num_heads=8,
+        act_fn="gelu",
+        static_chunk_size=50,
+        num_decoding_left_chunks=-1,
+    ):
+        torch.nn.Module.__init__(self)
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.static_chunk_size = static_chunk_size
+        self.num_decoding_left_chunks = num_decoding_left_chunks
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            if streaming is True:
+                attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            else:
+                attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
+            attn_mask = mask_to_bias(attn_mask, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask

flashcosyvoice/modules/flow_components/upsample_encoder.py ADDED Viewed

	@@ -0,0 +1,998 @@

+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+    Returns:
+        torch.Tensor: mask
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True):
+    """ Apply optional mask for encoder.
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, 25] or full context(max_len)
+            False: chunk size ~ U[1, 25]
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    assert chunk_masks.dtype == torch.bool
+    if (chunk_masks.sum(dim=-1) == 0).sum().item() != 0:
+        print('get chunk_masks all false at some timestep, force set to true, make sure they are masked in futuer computation!')
+        chunk_masks[chunk_masks.sum(dim=-1) == 0] = True
+    return chunk_masks
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+    See description of make_non_pad_mask.
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return x, pos_emb
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            pos_emb = self.pe[
+                :,
+                self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
+            ]
+        elif isinstance(offset, torch.Tensor):
+            pos_emb = self.pe[
+                :,
+                self.pe.size(1) // 2 - size - offset + 1: self.pe.size(1) // 2 + size + offset,
+            ]
+        return pos_emb
+class LinearNoSubsampling(torch.nn.Module):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        pos_enc_class (torch.nn.Module): Positional encoding class.
+    """
+    def __init__(self, idim: int, odim: int,
+                 pos_enc_class: torch.nn.Module):
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(self, channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels
+        self.stride = stride
+        # In this mode, first repeat interpolate, than conv with stride=1
+        self.conv = nn.Conv1d(self.channels, self.out_channels, stride * 2 + 1, stride=1, padding=0)
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        outputs = F.interpolate(inputs, scale_factor=float(self.stride), mode="nearest")
+        outputs = F.pad(outputs, (self.stride * 2, 0), value=0.0)
+        outputs = self.conv(outputs)
+        return outputs, input_lengths * self.stride
+class PreLookaheadLayer(nn.Module):
+    def __init__(self, channels: int, pre_lookahead_len: int = 1):
+        super().__init__()
+        self.channels = channels
+        self.pre_lookahead_len = pre_lookahead_len
+        self.conv1 = nn.Conv1d(
+            channels, channels,
+            kernel_size=pre_lookahead_len + 1,
+            stride=1, padding=0,
+        )
+        self.conv2 = nn.Conv1d(
+            channels, channels,
+            kernel_size=3, stride=1, padding=0,
+        )
+    def forward(self, inputs: torch.Tensor, context: torch.Tensor = torch.zeros(0, 0, 0)) -> torch.Tensor:
+        """
+        inputs: (batch_size, seq_len, channels)
+        """
+        outputs = inputs.transpose(1, 2).contiguous()
+        context = context.transpose(1, 2).contiguous()
+        # look ahead
+        if context.size(2) == 0:
+            outputs = F.pad(outputs, (0, self.pre_lookahead_len), mode='constant', value=0.0)
+        else:
+            assert self.training is False, 'you have passed context, make sure that you are running inference mode'
+            assert context.size(2) == self.pre_lookahead_len
+            outputs = F.pad(torch.concat([outputs, context], dim=2), (0, self.pre_lookahead_len - context.size(2)), mode='constant', value=0.0)
+        outputs = F.leaky_relu(self.conv1(outputs))
+        # outputs
+        outputs = F.pad(outputs, (self.conv2.kernel_size[0] - 1, 0), mode='constant', value=0.0)
+        outputs = self.conv2(outputs)
+        outputs = outputs.transpose(1, 2).contiguous()
+        # residual connection
+        outputs = outputs + inputs
+        return outputs
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        key_bias (bool): Whether to use bias in key linear layer.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                CosyVoice.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        key_bias (bool): Whether to use bias in key linear layer.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-12)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-12)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-12)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-12)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-12)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        return x, mask, new_att_cache, new_cnn_cache
+class UpsampleConformerEncoder(torch.nn.Module):
+    """
+    Args:
+        input_size (int): input dim
+        output_size (int): dimension of attention
+        attention_heads (int): the number of heads of multi head attention
+        linear_units (int): the hidden units number of position-wise feed
+            forward
+        num_blocks (int): the number of decoder blocks
+        static_chunk_size (int): chunk size for static chunk training and
+            decoding
+        use_dynamic_chunk (bool): whether use dynamic chunk size for
+            training or not, You can only use fixed chunk(chunk_size > 0)
+            or dyanmic chunk size(use_dynamic_chunk = True)
+        use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+            dynamic chunk training
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+    """
+    def __init__(
+        self,
+        input_size: int = 512,
+        output_size: int = 512,
+        attention_heads: int = 8,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        static_chunk_size: int = 25,
+        use_dynamic_chunk: bool = False,
+        use_dynamic_left_chunk: bool = False,
+        key_bias: bool = True,
+    ):
+        super().__init__()
+        self._output_size = output_size
+        self.embed = LinearNoSubsampling(
+            input_size, output_size,
+            EspnetRelPositionalEncoding(output_size),
+        )
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        activation = torch.nn.SiLU()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            0.0,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            0.0,
+            activation,
+        )
+        # convolution module definition
+        self.pre_lookahead_layer = PreLookaheadLayer(channels=512, pre_lookahead_len=3)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                RelPositionMultiHeadedAttention(*encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+            ) for _ in range(num_blocks)
+        ])
+        self.up_layer = Upsample1D(channels=512, out_channels=512, stride=2)
+        self.up_embed = LinearNoSubsampling(
+            input_size, output_size,
+            EspnetRelPositionalEncoding(output_size),
+        )
+        self.up_encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                RelPositionMultiHeadedAttention(*encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+            ) for _ in range(4)
+        ])
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        context: torch.Tensor = torch.zeros(0, 0, 0),
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+        streaming: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        if context.size(1) != 0:
+            assert self.training is False, 'you have passed context, make sure that you are running inference mode'
+            context_masks = torch.ones(1, 1, context.size(1)).to(masks)
+            context, _, _ = self.embed(context, context_masks, offset=xs.size(1))
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size if streaming is True else 0, -1)
+        # lookahead + conformer encoder
+        xs = self.pre_lookahead_layer(xs, context=context)
+        xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        # upsample + conformer encoder
+        xs = xs.transpose(1, 2).contiguous()
+        xs, xs_lens = self.up_layer(xs, xs_lens)
+        xs = xs.transpose(1, 2).contiguous()
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        xs, pos_emb, masks = self.up_embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks, False, False, 0, self.static_chunk_size * self.up_layer.stride if streaming is True else 0, -1)
+        xs = self.forward_up_layers(xs, chunk_masks, pos_emb, mask_pad)
+        xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+    def forward_up_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                          pos_emb: torch.Tensor,
+                          mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.up_encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs

flashcosyvoice/modules/hifigan.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HIFI-GAN"""
+from typing import Dict, List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.signal import get_window
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm  # noqa
+from flashcosyvoice.modules.hifigan_components.layers import (
+    ResBlock, SourceModuleHnNSF, SourceModuleHnNSF2, init_weights)
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(  # noqa
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(  # noqa
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(  # noqa
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(  # noqa
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(  # noqa
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 24000,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 5, 3],  # noqa
+            upsample_kernel_sizes: List[int] = [16, 11, 7],  # noqa
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},  # noqa
+            resblock_kernel_sizes: List[int] = [3, 7, 11],  # noqa
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],  # noqa
+            source_resblock_kernel_sizes: List[int] = [7, 7, 11],  # noqa
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],  # noqa
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        # NOTE in CosyVoice2, we use the original SourceModuleHnNSF implementation
+        this_SourceModuleHnNSF = SourceModuleHnNSF if self.sampling_rate == 22050 else SourceModuleHnNSF2
+        self.m_source = this_SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(  # noqa
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(  # noqa
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))  # noqa
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = ConvRNNF0Predictor() if f0_predictor is None else f0_predictor
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for up in self.ups:
+            remove_weight_norm(up)
+        for resblock in self.resblocks:
+            resblock.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for source_down in self.source_downs:
+            remove_weight_norm(source_down)
+        for source_resblock in self.source_resblocks:
+            source_resblock.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    @torch.inference_mode()
+    def forward(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

flashcosyvoice/modules/hifigan_components/__init__.py ADDED Viewed

File without changes

flashcosyvoice/modules/hifigan_components/layers.py ADDED Viewed

	@@ -0,0 +1,433 @@

+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.distributions.uniform import Uniform
+from torch.nn import Conv1d
+from torch.nn.utils import remove_weight_norm
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm  # noqa
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter
+        alpha_trainable: whether alpha is trainable
+        alpha_logscale: whether to use log scale for alpha
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
+        return x
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],  # noqa
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(  # noqa
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(  # noqa
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class SineGen2(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                         scale_factor=1 / self.upsample_scale,
+                                                         mode="linear").transpose(1, 2)
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                    scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF2(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
+                                  sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv

flashcosyvoice/modules/qwen2.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) 2025 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from transformers import AutoConfig
+from flashcosyvoice.config import CosyVoice2LLMConfig
+from flashcosyvoice.modules.qwen2_components.layers import (
+    ParallelLMHead, Qwen2DecoderLayer, RMSNorm, VocabParallelEmbedding)
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: CosyVoice2LLMConfig,
+    ):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([Qwen2DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+class Qwen2ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "q_proj": ("qkv_proj", "q"),
+        "k_proj": ("qkv_proj", "k"),
+        "v_proj": ("qkv_proj", "v"),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+    def __init__(
+        self,
+        config: CosyVoice2LLMConfig | AutoConfig
+    ):
+        super().__init__()
+        self.model = Qwen2Model(config)
+        if hasattr(config, "speech_vocab_size"):
+            self.lm_head = ParallelLMHead(config.speech_vocab_size, config.hidden_size, bias=getattr(config, "lm_head_bias", True))
+            self.model_type = "speech_llm"
+        else:
+            self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, bias=False)
+            self.model_type = "text_llm"
+        self.tie_word_embeddings = config.tie_word_embeddings
+        if self.tie_word_embeddings:
+            if self.model_type == "speech_llm":
+                assert config.vocab_size == config.speech_vocab_size, "vocab_size and speech_vocab_size must be the same when tie_word_embeddings is True"
+            self.lm_head.weight.data = self.model.embed_tokens.weight.data
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions)
+        return hidden_states
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        logits = self.lm_head(hidden_states)
+        return logits

flashcosyvoice/modules/qwen2_components/__init__.py ADDED Viewed

File without changes

flashcosyvoice/modules/qwen2_components/layers.py ADDED Viewed

	@@ -0,0 +1,616 @@

+from functools import lru_cache
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from flashcosyvoice.config import CosyVoice2LLMConfig
+from flashcosyvoice.utils.context import get_context
+class SiluAndMul(nn.Module):
+    def __init__(self):
+        super().__init__()
+    @torch.compile
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, y = x.chunk(2, -1)
+        return F.silu(x) * y
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    @torch.compile
+    def rms_forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x
+    @torch.compile
+    def add_rms_forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32).add_(residual.to(torch.float32))
+        residual = x.to(orig_dtype)
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x.mul_(torch.rsqrt(var + self.eps))
+        x = x.to(orig_dtype).mul_(self.weight)
+        return x, residual
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            return self.rms_forward(x)
+        else:
+            return self.add_rms_forward(x, residual)
+@triton.jit
+def store_kvcache_kernel(
+    key_ptr,
+    key_stride,
+    value_ptr,
+    value_stride,
+    k_cache_ptr,
+    v_cache_ptr,
+    slot_mapping_ptr,
+    D: tl.constexpr,
+):
+    idx = tl.program_id(0)
+    key_offsets = idx * key_stride + tl.arange(0, D)
+    value_offsets = idx * value_stride + tl.arange(0, D)
+    key = tl.load(key_ptr + key_offsets)
+    value = tl.load(value_ptr + value_offsets)
+    slot = tl.load(slot_mapping_ptr + idx)
+    cache_offsets = slot * D + tl.arange(0, D)
+    tl.store(k_cache_ptr + cache_offsets, key)
+    tl.store(v_cache_ptr + cache_offsets, value)
+def store_kvcache(key: torch.Tensor, value: torch.Tensor, k_cache: torch.Tensor, v_cache: torch.Tensor, slot_mapping: torch.Tensor):
+    N, num_heads, head_dim = key.shape
+    D = num_heads * head_dim
+    assert key.stride(-1) == 1 and value.stride(-1) == 1
+    assert key.stride(1) == head_dim and value.stride(1) == head_dim
+    assert k_cache.stride(1) == D and v_cache.stride(1) == D
+    assert slot_mapping.numel() == N
+    store_kvcache_kernel[(N,)](key, key.stride(0), value, value.stride(0), k_cache, v_cache, slot_mapping, D)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        num_heads,
+        head_dim,
+        scale,
+        num_kv_heads,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.scale = scale
+        self.num_kv_heads = num_kv_heads
+        self.k_cache = self.v_cache = torch.tensor([])
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+        o: torch.Tensor
+        q = q.view(-1, self.num_heads, self.head_dim)
+        k = k.view(-1, self.num_kv_heads, self.head_dim)
+        v = v.view(-1, self.num_kv_heads, self.head_dim)
+        context = get_context()
+        k_cache, v_cache = self.k_cache, self.v_cache
+        if k_cache.numel() and v_cache.numel():
+            store_kvcache(k, v, k_cache, v_cache, context.slot_mapping)
+        if context.is_prefill:
+            if context.block_tables is not None:    # prefix cache
+                k, v = k_cache, v_cache
+            o = flash_attn_varlen_func(q, k, v,
+                                       max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
+                                       max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
+                                       softmax_scale=self.scale, causal=True, block_table=context.block_tables)
+        else:    # decode
+            o = flash_attn_with_kvcache(q.unsqueeze(1), k_cache, v_cache,
+                                        cache_seqlens=context.context_lens, block_table=context.block_tables,
+                                        softmax_scale=self.scale, causal=True)
+        o = o.view(-1, self.num_heads * self.head_dim)
+        return o
+class VocabParallelEmbedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+    ):
+        super().__init__()
+        # TODO(xcsong): support tp > 1
+        self.tp_rank = 0  # dist.get_rank()
+        self.tp_size = 1  # dist.get_world_size()
+        assert num_embeddings % self.tp_size == 0
+        self.num_embeddings = num_embeddings
+        self.num_embeddings_per_partition = self.num_embeddings // self.tp_size
+        self.vocab_start_idx = self.num_embeddings_per_partition * self.tp_rank
+        self.vocab_end_idx = self.vocab_start_idx + self.num_embeddings_per_partition
+        self.embedding_dim = embedding_dim
+        self.weight = nn.Parameter(torch.empty(self.num_embeddings_per_partition, embedding_dim))
+        self.weight.weight_loader = self.weight_loader
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(0)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.size() == loaded_weight.size()
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor):
+        if self.tp_size > 1:
+            mask = (x >= self.vocab_start_idx) & (x < self.vocab_end_idx)
+            x = mask * (x - self.vocab_start_idx)
+        y = F.embedding(x, self.weight)
+        if self.tp_size > 1:
+            y = mask.unsqueeze(1) * y
+            dist.all_reduce(y)
+        return y
+class ParallelLMHead(VocabParallelEmbedding):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        bias: bool = False,
+    ):
+        super().__init__(num_embeddings, embedding_dim)
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.num_embeddings_per_partition))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x: torch.Tensor):
+        context = get_context()
+        if context.is_prefill:
+            last_indices = context.cu_seqlens_q[1:] - 1
+            x = x[last_indices].contiguous()
+        logits = F.linear(x, self.weight, self.bias)
+        if self.tp_size > 1:
+            all_logits = [torch.empty_like(logits) for _ in range(self.tp_size)] if self.tp_rank == 0 else None
+            dist.gather(logits, all_logits, 0)
+            logits = torch.cat(all_logits, -1) if self.tp_rank == 0 else None
+        return logits
+def divide(numerator, denominator):
+    assert numerator % denominator == 0
+    return numerator // denominator
+class LinearBase(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        tp_dim: int | None = None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.tp_dim = tp_dim
+        # TODO(xcsong): support tp > 1
+        self.tp_rank = 0  # dist.get_rank()
+        self.tp_size = 1  # dist.get_world_size()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+class ReplicatedLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        super().__init__(input_size, output_size)
+        self.weight = nn.Parameter(torch.empty(self.output_size, self.input_size))
+        self.weight.weight_loader = self.weight_loader
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.output_size))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class ColumnParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        super().__init__(input_size, output_size, 0)
+        self.input_size_per_partition = input_size
+        self.output_size_per_partition = divide(output_size, self.tp_size)
+        self.output_partition_sizes = [self.output_size_per_partition]
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = [
+                divide(output_size, self.tp_size)
+                for output_size in self.output_sizes
+            ]
+        self.weight = nn.Parameter(torch.empty(self.output_size_per_partition, self.input_size))
+        self.weight.weight_loader = self.weight_loader
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.output_size_per_partition))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        assert param_data.size() == loaded_weight.size()
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight, self.bias)
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = False,
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size, sum(output_sizes), bias=bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int):
+        param_data = param.data
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        assert param_data.size() == loaded_weight.size()
+        param_data.copy_(loaded_weight)
+class QKVParallelLinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: int | None = None,
+        bias: bool = False,
+    ):
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.total_num_heads = total_num_heads
+        if total_num_kv_heads is None:
+            total_num_kv_heads = total_num_heads
+        self.total_num_kv_heads = total_num_kv_heads
+        # TODO(xcsong): support tp > 1
+        tp_size = 1  # dist.get_world_size()
+        self.num_heads = divide(self.total_num_heads, tp_size)
+        self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
+        input_size = self.hidden_size
+        output_size = (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
+        self.output_sizes = [
+            self.num_heads * self.head_size * tp_size,  # q_proj
+            self.num_kv_heads * self.head_size * tp_size,  # k_proj
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj
+        ]
+        super().__init__(input_size, output_size, bias)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, loaded_shard_id: str):
+        param_data = param.data
+        assert loaded_shard_id in ["q", "k", "v"]
+        if loaded_shard_id == "q":
+            shard_size = self.num_heads * self.head_size
+            shard_offset = 0
+        elif loaded_shard_id == "k":
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size
+        else:
+            shard_size = self.num_kv_heads * self.head_size
+            shard_offset = self.num_heads * self.head_size + self.num_kv_heads * self.head_size
+        param_data = param_data.narrow(self.tp_dim, shard_offset, shard_size)
+        loaded_weight = loaded_weight.chunk(self.tp_size, self.tp_dim)[self.tp_rank]
+        assert param_data.size() == loaded_weight.size()
+        param_data.copy_(loaded_weight)
+class RowParallelLinear(LinearBase):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = False,
+    ):
+        super().__init__(input_size, output_size, 1)
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        self.output_size_per_partition = output_size
+        self.output_partition_sizes = [output_size]
+        self.weight = nn.Parameter(torch.empty(self.output_size, self.input_size_per_partition))
+        self.weight.weight_loader = self.weight_loader
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.output_size))
+            self.bias.weight_loader = self.weight_loader
+        else:
+            self.register_parameter("bias", None)
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        shard_size = param_data.size(self.tp_dim)
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(self.tp_dim, start_idx, shard_size)
+        assert param_data.size() == loaded_weight.size()
+        param_data.copy_(loaded_weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        y = F.linear(x, self.weight, self.bias if self.tp_rank == 0 else None)
+        if self.tp_size > 1:
+            dist.all_reduce(y)
+        return y
+def apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    cos = cos.unsqueeze(-2)
+    sin = sin.unsqueeze(-2)
+    x1, x2 = torch.chunk(x.to(torch.float32), 2, dim=-1)
+    y1 = x1 * cos - x2 * sin
+    y2 = x2 * cos + x1 * sin
+    return torch.cat((y1, y2), dim=-1).to(x.dtype)
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        assert rotary_dim == head_size
+        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+    @torch.compile
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query = apply_rotary_emb(query, cos, sin).view(query_shape)
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key = apply_rotary_emb(key, cos, sin).view(key_shape)
+        return query, key
+@lru_cache(1)
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: float,
+    rope_scaling: dict | None = None,
+):
+    assert rope_scaling is None
+    rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base)
+    return rotary_emb
+class Qwen2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = True,
+        rope_theta: float = 1000000.0,
+        rope_scaling: tuple | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        # TODO(xcsong): support tp > 1
+        tp_size = 1  # dist.get_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        assert self.total_num_kv_heads % tp_size == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        o = self.attn(q, k, v)
+        output = self.o_proj(o)
+        return output
+class Qwen2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+        )
+        assert hidden_act == "silu"
+        self.act_fn = SiluAndMul()
+    def forward(self, x):
+        gate_up = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x = self.down_proj(x)
+        return x
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: CosyVoice2LLMConfig,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position=config.max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, "qkv_bias", True),
+            head_dim=getattr(config, "head_dim", None),
+            rope_theta=getattr(config, "rope_theta", 1000000.0),
+            rope_scaling=getattr(config, "rope_scaling", None),
+        )
+        self.mlp = Qwen2MLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual

flashcosyvoice/modules/sampler.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import torch
+from torch import nn
+class Sampler(nn.Module):
+    """
+    Optimized sampler implementation using vectorized operations instead of loops, significantly improving performance
+    Performance optimizations:
+    1. Using batch processing instead of sequence loops, reducing Python loop overhead
+    2. Using PyTorch's vectorized operations (like torch.sort, torch.gather) for parallel computation
+    3. Using mask operations to apply top-k filtering at once, avoiding per-sequence processing
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, logits: torch.Tensor, temperatures: torch.Tensor, top_k: int = None):
+        """
+        Perform sampling operation using vectorized method for top-k filtering
+        Args:
+            logits: Logits tensor with shape [batch_size, vocab_size]
+            temperatures: Temperature parameters with shape [batch_size]
+            top_k: Top-k value for filtering (uniform across all sequences)
+        Returns:
+            Sampled token IDs
+        """
+        logits = logits.to(torch.float)
+        greedy_tokens = logits.argmax(dim=-1)  # Greedy decoding result, used when temperature=0
+        logits.div_(temperatures.unsqueeze(dim=1))  # Apply temperature scaling
+        # Apply uniform top-k filtering if top_k is provided
+        if top_k is not None and top_k > 0:
+            vocab_size = logits.size(-1)
+            # Create a mask to store which positions should be kept
+            mask = torch.zeros_like(logits, dtype=torch.bool)
+            # Batch sorting for all sequences at once
+            sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
+            # Get threshold for each sequence (the k-th largest value)
+            k_value = min(top_k, vocab_size)  # Ensure k doesn't exceed vocab size
+            thresholds = sorted_logits[:, k_value-1:k_value]  # Shape [batch_size, 1]
+            thresholds = thresholds.expand(-1, vocab_size)    # Expand to match logits shape
+            # Create mask: only keep logits greater than or equal to threshold
+            mask = logits >= thresholds
+            # Apply mask: set logits not in top-k to negative infinity
+            logits = torch.where(mask, logits, torch.tensor(float('-inf'), device=logits.device))
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        # logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+        sample_tokens = probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
+        return torch.where(temperatures == 0, greedy_tokens, sample_tokens)
+class RasSampler(nn.Module):
+    """
+    Optimized Repetition Aware Sampling implementation
+    Performance optimizations:
+    1. Using vectorized nucleus sampling instead of loop implementation, improving sampling efficiency
+    2. Using tensor operations to calculate repetition rate, reducing Python loop overhead
+    3. Optimizing EOS handling logic, reducing unnecessary resampling
+    4. Using PyTorch's vectorized operations for parallel computation
+    5. Batch processing for all sequences, dramatically improving throughput
+    6. Robust handling for sequences of any length, including empty sequences
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(self, logits: torch.Tensor, decoded_tokens_list: list,
+                win_size: int = 10, tau_r: float = 0.1,
+                top_p: float = 0.8, top_k: int = 25,
+                eos_token: int = 6561, min_tokens: list[int] = None):
+        """
+        Execute repetition-aware sampling using optimized vectorized operations with batch processing
+        Args:
+            logits: Input logits with shape [batch_size, vocab_size]
+            decoded_tokens_list: List of decoded tokens, each element is a token list for a batch
+            win_size: Window size for repetition detection (uniform across all batch items)
+            tau_r: Repetition threshold (uniform across all batch items)
+            top_p: Nucleus sampling probability threshold (uniform across all batch items)
+            top_k: Nucleus sampling top-k threshold (uniform across all batch items)
+            eos_token: End of sequence token ID (uniform across all batch items)
+            min_tokens: List of minimum tokens to generate before allowing EOS, one per batch item
+        Returns:
+            Selected token IDs
+        """
+        batch_size = logits.size(0)
+        device = logits.device
+        result = torch.zeros(batch_size, dtype=torch.long, device=device)
+        # Set default values if not provided
+        if min_tokens is None:
+            min_tokens = [2] * batch_size
+        # Ensure min_tokens list has the correct length
+        assert len(min_tokens) == batch_size, f"min_tokens length {len(min_tokens)} != batch_size {batch_size}"
+        # Force continue decode first token
+        for i in range(batch_size):
+            if i < len(decoded_tokens_list) and len(decoded_tokens_list[i]) == 0:
+                logits[i, eos_token] = -float('inf')
+        # 1. First, perform nucleus sampling for all sequences
+        probs = torch.softmax(logits, dim=-1)
+        # Use vectorized nucleus sampling for all sequences
+        # This can be done in batch since top_p and top_k are uniform
+        sorted_probs, sorted_indices = probs.sort(dim=-1, descending=True)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        # Create masks for top-p and top-k filtering
+        top_p_mask = cumulative_probs <= top_p
+        # Create top-k mask (first top_k positions are True)
+        top_k_mask = torch.zeros_like(top_p_mask)
+        top_k_mask[:, :top_k] = True
+        # Combine masks
+        mask = top_p_mask & top_k_mask
+        # Ensure at least one token is selected per sequence
+        first_token_mask = torch.zeros_like(mask)
+        first_token_mask[:, 0] = True
+        mask = mask | first_token_mask
+        # Sample from the filtered distribution
+        sample_probs = torch.where(mask, sorted_probs, torch.zeros_like(sorted_probs))
+        sample_probs = sample_probs / sample_probs.sum(dim=-1, keepdim=True)
+        # Sample indices from the filtered distribution
+        sampled_indices = torch.multinomial(sample_probs, 1).squeeze(-1)
+        top_ids = torch.gather(sorted_indices, -1, sampled_indices.unsqueeze(-1)).squeeze(-1)
+        # 2. Check for repetitions and apply random sampling if needed
+        # Extract recent tokens for each sequence, handling empty or short sequences
+        recent_tokens_list = []
+        for i in range(batch_size):
+            # Handle index out of range or empty tokens
+            if i < len(decoded_tokens_list):
+                tokens = decoded_tokens_list[i]
+                if len(tokens) > 0:
+                    start_idx = max(0, len(tokens) - win_size)
+                    recent_tokens_list.append(tokens[start_idx:])
+                else:
+                    recent_tokens_list.append([])  # Empty list for empty tokens
+            else:
+                recent_tokens_list.append([])  # Empty list for missing batch items
+        # Check if we have any tokens to process for repetition detection
+        if any(len(tokens) > 0 for tokens in recent_tokens_list):
+            # Convert to padded tensor for batch processing
+            max_recent_len = max(len(tokens) for tokens in recent_tokens_list)
+            if max_recent_len > 0:  # Only proceed if we have tokens
+                recent_tokens_tensor = torch.zeros((batch_size, max_recent_len), dtype=torch.long, device=device) - 1
+                for i, tokens in enumerate(recent_tokens_list):
+                    if len(tokens) > 0:
+                        recent_tokens_tensor[i, -len(tokens):] = torch.tensor(tokens, device=device)
+                # Create a mask for valid positions and to avoid division by zero
+                valid_positions_mask = torch.zeros_like(recent_tokens_tensor, dtype=torch.bool)
+                for i, tokens in enumerate(recent_tokens_list):
+                    if len(tokens) > 0:
+                        valid_positions_mask[i, -len(tokens):] = True
+                # Check repetition rates
+                repetition_counts = torch.zeros(batch_size, device=device)
+                for i in range(batch_size):
+                    if len(recent_tokens_list[i]) > 0:
+                        repetition_counts[i] = (recent_tokens_tensor[i] == top_ids[i]).sum()
+                # Calculate repetition rates, avoiding division by zero
+                recent_lengths = torch.tensor([max(1, len(tokens)) for tokens in recent_tokens_list], device=device)
+                repetition_rates = repetition_counts / recent_lengths
+                # Identify sequences needing random sampling
+                need_random = repetition_rates >= tau_r
+                # Apply random sampling where needed
+                if need_random.any():
+                    random_indices = torch.multinomial(probs[need_random], 1).squeeze(-1)
+                    top_ids[need_random] = random_indices
+        # 3. Handle EOS tokens
+        # Create mask for sequences that should ignore EOS tokens
+        ignore_eos_mask = torch.zeros(batch_size, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            if i < len(decoded_tokens_list):
+                ignore_eos_mask[i] = len(decoded_tokens_list[i]) < min_tokens[i]
+            else:
+                ignore_eos_mask[i] = True  # Default to ignoring EOS for missing sequences
+        is_eos_mask = top_ids == eos_token
+        need_resample = ignore_eos_mask & is_eos_mask
+        # Resample for sequences that need it
+        if need_resample.any():
+            max_trials = 100
+            for attempt in range(max_trials):
+                # Break if no more resampling needed
+                if not need_resample.any():
+                    break
+                # Sample new tokens for sequences that need resampling
+                new_samples = torch.multinomial(probs[need_resample], 1).squeeze(-1)
+                # Update top_ids with new samples
+                top_ids[need_resample] = new_samples
+                # Update which sequences still need resampling
+                is_eos_mask = top_ids == eos_token
+                need_resample = ignore_eos_mask & is_eos_mask
+            # If still have EOS tokens that should be ignored, force them to be non-EOS
+            if need_resample.any():
+                # Force to a non-EOS token (e.g., the second most likely token)
+                for i in range(batch_size):
+                    if need_resample[i]:
+                        # Get second most likely token (or first if only one token)
+                        second_best_idx = 1 if sorted_indices.size(1) > 1 else 0
+                        top_ids[i] = sorted_indices[i, second_best_idx]
+        result = top_ids
+        return result

flashcosyvoice/utils/__init__.py ADDED Viewed

File without changes

flashcosyvoice/utils/audio.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+import torch
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft=1920, num_mels=80, sampling_rate=24000, hop_size=480,
+                    win_size=1920, fmin=0, fmax=8000, center=False):
+    global mel_basis, hann_window  # pylint: disable=global-statement
+    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

flashcosyvoice/utils/context.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from dataclasses import dataclass
+import torch
+@dataclass
+class Context:
+    is_prefill: bool = False
+    cu_seqlens_q: torch.Tensor | None = None
+    cu_seqlens_k: torch.Tensor | None = None
+    max_seqlen_q: int = 0
+    max_seqlen_k: int = 0
+    slot_mapping: torch.Tensor | None = None
+    context_lens: torch.Tensor | None = None
+    block_tables: torch.Tensor | None = None
+_CONTEXT = Context()
+def get_context():
+    return _CONTEXT
+def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None):
+    global _CONTEXT
+    _CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
+def reset_context():
+    global _CONTEXT
+    _CONTEXT = Context()

flashcosyvoice/utils/loader.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+from glob import glob
+import torch
+from safetensors import safe_open
+from torch import nn
+from flashcosyvoice.config import CosyVoice2LLMConfig
+def default_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor):
+    param.data.copy_(loaded_weight)
+def load_text_llm(model: nn.Module, path: str):
+    packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
+    for file in glob(os.path.join(path, "*.safetensors")):
+        with safe_open(file, "pt", "cpu") as f:
+            for weight_name in f.keys():
+                for k in packed_modules_mapping:
+                    if k in weight_name:
+                        v, shard_id = packed_modules_mapping[k]
+                        param_name = weight_name.replace(k, v)
+                        param = model.get_parameter(param_name)
+                        weight_loader = param.weight_loader
+                        weight_loader(param, f.get_tensor(weight_name), shard_id)
+                        break
+                else:
+                    param = model.get_parameter(weight_name)
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, f.get_tensor(weight_name))
+def load_speech_llm(model: nn.Module, path: str, hf_config: CosyVoice2LLMConfig):
+    packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
+    # NOTE(xcsong): 1. load speech embedding + sos/taskid embedding + lm head
+    embedding_weights = {}
+    tmp_weights = torch.load(f"{path}/llm.pt", map_location="cpu", weights_only=True)
+    missed, missed_names = 0, []
+    for k, v in tmp_weights.items():
+        if k == "speech_embedding.weight":  # torch.Size([6564, 896])
+            speech_embedding_size = hf_config.speech_vocab_size  # 6562
+            # NOTE(xcsong): padding to 6592 for vllm tensor parallel
+            if speech_embedding_size != v.shape[0]:  # [6564, 896] -> [6562, 896]
+                assert speech_embedding_size <= v.shape[0], f"speech_embedding_size should be less than or equal to {v.shape[0]}, but got {speech_embedding_size}"
+                v = v[:speech_embedding_size, :]
+            embedding_weights["speech_embedding.weight"] = v
+        elif k == "llm_embedding.weight":  # torch.Size([2, 896]), eos and task_id
+            assert v.shape[0] == 2, f"llm_embedding.weight should be of shape [2, 896], but got {v.shape}"
+            embedding_weights["llm_embedding.weight"] = v
+        elif k == "llm.model.model.embed_tokens.weight":  # torch.Size([151936, 896])
+            embedding_weights["model.embed_tokens.weight"] = v
+        elif k == "llm_decoder.weight":  # torch.Size([6564, 896])
+            lm_head_size = hf_config.speech_vocab_size  # 6562
+            if lm_head_size != v.shape[0]:  # [6564, 896] -> [6562, 896]
+                assert lm_head_size <= v.shape[0], f"lm_head_size should be less than or equal to {v.shape[0]}, but got {lm_head_size}"
+                v = v[:lm_head_size, :]
+            param = model.get_parameter("lm_head.weight")
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, v)
+        elif k == "llm_decoder.bias":  # torch.Size([6564])
+            lm_head_size = hf_config.speech_vocab_size  # 6562
+            if lm_head_size != v.shape[0]:  # [6564] -> [6562]
+                assert lm_head_size <= v.shape[0], f"lm_head_size should be less than or equal to {v.shape[0]}, but got {lm_head_size}"
+                v = v[:lm_head_size]
+            param = model.get_parameter("lm_head.bias")
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, v)
+        elif "llm.model." in k:
+            weight_name = k.replace("llm.model.", "")
+            for kk in packed_modules_mapping:
+                if kk in weight_name:
+                    vv, shard_id = packed_modules_mapping[kk]
+                    param_name = weight_name.replace(kk, vv)
+                    try:
+                        param = model.get_parameter(param_name)
+                        weight_loader = param.weight_loader
+                        weight_loader(param, v, shard_id)
+                        break
+                    except Exception as e:
+                        print(e)
+                        print(f"skip parameter (1): {weight_name}")
+                        continue
+            else:
+                try:
+                    param = model.get_parameter(weight_name)
+                    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                    weight_loader(param, v)
+                except Exception as e:
+                    print(e)
+                    print(f"skip parameter (2): {weight_name}")
+                    continue
+        else:
+            missed += 1
+            missed_names.append(weight_name)
+            continue
+    print(f"missed {missed} parameters: {missed_names}")
+    # NOTE(xcsong): 2. merge text embedding, sos/taskid embedding, and speech embedding
+    text_embedding_weight = embedding_weights["model.embed_tokens.weight"].cpu()  # [151936, 896]
+    sos_taskid_embedding_weight = embedding_weights["llm_embedding.weight"].cpu()  # [2, 896]
+    speech_embedding_weight = embedding_weights["speech_embedding.weight"].cpu()  # [6562, 896]
+    final_embedding_weight = torch.cat([speech_embedding_weight, sos_taskid_embedding_weight, text_embedding_weight], dim=0)  # [158500, 896]
+    param = model.get_parameter("model.embed_tokens.weight")
+    weight_loader = getattr(param, "weight_loader", default_weight_loader)
+    weight_loader(param, final_embedding_weight)
+def load_model(model: nn.Module, path: str, hf_config: CosyVoice2LLMConfig | None = None):
+    if model.model_type == "speech_llm":
+        load_speech_llm(model, path, hf_config)
+    elif model.model_type == "text_llm":
+        load_text_llm(model, path)
+    else:
+        raise ValueError(f"Unsupported model type: {model.model_type}")

flashcosyvoice/utils/memory.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+import torch
+from pynvml import *  # noqa
+def get_gpu_memory():
+    torch.cuda.synchronize()
+    nvmlInit()
+    visible_device = list(map(int, os.getenv("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(',')))
+    cuda_device_idx = torch.cuda.current_device()
+    cuda_device_idx = visible_device[cuda_device_idx]
+    handle = nvmlDeviceGetHandleByIndex(cuda_device_idx)
+    mem_info = nvmlDeviceGetMemoryInfo(handle)
+    total_memory = mem_info.total
+    used_memory = mem_info.used
+    free_memory = mem_info.free
+    nvmlShutdown()
+    return total_memory, used_memory, free_memory

stepaudio2.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+from utils import compute_token_num, load_audio, log_mel_spectrogram, padding_mels
+class StepAudio2Base:
+    def __init__(self, model_path: str):
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side="right")
+        self.llm = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
+        self.eos_token_id = self.llm_tokenizer.eos_token_id
+    def __call__(self, messages: list, **kwargs):
+        messages, mels = self.apply_chat_template(messages)
+        # Tokenize prompts
+        prompt_ids = []
+        for msg in messages:
+            if isinstance(msg, str):
+                prompt_ids.append(self.llm_tokenizer(text=msg, return_tensors="pt", padding=True)["input_ids"])
+            elif isinstance(msg, list):
+                prompt_ids.append(torch.tensor([msg], dtype=torch.int32))
+            else:
+                raise ValueError(f"Unsupported content type: {type(msg)}")
+        prompt_ids = torch.cat(prompt_ids, dim=-1).cuda()
+        attention_mask = torch.ones_like(prompt_ids)
+        #mels = None if len(mels) == 0 else torch.stack(mels).cuda()
+        #mel_lengths = None if mels is None else torch.tensor([mel.shape[1] - 2 for mel in mels], dtype=torch.int32, device='cuda')
+        if len(mels)==0:
+            mels = None
+            mel_lengths = None
+        else:
+            mels, mel_lengths = padding_mels(mels)
+            mels = mels.cuda()
+            mel_lengths = mel_lengths.cuda()
+        generate_inputs = {
+            "input_ids": prompt_ids,
+            "wavs": mels,
+            "wav_lens": mel_lengths,
+            "attention_mask":attention_mask
+        }
+        generation_config = dict(max_new_tokens=2048,
+            pad_token_id=self.llm_tokenizer.pad_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+        generation_config.update(kwargs)
+        generation_config = GenerationConfig(**generation_config)
+        outputs = self.llm.generate(**generate_inputs, generation_config=generation_config)
+        output_token_ids = outputs[0, prompt_ids.shape[-1] : -1].tolist()
+        output_text_tokens = [i for i in output_token_ids if i < 151688]
+        output_audio_tokens = [i - 151696 for i in output_token_ids if i > 151695]
+        output_text = self.llm_tokenizer.decode(output_text_tokens)
+        return output_token_ids, output_text, output_audio_tokens
+    def apply_chat_template(self, messages: list):
+        results = []
+        mels = []
+        for msg in messages:
+            content = msg
+            if isinstance(content, str):
+                text_with_audio = content
+                results.append(text_with_audio)
+            elif isinstance(content, dict):
+                if content["type"] == "text":
+                    results.append(f"{content['text']}")
+                elif content["type"] == "audio":
+                    audio = load_audio(content['audio'])
+                    for i in range(0, audio.shape[0], 16000 * 25):
+                        mel = log_mel_spectrogram(audio[i:i+16000*25], n_mels=128, padding=479)
+                        mels.append(mel)
+                        audio_tokens = "<audio_patch>" * compute_token_num(mel.shape[1])
+                        results.append(f"<audio_start>{audio_tokens}<audio_end>")
+                elif content["type"] == "token":
+                    results.append(content["token"])
+            else:
+                raise ValueError(f"Unsupported content type: {type(content)}")
+        # print(results)
+        return results, mels
+class StepAudio2(StepAudio2Base):
+    def __init__(self, model_path: str):
+        super().__init__(model_path)
+        self.llm_tokenizer.eos_token = "<|EOT|>"
+        self.llm.config.eos_token_id = self.llm_tokenizer.convert_tokens_to_ids("<|EOT|>")
+        self.eos_token_id = self.llm_tokenizer.convert_tokens_to_ids("<|EOT|>")
+    def apply_chat_template(self, messages: list):
+        results = []
+        mels = []
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "user":
+                role = "human"
+            if isinstance(content, str):
+                text_with_audio = f"<|BOT|>{role}\n{content}"
+                text_with_audio += '<|EOT|>' if msg.get('eot', True) else ''
+                results.append(text_with_audio)
+            elif isinstance(content, list):
+                results.append(f"<|BOT|>{role}\n")
+                for item in content:
+                    if item["type"] == "text":
+                        results.append(f"{item['text']}")
+                    elif item["type"] == "audio":
+                        audio = load_audio(item['audio'])
+                        for i in range(0, audio.shape[0], 16000 * 25):
+                            mel = log_mel_spectrogram(audio[i:i+16000*25], n_mels=128, padding=479)
+                            mels.append(mel)
+                            audio_tokens = "<audio_patch>" * compute_token_num(mel.shape[1])
+                            results.append(f"<audio_start>{audio_tokens}<audio_end>")
+                    elif item["type"] == "token":
+                        results.append(item["token"])
+                if msg.get('eot', True):
+                    results.append('<|EOT|>')
+            elif content is None:
+                results.append(f"<|BOT|>{role}\n")
+            else:
+                raise ValueError(f"Unsupported content type: {type(content)}")
+        # print(results)
+        return results, mels
+if __name__ == '__main__':
+    from token2wav import Token2wav
+    model = StepAudio2('/mnt/gpfs/lijingbei/Step-Audio-2-mini')
+    token2wav = Token2wav('/mnt/gpfs/lijingbei/Step-Audio-2-mini/token2wav')
+    # Text-to-text conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": "Give me a brief introduction to the Great Wall."},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, _ = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    # Text-to-speech conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": "Give me a brief introduction to the Great Wall."},
+            {"role": "assistant", "content": "<tts_start>", "eot": False}, # Insert <tts_start> for speech response
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=4096, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    print(tokens)
+    audio = token2wav(audio, prompt_wav='assets/default_male.wav')
+    with open('output-male.wav', 'wb') as f:
+        f.write(audio)
+    # Speech-to-text conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": [{"type": "audio", "audio": "assets/give_me_a_brief_introduction_to_the_great_wall.wav"}]},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, _ = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    # Speech-to-speech conversation
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": [{"type": "audio", "audio": "assets/give_me_a_brief_introduction_to_the_great_wall.wav"}]},
+            {"role": "assistant", "content": "<tts_start>", "eot": False}, # Insert <tts_start> for speech response
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=4096, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    print(tokens)
+    audio = token2wav(audio, prompt_wav='assets/default_female.wav')
+    with open('output-female.wav', 'wb') as f:
+        f.write(audio)
+    # Multi-turn conversation
+    print()
+    messages.pop(-1)
+    messages += [
+            {"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"},
+                                              {"type": "token", "token": tokens}]},
+            {"role": "human", "content": "Now write a 4-line poem about it."},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)
+    # Multi-modal inputs
+    print()
+    messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "human", "content": [{"type": "text", "text": "Translate the speech into Chinese."},
+                                          {"type": "audio", "audio": "assets/give_me_a_brief_introduction_to_the_great_wall.wav"}]},
+            {"role": "assistant", "content": None}
+    ]
+    tokens, text, audio = model(messages, max_new_tokens=256, temperature=0.7, repetition_penalty=1.05, top_p=0.9, do_sample=True)
+    print(text)

token2wav.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import io
+import torch
+import torchaudio
+import s3tokenizer
+import onnxruntime
+import torchaudio.compliance.kaldi as kaldi
+from flashcosyvoice.modules.hifigan import HiFTGenerator
+from flashcosyvoice.utils.audio import mel_spectrogram
+from hyperpyyaml import load_hyperpyyaml
+class Token2wav():
+    def __init__(self, model_path, float16=False):
+        self.float16 = float16
+        self.audio_tokenizer = s3tokenizer.load_model(f"{model_path}/speech_tokenizer_v2_25hz.onnx").cuda().eval()
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.spk_model = onnxruntime.InferenceSession(f"{model_path}/campplus.onnx", sess_options=option, providers=["CPUExecutionProvider"])
+        with open(f"{model_path}/flow.yaml", "r") as f:
+            configs = load_hyperpyyaml(f)
+            self.flow = configs['flow']
+        if float16:
+            self.flow.half()
+        self.flow.load_state_dict(torch.load(f"{model_path}/flow.pt", map_location="cpu", weights_only=True), strict=True)
+        self.flow.cuda().eval()
+        self.hift = HiFTGenerator()
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(f"{model_path}/hift.pt", map_location="cpu", weights_only=True).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.cuda().eval()
+    def __call__(self, generated_speech_tokens, prompt_wav):
+        audio = s3tokenizer.load_audio(prompt_wav, sr=16000)  # [T]
+        mels = s3tokenizer.log_mel_spectrogram(audio)
+        mels, mels_lens = s3tokenizer.padding([mels])
+        prompt_speech_tokens, prompt_speech_tokens_lens = self.audio_tokenizer.quantize(mels.cuda(), mels_lens.cuda())
+        spk_feat = kaldi.fbank(audio.unsqueeze(0), num_mel_bins=80, dither=0, sample_frequency=16000)
+        spk_feat = spk_feat - spk_feat.mean(dim=0, keepdim=True)
+        spk_emb = torch.tensor(self.spk_model.run(
+            None, {self.spk_model.get_inputs()[0].name: spk_feat.unsqueeze(dim=0).cpu().numpy()}
+        )[0], device='cuda')
+        audio, sample_rate = torchaudio.load(prompt_wav, backend='soundfile')
+        audio = audio.mean(dim=0, keepdim=True)  # [1, T]
+        if sample_rate != 24000:
+            audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=24000)(audio)
+        prompt_mel = mel_spectrogram(audio).transpose(1, 2).squeeze(0)  # [T, num_mels]
+        prompt_mels = prompt_mel.unsqueeze(0).cuda()
+        prompt_mels_lens = torch.tensor([prompt_mels.shape[1]], dtype=torch.int32, device='cuda')
+        generated_speech_tokens = torch.tensor([generated_speech_tokens], dtype=torch.int32, device='cuda')
+        generated_speech_tokens_lens = torch.tensor([generated_speech_tokens.shape[1]], dtype=torch.int32, device='cuda')
+        with torch.amp.autocast("cuda", dtype=torch.float16 if self.float16 else torch.float32):
+            mel = self.flow.inference(generated_speech_tokens, generated_speech_tokens_lens,
+                prompt_speech_tokens, prompt_speech_tokens_lens,
+                prompt_mels, prompt_mels_lens, spk_emb, 10)
+        wav, _ = self.hift(speech_feat=mel)
+        output = io.BytesIO()
+        torchaudio.save(output, wav.cpu(), sample_rate=24000, format='wav')
+        return output.getvalue()
+if __name__ == '__main__':
+    token2wav = Token2wav('/mnt/gpfs/lijingbei/Step-Audio-2-mini/token2wav')
+    tokens = [1493, 4299, 4218, 2049, 528, 2752, 4850, 4569, 4575, 6372, 2127, 4068, 2312, 4993, 4769, 2300, 226, 2175, 2160, 2152, 6311, 6065, 4859, 5102, 4615, 6534, 6426, 1763, 2249, 2209, 5938, 1725, 6048, 3816, 6058, 958, 63, 4460, 5914, 2379, 735, 5319, 4593, 2328, 890, 35, 751, 1483, 1484, 1483, 2112, 303, 4753, 2301, 5507, 5588, 5261, 5744, 5501, 2341, 2001, 2252, 2344, 1860, 2031, 414, 4366, 4366, 6059, 5300, 4814, 5092, 5100, 1923, 3054, 4320, 4296, 2148, 4371, 5831, 5084, 5027, 4946, 4946, 2678, 575, 575, 521, 518, 638, 1367, 2804, 3402, 4299]
+    audio = token2wav(tokens, 'assets/default_male.wav')
+    with open('assets/give_me_a_brief_introduction_to_the_great_wall.wav', 'wb') as f:
+        f.write(audio)

utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import librosa
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+import torchaudio
+from typing import List
+def _mel_filters(n_mels: int) -> torch.Tensor:
+    """Load the mel filterbank matrix for projecting STFT into a Mel spectrogram."""
+    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
+    if n_mels == 128:
+        return torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=128))
+    else:
+        return torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=80))
+def load_audio(file_path, target_rate=16000, max_length=None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    If max_length is provided, truncate the audio to that length
+    """
+    waveform, sample_rate = torchaudio.load(file_path)
+    if sample_rate != target_rate:
+        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
+    audio = waveform[0]  # get the first channel
+    # Truncate audio if it exceeds max_length
+    if max_length is not None and audio.shape[0] > max_length:
+        audio = audio[:max_length]
+    return audio
+def log_mel_spectrogram(audio, n_mels=128, padding=479, device=None):
+    """
+    Compute the log-Mel spectrogram with specific padding for StepAudio
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(400).to(audio.device)
+    stft = torch.stft(audio, 400, 160, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = _mel_filters(n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+def compute_token_num(max_feature_len):
+    # First, audio goes through encoder:
+    # 1. conv1: kernel=3, stride=1, padding=1 -> size unchanged
+    # 2. conv2: kernel=3, stride=2, padding=1 -> size/2
+    # 3. avg_pooler: kernel=2, stride=2 -> size/2
+    max_feature_len = max_feature_len - 2  # remove padding
+    encoder_output_dim = (max_feature_len + 1) // 2 // 2  # after conv2 and avg_pooler
+    # Then through adaptor (parameters from config file):
+    padding = 1
+    kernel_size = 3  # from config: audio_encoder_config.kernel_size
+    stride = 2      # from config: audio_encoder_config.adapter_stride
+    adapter_output_dim = (encoder_output_dim + 2 * padding - kernel_size) // stride + 1
+    return adapter_output_dim
+def padding_mels(data: List[torch.Tensor]):
+    """ Padding the data into batch data
+    Parameters
+    ----------
+        data: List[Tensor], shape of Tensor (128, T)
+    Returns:
+    -------
+        feats, feats lengths
+    """
+    sample = data
+    assert isinstance(sample, list)
+    feats_lengths = torch.tensor([s.size(1)-2 for s in sample],
+                                dtype=torch.int32)
+    feats = [s.t() for s in sample]
+    padded_feats = pad_sequence(feats,
+                                batch_first=True,
+                                padding_value=0)
+    return padded_feats.transpose(1, 2), feats_lengths