Spaces:

autophil
/

sonisphere

Sleeping

App Files Files Community

Phil Sobrepena commited on 7 days ago

Commit

e87de0e

1 Parent(s): d07a8ac

match embeddings and network to rc repo

Browse files

Files changed (2) hide show

mmaudio/model/embeddings.py +30 -16
mmaudio/model/networks.py +6 -4

mmaudio/model/embeddings.py CHANGED Viewed

@@ -4,31 +4,45 @@ import math
 # https://github.com/facebookresearch/DiT
 class TimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.
     """
-    def __init__(self, hidden_dim: int, frequency_embedding_size: int = 256):
         super().__init__()
         self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, hidden_dim, bias=True),
             nn.SiLU(),
-            nn.Linear(hidden_dim, hidden_dim, bias=True),
         )
-        self.frequency_embedding_size = frequency_embedding_size
-        half_dim = self.frequency_embedding_size // 2
-        freqs = torch.exp(
-            -math.log(10000) * torch.arange(start=0, end=half_dim, dtype=torch.float32) /
-            half_dim
-        )
-        self.register_buffer('freqs', freqs)
     def forward(self, t):
-        t_freq = t.unsqueeze(-1) * self.freqs.unsqueeze(0)
-        t_embed = torch.cat([t_freq.sin(), t_freq.cos()], dim=-1)
-        t_embed = self.mlp(t_embed.to(t.dtype))
-        return t_embed

 # https://github.com/facebookresearch/DiT
 class TimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.
     """
+    def __init__(self, dim, frequency_embedding_size, max_period):
         super().__init__()
         self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, dim),
             nn.SiLU(),
+            nn.Linear(dim, dim),
         )
+        self.dim = dim
+        self.max_period = max_period
+        assert dim % 2 == 0, 'dim must be even.'
+        with torch.autocast('cuda', enabled=False):
+            self.freqs = (
+                1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
+                               frequency_embedding_size)))
+            freq_scale = 10000 / max_period
+            self.freqs = nn.Parameter(freq_scale * self.freqs)
+    def timestep_embedding(self, t):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        args = t[:, None].float() * self.freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return embedding
     def forward(self, t):
+        t_freq = self.timestep_embedding(t).to(t.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb

mmaudio/model/networks.py CHANGED Viewed

@@ -166,8 +166,10 @@ class MMAudio(nn.Module):
                                           self._clip_seq_len,
                                           device=self.device)
-        self.latent_rot = nn.Buffer(latent_rot, persistent=False)
-        self.clip_rot = nn.Buffer(clip_rot, persistent=False)
     def update_seq_lengths(self, latent_seq_len: int, clip_seq_len: int, sync_seq_len: int) -> None:
         self._latent_seq_len = latent_seq_len
@@ -346,7 +348,7 @@ class MMAudio(nn.Module):
         if 'clip_rot' in src_dict:
             del src_dict['clip_rot']
-        self.load_state_dict(src_dict, strict=True)
     @property
     def device(self) -> torch.device:
@@ -466,4 +468,4 @@ if __name__ == '__main__':
     # print the number of parameters in terms of millions
     num_params = sum(p.numel() for p in network.parameters()) / 1e6
-    print(f'Number of parameters: {num_params:.2f}M')

                                           self._clip_seq_len,
                                           device=self.device)
+        # self.latent_rot = latent_rot.to(self.device)
+        # self.clip_rot = clip_rot.to(self.device)
+        self.register_buffer('latent_rot', latent_rot)
+        self.register_buffer('clip_rot', clip_rot)
     def update_seq_lengths(self, latent_seq_len: int, clip_seq_len: int, sync_seq_len: int) -> None:
         self._latent_seq_len = latent_seq_len
         if 'clip_rot' in src_dict:
             del src_dict['clip_rot']
+        self.load_state_dict(src_dict, strict=False)
     @property
     def device(self) -> torch.device:
     # print the number of parameters in terms of millions
     num_params = sum(p.numel() for p in network.parameters()) / 1e6
+    print(f'Number of parameters: {num_params:.2f}M')