Spaces:

autophil
/

sonisphere

Sleeping

App Files Files Community

Phil Sobrepena commited on 8 days ago

Commit

31bd90e

1 Parent(s): c19a618

deps

Browse files

Files changed (3) hide show

Dockerfile +3 -3
mmaudio/model/embeddings.py +16 -31
requirements.txt +3 -3

Dockerfile CHANGED Viewed

@@ -27,9 +27,9 @@ RUN pip3 install --no-cache-dir numpy==1.24.3
 # Install PyTorch and related packages first (as recommended in README)
 RUN pip3 install --no-cache-dir \
-    torch==2.1.2 \
-    torchvision==0.16.2 \
-    torchaudio==2.1.2 \
     --index-url https://download.pytorch.org/whl/cu118
 # Install core dependencies

 # Install PyTorch and related packages first (as recommended in README)
 RUN pip3 install --no-cache-dir \
+    torch==2.6.0 \
+    torchvision==0.21.0 \
+    torchaudio==2.6.0 \
     --index-url https://download.pytorch.org/whl/cu118
 # Install core dependencies

mmaudio/model/embeddings.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
 # https://github.com/facebookresearch/DiT
@@ -9,41 +10,25 @@ class TimestepEmbedder(nn.Module):
     Embeds scalar timesteps into vector representations.
     """
-    def __init__(self, dim, frequency_embedding_size, max_period):
         super().__init__()
         self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, dim),
             nn.SiLU(),
-            nn.Linear(dim, dim),
         )
-        self.dim = dim
-        self.max_period = max_period
-        assert dim % 2 == 0, 'dim must be even.'
-        with torch.autocast('cuda', enabled=False):
-            self.freqs = nn.Buffer(
-                1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
-                               frequency_embedding_size)),
-                persistent=False)
-            freq_scale = 10000 / max_period
-            self.freqs = freq_scale * self.freqs
-    def timestep_embedding(self, t):
-        """
-        Create sinusoidal timestep embeddings.
-        :param t: a 1-D Tensor of N indices, one per batch element.
-                          These may be fractional.
-        :param dim: the dimension of the output.
-        :param max_period: controls the minimum frequency of the embeddings.
-        :return: an (N, D) Tensor of positional embeddings.
-        """
-        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
-        args = t[:, None].float() * self.freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        return embedding
     def forward(self, t):
-        t_freq = self.timestep_embedding(t).to(t.dtype)
-        t_emb = self.mlp(t_freq)
-        return t_emb

 import torch
 import torch.nn as nn
+import math
 # https://github.com/facebookresearch/DiT
     Embeds scalar timesteps into vector representations.
     """
+    def __init__(self, hidden_dim: int, frequency_embedding_size: int = 256):
         super().__init__()
         self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_dim, bias=True),
             nn.SiLU(),
+            nn.Linear(hidden_dim, hidden_dim, bias=True),
         )
+        self.frequency_embedding_size = frequency_embedding_size
+        half_dim = self.frequency_embedding_size // 2
+        freqs = torch.exp(
+            -math.log(10000) * torch.arange(start=0, end=half_dim, dtype=torch.float32) /
+            half_dim
+        )
+        self.register_buffer('freqs', freqs)
     def forward(self, t):
+        t_freq = t.unsqueeze(-1) * self.freqs.unsqueeze(0)
+        t_embed = torch.cat([t_freq.sin(), t_freq.cos()], dim=-1)
+        t_embed = self.mlp(t_embed.to(t.dtype))
+        return t_embed

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 numpy==1.24.3
-torch==2.1.2
-torchvision==0.16.2
-torchaudio==2.1.2
 torchdiffeq==0.2.3
 omegaconf>=2.3.0
 huggingface_hub>=0.26.1

 numpy==1.24.3
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0
 torchdiffeq==0.2.3
 omegaconf>=2.3.0
 huggingface_hub>=0.26.1