Spaces:
Sleeping
Sleeping
Phil Sobrepena
commited on
Commit
·
31bd90e
1
Parent(s):
c19a618
deps
Browse files- Dockerfile +3 -3
- mmaudio/model/embeddings.py +16 -31
- requirements.txt +3 -3
Dockerfile
CHANGED
@@ -27,9 +27,9 @@ RUN pip3 install --no-cache-dir numpy==1.24.3
|
|
27 |
|
28 |
# Install PyTorch and related packages first (as recommended in README)
|
29 |
RUN pip3 install --no-cache-dir \
|
30 |
-
torch==2.
|
31 |
-
torchvision==0.
|
32 |
-
torchaudio==2.
|
33 |
--index-url https://download.pytorch.org/whl/cu118
|
34 |
|
35 |
# Install core dependencies
|
|
|
27 |
|
28 |
# Install PyTorch and related packages first (as recommended in README)
|
29 |
RUN pip3 install --no-cache-dir \
|
30 |
+
torch==2.6.0 \
|
31 |
+
torchvision==0.21.0 \
|
32 |
+
torchaudio==2.6.0 \
|
33 |
--index-url https://download.pytorch.org/whl/cu118
|
34 |
|
35 |
# Install core dependencies
|
mmaudio/model/embeddings.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
|
|
3 |
|
4 |
# https://github.com/facebookresearch/DiT
|
5 |
|
@@ -9,41 +10,25 @@ class TimestepEmbedder(nn.Module):
|
|
9 |
Embeds scalar timesteps into vector representations.
|
10 |
"""
|
11 |
|
12 |
-
def __init__(self,
|
13 |
super().__init__()
|
|
|
14 |
self.mlp = nn.Sequential(
|
15 |
-
nn.Linear(frequency_embedding_size,
|
16 |
nn.SiLU(),
|
17 |
-
nn.Linear(
|
18 |
)
|
19 |
-
self.dim = dim
|
20 |
-
self.max_period = max_period
|
21 |
-
assert dim % 2 == 0, 'dim must be even.'
|
22 |
-
|
23 |
-
with torch.autocast('cuda', enabled=False):
|
24 |
-
self.freqs = nn.Buffer(
|
25 |
-
1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
|
26 |
-
frequency_embedding_size)),
|
27 |
-
persistent=False)
|
28 |
-
freq_scale = 10000 / max_period
|
29 |
-
self.freqs = freq_scale * self.freqs
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
:return: an (N, D) Tensor of positional embeddings.
|
39 |
-
"""
|
40 |
-
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
|
41 |
-
|
42 |
-
args = t[:, None].float() * self.freqs[None]
|
43 |
-
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
44 |
-
return embedding
|
45 |
|
46 |
def forward(self, t):
|
47 |
-
t_freq =
|
48 |
-
|
49 |
-
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
+
import math
|
4 |
|
5 |
# https://github.com/facebookresearch/DiT
|
6 |
|
|
|
10 |
Embeds scalar timesteps into vector representations.
|
11 |
"""
|
12 |
|
13 |
+
def __init__(self, hidden_dim: int, frequency_embedding_size: int = 256):
|
14 |
super().__init__()
|
15 |
+
|
16 |
self.mlp = nn.Sequential(
|
17 |
+
nn.Linear(frequency_embedding_size, hidden_dim, bias=True),
|
18 |
nn.SiLU(),
|
19 |
+
nn.Linear(hidden_dim, hidden_dim, bias=True),
|
20 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
self.frequency_embedding_size = frequency_embedding_size
|
23 |
+
half_dim = self.frequency_embedding_size // 2
|
24 |
+
freqs = torch.exp(
|
25 |
+
-math.log(10000) * torch.arange(start=0, end=half_dim, dtype=torch.float32) /
|
26 |
+
half_dim
|
27 |
+
)
|
28 |
+
self.register_buffer('freqs', freqs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def forward(self, t):
|
31 |
+
t_freq = t.unsqueeze(-1) * self.freqs.unsqueeze(0)
|
32 |
+
t_embed = torch.cat([t_freq.sin(), t_freq.cos()], dim=-1)
|
33 |
+
t_embed = self.mlp(t_embed.to(t.dtype))
|
34 |
+
return t_embed
|
requirements.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
numpy==1.24.3
|
2 |
-
torch==2.
|
3 |
-
torchvision==0.
|
4 |
-
torchaudio==2.
|
5 |
torchdiffeq==0.2.3
|
6 |
omegaconf>=2.3.0
|
7 |
huggingface_hub>=0.26.1
|
|
|
1 |
numpy==1.24.3
|
2 |
+
torch==2.6.0
|
3 |
+
torchvision==0.21.0
|
4 |
+
torchaudio==2.6.0
|
5 |
torchdiffeq==0.2.3
|
6 |
omegaconf>=2.3.0
|
7 |
huggingface_hub>=0.26.1
|