Phil Sobrepena commited on
Commit
31bd90e
·
1 Parent(s): c19a618
Files changed (3) hide show
  1. Dockerfile +3 -3
  2. mmaudio/model/embeddings.py +16 -31
  3. requirements.txt +3 -3
Dockerfile CHANGED
@@ -27,9 +27,9 @@ RUN pip3 install --no-cache-dir numpy==1.24.3
27
 
28
  # Install PyTorch and related packages first (as recommended in README)
29
  RUN pip3 install --no-cache-dir \
30
- torch==2.1.2 \
31
- torchvision==0.16.2 \
32
- torchaudio==2.1.2 \
33
  --index-url https://download.pytorch.org/whl/cu118
34
 
35
  # Install core dependencies
 
27
 
28
  # Install PyTorch and related packages first (as recommended in README)
29
  RUN pip3 install --no-cache-dir \
30
+ torch==2.6.0 \
31
+ torchvision==0.21.0 \
32
+ torchaudio==2.6.0 \
33
  --index-url https://download.pytorch.org/whl/cu118
34
 
35
  # Install core dependencies
mmaudio/model/embeddings.py CHANGED
@@ -1,5 +1,6 @@
1
  import torch
2
  import torch.nn as nn
 
3
 
4
  # https://github.com/facebookresearch/DiT
5
 
@@ -9,41 +10,25 @@ class TimestepEmbedder(nn.Module):
9
  Embeds scalar timesteps into vector representations.
10
  """
11
 
12
- def __init__(self, dim, frequency_embedding_size, max_period):
13
  super().__init__()
 
14
  self.mlp = nn.Sequential(
15
- nn.Linear(frequency_embedding_size, dim),
16
  nn.SiLU(),
17
- nn.Linear(dim, dim),
18
  )
19
- self.dim = dim
20
- self.max_period = max_period
21
- assert dim % 2 == 0, 'dim must be even.'
22
-
23
- with torch.autocast('cuda', enabled=False):
24
- self.freqs = nn.Buffer(
25
- 1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
26
- frequency_embedding_size)),
27
- persistent=False)
28
- freq_scale = 10000 / max_period
29
- self.freqs = freq_scale * self.freqs
30
 
31
- def timestep_embedding(self, t):
32
- """
33
- Create sinusoidal timestep embeddings.
34
- :param t: a 1-D Tensor of N indices, one per batch element.
35
- These may be fractional.
36
- :param dim: the dimension of the output.
37
- :param max_period: controls the minimum frequency of the embeddings.
38
- :return: an (N, D) Tensor of positional embeddings.
39
- """
40
- # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
41
-
42
- args = t[:, None].float() * self.freqs[None]
43
- embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
44
- return embedding
45
 
46
  def forward(self, t):
47
- t_freq = self.timestep_embedding(t).to(t.dtype)
48
- t_emb = self.mlp(t_freq)
49
- return t_emb
 
 
1
  import torch
2
  import torch.nn as nn
3
+ import math
4
 
5
  # https://github.com/facebookresearch/DiT
6
 
 
10
  Embeds scalar timesteps into vector representations.
11
  """
12
 
13
+ def __init__(self, hidden_dim: int, frequency_embedding_size: int = 256):
14
  super().__init__()
15
+
16
  self.mlp = nn.Sequential(
17
+ nn.Linear(frequency_embedding_size, hidden_dim, bias=True),
18
  nn.SiLU(),
19
+ nn.Linear(hidden_dim, hidden_dim, bias=True),
20
  )
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ self.frequency_embedding_size = frequency_embedding_size
23
+ half_dim = self.frequency_embedding_size // 2
24
+ freqs = torch.exp(
25
+ -math.log(10000) * torch.arange(start=0, end=half_dim, dtype=torch.float32) /
26
+ half_dim
27
+ )
28
+ self.register_buffer('freqs', freqs)
 
 
 
 
 
 
 
29
 
30
  def forward(self, t):
31
+ t_freq = t.unsqueeze(-1) * self.freqs.unsqueeze(0)
32
+ t_embed = torch.cat([t_freq.sin(), t_freq.cos()], dim=-1)
33
+ t_embed = self.mlp(t_embed.to(t.dtype))
34
+ return t_embed
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  numpy==1.24.3
2
- torch==2.1.2
3
- torchvision==0.16.2
4
- torchaudio==2.1.2
5
  torchdiffeq==0.2.3
6
  omegaconf>=2.3.0
7
  huggingface_hub>=0.26.1
 
1
  numpy==1.24.3
2
+ torch==2.6.0
3
+ torchvision==0.21.0
4
+ torchaudio==2.6.0
5
  torchdiffeq==0.2.3
6
  omegaconf>=2.3.0
7
  huggingface_hub>=0.26.1