Phil Sobrepena commited on
Commit
d07a8ac
·
1 Parent(s): 2fd3693

updates to app and reqs

Browse files
Files changed (3) hide show
  1. Dockerfile +0 -58
  2. app.py +75 -76
  3. requirements.txt +27 -25
Dockerfile DELETED
@@ -1,58 +0,0 @@
1
- FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
2
-
3
- # # Clone MMAudio
4
- # RUN git clone https://huggingface.co/autophil/MMAudio_SS
5
-
6
- WORKDIR /code/MMAudio_SS
7
-
8
- # Install system dependencies
9
- RUN apt-get update && apt-get install -y \
10
- python3.10 \
11
- python3.10-distutils \
12
- python3-pip \
13
- git \
14
- ffmpeg \
15
- libsm6 \
16
- libxext6 \
17
- curl \
18
- libsndfile1 \
19
- && rm -rf /var/lib/apt/lists/*
20
-
21
- # # Ensure we're using Python 3.10
22
- # RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
23
-
24
- # # Install pip for Python 3.10 and upgrade it
25
- # RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
26
- # pip3 install --no-cache-dir --upgrade pip setuptools wheel
27
-
28
- # Install Requirements
29
- RUN pip3 install --no-cache-dir -r requirements.txt
30
-
31
- # Install PyTorch and related packages first (as recommended in README)
32
- RUN pip3 install --no-cache-dir \
33
- torch \
34
- torchvision \
35
- torchaudio \
36
- --index-url https://download.pytorch.org/whl/cu118 --upgrade
37
-
38
- # Install MMAudio last (as recommended in README)
39
- RUN pip3 --no-cache-dir install -e .
40
-
41
-
42
- # Create output directory
43
- RUN mkdir -p output/gradio && chmod 777 output/gradio
44
-
45
- # # Copy app.py (we'll use our own version instead of the one from the repo)
46
- # COPY app.py .
47
-
48
- # Set environment variables for Hugging Face Spaces
49
- ENV PYTHONUNBUFFERED=1
50
- ENV GRADIO_SERVER_NAME=0.0.0.0
51
- ENV GRADIO_SERVER_PORT=7860
52
- ENV PYTHONPATH=/code/MMAudio
53
-
54
- # Expose Gradio port
55
- EXPOSE 7860
56
-
57
- # Run the app
58
- CMD ["python3", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,13 +1,18 @@
1
- import gc
2
  import logging
3
- from argparse import ArgumentParser
4
  from datetime import datetime
5
- from fractions import Fraction
6
  from pathlib import Path
7
 
8
  import gradio as gr
9
  import torch
10
  import torchaudio
 
 
 
 
 
 
 
11
 
12
  from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
13
  load_video, make_video, setup_eval_logging)
@@ -21,13 +26,7 @@ torch.backends.cudnn.allow_tf32 = True
21
 
22
  log = logging.getLogger()
23
 
24
- device = 'cpu'
25
- if torch.cuda.is_available():
26
- device = 'cuda'
27
- elif torch.backends.mps.is_available():
28
- device = 'mps'
29
- else:
30
- log.warning('CUDA/MPS are not available, running on CPU')
31
  dtype = torch.bfloat16
32
 
33
  model: ModelConfig = all_model_cfg['large_44k_v2']
@@ -57,7 +56,7 @@ def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
57
 
58
  net, feature_utils, seq_cfg = get_model()
59
 
60
-
61
  @torch.inference_mode()
62
  def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
63
  cfg_strength: float, duration: float):
@@ -88,14 +87,15 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
88
  cfg_strength=cfg_strength)
89
  audio = audios.float().cpu()[0]
90
 
91
- current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
92
- output_dir.mkdir(exist_ok=True, parents=True)
93
- video_save_path = output_dir / f'{current_time_string}.mp4'
 
94
  make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
95
- gc.collect()
96
  return video_save_path
97
 
98
-
99
  @torch.inference_mode()
100
  def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
101
  cfg_strength: float, duration: float):
@@ -126,46 +126,47 @@ def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int
126
  image_input=True)
127
  audio = audios.float().cpu()[0]
128
 
129
- current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
130
- output_dir.mkdir(exist_ok=True, parents=True)
131
- video_save_path = output_dir / f'{current_time_string}.mp4'
132
  video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
 
133
  make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
134
- gc.collect()
135
  return video_save_path
136
 
137
-
138
- @torch.inference_mode()
139
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
140
- duration: float):
141
-
142
- rng = torch.Generator(device=device)
143
- if seed >= 0:
144
- rng.manual_seed(seed)
145
- else:
146
- rng.seed()
147
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
148
-
149
- clip_frames = sync_frames = None
150
- seq_cfg.duration = duration
151
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
152
-
153
- audios = generate(clip_frames,
154
- sync_frames, [prompt],
155
- negative_text=[negative_prompt],
156
- feature_utils=feature_utils,
157
- net=net,
158
- fm=fm,
159
- rng=rng,
160
- cfg_strength=cfg_strength)
161
- audio = audios.float().cpu()[0]
162
-
163
- current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
164
- output_dir.mkdir(exist_ok=True, parents=True)
165
- audio_save_path = output_dir / f'{current_time_string}.flac'
166
- torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
167
- gc.collect()
168
- return audio_save_path
169
 
170
 
171
  video_to_audio_tab = gr.Interface(
@@ -188,22 +189,22 @@ video_to_audio_tab = gr.Interface(
188
  title='Sonisphere - Sonic Branding Tool',
189
  )
190
 
191
- text_to_audio_tab = gr.Interface(
192
- fn=text_to_audio,
193
- description=""" Text-to-Audio
194
- """,
195
- inputs=[
196
- gr.Text(label='Prompt'),
197
- gr.Text(label='Negative prompt'),
198
- gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
199
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
200
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
201
- gr.Number(label='Duration (sec)', value=8, minimum=1),
202
- ],
203
- outputs='audio',
204
- cache_examples=False,
205
- title='Sonisphere - Sonic Branding Tool',
206
- )
207
 
208
  image_to_audio_tab = gr.Interface(
209
  fn=image_to_audio,
@@ -227,13 +228,11 @@ image_to_audio_tab = gr.Interface(
227
  )
228
 
229
  if __name__ == "__main__":
230
- parser = ArgumentParser()
231
- parser.add_argument('--port', type=int, default=7860)
232
- args = parser.parse_args()
233
-
234
- gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab, image_to_audio_tab],
235
- ['Video-to-Audio', 'Text-to-Audio', 'Image-to-Audio (experimental)']).launch(
236
- server_name="0.0.0.0",
237
- server_port=7860,
238
  auth=("admin", "sonisphere"),
239
  allowed_paths=[output_dir])
 
1
+ import spaces
2
  import logging
 
3
  from datetime import datetime
 
4
  from pathlib import Path
5
 
6
  import gradio as gr
7
  import torch
8
  import torchaudio
9
+ import os
10
+
11
+ try:
12
+ import mmaudio
13
+ except ImportError:
14
+ os.system("pip install -e .")
15
+ import mmaudio
16
 
17
  from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
18
  load_video, make_video, setup_eval_logging)
 
26
 
27
  log = logging.getLogger()
28
 
29
+ device = 'cuda'
 
 
 
 
 
 
30
  dtype = torch.bfloat16
31
 
32
  model: ModelConfig = all_model_cfg['large_44k_v2']
 
56
 
57
  net, feature_utils, seq_cfg = get_model()
58
 
59
+ @spaces.GPU(duration=120)
60
  @torch.inference_mode()
61
  def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
62
  cfg_strength: float, duration: float):
 
87
  cfg_strength=cfg_strength)
88
  audio = audios.float().cpu()[0]
89
 
90
+ # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
91
+ # output_dir.mkdir(exist_ok=True, parents=True)
92
+ # video_save_path = output_dir / f'{current_time_string}.mp4'
93
+ video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
94
  make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
95
+ log.info(f'Saved video to {video_save_path}')
96
  return video_save_path
97
 
98
+ @spaces.GPU(duration=120)
99
  @torch.inference_mode()
100
  def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
101
  cfg_strength: float, duration: float):
 
126
  image_input=True)
127
  audio = audios.float().cpu()[0]
128
 
129
+ # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
130
+ # output_dir.mkdir(exist_ok=True, parents=True)
131
+ # video_save_path = output_dir / f'{current_time_string}.mp4'
132
  video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
133
+ video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
134
  make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
135
+ log.info(f'Saved video to {video_save_path}')
136
  return video_save_path
137
 
138
+ # @spaces.GPU(duration=120)
139
+ # @torch.inference_mode()
140
+ # def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
141
+ # duration: float):
142
+
143
+ # rng = torch.Generator(device=device)
144
+ # if seed >= 0:
145
+ # rng.manual_seed(seed)
146
+ # else:
147
+ # rng.seed()
148
+ # fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
149
+
150
+ # clip_frames = sync_frames = None
151
+ # seq_cfg.duration = duration
152
+ # net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
153
+
154
+ # audios = generate(clip_frames,
155
+ # sync_frames, [prompt],
156
+ # negative_text=[negative_prompt],
157
+ # feature_utils=feature_utils,
158
+ # net=net,
159
+ # fm=fm,
160
+ # rng=rng,
161
+ # cfg_strength=cfg_strength)
162
+ # audio = audios.float().cpu()[0]
163
+
164
+ # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
165
+ # output_dir.mkdir(exist_ok=True, parents=True)
166
+ # audio_save_path = output_dir / f'{current_time_string}.flac'
167
+ # torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
168
+ # gc.collect()
169
+ # return audio_save_path
170
 
171
 
172
  video_to_audio_tab = gr.Interface(
 
189
  title='Sonisphere - Sonic Branding Tool',
190
  )
191
 
192
+ # text_to_audio_tab = gr.Interface(
193
+ # fn=text_to_audio,
194
+ # description=""" Text-to-Audio
195
+ # """,
196
+ # inputs=[
197
+ # gr.Text(label='Prompt'),
198
+ # gr.Text(label='Negative prompt'),
199
+ # gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
200
+ # gr.Number(label='Num steps', value=25, precision=0, minimum=1),
201
+ # gr.Number(label='Guidance Strength', value=4.5, minimum=1),
202
+ # gr.Number(label='Duration (sec)', value=8, minimum=1),
203
+ # ],
204
+ # outputs='audio',
205
+ # cache_examples=False,
206
+ # title='Sonisphere - Sonic Branding Tool',
207
+ # )
208
 
209
  image_to_audio_tab = gr.Interface(
210
  fn=image_to_audio,
 
228
  )
229
 
230
  if __name__ == "__main__":
231
+ # parser = ArgumentParser()
232
+ # parser.add_argument('--port', type=int, default=7860)
233
+ # args = parser.parse_args()
234
+
235
+ gr.TabbedInterface([video_to_audio_tab, image_to_audio_tab],
236
+ ['Video-to-Audio', 'Image-to-Audio']).launch(
 
 
237
  auth=("admin", "sonisphere"),
238
  allowed_paths=[output_dir])
requirements.txt CHANGED
@@ -1,25 +1,27 @@
1
- numpy==1.24.3
2
- torch==2.6.0
3
- torchvision==0.21.0
4
- torchaudio==2.6.0
5
- torchdiffeq==0.2.3
6
- omegaconf>=2.3.0
7
- huggingface_hub>=0.26.1
8
- Pillow>=9.5
9
- opencv-python-headless>=4.8
10
- gradio>=4.19.2
11
- einops>=0.6
12
- open_clip_torch>=2.29.0
13
- av>=14.0.1
14
- transformers>=4.36.2
15
- ffmpeg-python>=0.2.0
16
- moviepy>=1.0.3
17
- python-multipart>=0.0.9
18
- colorlog==6.8.2
19
- pandas>=2.0.0
20
- tensordict>=0.2.0
21
- hydra-core>=1.3.2
22
- tqdm>=4.65.0
23
- librosa>=0.10.1
24
- timm>=0.9.12
25
- requests>=2.31.0
 
 
 
1
+ torch == 2.4.0
2
+ torchvision
3
+ torchaudio
4
+ python-dotenv
5
+ cython
6
+ gitpython >= 3.1
7
+ tensorboard >= 2.11
8
+ numpy >= 1.21, <2.1
9
+ Pillow >= 9.5
10
+ opencv-python >= 4.8
11
+ scipy >= 1.7
12
+ tqdm >= 4.66.1
13
+ gradio >= 3.34
14
+ einops >= 0.6
15
+ hydra-core >= 1.3.2
16
+ requests
17
+ torchdiffeq
18
+ librosa >= 0.8.1
19
+ nitrous-ema
20
+ safetensors
21
+ auraloss
22
+ hydra_colorlog
23
+ tensordict
24
+ colorlog
25
+ open_clip_torch
26
+ soundfile
27
+ av