Spaces:

autophil
/

sonisphere

Sleeping

App Files Files Community

Phil Sobrepena commited on 7 days ago

Commit

d07a8ac

1 Parent(s): 2fd3693

updates to app and reqs

Browse files

Files changed (3) hide show

Dockerfile +0 -58
app.py +75 -76
requirements.txt +27 -25

Dockerfile DELETED Viewed

@@ -1,58 +0,0 @@
-FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
-# # Clone MMAudio
-# RUN git clone https://huggingface.co/autophil/MMAudio_SS
-WORKDIR /code/MMAudio_SS
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    python3.10 \
-    python3.10-distutils \
-    python3-pip \
-    git \
-    ffmpeg \
-    libsm6 \
-    libxext6 \
-    curl \
-    libsndfile1 \
-    && rm -rf /var/lib/apt/lists/*
-# # Ensure we're using Python 3.10
-# RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-# # Install pip for Python 3.10 and upgrade it
-# RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
-#     pip3 install --no-cache-dir --upgrade pip setuptools wheel
-# Install Requirements
-RUN pip3 install --no-cache-dir -r requirements.txt
-# Install PyTorch and related packages first (as recommended in README)
-RUN pip3 install --no-cache-dir \
-    torch \
-    torchvision \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cu118 --upgrade
-# Install MMAudio last (as recommended in README)
-RUN pip3 --no-cache-dir install -e .
-# Create output directory
-RUN mkdir -p output/gradio && chmod 777 output/gradio
-# # Copy app.py (we'll use our own version instead of the one from the repo)
-# COPY app.py .
-# Set environment variables for Hugging Face Spaces
-ENV PYTHONUNBUFFERED=1
-ENV GRADIO_SERVER_NAME=0.0.0.0
-ENV GRADIO_SERVER_PORT=7860
-ENV PYTHONPATH=/code/MMAudio
-# Expose Gradio port
-EXPOSE 7860
-# Run the app
-CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -1,13 +1,18 @@
-import gc
 import logging
-from argparse import ArgumentParser
 from datetime import datetime
-from fractions import Fraction
 from pathlib import Path
 import gradio as gr
 import torch
 import torchaudio
 from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
                                 load_video, make_video, setup_eval_logging)
@@ -21,13 +26,7 @@ torch.backends.cudnn.allow_tf32 = True
 log = logging.getLogger()
-device = 'cpu'
-if torch.cuda.is_available():
-    device = 'cuda'
-elif torch.backends.mps.is_available():
-    device = 'mps'
-else:
-    log.warning('CUDA/MPS are not available, running on CPU')
 dtype = torch.bfloat16
 model: ModelConfig = all_model_cfg['large_44k_v2']
@@ -57,7 +56,7 @@ def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
 net, feature_utils, seq_cfg = get_model()
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
@@ -88,14 +87,15 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
-    current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    video_save_path = output_dir / f'{current_time_string}.mp4'
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
-    gc.collect()
     return video_save_path
 @torch.inference_mode()
 def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
@@ -126,46 +126,47 @@ def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int
                       image_input=True)
     audio = audios.float().cpu()[0]
-    current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    video_save_path = output_dir / f'{current_time_string}.mp4'
     video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
-    gc.collect()
     return video_save_path
-@torch.inference_mode()
-def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
-                  duration: float):
-    rng = torch.Generator(device=device)
-    if seed >= 0:
-        rng.manual_seed(seed)
-    else:
-        rng.seed()
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    clip_frames = sync_frames = None
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
-    audio = audios.float().cpu()[0]
-    current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    audio_save_path = output_dir / f'{current_time_string}.flac'
-    torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
-    gc.collect()
-    return audio_save_path
 video_to_audio_tab = gr.Interface(
@@ -188,22 +189,22 @@ video_to_audio_tab = gr.Interface(
     title='Sonisphere - Sonic Branding Tool',
     )
-text_to_audio_tab = gr.Interface(
-    fn=text_to_audio,
-    description=""" Text-to-Audio
- """,
-    inputs=[
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='audio',
-    cache_examples=False,
-    title='Sonisphere - Sonic Branding Tool',
-)
 image_to_audio_tab = gr.Interface(
     fn=image_to_audio,
@@ -227,13 +228,11 @@ image_to_audio_tab = gr.Interface(
 )
 if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument('--port', type=int, default=7860)
-    args = parser.parse_args()
-    gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab, image_to_audio_tab],
-                       ['Video-to-Audio', 'Text-to-Audio', 'Image-to-Audio (experimental)']).launch(
-                           server_name="0.0.0.0",
-                           server_port=7860,
                            auth=("admin", "sonisphere"),
                            allowed_paths=[output_dir])

+import spaces
 import logging
 from datetime import datetime
 from pathlib import Path
 import gradio as gr
 import torch
 import torchaudio
+import os
+try:
+    import mmaudio
+except ImportError:
+    os.system("pip install -e .")
+    import mmaudio
 from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
                                 load_video, make_video, setup_eval_logging)
 log = logging.getLogger()
+device = 'cuda'
 dtype = torch.bfloat16
 model: ModelConfig = all_model_cfg['large_44k_v2']
 net, feature_utils, seq_cfg = get_model()
+@spaces.GPU(duration=120)
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
                       cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
+    # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
+    # output_dir.mkdir(exist_ok=True, parents=True)
+    # video_save_path = output_dir / f'{current_time_string}.mp4'
+    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
+    log.info(f'Saved video to {video_save_path}')
     return video_save_path
+@spaces.GPU(duration=120)
 @torch.inference_mode()
 def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                    cfg_strength: float, duration: float):
                       image_input=True)
     audio = audios.float().cpu()[0]
+    # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
+    # output_dir.mkdir(exist_ok=True, parents=True)
+    # video_save_path = output_dir / f'{current_time_string}.mp4'
     video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
+    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
+    log.info(f'Saved video to {video_save_path}')
     return video_save_path
+# @spaces.GPU(duration=120)
+# @torch.inference_mode()
+# def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
+#                   duration: float):
+#     rng = torch.Generator(device=device)
+#     if seed >= 0:
+#         rng.manual_seed(seed)
+#     else:
+#         rng.seed()
+#     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+#     clip_frames = sync_frames = None
+#     seq_cfg.duration = duration
+#     net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
+#     audios = generate(clip_frames,
+#                       sync_frames, [prompt],
+#                       negative_text=[negative_prompt],
+#                       feature_utils=feature_utils,
+#                       net=net,
+#                       fm=fm,
+#                       rng=rng,
+#                       cfg_strength=cfg_strength)
+#     audio = audios.float().cpu()[0]
+#     current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
+#     output_dir.mkdir(exist_ok=True, parents=True)
+#     audio_save_path = output_dir / f'{current_time_string}.flac'
+#     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
+#     gc.collect()
+#     return audio_save_path
 video_to_audio_tab = gr.Interface(
     title='Sonisphere - Sonic Branding Tool',
     )
+# text_to_audio_tab = gr.Interface(
+#     fn=text_to_audio,
+#     description=""" Text-to-Audio
+#  """,
+#     inputs=[
+#         gr.Text(label='Prompt'),
+#         gr.Text(label='Negative prompt'),
+#         gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
+#         gr.Number(label='Num steps', value=25, precision=0, minimum=1),
+#         gr.Number(label='Guidance Strength', value=4.5, minimum=1),
+#         gr.Number(label='Duration (sec)', value=8, minimum=1),
+#     ],
+#     outputs='audio',
+#     cache_examples=False,
+#     title='Sonisphere - Sonic Branding Tool',
+# )
 image_to_audio_tab = gr.Interface(
     fn=image_to_audio,
 )
 if __name__ == "__main__":
+    # parser = ArgumentParser()
+    # parser.add_argument('--port', type=int, default=7860)
+    # args = parser.parse_args()
+    gr.TabbedInterface([video_to_audio_tab, image_to_audio_tab],
+                       ['Video-to-Audio', 'Image-to-Audio']).launch(
                            auth=("admin", "sonisphere"),
                            allowed_paths=[output_dir])

requirements.txt CHANGED Viewed

@@ -1,25 +1,27 @@
-numpy==1.24.3
-torch==2.6.0
-torchvision==0.21.0
-torchaudio==2.6.0
-torchdiffeq==0.2.3
-omegaconf>=2.3.0
-huggingface_hub>=0.26.1
-Pillow>=9.5
-opencv-python-headless>=4.8
-gradio>=4.19.2
-einops>=0.6
-open_clip_torch>=2.29.0
-av>=14.0.1
-transformers>=4.36.2
-ffmpeg-python>=0.2.0
-moviepy>=1.0.3
-python-multipart>=0.0.9
-colorlog==6.8.2
-pandas>=2.0.0
-tensordict>=0.2.0
-hydra-core>=1.3.2
-tqdm>=4.65.0
-librosa>=0.10.1
-timm>=0.9.12
-requests>=2.31.0

+torch == 2.4.0
+torchvision
+torchaudio
+python-dotenv
+cython
+gitpython >= 3.1
+tensorboard >= 2.11
+numpy >= 1.21, <2.1
+Pillow >= 9.5
+opencv-python >= 4.8
+scipy >= 1.7
+tqdm >= 4.66.1
+gradio >= 3.34
+einops >= 0.6
+hydra-core >= 1.3.2
+requests
+torchdiffeq
+librosa >= 0.8.1
+nitrous-ema
+safetensors
+auraloss
+hydra_colorlog
+tensordict
+colorlog
+open_clip_torch
+soundfile
+av