Spaces:

autophil
/

sonisphere

Sleeping

App Files Files Community

Phil Sobrepena commited on 6 days ago

Commit

2c4e2b0

1 Parent(s): 977df40

clone

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +4 -6
app.py +24 -103
batch_eval.py +0 -110
config/__init__.py +0 -0
config/base_config.yaml +0 -62
config/data/base.yaml +0 -70
config/eval_config.yaml +0 -17
config/eval_data/base.yaml +0 -22
config/hydra/job_logging/custom-eval.yaml +0 -32
config/hydra/job_logging/custom-no-rank.yaml +0 -32
config/hydra/job_logging/custom-simplest.yaml +0 -26
config/hydra/job_logging/custom.yaml +0 -33
config/train_config.yaml +0 -41
demo.py +1 -7
docs/EVAL.md +0 -22
docs/MODELS.md +0 -50
docs/TRAINING.md +0 -160
docs/index.html +10 -12
gitattributes +0 -35
mmaudio/data/av_utils.py +0 -26
mmaudio/data/data_setup.py +0 -174
mmaudio/data/eval/__init__.py +0 -0
mmaudio/data/eval/audiocaps.py +0 -39
mmaudio/data/eval/moviegen.py +0 -131
mmaudio/data/eval/video_dataset.py +0 -197
mmaudio/data/extracted_audio.py +0 -88
mmaudio/data/extracted_vgg.py +0 -101
mmaudio/data/extraction/__init__.py +0 -0
mmaudio/data/extraction/vgg_sound.py +0 -193
mmaudio/data/extraction/wav_dataset.py +0 -132
mmaudio/data/mm_dataset.py +0 -45
mmaudio/data/utils.py +0 -148
mmaudio/eval_utils.py +9 -47
mmaudio/ext/autoencoder/autoencoder.py +1 -1
mmaudio/ext/autoencoder/vae.py +4 -0
mmaudio/ext/mel_converter.py +9 -33
mmaudio/model/embeddings.py +1 -1
mmaudio/model/flow_matching.py +18 -1
mmaudio/model/networks.py +1 -1
mmaudio/model/transformer_layers.py +1 -0
mmaudio/model/utils/features_utils.py +2 -2
mmaudio/runner.py +0 -609
mmaudio/sample.py +0 -90
mmaudio/utils/email_utils.py +0 -50
mmaudio/utils/log_integrator.py +0 -112
mmaudio/utils/logger.py +0 -231
mmaudio/utils/synthesize_ema.py +0 -19
mmaudio/utils/tensor_utils.py +0 -14
mmaudio/utils/time_estimator.py +0 -72
mmaudio/utils/timezone.py +0 -1

.gitignore CHANGED Viewed

@@ -2,18 +2,16 @@ run_*.sh
 log/
 saves
 saves/
-weights/
-weights
 output/
 output
 pretrained/
 workspace
 workspace/
-ext_weights/
-ext_weights
 .checkpoints/
-.vscode/
-training/example_output/
 # Byte-compiled / optimized / DLL files
 __pycache__/

 log/
 saves
 saves/
 output/
 output
 pretrained/
 workspace
 workspace/
 .checkpoints/
+weights/
+ext_weights/
+*.pth
+*.pt
 # Byte-compiled / optimized / DLL files
 __pycache__/

app.py CHANGED Viewed

@@ -14,12 +14,13 @@ except ImportError:
     os.system("pip install -e .")
     import mmaudio
-from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
-                                load_video, make_video, setup_eval_logging)
 from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
@@ -56,6 +57,7 @@ def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
 net, feature_utils, seq_cfg = get_model()
 @spaces.GPU(duration=120)
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
@@ -88,17 +90,18 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
     audio = audios.float().cpu()[0]
     # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
     # output_dir.mkdir(exist_ok=True, parents=True)
     # video_save_path = output_dir / f'{current_time_string}.mp4'
-    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
     log.info(f'Saved video to {video_save_path}')
     return video_save_path
 @spaces.GPU(duration=120)
 @torch.inference_mode()
-def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
-                   cfg_strength: float, duration: float):
     rng = torch.Generator(device=device)
     if seed >= 0:
@@ -107,11 +110,7 @@ def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int
         rng.seed()
     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    image_info = load_image(image)
-    clip_frames = image_info.clip_frames
-    sync_frames = image_info.sync_frames
-    clip_frames = clip_frames.unsqueeze(0)
-    sync_frames = sync_frames.unsqueeze(0)
     seq_cfg.duration = duration
     net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
@@ -122,61 +121,24 @@ def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int
                       net=net,
                       fm=fm,
                       rng=rng,
-                      cfg_strength=cfg_strength,
-                      image_input=True)
     audio = audios.float().cpu()[0]
-    # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-    # output_dir.mkdir(exist_ok=True, parents=True)
-    # video_save_path = output_dir / f'{current_time_string}.mp4'
-    video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
-    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
-    make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
-    log.info(f'Saved video to {video_save_path}')
-    return video_save_path
-# @spaces.GPU(duration=120)
-# @torch.inference_mode()
-# def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
-#                   duration: float):
-#     rng = torch.Generator(device=device)
-#     if seed >= 0:
-#         rng.manual_seed(seed)
-#     else:
-#         rng.seed()
-#     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-#     clip_frames = sync_frames = None
-#     seq_cfg.duration = duration
-#     net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-#     audios = generate(clip_frames,
-#                       sync_frames, [prompt],
-#                       negative_text=[negative_prompt],
-#                       feature_utils=feature_utils,
-#                       net=net,
-#                       fm=fm,
-#                       rng=rng,
-#                       cfg_strength=cfg_strength)
-#     audio = audios.float().cpu()[0]
-#     current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
-#     output_dir.mkdir(exist_ok=True, parents=True)
-#     audio_save_path = output_dir / f'{current_time_string}.flac'
-#     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
-#     gc.collect()
-#     return audio_save_path
 video_to_audio_tab = gr.Interface(
     fn=video_to_audio,
-    description=""" Video-to-Audio
     NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
     Doing so does not improve results.
-    The model has been trained on 8-second videos.
-    Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
     """,
     inputs=[
         gr.Video(),
@@ -189,52 +151,11 @@ video_to_audio_tab = gr.Interface(
     ],
     outputs='playable_video',
     cache_examples=False,
-    title='Sonisphere - Sonic Branding Tool',
-    )
-# text_to_audio_tab = gr.Interface(
-#     fn=text_to_audio,
-#     description=""" Text-to-Audio
-#  """,
-#     inputs=[
-#         gr.Text(label='Prompt'),
-#         gr.Text(label='Negative prompt'),
-#         gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-#         gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-#         gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-#         gr.Number(label='Duration (sec)', value=8, minimum=1),
-#     ],
-#     outputs='audio',
-#     cache_examples=False,
-#     title='Sonisphere - Sonic Branding Tool',
-# )
-image_to_audio_tab = gr.Interface(
-    fn=image_to_audio,
-    description="""
-    Image-to-Audio
-    NOTE: It takes longer to process high-resolution images (>384 px on the shorter side).
-    Doing so does not improve results.
-    """,
-    inputs=[
-        gr.Image(type='filepath'),
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='playable_video',
-    cache_examples=False,
-    title='Image-to-Audio Synthesis (experimental)',
-)
-if __name__ == "__main__":
-    # parser = ArgumentParser()
-    # parser.add_argument('--port', type=int, default=7860)
-    # args = parser.parse_args()
-    gr.TabbedInterface([video_to_audio_tab, image_to_audio_tab],
-                       ['Video-to-Audio', 'Image-to-Audio']).launch(
-                           allowed_paths=[output_dir])

     os.system("pip install -e .")
     import mmaudio
+from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
+                                setup_eval_logging)
 from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio, get_my_mmaudio
 from mmaudio.model.sequence_config import SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
+import tempfile
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 net, feature_utils, seq_cfg = get_model()
 @spaces.GPU(duration=120)
 @torch.inference_mode()
 def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
     audio = audios.float().cpu()[0]
     # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
+    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     # output_dir.mkdir(exist_ok=True, parents=True)
     # video_save_path = output_dir / f'{current_time_string}.mp4'
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
     log.info(f'Saved video to {video_save_path}')
     return video_save_path
 @spaces.GPU(duration=120)
 @torch.inference_mode()
+def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
+                  duration: float):
     rng = torch.Generator(device=device)
     if seed >= 0:
         rng.seed()
     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+    clip_frames = sync_frames = None
     seq_cfg.duration = duration
     net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
                       net=net,
                       fm=fm,
                       rng=rng,
+                      cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
+    audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
+    torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
+    log.info(f'Saved audio to {audio_save_path}')
+    return audio_save_path
 video_to_audio_tab = gr.Interface(
     fn=video_to_audio,
+    description="""
+    Sonisphere
+    Video-to-Audio
     NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
     Doing so does not improve results.
+    The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
     """,
     inputs=[
         gr.Video(),
     ],
     outputs='playable_video',
     cache_examples=False,
+    title='MMAudio — Video-to-Audio Synthesis',
+    examples=[
+    ])
+if __name__ == "__main__":
+    gr.TabbedInterface([video_to_audio_tab],
+                       ['Video-to-Audio']).launch(allowed_paths=[output_dir])

batch_eval.py DELETED Viewed

@@ -1,110 +0,0 @@
-import logging
-import os
-from pathlib import Path
-import hydra
-import torch
-import torch.distributed as distributed
-import torchaudio
-from hydra.core.hydra_config import HydraConfig
-from omegaconf import DictConfig
-from tqdm import tqdm
-from mmaudio.data.data_setup import setup_eval_dataset
-from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate
-from mmaudio.model.flow_matching import FlowMatching
-from mmaudio.model.networks import MMAudio, get_my_mmaudio
-from mmaudio.model.utils.features_utils import FeaturesUtils
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-local_rank = int(os.environ['LOCAL_RANK'])
-world_size = int(os.environ['WORLD_SIZE'])
-log = logging.getLogger()
-@torch.inference_mode()
-@hydra.main(version_base='1.3.2', config_path='config', config_name='eval_config.yaml')
-def main(cfg: DictConfig):
-    device = 'cuda'
-    torch.cuda.set_device(local_rank)
-    if cfg.model not in all_model_cfg:
-        raise ValueError(f'Unknown model variant: {cfg.model}')
-    model: ModelConfig = all_model_cfg[cfg.model]
-    model.download_if_needed()
-    seq_cfg = model.seq_cfg
-    run_dir = Path(HydraConfig.get().run.dir)
-    if cfg.output_name is None:
-        output_dir = run_dir / cfg.dataset
-    else:
-        output_dir = run_dir / f'{cfg.dataset}-{cfg.output_name}'
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # load a pretrained model
-    seq_cfg.duration = cfg.duration_s
-    net: MMAudio = get_my_mmaudio(cfg.model).to(device).eval()
-    net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
-    log.info(f'Loaded weights from {model.model_path}')
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    log.info(f'Latent seq len: {seq_cfg.latent_seq_len}')
-    log.info(f'Clip seq len: {seq_cfg.clip_seq_len}')
-    log.info(f'Sync seq len: {seq_cfg.sync_seq_len}')
-    # misc setup
-    rng = torch.Generator(device=device)
-    rng.manual_seed(cfg.seed)
-    fm = FlowMatching(cfg.sampling.min_sigma,
-                      inference_mode=cfg.sampling.method,
-                      num_steps=cfg.sampling.num_steps)
-    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
-                                  synchformer_ckpt=model.synchformer_ckpt,
-                                  enable_conditions=True,
-                                  mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-                                  need_vae_encoder=False)
-    feature_utils = feature_utils.to(device).eval()
-    if cfg.compile:
-        net.preprocess_conditions = torch.compile(net.preprocess_conditions)
-        net.predict_flow = torch.compile(net.predict_flow)
-        feature_utils.compile()
-    dataset, loader = setup_eval_dataset(cfg.dataset, cfg)
-    with torch.amp.autocast(enabled=cfg.amp, dtype=torch.bfloat16, device_type=device):
-        for batch in tqdm(loader):
-            audios = generate(batch.get('clip_video', None),
-                              batch.get('sync_video', None),
-                              batch.get('caption', None),
-                              feature_utils=feature_utils,
-                              net=net,
-                              fm=fm,
-                              rng=rng,
-                              cfg_strength=cfg.cfg_strength,
-                              clip_batch_size_multiplier=64,
-                              sync_batch_size_multiplier=64)
-            audios = audios.float().cpu()
-            names = batch['name']
-            for audio, name in zip(audios, names):
-                torchaudio.save(output_dir / f'{name}.flac', audio, seq_cfg.sampling_rate)
-def distributed_setup():
-    distributed.init_process_group(backend="nccl")
-    local_rank = distributed.get_rank()
-    world_size = distributed.get_world_size()
-    log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
-    return local_rank, world_size
-if __name__ == '__main__':
-    distributed_setup()
-    main()
-    # clean-up
-    distributed.destroy_process_group()

config/__init__.py DELETED Viewed

File without changes

config/base_config.yaml DELETED Viewed

@@ -1,62 +0,0 @@
-defaults:
-  - data: base
-  - eval_data: base
-  - override hydra/job_logging: custom-simplest
-  - _self_
-hydra:
-  run:
-    dir: ./output/${exp_id}
-  output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
-enable_email: False
-model: small_16k
-exp_id: default
-debug: False
-cudnn_benchmark: True
-compile: True
-amp: True
-weights: null
-checkpoint: null
-seed: 14159265
-num_workers: 10 # per-GPU
-pin_memory: False # set to True if your system can handle it, i.e., have enough memory
-# NOTE: This DOSE NOT affect the model during inference in any way
-# they are just for the dataloader to fill in the missing data in multi-modal loading
-# to change the sequence length for the model, see networks.py
-data_dim:
-  text_seq_len: 77
-  clip_dim: 1024
-  sync_dim: 768
-  text_dim: 1024
-# ema configuration
-ema:
-  enable: True
-  sigma_rels: [0.05, 0.1]
-  update_every: 1
-  checkpoint_every: 5_000
-  checkpoint_folder: ${hydra:run.dir}/ema_ckpts
-  default_output_sigma: 0.05
-# sampling
-sampling:
-  mean: 0.0
-  scale: 1.0
-  min_sigma: 0.0
-  method: euler
-  num_steps: 25
-# classifier-free guidance
-null_condition_probability: 0.1
-cfg_strength: 4.5
-# checkpoint paths to external modules
-vae_16k_ckpt: ./ext_weights/v1-16.pth
-vae_44k_ckpt: ./ext_weights/v1-44.pth
-bigvgan_vocoder_ckpt: ./ext_weights/best_netG.pt
-synchformer_ckpt: ./ext_weights/synchformer_state_dict.pth

config/data/base.yaml DELETED Viewed

@@ -1,70 +0,0 @@
-VGGSound:
-  root: ../data/video
-  subset_name: sets/vgg3-train.tsv
-  fps: 8
-  height: 384
-  width: 384
-  sample_duration_sec: 8.0
-VGGSound_test:
-  root: ../data/video
-  subset_name: sets/vgg3-test.tsv
-  fps: 8
-  height: 384
-  width: 384
-  sample_duration_sec: 8.0
-VGGSound_val:
-  root: ../data/video
-  subset_name: sets/vgg3-val.tsv
-  fps: 8
-  height: 384
-  width: 384
-  sample_duration_sec: 8.0
-ExtractedVGG:
-  tsv: ../data/v1-16-memmap/vgg-train.tsv
-  memmap_dir: ../data/v1-16-memmap/vgg-train
-ExtractedVGG_test:
-  tag: test
-  gt_cache: ../data/eval-cache/vggsound-test
-  output_subdir: null
-  tsv: ../data/v1-16-memmap/vgg-test.tsv
-  memmap_dir: ../data/v1-16-memmap/vgg-test
-ExtractedVGG_val:
-  tag: val
-  gt_cache: ../data/eval-cache/vggsound-val
-  output_subdir: val
-  tsv: ../data/v1-16-memmap/vgg-val.tsv
-  memmap_dir: ../data/v1-16-memmap/vgg-val
-AudioCaps:
-  tsv: ../data/v1-16-memmap/audiocaps.tsv
-  memmap_dir: ../data/v1-16-memmap/audiocaps
-AudioSetSL:
-  tsv: ../data/v1-16-memmap/audioset_sl.tsv
-  memmap_dir: ../data/v1-16-memmap/audioset_sl
-BBCSound:
-  tsv: ../data/v1-16-memmap/bbcsound.tsv
-  memmap_dir: ../data/v1-16-memmap/bbcsound
-FreeSound:
-  tsv: ../data/v1-16-memmap/freesound.tsv
-  memmap_dir: ../data/v1-16-memmap/freesound
-Clotho:
-  tsv: ../data/v1-16-memmap/clotho.tsv
-  memmap_dir: ../data/v1-16-memmap/clotho
-Example_video:
-  tsv: ./training/example_output/memmap/vgg-example.tsv
-  memmap_dir: ./training/example_output/memmap/vgg-example
-Example_audio:
-  tsv: ./training/example_output/memmap/audio-example.tsv
-  memmap_dir: ./training/example_output/memmap/audio-example

config/eval_config.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-defaults:
-  - base_config
-  - override hydra/job_logging: custom-simplest
-  - _self_
-hydra:
-  run:
-    dir: ./output/${exp_id}
-  output_subdir: eval-${now:%Y-%m-%d_%H-%M-%S}-hydra
-exp_id: ${model}
-dataset: audiocaps
-duration_s: 8.0
-# for inference, this is the per-GPU batch size
-batch_size: 16
-output_name: null

config/eval_data/base.yaml DELETED Viewed

@@ -1,22 +0,0 @@
-AudioCaps:
-  audio_path: ../data/AudioCaps-test-audioldm-ver
-  # a csv file, with a header row of 'name' and 'caption'
-  # name should match the audio file name without extension
-  # Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_audioldm_data.csv
-  csv_path: ../data/AudioCaps-test-audioldm-ver/data.csv
-AudioCaps_full:
-  audio_path: ../data/AudioCaps-test-full-ver
-  # a csv file, with a header row of 'name' and 'caption'
-  # name should match the audio file name without extension
-  # Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_full_data.csv
-  csv_path: ../data/AudioCaps-test-full-ver/data.csv
-MovieGen:
-  video_path: ../data/MovieGen/MovieGenAudioBenchSfx/video_with_audio
-  jsonl_path: ../data/MovieGen/MovieGenAudioBenchSfx/metadata
-VGGSound:
-  video_path: ../data/test-videos
-  # from the officially released csv file
-  csv_path: ../data/vggsound.csv

config/hydra/job_logging/custom-eval.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-  file:
-    class: logging.FileHandler
-    formatter: simple
-    # absolute file path
-    filename: ${hydra.runtime.output_dir}/eval-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
-    mode: w
-root:
-  level: INFO
-  handlers: [console, file]
-disable_existing_loggers: false

config/hydra/job_logging/custom-no-rank.yaml DELETED Viewed

@@ -1,32 +0,0 @@
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-  file:
-    class: logging.FileHandler
-    formatter: simple
-    # absolute file path
-    filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-eval.log
-    mode: w
-root:
-  level: INFO
-  handlers: [console, file]
-disable_existing_loggers: false

config/hydra/job_logging/custom-simplest.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-root:
-  level: INFO
-  handlers: [console]
-disable_existing_loggers: false

config/hydra/job_logging/custom.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-# @package hydra.job_logging
-# python logging configuration for tasks
-version: 1
-formatters:
-  simple:
-    format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-  colorlog:
-    '()': 'colorlog.ColoredFormatter'
-    format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)sr${oc.env:LOCAL_RANK}%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
-    datefmt: '%Y-%m-%d %H:%M:%S'
-    log_colors:
-      DEBUG: purple
-      INFO: green
-      WARNING: yellow
-      ERROR: red
-      CRITICAL: red
-handlers:
-  console:
-    class: logging.StreamHandler
-    formatter: colorlog
-    stream: ext://sys.stdout
-  file:
-    class: logging.FileHandler
-    formatter: simple
-    # absolute file path
-    filename: ${hydra.runtime.output_dir}/train-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
-    mode: w
-root:
-  level: INFO
-  handlers: [console, file]
-disable_existing_loggers: false

config/train_config.yaml DELETED Viewed

@@ -1,41 +0,0 @@
-defaults:
-  - base_config
-  - override data: base
-  - override hydra/job_logging: custom
-  - _self_
-hydra:
-  run:
-    dir: ./output/${exp_id}
-  output_subdir: train-${now:%Y-%m-%d_%H-%M-%S}-hydra
-ema:
-  start: 0
-mini_train: False
-example_train: False
-enable_grad_scaler: False
-vgg_oversample_rate: 5
-log_text_interval: 200
-log_extra_interval: 20_000
-val_interval: 5_000
-eval_interval: 20_000
-save_eval_interval: 40_000
-save_weights_interval: 10_000
-save_checkpoint_interval: 10_000
-save_copy_iterations: []
-batch_size: 512
-eval_batch_size: 256 # per-GPU
-num_iterations: 300_000
-learning_rate: 1.0e-4
-linear_warmup_steps: 1_000
-lr_schedule: step
-lr_schedule_steps: [240_000, 270_000]
-lr_schedule_gamma: 0.1
-clip_grad_norm: 1.0
-weight_decay: 1.0e-6

demo.py CHANGED Viewed

@@ -62,13 +62,7 @@ def main():
     skip_video_composite: bool = args.skip_video_composite
     mask_away_clip: bool = args.mask_away_clip
-    device = 'cpu'
-    if torch.cuda.is_available():
-        device = 'cuda'
-    elif torch.backends.mps.is_available():
-        device = 'mps'
-    else:
-        log.warning('CUDA/MPS are not available, running on CPU')
     dtype = torch.float32 if args.full_precision else torch.bfloat16
     output_dir.mkdir(parents=True, exist_ok=True)

     skip_video_composite: bool = args.skip_video_composite
     mask_away_clip: bool = args.mask_away_clip
+    device = 'cuda'
     dtype = torch.float32 if args.full_precision else torch.bfloat16
     output_dir.mkdir(parents=True, exist_ok=True)

docs/EVAL.md DELETED Viewed

@@ -1,22 +0,0 @@
-# Evaluation
-## Batch Evaluation
-To evaluate the model on a dataset, use the `batch_eval.py` script. It is significantly more efficient in large-scale evaluation compared to `demo.py`, supporting batched inference, multi-GPU inference, torch compilation, and skipping video compositions.
-An example of running this script with four GPUs is as follows:
-```bash
-OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=4  batch_eval.py duration_s=8 dataset=vggsound model=small_16k num_workers=8
-```
-You may need to update the data paths in `config/eval_data/base.yaml`.
-More configuration options can be found in `config/base_config.yaml` and `config/eval_config.yaml`.
-## Precomputed Results
-Precomputed results for VGGSound, AudioCaps, and MovieGen are available here: https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results
-## Obtaining Quantitative Metrics
-Our evaluation code is available here: https://github.com/hkchengrex/av-benchmark

docs/MODELS.md DELETED Viewed

@@ -1,50 +0,0 @@
-# Pretrained models
-The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
-The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
-| Model    | Download link | File size |
-| -------- | ------- | ------- |
-| Flow prediction network, small 16kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_16k.pth" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
-| Flow prediction network, small 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_44k.pth" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
-| Flow prediction network, medium 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_medium_44k.pth" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
-| Flow prediction network, large 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k.pth" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
-| Flow prediction network, large 44.1kHz, v2 **(recommended)** | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth" download="mmaudio_large_44k_v2.pth">mmaudio_large_44k_v2.pth</a> | 3.9G |
-| 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
-| 16kHz BigVGAN vocoder (from Make-An-Audio 2) |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
-| 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G |
-| Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
-To run the model, you need four components: a flow prediction network, visual feature extractors (Synchformer and CLIP, CLIP will be downloaded automatically), a VAE, and a vocoder. VAEs and vocoders are specific to the sampling rate (16kHz or 44.1kHz) and not model sizes.
-The 44.1kHz vocoder will be downloaded automatically.
-The `_v2` model performs worse in benchmarking (e.g., in  Fréchet distance), but, in my experience, generalizes better to new data.
-The expected directory structure (full):
-```bash
-MMAudio
-├── ext_weights
-│   ├── best_netG.pt
-│   ├── synchformer_state_dict.pth
-│   ├── v1-16.pth
-│   └── v1-44.pth
-├── weights
-│   ├── mmaudio_small_16k.pth
-│   ├── mmaudio_small_44k.pth
-│   ├── mmaudio_medium_44k.pth
-│   ├── mmaudio_large_44k.pth
-│   └── mmaudio_large_44k_v2.pth
-└── ...
-```
-The expected directory structure (minimal, for the recommended model only):
-```bash
-MMAudio
-├── ext_weights
-│   ├── synchformer_state_dict.pth
-│   └── v1-44.pth
-├── weights
-│   └── mmaudio_large_44k_v2.pth
-└── ...
-```

docs/TRAINING.md DELETED Viewed

@@ -1,160 +0,0 @@
-# Training
-## Overview
-We have put a large emphasis on making training as fast as possible.
-Consequently, some pre-processing steps are required.
-Namely, before starting any training, we
-1. Obtain training data as videos, audios, and captions.
-2. Encode training audios into spectrograms and then with VAE into mean/std
-3. Extract CLIP and synchronization features from videos
-4. Extract CLIP features from text (captions)
-5. Encode all extracted features into [MemoryMappedTensors](https://pytorch.org/tensordict/main/reference/generated/tensordict.MemoryMappedTensor.html) with [TensorDict](https://pytorch.org/tensordict/main/reference/tensordict.html)
-**NOTE:** for maximum training speed (e.g., when training the base model with 2*H100s), you would need around 3~5 GB/s of random read speed. Spinning disks would not be able to catch up and most consumer-grade SSDs would struggle. In my experience, the best bet is to have a large enough system memory such that the OS can cache the data. This way, the data is read from RAM instead of disk.
-The current training script does not support `_v2` training.
-## Recommended Hardware Configuration
-These are what I recommend for a smooth and efficient training experience. These are not minimum requirements.
-- Single-node machine. We did not implement multi-node training
-- GPUs: for the small model, two 80G-H100s or above; for the large model, eight 80G-H100s or above
-- System memory: for 16kHz training, 600GB+; for 44kHz training, 700GB+
-- Storage: >2TB of fast NVMe storage. If you have enough system memory, OS caching will help and the storage does not need to be as fast.
-## Prerequisites
-1. Install [av-benchmark](https://github.com/hkchengrex/av-benchmark). We use this library to automatically evaluate on the validation set during training, and on the test set after training.
-2. Extract features for evaluation using [av-benchmark](https://github.com/hkchengrex/av-benchmark) for the validation and test set as a [validation cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L38) and a [test cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L31). You can also download the precomputed evaluation cache [here](https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results/tree/main).
-3. You will need ffmpeg to extract frames from videos. Note that `torchaudio` imposes a maximum version limit (`ffmpeg<7`). You can install it as follows:
-```bash
-conda install -c conda-forge 'ffmpeg<7'
-```
-4. Download the training datasets. We used [VGGSound](https://arxiv.org/abs/2004.14368), [AudioCaps](https://audiocaps.github.io/), and [WavCaps](https://arxiv.org/abs/2303.17395). Note that the audio files in the huggingface release of WavCaps have been downsampled to 32kHz. To the best of our ability, we located the original (high-sampling rate) audio files and used them instead to prevent artifacts during 44.1kHz training. We did not use the "SoundBible" portion of WavCaps, since it is a small set with many short audio unsuitable for our training.
-5. Download the corresponding VAE (`v1-16.pth` for 16kHz training, and `v1-44.pth` for 44.1kHz training), vocoder models (`best_netG.pt` for 16kHz training; the vocoder for 44.1kHz training will be downloaded automatically), the [empty string encoding](https://github.com/hkchengrex/MMAudio/releases/download/v0.1/empty_string.pth), and Synchformer weights from [MODELS.md](https://github.com/hkchengrex/MMAudio/blob/main/docs/MODELS.md) place them in `ext_weights/`.
-## Preparing Audio-Video-Text Features
-We have prepared some example data in `training/example_videos`.
-`training/extract_video_training_latents.py` extracts audio, video, and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
-To run this script, use the `torchrun` utility:
-```bash
-torchrun --standalone training/extract_video_training_latents.py
-```
-You can run this script with multiple GPUs (with `--nproc_per_node=<n>` after `--standalone` and before the script name) to speed up extraction.
-Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
-Change the data path definitions in `data_cfg` if necessary.
-Arguments:
-- `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
-- `output_dir` -- where TensorDict and the metadata file are saved.
-Outputs produced in `output_dir`:
-1. A directory named `vgg-{split}` (i.e., in the TensorDict format), containing
-    a. `mean.memmap` mean values predicted by the VAE encoder (number of videos X sequence length X channel size)
-    b. `std.memmap` standard deviation values predicted by the VAE encoder (number of videos X sequence length X channel size)
-    c. `text_features.memmap` text features extracted from CLIP (number of videos X 77 (sequence length) X 1024)
-    d. `clip_features.memmap` clip features extracted from CLIP (number of videos X 64 (8 fps) X 1024)
-    e. `sync_features.memmap` synchronization features extracted from Synchformer (number of videos X 192 (24 fps) X 768)
-    f. `meta.json` that contains the metadata for the above memory mappings
-2. A tab-separated values file named `vgg-{split}.tsv` that contains two columns: `id` containing video file names without extension, and `label` containing corresponding text labels (i.e., captions)
-## Preparing Audio-Text Features
-We have prepared some example data in `training/example_audios`.
-1. Run `training/partition_clips` to partition each audio file into clips (by finding start and end points; we do not save the partitioned audio onto the disk to save disk space)
-2. Run `training/extract_audio_training_latents.py` to extract each clip's audio and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
-### Partitioning the audio files
-Run
-```bash
-python training/partition_clips.py
-```
-Arguments:
-- `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`)
-- `output_dir` -- path to the output `.csv` file
-- `start` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the beginning of the chunk to be processed
-- `end` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the end of the chunk to be processed
-### Extracting audio and text features
-Run
-```bash
-torchrun --standalone training/extract_audio_training_latents.py
-```
-You can run this with multiple GPUs (with `--nproc_per_node=<n>`) to speed up extraction.
-Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
-Arguments:
-- `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`), same as the previous step
-- `captions_tsv` -- path to the captions file, a tab-separated values (tsv) file at least with columns `id` and `caption`
-- `clips_tsv` -- path to the clips file, generated in the last step
-- `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
-- `output_dir` -- where TensorDict and the metadata file are saved.
-**Reference tsv files (with overlaps removed as mentioned in the paper) can be found [here](https://github.com/hkchengrex/MMAudio/releases/tag/v0.1).**
-Note that these reference tsv files are the **outputs** of `extract_audio_training_latents.py`, which means the `id` column might contain duplicate entries (one per clip). You can still use it as the `captions_tsv` input though -- the script will handle duplicates gracefully.
-Among these reference tsv files, `audioset_sl.tsv`, `bbcsound.tsv`, and `freesound.tsv` are subsets that are parts of WavCaps. These subsets might be smaller than the original datasets.
-The Clotho data contains both the development set and the validation set.
-Outputs produced in `output_dir`:
-1. A directory named `{basename(output_dir)}` (i.e., in the TensorDict format), containing
-    a. `mean.memmap` mean values predicted by the VAE encoder (number of audios X sequence length X channel size)
-    b. `std.memmap` standard deviation values predicted by the VAE encoder (number of audios X sequence length X channel size)
-    c. `text_features.memmap` text features extracted from CLIP (number of audios X 77 (sequence length) X 1024)
-    f. `meta.json` that contains the metadata for the above memory mappings
-2. A tab-separated values file named `{basename(output_dir)}.tsv` that contains two columns: `id` containing audio file names without extension, and `label` containing corresponding text labels (i.e., captions)
-## Training on Extracted Features
-We use Distributed Data Parallel (DDP) for training.
-First, specify the data path in `config/data/base.yaml`. If you used the default parameters in the scripts above to extract features for the example data, the `Example_video` and `Example_audio` items should already be correct.
-To run training on the example data, use the following command:
-```bash
-OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=1 train.py exp_id=debug compile=False  debug=True example_train=True  batch_size=1
-```
-This will not train a useful model, but it will check if everything is set up correctly.
-For full training on the base model with two GPUs, use the following command:
-```bash
-OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=2 train.py exp_id=exp_1 model=small_16k
-```
-Any outputs from training will be stored in `output/<exp_id>`.
-More configuration options can be found in `config/base_config.yaml` and `config/train_config.yaml`.
-For the medium and large models, specify `vgg_oversample_rate` to be `3` to reduce overfitting.
-## Checkpoints
-Model checkpoints, including optimizer states and the latest EMA weights, are available here: https://huggingface.co/hkchengrex/MMAudio
----
-Godspeed!

docs/index.html CHANGED Viewed

@@ -40,7 +40,7 @@
             <br>
             <div class="row text-center" style="font-size:28px">
                 <div class="col">
-                    CVPR 2025
                 </div>
             </div>
             <br>
@@ -83,21 +83,19 @@
             <br>
             <div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
-                <div class="col-sm-2">
-                    <a href="https://arxiv.org/abs/2412.15322">[Paper]</a>
-                </div>
-                <div class="col-sm-2">
-                    <a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
-                </div>
                 <div class="col-sm-3">
-                    <a href="https://huggingface.co/spaces/hkchengrex/MMAudio">[Huggingface Demo]</a>
-                </div>
-                <div class="col-sm-2">
-                    <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">[Colab Demo]</a>
                 </div>
                 <div class="col-sm-3">
-                    <a href="https://replicate.com/zsxkib/mmaudio">[Replicate Demo]</a>
                 </div>
             </div>
             <br>

             <br>
             <div class="row text-center" style="font-size:28px">
                 <div class="col">
+                    arXiv 2024
                 </div>
             </div>
             <br>
             <br>
             <div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
+                <!-- <div class="col-sm-2">
+                    <a href="https://arxiv.org/abs/2310.12982">[arXiv]</a>
+                </div> -->
                 <div class="col-sm-3">
+                    <a href="">[Paper (being prepared)]</a>
                 </div>
                 <div class="col-sm-3">
+                    <a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
                 </div>
+                <!-- <div class="col-sm-2">
+                    <a
+                        href="https://colab.research.google.com/drive/1yo43XTbjxuWA7XgCUO9qxAi7wBI6HzvP?usp=sharing">[Colab]</a>
+                </div> -->
             </div>
             <br>

gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

mmaudio/data/av_utils.py CHANGED Viewed

@@ -25,32 +25,6 @@ class VideoInfo:
     def width(self):
         return self.all_frames[0].shape[1]
-    @classmethod
-    def from_image_info(cls, image_info: 'ImageInfo', duration_sec: float,
-                        fps: Fraction) -> 'VideoInfo':
-        num_frames = int(duration_sec * fps)
-        all_frames = [image_info.original_frame] * num_frames
-        return cls(duration_sec=duration_sec,
-                   fps=fps,
-                   clip_frames=image_info.clip_frames,
-                   sync_frames=image_info.sync_frames,
-                   all_frames=all_frames)
-@dataclass
-class ImageInfo:
-    clip_frames: torch.Tensor
-    sync_frames: torch.Tensor
-    original_frame: Optional[np.ndarray]
-    @property
-    def height(self):
-        return self.original_frame.shape[0]
-    @property
-    def width(self):
-        return self.original_frame.shape[1]
 def read_frames(video_path: Path, list_of_fps: list[float], start_sec: float, end_sec: float,
                 need_all_frames: bool) -> tuple[list[np.ndarray], list[np.ndarray], Fraction]:

     def width(self):
         return self.all_frames[0].shape[1]
 def read_frames(video_path: Path, list_of_fps: list[float], start_sec: float, end_sec: float,
                 need_all_frames: bool) -> tuple[list[np.ndarray], list[np.ndarray], Fraction]:

mmaudio/data/data_setup.py DELETED Viewed

@@ -1,174 +0,0 @@
-import logging
-import random
-import numpy as np
-import torch
-from omegaconf import DictConfig
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataloader import default_collate
-from torch.utils.data.distributed import DistributedSampler
-from mmaudio.data.eval.audiocaps import AudioCapsData
-from mmaudio.data.eval.video_dataset import MovieGen, VGGSound
-from mmaudio.data.extracted_audio import ExtractedAudio
-from mmaudio.data.extracted_vgg import ExtractedVGG
-from mmaudio.data.mm_dataset import MultiModalDataset
-from mmaudio.utils.dist_utils import local_rank
-log = logging.getLogger()
-# Re-seed randomness every time we start a worker
-def worker_init_fn(worker_id: int):
-    worker_seed = torch.initial_seed() % (2**31) + worker_id + local_rank * 1000
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-    log.debug(f'Worker {worker_id} re-seeded with seed {worker_seed} in rank {local_rank}')
-def load_vgg_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
-    dataset = ExtractedVGG(tsv_path=data_cfg.tsv,
-                           data_dim=cfg.data_dim,
-                           premade_mmap_dir=data_cfg.memmap_dir)
-    return dataset
-def load_audio_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
-    dataset = ExtractedAudio(tsv_path=data_cfg.tsv,
-                             data_dim=cfg.data_dim,
-                             premade_mmap_dir=data_cfg.memmap_dir)
-    return dataset
-def setup_training_datasets(cfg: DictConfig) -> tuple[Dataset, DistributedSampler, DataLoader]:
-    if cfg.mini_train:
-        vgg = load_vgg_data(cfg, cfg.data.ExtractedVGG_val)
-        audiocaps = load_audio_data(cfg, cfg.data.AudioCaps)
-        dataset = MultiModalDataset([vgg], [audiocaps])
-    if cfg.example_train:
-        video = load_vgg_data(cfg, cfg.data.Example_video)
-        audio = load_audio_data(cfg, cfg.data.Example_audio)
-        dataset = MultiModalDataset([video], [audio])
-    else:
-        # load the largest one first
-        freesound = load_audio_data(cfg, cfg.data.FreeSound)
-        vgg = load_vgg_data(cfg, cfg.data.ExtractedVGG)
-        audiocaps = load_audio_data(cfg, cfg.data.AudioCaps)
-        audioset_sl = load_audio_data(cfg, cfg.data.AudioSetSL)
-        bbcsound = load_audio_data(cfg, cfg.data.BBCSound)
-        clotho = load_audio_data(cfg, cfg.data.Clotho)
-        dataset = MultiModalDataset([vgg] * cfg.vgg_oversample_rate,
-                                    [audiocaps, audioset_sl, bbcsound, freesound, clotho])
-    batch_size = cfg.batch_size
-    num_workers = cfg.num_workers
-    pin_memory = cfg.pin_memory
-    sampler, loader = construct_loader(dataset,
-                                       batch_size,
-                                       num_workers,
-                                       shuffle=True,
-                                       drop_last=True,
-                                       pin_memory=pin_memory)
-    return dataset, sampler, loader
-def setup_test_datasets(cfg):
-    dataset = load_vgg_data(cfg, cfg.data.ExtractedVGG_test)
-    batch_size = cfg.batch_size
-    num_workers = cfg.num_workers
-    pin_memory = cfg.pin_memory
-    sampler, loader = construct_loader(dataset,
-                                       batch_size,
-                                       num_workers,
-                                       shuffle=False,
-                                       drop_last=False,
-                                       pin_memory=pin_memory)
-    return dataset, sampler, loader
-def setup_val_datasets(cfg: DictConfig) -> tuple[Dataset, DataLoader, DataLoader]:
-    if cfg.example_train:
-        dataset = load_vgg_data(cfg, cfg.data.Example_video)
-    else:
-        dataset = load_vgg_data(cfg, cfg.data.ExtractedVGG_val)
-    val_batch_size = cfg.batch_size
-    val_eval_batch_size = cfg.eval_batch_size
-    num_workers = cfg.num_workers
-    pin_memory = cfg.pin_memory
-    _, val_loader = construct_loader(dataset,
-                                     val_batch_size,
-                                     num_workers,
-                                     shuffle=False,
-                                     drop_last=False,
-                                     pin_memory=pin_memory)
-    _, eval_loader = construct_loader(dataset,
-                                      val_eval_batch_size,
-                                      num_workers,
-                                      shuffle=False,
-                                      drop_last=False,
-                                      pin_memory=pin_memory)
-    return dataset, val_loader, eval_loader
-def setup_eval_dataset(dataset_name: str, cfg: DictConfig) -> tuple[Dataset, DataLoader]:
-    if dataset_name.startswith('audiocaps_full'):
-        dataset = AudioCapsData(cfg.eval_data.AudioCaps_full.audio_path,
-                                cfg.eval_data.AudioCaps_full.csv_path)
-    elif dataset_name.startswith('audiocaps'):
-        dataset = AudioCapsData(cfg.eval_data.AudioCaps.audio_path,
-                                cfg.eval_data.AudioCaps.csv_path)
-    elif dataset_name.startswith('moviegen'):
-        dataset = MovieGen(cfg.eval_data.MovieGen.video_path,
-                           cfg.eval_data.MovieGen.jsonl_path,
-                           duration_sec=cfg.duration_s)
-    elif dataset_name.startswith('vggsound'):
-        dataset = VGGSound(cfg.eval_data.VGGSound.video_path,
-                           cfg.eval_data.VGGSound.csv_path,
-                           duration_sec=cfg.duration_s)
-    else:
-        raise ValueError(f'Invalid dataset name: {dataset_name}')
-    batch_size = cfg.batch_size
-    num_workers = cfg.num_workers
-    pin_memory = cfg.pin_memory
-    _, loader = construct_loader(dataset,
-                                 batch_size,
-                                 num_workers,
-                                 shuffle=False,
-                                 drop_last=False,
-                                 pin_memory=pin_memory,
-                                 error_avoidance=True)
-    return dataset, loader
-def error_avoidance_collate(batch):
-    batch = list(filter(lambda x: x is not None, batch))
-    return default_collate(batch)
-def construct_loader(dataset: Dataset,
-                     batch_size: int,
-                     num_workers: int,
-                     *,
-                     shuffle: bool = True,
-                     drop_last: bool = True,
-                     pin_memory: bool = False,
-                     error_avoidance: bool = False) -> tuple[DistributedSampler, DataLoader]:
-    train_sampler = DistributedSampler(dataset, rank=local_rank, shuffle=shuffle)
-    train_loader = DataLoader(dataset,
-                              batch_size,
-                              sampler=train_sampler,
-                              num_workers=num_workers,
-                              worker_init_fn=worker_init_fn,
-                              drop_last=drop_last,
-                              persistent_workers=num_workers > 0,
-                              pin_memory=pin_memory,
-                              collate_fn=error_avoidance_collate if error_avoidance else None)
-    return train_sampler, train_loader

mmaudio/data/eval/__init__.py DELETED Viewed

File without changes

mmaudio/data/eval/audiocaps.py DELETED Viewed

@@ -1,39 +0,0 @@
-import logging
-import os
-from collections import defaultdict
-from pathlib import Path
-from typing import Union
-import pandas as pd
-import torch
-from torch.utils.data.dataset import Dataset
-log = logging.getLogger()
-class AudioCapsData(Dataset):
-    def __init__(self, audio_path: Union[str, Path], csv_path: Union[str, Path]):
-        df = pd.read_csv(csv_path).to_dict(orient='records')
-        audio_files = sorted(os.listdir(audio_path))
-        audio_files = set(
-            [Path(f).stem for f in audio_files if f.endswith('.wav') or f.endswith('.flac')])
-        self.data = []
-        for row in df:
-            self.data.append({
-                'name': row['name'],
-                'caption': row['caption'],
-            })
-        self.audio_path = Path(audio_path)
-        self.csv_path = Path(csv_path)
-        log.info(f'Found {len(self.data)} matching audio files in {self.audio_path}')
-    def __getitem__(self, idx: int) -> torch.Tensor:
-        return self.data[idx]
-    def __len__(self):
-        return len(self.data)

mmaudio/data/eval/moviegen.py DELETED Viewed

@@ -1,131 +0,0 @@
-import json
-import logging
-import os
-from pathlib import Path
-from typing import Union
-import torch
-from torch.utils.data.dataset import Dataset
-from torchvision.transforms import v2
-from torio.io import StreamingMediaDecoder
-from mmaudio.utils.dist_utils import local_rank
-log = logging.getLogger()
-_CLIP_SIZE = 384
-_CLIP_FPS = 8.0
-_SYNC_SIZE = 224
-_SYNC_FPS = 25.0
-class MovieGenData(Dataset):
-    def __init__(
-        self,
-        video_root: Union[str, Path],
-        sync_root: Union[str, Path],
-        jsonl_root: Union[str, Path],
-        *,
-        duration_sec: float = 10.0,
-        read_clip: bool = True,
-    ):
-        self.video_root = Path(video_root)
-        self.sync_root = Path(sync_root)
-        self.jsonl_root = Path(jsonl_root)
-        self.read_clip = read_clip
-        videos = sorted(os.listdir(self.video_root))
-        videos = [v[:-4] for v in videos]  # remove extensions
-        self.captions = {}
-        for v in videos:
-            with open(self.jsonl_root / (v + '.jsonl')) as f:
-                data = json.load(f)
-                self.captions[v] = data['audio_prompt']
-        if local_rank == 0:
-            log.info(f'{len(videos)} videos found in {video_root}')
-        self.duration_sec = duration_sec
-        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
-        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
-        self.clip_augment = v2.Compose([
-            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-        ])
-        self.sync_augment = v2.Compose([
-            v2.Resize((_SYNC_SIZE, _SYNC_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
-            v2.CenterCrop(_SYNC_SIZE),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
-        ])
-        self.videos = videos
-    def sample(self, idx: int) -> dict[str, torch.Tensor]:
-        video_id = self.videos[idx]
-        caption = self.captions[video_id]
-        reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
-        reader.add_basic_video_stream(
-            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
-            frame_rate=_CLIP_FPS,
-            format='rgb24',
-        )
-        reader.add_basic_video_stream(
-            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
-            frame_rate=_SYNC_FPS,
-            format='rgb24',
-        )
-        reader.fill_buffer()
-        data_chunk = reader.pop_chunks()
-        clip_chunk = data_chunk[0]
-        sync_chunk = data_chunk[1]
-        if clip_chunk is None:
-            raise RuntimeError(f'CLIP video returned None {video_id}')
-        if clip_chunk.shape[0] < self.clip_expected_length:
-            raise RuntimeError(f'CLIP video too short {video_id}')
-        if sync_chunk is None:
-            raise RuntimeError(f'Sync video returned None {video_id}')
-        if sync_chunk.shape[0] < self.sync_expected_length:
-            raise RuntimeError(f'Sync video too short {video_id}')
-        # truncate the video
-        clip_chunk = clip_chunk[:self.clip_expected_length]
-        if clip_chunk.shape[0] != self.clip_expected_length:
-            raise RuntimeError(f'CLIP video wrong length {video_id}, '
-                               f'expected {self.clip_expected_length}, '
-                               f'got {clip_chunk.shape[0]}')
-        clip_chunk = self.clip_augment(clip_chunk)
-        sync_chunk = sync_chunk[:self.sync_expected_length]
-        if sync_chunk.shape[0] != self.sync_expected_length:
-            raise RuntimeError(f'Sync video wrong length {video_id}, '
-                               f'expected {self.sync_expected_length}, '
-                               f'got {sync_chunk.shape[0]}')
-        sync_chunk = self.sync_augment(sync_chunk)
-        data = {
-            'name': video_id,
-            'caption': caption,
-            'clip_video': clip_chunk,
-            'sync_video': sync_chunk,
-        }
-        return data
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        return self.sample(idx)
-    def __len__(self):
-        return len(self.captions)

mmaudio/data/eval/video_dataset.py DELETED Viewed

@@ -1,197 +0,0 @@
-import json
-import logging
-import os
-from pathlib import Path
-from typing import Union
-import pandas as pd
-import torch
-from torch.utils.data.dataset import Dataset
-from torchvision.transforms import v2
-from torio.io import StreamingMediaDecoder
-from mmaudio.utils.dist_utils import local_rank
-log = logging.getLogger()
-_CLIP_SIZE = 384
-_CLIP_FPS = 8.0
-_SYNC_SIZE = 224
-_SYNC_FPS = 25.0
-class VideoDataset(Dataset):
-    def __init__(
-        self,
-        video_root: Union[str, Path],
-        *,
-        duration_sec: float = 8.0,
-    ):
-        self.video_root = Path(video_root)
-        self.duration_sec = duration_sec
-        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
-        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
-        self.clip_transform = v2.Compose([
-            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-        ])
-        self.sync_transform = v2.Compose([
-            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
-            v2.CenterCrop(_SYNC_SIZE),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
-        ])
-        # to be implemented by subclasses
-        self.captions = {}
-        self.videos = sorted(list(self.captions.keys()))
-    def sample(self, idx: int) -> dict[str, torch.Tensor]:
-        video_id = self.videos[idx]
-        caption = self.captions[video_id]
-        reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
-        reader.add_basic_video_stream(
-            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
-            frame_rate=_CLIP_FPS,
-            format='rgb24',
-        )
-        reader.add_basic_video_stream(
-            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
-            frame_rate=_SYNC_FPS,
-            format='rgb24',
-        )
-        reader.fill_buffer()
-        data_chunk = reader.pop_chunks()
-        clip_chunk = data_chunk[0]
-        sync_chunk = data_chunk[1]
-        if clip_chunk is None:
-            raise RuntimeError(f'CLIP video returned None {video_id}')
-        if clip_chunk.shape[0] < self.clip_expected_length:
-            raise RuntimeError(
-                f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
-            )
-        if sync_chunk is None:
-            raise RuntimeError(f'Sync video returned None {video_id}')
-        if sync_chunk.shape[0] < self.sync_expected_length:
-            raise RuntimeError(
-                f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
-            )
-        # truncate the video
-        clip_chunk = clip_chunk[:self.clip_expected_length]
-        if clip_chunk.shape[0] != self.clip_expected_length:
-            raise RuntimeError(f'CLIP video wrong length {video_id}, '
-                               f'expected {self.clip_expected_length}, '
-                               f'got {clip_chunk.shape[0]}')
-        clip_chunk = self.clip_transform(clip_chunk)
-        sync_chunk = sync_chunk[:self.sync_expected_length]
-        if sync_chunk.shape[0] != self.sync_expected_length:
-            raise RuntimeError(f'Sync video wrong length {video_id}, '
-                               f'expected {self.sync_expected_length}, '
-                               f'got {sync_chunk.shape[0]}')
-        sync_chunk = self.sync_transform(sync_chunk)
-        data = {
-            'name': video_id,
-            'caption': caption,
-            'clip_video': clip_chunk,
-            'sync_video': sync_chunk,
-        }
-        return data
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        try:
-            return self.sample(idx)
-        except Exception as e:
-            log.error(f'Error loading video {self.videos[idx]}: {e}')
-            return None
-    def __len__(self):
-        return len(self.captions)
-class VGGSound(VideoDataset):
-    def __init__(
-        self,
-        video_root: Union[str, Path],
-        csv_path: Union[str, Path],
-        *,
-        duration_sec: float = 8.0,
-    ):
-        super().__init__(video_root, duration_sec=duration_sec)
-        self.video_root = Path(video_root)
-        self.csv_path = Path(csv_path)
-        videos = sorted(os.listdir(self.video_root))
-        if local_rank == 0:
-            log.info(f'{len(videos)} videos found in {video_root}')
-        self.captions = {}
-        df = pd.read_csv(csv_path, header=None, names=['id', 'sec', 'caption',
-                                                       'split']).to_dict(orient='records')
-        videos_no_found = []
-        for row in df:
-            if row['split'] == 'test':
-                start_sec = int(row['sec'])
-                video_id = str(row['id'])
-                # this is how our videos are named
-                video_name = f'{video_id}_{start_sec:06d}'
-                if video_name + '.mp4' not in videos:
-                    videos_no_found.append(video_name)
-                    continue
-                self.captions[video_name] = row['caption']
-        if local_rank == 0:
-            log.info(f'{len(videos)} videos found in {video_root}')
-            log.info(f'{len(self.captions)} useable videos found')
-            if videos_no_found:
-                log.info(f'{len(videos_no_found)} found in {csv_path} but not in {video_root}')
-                log.info(
-                    'A small amount is expected, as not all videos are still available on YouTube')
-        self.videos = sorted(list(self.captions.keys()))
-class MovieGen(VideoDataset):
-    def __init__(
-        self,
-        video_root: Union[str, Path],
-        jsonl_root: Union[str, Path],
-        *,
-        duration_sec: float = 10.0,
-    ):
-        super().__init__(video_root, duration_sec=duration_sec)
-        self.video_root = Path(video_root)
-        self.jsonl_root = Path(jsonl_root)
-        videos = sorted(os.listdir(self.video_root))
-        videos = [v[:-4] for v in videos]  # remove extensions
-        self.captions = {}
-        for v in videos:
-            with open(self.jsonl_root / (v + '.jsonl')) as f:
-                data = json.load(f)
-                self.captions[v] = data['audio_prompt']
-        if local_rank == 0:
-            log.info(f'{len(videos)} videos found in {video_root}')
-        self.videos = videos

mmaudio/data/extracted_audio.py DELETED Viewed

@@ -1,88 +0,0 @@
-import logging
-from pathlib import Path
-from typing import Union
-import pandas as pd
-import torch
-from tensordict import TensorDict
-from torch.utils.data.dataset import Dataset
-from mmaudio.utils.dist_utils import local_rank
-log = logging.getLogger()
-class ExtractedAudio(Dataset):
-    def __init__(
-        self,
-        tsv_path: Union[str, Path],
-        *,
-        premade_mmap_dir: Union[str, Path],
-        data_dim: dict[str, int],
-    ):
-        super().__init__()
-        self.data_dim = data_dim
-        self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records')
-        self.ids = [str(d['id']) for d in self.df_list]
-        log.info(f'Loading precomputed mmap from {premade_mmap_dir}')
-        # load precomputed memory mapped tensors
-        premade_mmap_dir = Path(premade_mmap_dir)
-        td = TensorDict.load_memmap(premade_mmap_dir)
-        log.info(f'Loaded precomputed mmap from {premade_mmap_dir}')
-        self.mean = td['mean']
-        self.std = td['std']
-        self.text_features = td['text_features']
-        log.info(f'Loaded {len(self)} samples from {premade_mmap_dir}.')
-        log.info(f'Loaded mean: {self.mean.shape}.')
-        log.info(f'Loaded std: {self.std.shape}.')
-        log.info(f'Loaded text features: {self.text_features.shape}.')
-        assert self.mean.shape[1] == self.data_dim['latent_seq_len'], \
-            f'{self.mean.shape[1]} != {self.data_dim["latent_seq_len"]}'
-        assert self.std.shape[1] == self.data_dim['latent_seq_len'], \
-            f'{self.std.shape[1]} != {self.data_dim["latent_seq_len"]}'
-        assert self.text_features.shape[1] == self.data_dim['text_seq_len'], \
-            f'{self.text_features.shape[1]} != {self.data_dim["text_seq_len"]}'
-        assert self.text_features.shape[-1] == self.data_dim['text_dim'], \
-            f'{self.text_features.shape[-1]} != {self.data_dim["text_dim"]}'
-        self.fake_clip_features = torch.zeros(self.data_dim['clip_seq_len'],
-                                              self.data_dim['clip_dim'])
-        self.fake_sync_features = torch.zeros(self.data_dim['sync_seq_len'],
-                                              self.data_dim['sync_dim'])
-        self.video_exist = torch.tensor(0, dtype=torch.bool)
-        self.text_exist = torch.tensor(1, dtype=torch.bool)
-    def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
-        latents = self.mean
-        return latents.mean(dim=(0, 1)), latents.std(dim=(0, 1))
-    def get_memory_mapped_tensor(self) -> TensorDict:
-        td = TensorDict({
-            'mean': self.mean,
-            'std': self.std,
-            'text_features': self.text_features,
-        })
-        return td
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        data = {
-            'id': str(self.df_list[idx]['id']),
-            'a_mean': self.mean[idx],
-            'a_std': self.std[idx],
-            'clip_features': self.fake_clip_features,
-            'sync_features': self.fake_sync_features,
-            'text_features': self.text_features[idx],
-            'caption': self.df_list[idx]['caption'],
-            'video_exist': self.video_exist,
-            'text_exist': self.text_exist,
-        }
-        return data
-    def __len__(self):
-        return len(self.ids)

mmaudio/data/extracted_vgg.py DELETED Viewed

@@ -1,101 +0,0 @@
-import logging
-from pathlib import Path
-from typing import Union
-import pandas as pd
-import torch
-from tensordict import TensorDict
-from torch.utils.data.dataset import Dataset
-from mmaudio.utils.dist_utils import local_rank
-log = logging.getLogger()
-class ExtractedVGG(Dataset):
-    def __init__(
-        self,
-        tsv_path: Union[str, Path],
-        *,
-        premade_mmap_dir: Union[str, Path],
-        data_dim: dict[str, int],
-    ):
-        super().__init__()
-        self.data_dim = data_dim
-        self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records')
-        self.ids = [d['id'] for d in self.df_list]
-        log.info(f'Loading precomputed mmap from {premade_mmap_dir}')
-        # load precomputed memory mapped tensors
-        premade_mmap_dir = Path(premade_mmap_dir)
-        td = TensorDict.load_memmap(premade_mmap_dir)
-        log.info(f'Loaded precomputed mmap from {premade_mmap_dir}')
-        self.mean = td['mean']
-        self.std = td['std']
-        self.clip_features = td['clip_features']
-        self.sync_features = td['sync_features']
-        self.text_features = td['text_features']
-        if local_rank == 0:
-            log.info(f'Loaded {len(self)} samples.')
-            log.info(f'Loaded mean: {self.mean.shape}.')
-            log.info(f'Loaded std: {self.std.shape}.')
-            log.info(f'Loaded clip_features: {self.clip_features.shape}.')
-            log.info(f'Loaded sync_features: {self.sync_features.shape}.')
-            log.info(f'Loaded text_features: {self.text_features.shape}.')
-        assert self.mean.shape[1] == self.data_dim['latent_seq_len'], \
-            f'{self.mean.shape[1]} != {self.data_dim["latent_seq_len"]}'
-        assert self.std.shape[1] == self.data_dim['latent_seq_len'], \
-            f'{self.std.shape[1]} != {self.data_dim["latent_seq_len"]}'
-        assert self.clip_features.shape[1] == self.data_dim['clip_seq_len'], \
-            f'{self.clip_features.shape[1]} != {self.data_dim["clip_seq_len"]}'
-        assert self.sync_features.shape[1] == self.data_dim['sync_seq_len'], \
-            f'{self.sync_features.shape[1]} != {self.data_dim["sync_seq_len"]}'
-        assert self.text_features.shape[1] == self.data_dim['text_seq_len'], \
-            f'{self.text_features.shape[1]} != {self.data_dim["text_seq_len"]}'
-        assert self.clip_features.shape[-1] == self.data_dim['clip_dim'], \
-            f'{self.clip_features.shape[-1]} != {self.data_dim["clip_dim"]}'
-        assert self.sync_features.shape[-1] == self.data_dim['sync_dim'], \
-            f'{self.sync_features.shape[-1]} != {self.data_dim["sync_dim"]}'
-        assert self.text_features.shape[-1] == self.data_dim['text_dim'], \
-            f'{self.text_features.shape[-1]} != {self.data_dim["text_dim"]}'
-        self.video_exist = torch.tensor(1, dtype=torch.bool)
-        self.text_exist = torch.tensor(1, dtype=torch.bool)
-    def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
-        latents = self.mean
-        return latents.mean(dim=(0, 1)), latents.std(dim=(0, 1))
-    def get_memory_mapped_tensor(self) -> TensorDict:
-        td = TensorDict({
-            'mean': self.mean,
-            'std': self.std,
-            'clip_features': self.clip_features,
-            'sync_features': self.sync_features,
-            'text_features': self.text_features,
-        })
-        return td
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        data = {
-            'id': self.df_list[idx]['id'],
-            'a_mean': self.mean[idx],
-            'a_std': self.std[idx],
-            'clip_features': self.clip_features[idx],
-            'sync_features': self.sync_features[idx],
-            'text_features': self.text_features[idx],
-            'caption': self.df_list[idx]['label'],
-            'video_exist': self.video_exist,
-            'text_exist': self.text_exist,
-        }
-        return data
-    def __len__(self):
-        return len(self.ids)

mmaudio/data/extraction/__init__.py DELETED Viewed

File without changes

mmaudio/data/extraction/vgg_sound.py DELETED Viewed

@@ -1,193 +0,0 @@
-import logging
-import os
-from pathlib import Path
-from typing import Optional, Union
-import pandas as pd
-import torch
-import torchaudio
-from torch.utils.data.dataset import Dataset
-from torchvision.transforms import v2
-from torio.io import StreamingMediaDecoder
-from mmaudio.utils.dist_utils import local_rank
-log = logging.getLogger()
-_CLIP_SIZE = 384
-_CLIP_FPS = 8.0
-_SYNC_SIZE = 224
-_SYNC_FPS = 25.0
-class VGGSound(Dataset):
-    def __init__(
-        self,
-        root: Union[str, Path],
-        *,
-        tsv_path: Union[str, Path] = 'sets/vgg3-train.tsv',
-        sample_rate: int = 16_000,
-        duration_sec: float = 8.0,
-        audio_samples: Optional[int] = None,
-        normalize_audio: bool = False,
-    ):
-        self.root = Path(root)
-        self.normalize_audio = normalize_audio
-        if audio_samples is None:
-            self.audio_samples = int(sample_rate * duration_sec)
-        else:
-            self.audio_samples = audio_samples
-            effective_duration = audio_samples / sample_rate
-            # make sure the duration is close enough, within 15ms
-            assert abs(effective_duration - duration_sec) < 0.015, \
-                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
-        videos = sorted(os.listdir(self.root))
-        videos = set([Path(v).stem for v in videos])  # remove extensions
-        self.labels = {}
-        self.videos = []
-        missing_videos = []
-        # read the tsv for subset information
-        df_list = pd.read_csv(tsv_path, sep='\t', dtype={'id': str}).to_dict('records')
-        for record in df_list:
-            id = record['id']
-            label = record['label']
-            if id in videos:
-                self.labels[id] = label
-                self.videos.append(id)
-            else:
-                missing_videos.append(id)
-        if local_rank == 0:
-            log.info(f'{len(videos)} videos found in {root}')
-            log.info(f'{len(self.videos)} videos found in {tsv_path}')
-            log.info(f'{len(missing_videos)} videos missing in {root}')
-        self.sample_rate = sample_rate
-        self.duration_sec = duration_sec
-        self.expected_audio_length = audio_samples
-        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
-        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
-        self.clip_transform = v2.Compose([
-            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-        ])
-        self.sync_transform = v2.Compose([
-            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
-            v2.CenterCrop(_SYNC_SIZE),
-            v2.ToImage(),
-            v2.ToDtype(torch.float32, scale=True),
-            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
-        ])
-        self.resampler = {}
-    def sample(self, idx: int) -> dict[str, torch.Tensor]:
-        video_id = self.videos[idx]
-        label = self.labels[video_id]
-        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
-        reader.add_basic_video_stream(
-            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
-            frame_rate=_CLIP_FPS,
-            format='rgb24',
-        )
-        reader.add_basic_video_stream(
-            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
-            frame_rate=_SYNC_FPS,
-            format='rgb24',
-        )
-        reader.add_basic_audio_stream(frames_per_chunk=2**30, )
-        reader.fill_buffer()
-        data_chunk = reader.pop_chunks()
-        clip_chunk = data_chunk[0]
-        sync_chunk = data_chunk[1]
-        audio_chunk = data_chunk[2]
-        if clip_chunk is None:
-            raise RuntimeError(f'CLIP video returned None {video_id}')
-        if clip_chunk.shape[0] < self.clip_expected_length:
-            raise RuntimeError(
-                f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
-            )
-        if sync_chunk is None:
-            raise RuntimeError(f'Sync video returned None {video_id}')
-        if sync_chunk.shape[0] < self.sync_expected_length:
-            raise RuntimeError(
-                f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
-            )
-        # process audio
-        sample_rate = int(reader.get_out_stream_info(2).sample_rate)
-        audio_chunk = audio_chunk.transpose(0, 1)
-        audio_chunk = audio_chunk.mean(dim=0)  # mono
-        if self.normalize_audio:
-            abs_max = audio_chunk.abs().max()
-            audio_chunk = audio_chunk / abs_max * 0.95
-            if abs_max <= 1e-6:
-                raise RuntimeError(f'Audio is silent {video_id}')
-        # resample
-        if sample_rate == self.sample_rate:
-            audio_chunk = audio_chunk
-        else:
-            if sample_rate not in self.resampler:
-                # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
-                self.resampler[sample_rate] = torchaudio.transforms.Resample(
-                    sample_rate,
-                    self.sample_rate,
-                    lowpass_filter_width=64,
-                    rolloff=0.9475937167399596,
-                    resampling_method='sinc_interp_kaiser',
-                    beta=14.769656459379492,
-                )
-            audio_chunk = self.resampler[sample_rate](audio_chunk)
-        if audio_chunk.shape[0] < self.expected_audio_length:
-            raise RuntimeError(f'Audio too short {video_id}')
-        audio_chunk = audio_chunk[:self.expected_audio_length]
-        # truncate the video
-        clip_chunk = clip_chunk[:self.clip_expected_length]
-        if clip_chunk.shape[0] != self.clip_expected_length:
-            raise RuntimeError(f'CLIP video wrong length {video_id}, '
-                               f'expected {self.clip_expected_length}, '
-                               f'got {clip_chunk.shape[0]}')
-        clip_chunk = self.clip_transform(clip_chunk)
-        sync_chunk = sync_chunk[:self.sync_expected_length]
-        if sync_chunk.shape[0] != self.sync_expected_length:
-            raise RuntimeError(f'Sync video wrong length {video_id}, '
-                               f'expected {self.sync_expected_length}, '
-                               f'got {sync_chunk.shape[0]}')
-        sync_chunk = self.sync_transform(sync_chunk)
-        data = {
-            'id': video_id,
-            'caption': label,
-            'audio': audio_chunk,
-            'clip_video': clip_chunk,
-            'sync_video': sync_chunk,
-        }
-        return data
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        try:
-            return self.sample(idx)
-        except Exception as e:
-            log.error(f'Error loading video {self.videos[idx]}: {e}')
-            return None
-    def __len__(self):
-        return len(self.labels)

mmaudio/data/extraction/wav_dataset.py DELETED Viewed

@@ -1,132 +0,0 @@
-import logging
-import os
-from pathlib import Path
-from typing import Union
-import open_clip
-import pandas as pd
-import torch
-import torchaudio
-from torch.utils.data.dataset import Dataset
-log = logging.getLogger()
-class WavTextClipsDataset(Dataset):
-    def __init__(
-        self,
-        root: Union[str, Path],
-        *,
-        captions_tsv: Union[str, Path],
-        clips_tsv: Union[str, Path],
-        sample_rate: int,
-        num_samples: int,
-        normalize_audio: bool = False,
-        reject_silent: bool = False,
-        tokenizer_id: str = 'ViT-H-14-378-quickgelu',
-    ):
-        self.root = Path(root)
-        self.sample_rate = sample_rate
-        self.num_samples = num_samples
-        self.normalize_audio = normalize_audio
-        self.reject_silent = reject_silent
-        self.tokenizer = open_clip.get_tokenizer(tokenizer_id)
-        audios = sorted(os.listdir(self.root))
-        audios = set([
-            Path(audio).stem for audio in audios
-            if audio.endswith('.wav') or audio.endswith('.flac')
-        ])
-        self.captions = {}
-        # read the caption tsv
-        df_list = pd.read_csv(captions_tsv, sep='\t', dtype={'id': str}).to_dict('records')
-        for record in df_list:
-            id = record['id']
-            caption = record['caption']
-            self.captions[id] = caption
-        # read the clip tsv
-        df_list = pd.read_csv(clips_tsv, sep='\t', dtype={
-            'id': str,
-            'name': str
-        }).to_dict('records')
-        self.clips = []
-        for record in df_list:
-            record['id'] = record['id']
-            record['name'] = record['name']
-            id = record['id']
-            name = record['name']
-            if name not in self.captions:
-                log.warning(f'Audio {name} not found in {captions_tsv}')
-                continue
-            record['caption'] = self.captions[name]
-            self.clips.append(record)
-        log.info(f'Found {len(self.clips)} audio files in {self.root}')
-        self.resampler = {}
-    def __getitem__(self, idx: int) -> torch.Tensor:
-        try:
-            clip = self.clips[idx]
-            audio_name = clip['name']
-            audio_id = clip['id']
-            caption = clip['caption']
-            start_sample = clip['start_sample']
-            end_sample = clip['end_sample']
-            audio_path = self.root / f'{audio_name}.flac'
-            if not audio_path.exists():
-                audio_path = self.root / f'{audio_name}.wav'
-                assert audio_path.exists()
-            audio_chunk, sample_rate = torchaudio.load(audio_path)
-            audio_chunk = audio_chunk.mean(dim=0)  # mono
-            abs_max = audio_chunk.abs().max()
-            if self.normalize_audio:
-                audio_chunk = audio_chunk / abs_max * 0.95
-            if self.reject_silent and abs_max < 1e-6:
-                log.warning(f'Rejecting silent audio')
-                return None
-            audio_chunk = audio_chunk[start_sample:end_sample]
-            # resample
-            if sample_rate == self.sample_rate:
-                audio_chunk = audio_chunk
-            else:
-                if sample_rate not in self.resampler:
-                    # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
-                    self.resampler[sample_rate] = torchaudio.transforms.Resample(
-                        sample_rate,
-                        self.sample_rate,
-                        lowpass_filter_width=64,
-                        rolloff=0.9475937167399596,
-                        resampling_method='sinc_interp_kaiser',
-                        beta=14.769656459379492,
-                    )
-                audio_chunk = self.resampler[sample_rate](audio_chunk)
-            if audio_chunk.shape[0] < self.num_samples:
-                raise ValueError('Audio is too short')
-            audio_chunk = audio_chunk[:self.num_samples]
-            tokens = self.tokenizer([caption])[0]
-            output = {
-                'waveform': audio_chunk,
-                'id': audio_id,
-                'caption': caption,
-                'tokens': tokens,
-            }
-            return output
-        except Exception as e:
-            log.error(f'Error reading {audio_path}: {e}')
-            return None
-    def __len__(self):
-        return len(self.clips)

mmaudio/data/mm_dataset.py DELETED Viewed

@@ -1,45 +0,0 @@
-import bisect
-import torch
-from torch.utils.data.dataset import Dataset
-# modified from https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#ConcatDataset
-class MultiModalDataset(Dataset):
-    datasets: list[Dataset]
-    cumulative_sizes: list[int]
-    @staticmethod
-    def cumsum(sequence):
-        r, s = [], 0
-        for e in sequence:
-            l = len(e)
-            r.append(l + s)
-            s += l
-        return r
-    def __init__(self, video_datasets: list[Dataset], audio_datasets: list[Dataset]):
-        super().__init__()
-        self.video_datasets = list(video_datasets)
-        self.audio_datasets = list(audio_datasets)
-        self.datasets = self.video_datasets + self.audio_datasets
-        self.cumulative_sizes = self.cumsum(self.datasets)
-    def __len__(self):
-        return self.cumulative_sizes[-1]
-    def __getitem__(self, idx):
-        if idx < 0:
-            if -idx > len(self):
-                raise ValueError("absolute value of index should not exceed dataset length")
-            idx = len(self) + idx
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
-        if dataset_idx == 0:
-            sample_idx = idx
-        else:
-            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
-        return self.datasets[dataset_idx][sample_idx]
-    def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
-        return self.video_datasets[0].compute_latent_stats()

mmaudio/data/utils.py DELETED Viewed

@@ -1,148 +0,0 @@
-import logging
-import os
-import random
-import tempfile
-from pathlib import Path
-from typing import Any, Optional, Union
-import torch
-import torch.distributed as dist
-from tensordict import MemoryMappedTensor
-from torch.utils.data import DataLoader
-from torch.utils.data.dataset import Dataset
-from tqdm import tqdm
-from mmaudio.utils.dist_utils import local_rank, world_size
-scratch_path = Path(os.environ['SLURM_SCRATCH'] if 'SLURM_SCRATCH' in os.environ else '/dev/shm')
-shm_path = Path('/dev/shm')
-log = logging.getLogger()
-def reseed(seed):
-    random.seed(seed)
-    torch.manual_seed(seed)
-def local_scatter_torch(obj: Optional[Any]):
-    if world_size == 1:
-        # Just one worker. Do nothing.
-        return obj
-    array = [obj] * world_size
-    target_array = [None]
-    if local_rank == 0:
-        dist.scatter_object_list(target_array, scatter_object_input_list=array, src=0)
-    else:
-        dist.scatter_object_list(target_array, scatter_object_input_list=None, src=0)
-    return target_array[0]
-class ShardDataset(Dataset):
-    def __init__(self, root):
-        self.root = root
-        self.shards = sorted(os.listdir(root))
-    def __len__(self):
-        return len(self.shards)
-    def __getitem__(self, idx):
-        return torch.load(os.path.join(self.root, self.shards[idx]), weights_only=True)
-def get_tmp_dir(in_memory: bool) -> Path:
-    return shm_path if in_memory else scratch_path
-def load_shards_and_share(data_path: Union[str, Path], ids: list[int],
-                          in_memory: bool) -> MemoryMappedTensor:
-    if local_rank == 0:
-        with tempfile.NamedTemporaryFile(prefix='shared-tensor-', dir=get_tmp_dir(in_memory)) as f:
-            log.info(f'Loading shards from {data_path} into {f.name}...')
-            data = load_shards(data_path, ids=ids, tmp_file_path=f.name)
-            data = share_tensor_to_all(data)
-            torch.distributed.barrier()
-            f.close()  # why does the context manager not close the file for me?
-    else:
-        log.info('Waiting for the data to be shared with me...')
-        data = share_tensor_to_all(None)
-        torch.distributed.barrier()
-    return data
-def load_shards(
-    data_path: Union[str, Path],
-    ids: list[int],
-    *,
-    tmp_file_path: str,
-) -> Union[torch.Tensor, dict[str, torch.Tensor]]:
-    id_set = set(ids)
-    shards = sorted(os.listdir(data_path))
-    log.info(f'Found {len(shards)} shards in {data_path}.')
-    first_shard = torch.load(os.path.join(data_path, shards[0]), weights_only=True)
-    log.info(f'Rank {local_rank} created file {tmp_file_path}')
-    first_item = next(iter(first_shard.values()))
-    log.info(f'First item shape: {first_item.shape}')
-    mm_tensor = MemoryMappedTensor.empty(shape=(len(ids), *first_item.shape),
-                                         dtype=torch.float32,
-                                         filename=tmp_file_path,
-                                         existsok=True)
-    total_count = 0
-    used_index = set()
-    id_indexing = {i: idx for idx, i in enumerate(ids)}
-    # faster with no workers; otherwise we need to set_sharing_strategy('file_system')
-    loader = DataLoader(ShardDataset(data_path), batch_size=1, num_workers=0)
-    for data in tqdm(loader, desc='Loading shards'):
-        for i, v in data.items():
-            if i not in id_set:
-                continue
-            # tensor_index = ids.index(i)
-            tensor_index = id_indexing[i]
-            if tensor_index in used_index:
-                raise ValueError(f'Duplicate id {i} found in {data_path}.')
-            used_index.add(tensor_index)
-            mm_tensor[tensor_index] = v
-            total_count += 1
-    assert total_count == len(ids), f'Expected {len(ids)} tensors, got {total_count}.'
-    log.info(f'Loaded {total_count} tensors from {data_path}.')
-    return mm_tensor
-def share_tensor_to_all(x: Optional[MemoryMappedTensor]) -> MemoryMappedTensor:
-    """
-    x: the tensor to be shared; None if local_rank != 0
-    return: the shared tensor
-    """
-    # there is no need to share your stuff with anyone if you are alone; must be in memory
-    if world_size == 1:
-        return x
-    if local_rank == 0:
-        assert x is not None, 'x must not be None if local_rank == 0'
-    else:
-        assert x is None, 'x must be None if local_rank != 0'
-    if local_rank == 0:
-        filename = x.filename
-        meta_information = (filename, x.shape, x.dtype)
-    else:
-        meta_information = None
-    filename, data_shape, data_type = local_scatter_torch(meta_information)
-    if local_rank == 0:
-        data = x
-    else:
-        data = MemoryMappedTensor.from_filename(filename=filename,
-                                                dtype=data_type,
-                                                shape=data_shape)
-    return data

mmaudio/eval_utils.py CHANGED Viewed

@@ -3,16 +3,14 @@ import logging
 from pathlib import Path
 from typing import Optional
-import numpy as np
 import torch
 from colorlog import ColoredFormatter
-from PIL import Image
 from torchvision.transforms import v2
-from mmaudio.data.av_utils import ImageInfo, VideoInfo, read_frames, reencode_with_audio
 from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio
-from mmaudio.model.sequence_config import CONFIG_16K, CONFIG_44K, SequenceConfig
 from mmaudio.model.utils.features_utils import FeaturesUtils
 from mmaudio.utils.download_utils import download_model_if_needed
@@ -90,7 +88,6 @@ def generate(
     cfg_strength: float,
     clip_batch_size_multiplier: int = 40,
     sync_batch_size_multiplier: int = 40,
-    image_input: bool = False,
 ) -> torch.Tensor:
     device = feature_utils.device
     dtype = feature_utils.dtype
@@ -101,12 +98,10 @@ def generate(
         clip_features = feature_utils.encode_video_with_clip(clip_video,
                                                              batch_size=bs *
                                                              clip_batch_size_multiplier)
-        if image_input:
-            clip_features = clip_features.expand(-1, net.clip_seq_len, -1)
     else:
         clip_features = net.get_empty_clip_sequence(bs)
-    if sync_video is not None and not image_input:
         sync_video = sync_video.to(device, dtype, non_blocking=True)
         sync_features = feature_utils.encode_video_with_sync(sync_video,
                                                              batch_size=bs *
@@ -144,7 +139,7 @@ def generate(
     return audio
-LOGFORMAT = "[%(log_color)s%(levelname)-8s%(reset)s]: %(log_color)s%(message)s%(reset)s"
 def setup_eval_logging(log_level: int = logging.INFO):
@@ -158,14 +153,12 @@ def setup_eval_logging(log_level: int = logging.INFO):
     log.addHandler(stream)
-_CLIP_SIZE = 384
-_CLIP_FPS = 8.0
-_SYNC_SIZE = 224
-_SYNC_FPS = 25.0
 def load_video(video_path: Path, duration_sec: float, load_all_frames: bool = True) -> VideoInfo:
     clip_transform = v2.Compose([
         v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
@@ -220,36 +213,5 @@ def load_video(video_path: Path, duration_sec: float, load_all_frames: bool = Tr
     return video_info
-def load_image(image_path: Path) -> VideoInfo:
-    clip_transform = v2.Compose([
-        v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
-        v2.ToImage(),
-        v2.ToDtype(torch.float32, scale=True),
-    ])
-    sync_transform = v2.Compose([
-        v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
-        v2.CenterCrop(_SYNC_SIZE),
-        v2.ToImage(),
-        v2.ToDtype(torch.float32, scale=True),
-        v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
-    ])
-    frame = np.array(Image.open(image_path))
-    clip_chunk = torch.from_numpy(frame).unsqueeze(0).permute(0, 3, 1, 2)
-    sync_chunk = torch.from_numpy(frame).unsqueeze(0).permute(0, 3, 1, 2)
-    clip_frames = clip_transform(clip_chunk)
-    sync_frames = sync_transform(sync_chunk)
-    video_info = ImageInfo(
-        clip_frames=clip_frames,
-        sync_frames=sync_frames,
-        original_frame=frame,
-    )
-    return video_info
 def make_video(video_info: VideoInfo, output_path: Path, audio: torch.Tensor, sampling_rate: int):
     reencode_with_audio(video_info, output_path, audio, sampling_rate)

 from pathlib import Path
 from typing import Optional
 import torch
 from colorlog import ColoredFormatter
 from torchvision.transforms import v2
+from mmaudio.data.av_utils import VideoInfo, read_frames, reencode_with_audio
 from mmaudio.model.flow_matching import FlowMatching
 from mmaudio.model.networks import MMAudio
+from mmaudio.model.sequence_config import (CONFIG_16K, CONFIG_44K, SequenceConfig)
 from mmaudio.model.utils.features_utils import FeaturesUtils
 from mmaudio.utils.download_utils import download_model_if_needed
     cfg_strength: float,
     clip_batch_size_multiplier: int = 40,
     sync_batch_size_multiplier: int = 40,
 ) -> torch.Tensor:
     device = feature_utils.device
     dtype = feature_utils.dtype
         clip_features = feature_utils.encode_video_with_clip(clip_video,
                                                              batch_size=bs *
                                                              clip_batch_size_multiplier)
     else:
         clip_features = net.get_empty_clip_sequence(bs)
+    if sync_video is not None:
         sync_video = sync_video.to(device, dtype, non_blocking=True)
         sync_features = feature_utils.encode_video_with_sync(sync_video,
                                                              batch_size=bs *
     return audio
+LOGFORMAT = "  %(log_color)s%(levelname)-8s%(reset)s | %(log_color)s%(message)s%(reset)s"
 def setup_eval_logging(log_level: int = logging.INFO):
     log.addHandler(stream)
 def load_video(video_path: Path, duration_sec: float, load_all_frames: bool = True) -> VideoInfo:
+    _CLIP_SIZE = 384
+    _CLIP_FPS = 8.0
+    _SYNC_SIZE = 224
+    _SYNC_FPS = 25.0
     clip_transform = v2.Compose([
         v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
     return video_info
 def make_video(video_info: VideoInfo, output_path: Path, audio: torch.Tensor, sampling_rate: int):
     reencode_with_audio(video_info, output_path, audio, sampling_rate)

mmaudio/ext/autoencoder/autoencoder.py CHANGED Viewed

@@ -20,7 +20,7 @@ class AutoEncoderModule(nn.Module):
         super().__init__()
         self.vae: VAE = get_my_vae(mode).eval()
         vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
-        self.vae.load_state_dict(vae_state_dict)
         self.vae.remove_weight_norm()
         if mode == '16k':

         super().__init__()
         self.vae: VAE = get_my_vae(mode).eval()
         vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
+        self.vae.load_state_dict(vae_state_dict, strict=False)
         self.vae.remove_weight_norm()
         if mode == '16k':

mmaudio/ext/autoencoder/vae.py CHANGED Viewed

@@ -75,9 +75,13 @@ class VAE(nn.Module):
         super().__init__()
         if data_dim == 80:
             self.register_buffer('data_mean', torch.tensor(DATA_MEAN_80D, dtype=torch.float32))
             self.register_buffer('data_std', torch.tensor(DATA_STD_80D, dtype=torch.float32))
         elif data_dim == 128:
             self.register_buffer('data_mean', torch.tensor(DATA_MEAN_128D, dtype=torch.float32))
             self.register_buffer('data_std', torch.tensor(DATA_STD_128D, dtype=torch.float32))

         super().__init__()
         if data_dim == 80:
+            # self.data_mean = torch.tensor(DATA_MEAN_80D, dtype=torch.float32).cuda()
+            # self.data_std = torch.tensor(DATA_STD_80D, dtype=torch.float32).cuda()
             self.register_buffer('data_mean', torch.tensor(DATA_MEAN_80D, dtype=torch.float32))
             self.register_buffer('data_std', torch.tensor(DATA_STD_80D, dtype=torch.float32))
         elif data_dim == 128:
+            # torch.tensor(DATA_MEAN_128D, dtype=torch.float32).cuda()
+            # self.data_std = torch.tensor(DATA_STD_128D, dtype=torch.float32).cuda()
             self.register_buffer('data_mean', torch.tensor(DATA_MEAN_128D, dtype=torch.float32))
             self.register_buffer('data_std', torch.tensor(DATA_STD_128D, dtype=torch.float32))

mmaudio/ext/mel_converter.py CHANGED Viewed

@@ -1,12 +1,11 @@
 # Reference: # https://github.com/bytedance/Make-An-Audio-2
-from typing import Literal
 import torch
 import torch.nn as nn
 from librosa.filters import mel as librosa_mel_fn
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5, *, norm_fn):
     return norm_fn(torch.clamp(x, min=clip_val) * C)
@@ -20,14 +19,14 @@ class MelConverter(nn.Module):
     def __init__(
         self,
         *,
-        sampling_rate: float,
-        n_fft: int,
-        num_mels: int,
-        hop_size: int,
-        win_size: int,
-        fmin: float,
-        fmax: float,
-        norm_fn,
     ):
         super().__init__()
         self.sampling_rate = sampling_rate
@@ -81,26 +80,3 @@ class MelConverter(nn.Module):
         spec = spectral_normalize_torch(spec, self.norm_fn)
         return spec
-def get_mel_converter(mode: Literal['16k', '44k']) -> MelConverter:
-    if mode == '16k':
-        return MelConverter(sampling_rate=16_000,
-                            n_fft=1024,
-                            num_mels=80,
-                            hop_size=256,
-                            win_size=1024,
-                            fmin=0,
-                            fmax=8_000,
-                            norm_fn=torch.log10)
-    elif mode == '44k':
-        return MelConverter(sampling_rate=44_100,
-                            n_fft=2048,
-                            num_mels=128,
-                            hop_size=512,
-                            win_size=2048,
-                            fmin=0,
-                            fmax=44100 / 2,
-                            norm_fn=torch.log)
-    else:
-        raise ValueError(f'Unknown mode: {mode}')

 # Reference: # https://github.com/bytedance/Make-An-Audio-2
 import torch
 import torch.nn as nn
 from librosa.filters import mel as librosa_mel_fn
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5, norm_fn=torch.log10):
     return norm_fn(torch.clamp(x, min=clip_val) * C)
     def __init__(
         self,
         *,
+        sampling_rate: float = 16_000,
+        n_fft: int = 1024,
+        num_mels: int = 80,
+        hop_size: int = 256,
+        win_size: int = 1024,
+        fmin: float = 0,
+        fmax: float = 8_000,
+        norm_fn=torch.log10,
     ):
         super().__init__()
         self.sampling_rate = sampling_rate
         spec = spectral_normalize_torch(spec, self.norm_fn)
         return spec

mmaudio/model/embeddings.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
-import math
 # https://github.com/facebookresearch/DiT
 class TimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.

 import torch
 import torch.nn as nn
 # https://github.com/facebookresearch/DiT
 class TimestepEmbedder(nn.Module):
     """
     Embeds scalar timesteps into vector representations.

mmaudio/model/flow_matching.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import logging
-from typing import Callable, Optional
 import torch
 from torchdiffeq import odeint
 log = logging.getLogger()
@@ -43,8 +45,12 @@ class FlowMatching:
         Cs: list[torch.Tensor],
         generator: Optional[torch.Generator] = None
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         x0 = torch.empty_like(x1).normal_(generator=generator)
         xt = self.get_conditional_flow(x0, x1, t)
         return x0, x1, xt, Cs
@@ -68,4 +74,15 @@ class FlowMatching:
                 dt = next_t - t
                 x = x + dt * flow
         return x

 import logging
+from typing import Callable, Iterable, Optional
 import torch
 from torchdiffeq import odeint
+# from torchcfm.conditional_flow_matching import ExactOptimalTransportConditionalFlowMatcher
 log = logging.getLogger()
         Cs: list[torch.Tensor],
         generator: Optional[torch.Generator] = None
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # x0 = torch.randn_like(x1, generator=generator)
         x0 = torch.empty_like(x1).normal_(generator=generator)
+        # find mini-batch optimal transport
+        # x0, x1, _, Cs = self.fm.ot_sampler.sample_plan_with_labels(x0, x1, None, Cs, replace=True)
         xt = self.get_conditional_flow(x0, x1, t)
         return x0, x1, xt, Cs
                 dt = next_t - t
                 x = x + dt * flow
+            # return odeint(fn,
+            #               x0,
+            #               torch.tensor([t0, t1], device=x0.device, dtype=x0.dtype),
+            #               method='rk4',
+            #               options=dict(step_size=(t1 - t0) / self.num_steps))[-1]
+            # return odeint(fn,
+            #               x0,
+            #               torch.tensor([t0, t1], device=x0.device, dtype=x0.dtype),
+            #               method='euler',
+            #               options=dict(step_size=(t1 - t0) / self.num_steps))[-1]
         return x

mmaudio/model/networks.py CHANGED Viewed

@@ -468,4 +468,4 @@ if __name__ == '__main__':
     # print the number of parameters in terms of millions
     num_params = sum(p.numel() for p in network.parameters()) / 1e6
-    print(f'Number of parameters: {num_params:.2f}M')

     # print the number of parameters in terms of millions
     num_params = sum(p.numel() for p in network.parameters()) / 1e6
+    print(f'Number of parameters: {num_params:.2f}M')

mmaudio/model/transformer_layers.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from einops.layers.torch import Rearrange
 from mmaudio.ext.rotary_embeddings import apply_rope
 from mmaudio.model.low_level import MLP, ChannelLastConv1d, ConvMLP

 import torch.nn.functional as F
 from einops import rearrange
 from einops.layers.torch import Rearrange
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from mmaudio.ext.rotary_embeddings import apply_rope
 from mmaudio.model.low_level import MLP, ChannelLastConv1d, ConvMLP

mmaudio/model/utils/features_utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ from open_clip import create_model_from_pretrained
 from torchvision.transforms import Normalize
 from mmaudio.ext.autoencoder import AutoEncoderModule
-from mmaudio.ext.mel_converter import get_mel_converter
 from mmaudio.ext.synchformer import Synchformer
 from mmaudio.model.utils.distributions import DiagonalGaussianDistribution
@@ -63,13 +63,13 @@ class FeaturesUtils(nn.Module):
             self.tokenizer = None
         if tod_vae_ckpt is not None:
-            self.mel_converter = get_mel_converter(mode)
             self.tod = AutoEncoderModule(vae_ckpt_path=tod_vae_ckpt,
                                          vocoder_ckpt_path=bigvgan_vocoder_ckpt,
                                          mode=mode,
                                          need_vae_encoder=need_vae_encoder)
         else:
             self.tod = None
     def compile(self):
         if self.clip_model is not None:

 from torchvision.transforms import Normalize
 from mmaudio.ext.autoencoder import AutoEncoderModule
+from mmaudio.ext.mel_converter import MelConverter
 from mmaudio.ext.synchformer import Synchformer
 from mmaudio.model.utils.distributions import DiagonalGaussianDistribution
             self.tokenizer = None
         if tod_vae_ckpt is not None:
             self.tod = AutoEncoderModule(vae_ckpt_path=tod_vae_ckpt,
                                          vocoder_ckpt_path=bigvgan_vocoder_ckpt,
                                          mode=mode,
                                          need_vae_encoder=need_vae_encoder)
         else:
             self.tod = None
+        self.mel_converter = MelConverter()
     def compile(self):
         if self.clip_model is not None:

mmaudio/runner.py DELETED Viewed

@@ -1,609 +0,0 @@
-"""
-trainer.py - wrapper and utility functions for network training
-Compute loss, back-prop, update parameters, logging, etc.
-"""
-import os
-from pathlib import Path
-from typing import Optional, Union
-import torch
-import torch.distributed
-import torch.optim as optim
-from av_bench.evaluate import evaluate
-from av_bench.extract import extract
-from nitrous_ema import PostHocEMA
-from omegaconf import DictConfig
-from torch.nn.parallel import DistributedDataParallel as DDP
-from mmaudio.model.flow_matching import FlowMatching
-from mmaudio.model.networks import get_my_mmaudio
-from mmaudio.model.sequence_config import CONFIG_16K, CONFIG_44K
-from mmaudio.model.utils.features_utils import FeaturesUtils
-from mmaudio.model.utils.parameter_groups import get_parameter_groups
-from mmaudio.model.utils.sample_utils import log_normal_sample
-from mmaudio.utils.dist_utils import (info_if_rank_zero, local_rank, string_if_rank_zero)
-from mmaudio.utils.log_integrator import Integrator
-from mmaudio.utils.logger import TensorboardLogger
-from mmaudio.utils.time_estimator import PartialTimeEstimator, TimeEstimator
-from mmaudio.utils.video_joiner import VideoJoiner
-class Runner:
-    def __init__(self,
-                 cfg: DictConfig,
-                 log: TensorboardLogger,
-                 run_path: Union[str, Path],
-                 for_training: bool = True,
-                 latent_mean: Optional[torch.Tensor] = None,
-                 latent_std: Optional[torch.Tensor] = None):
-        self.exp_id = cfg.exp_id
-        self.use_amp = cfg.amp
-        self.enable_grad_scaler = cfg.enable_grad_scaler
-        self.for_training = for_training
-        self.cfg = cfg
-        if cfg.model.endswith('16k'):
-            self.seq_cfg = CONFIG_16K
-            mode = '16k'
-        elif cfg.model.endswith('44k'):
-            self.seq_cfg = CONFIG_44K
-            mode = '44k'
-        else:
-            raise ValueError(f'Unknown model: {cfg.model}')
-        self.sample_rate = self.seq_cfg.sampling_rate
-        self.duration_sec = self.seq_cfg.duration
-        # setting up the model
-        empty_string_feat = torch.load('./ext_weights/empty_string.pth', weights_only=True)[0]
-        self.network = DDP(get_my_mmaudio(cfg.model,
-                                          latent_mean=latent_mean,
-                                          latent_std=latent_std,
-                                          empty_string_feat=empty_string_feat).cuda(),
-                           device_ids=[local_rank],
-                           broadcast_buffers=False)
-        if cfg.compile:
-            # NOTE: though train_fn and val_fn are very similar
-            # (early on they are implemented as a single function)
-            # keeping them separate and compiling them separately are CRUCIAL for high performance
-            self.train_fn = torch.compile(self.train_fn)
-            self.val_fn = torch.compile(self.val_fn)
-        self.fm = FlowMatching(cfg.sampling.min_sigma,
-                               inference_mode=cfg.sampling.method,
-                               num_steps=cfg.sampling.num_steps)
-        # ema profile
-        if for_training and cfg.ema.enable and local_rank == 0:
-            self.ema = PostHocEMA(self.network.module,
-                                  sigma_rels=cfg.ema.sigma_rels,
-                                  update_every=cfg.ema.update_every,
-                                  checkpoint_every_num_steps=cfg.ema.checkpoint_every,
-                                  checkpoint_folder=cfg.ema.checkpoint_folder,
-                                  step_size_correction=True).cuda()
-            self.ema_start = cfg.ema.start
-        else:
-            self.ema = None
-        self.rng = torch.Generator(device='cuda')
-        self.rng.manual_seed(cfg['seed'] + local_rank)
-        # setting up feature extractors and VAEs
-        if mode == '16k':
-            self.features = FeaturesUtils(
-                tod_vae_ckpt=cfg['vae_16k_ckpt'],
-                bigvgan_vocoder_ckpt=cfg['bigvgan_vocoder_ckpt'],
-                synchformer_ckpt=cfg['synchformer_ckpt'],
-                enable_conditions=True,
-                mode=mode,
-                need_vae_encoder=False,
-            )
-        elif mode == '44k':
-            self.features = FeaturesUtils(
-                tod_vae_ckpt=cfg['vae_44k_ckpt'],
-                synchformer_ckpt=cfg['synchformer_ckpt'],
-                enable_conditions=True,
-                mode=mode,
-                need_vae_encoder=False,
-            )
-        self.features = self.features.cuda().eval()
-        if cfg.compile:
-            self.features.compile()
-        # hyperparameters
-        self.log_normal_sampling_mean = cfg.sampling.mean
-        self.log_normal_sampling_scale = cfg.sampling.scale
-        self.null_condition_probability = cfg.null_condition_probability
-        self.cfg_strength = cfg.cfg_strength
-        # setting up logging
-        self.log = log
-        self.run_path = Path(run_path)
-        vgg_cfg = cfg.data.VGGSound
-        if for_training:
-            self.val_video_joiner = VideoJoiner(vgg_cfg.root, self.run_path / 'val-sampled-videos',
-                                                self.sample_rate, self.duration_sec)
-        else:
-            self.test_video_joiner = VideoJoiner(vgg_cfg.root,
-                                                 self.run_path / 'test-sampled-videos',
-                                                 self.sample_rate, self.duration_sec)
-        string_if_rank_zero(self.log, 'model_size',
-                            f'{sum([param.nelement() for param in self.network.parameters()])}')
-        string_if_rank_zero(
-            self.log, 'number_of_parameters_that_require_gradient: ',
-            str(
-                sum([
-                    param.nelement()
-                    for param in filter(lambda p: p.requires_grad, self.network.parameters())
-                ])))
-        info_if_rank_zero(self.log, 'torch version: ' + torch.__version__)
-        self.train_integrator = Integrator(self.log, distributed=True)
-        self.val_integrator = Integrator(self.log, distributed=True)
-        # setting up optimizer and loss
-        if for_training:
-            self.enter_train()
-            parameter_groups = get_parameter_groups(self.network, cfg, print_log=(local_rank == 0))
-            self.optimizer = optim.AdamW(parameter_groups,
-                                         lr=cfg['learning_rate'],
-                                         weight_decay=cfg['weight_decay'],
-                                         betas=[0.9, 0.95],
-                                         eps=1e-6 if self.use_amp else 1e-8,
-                                         fused=True)
-            if self.enable_grad_scaler:
-                self.scaler = torch.amp.GradScaler(init_scale=2048)
-            self.clip_grad_norm = cfg['clip_grad_norm']
-            # linearly warmup learning rate
-            linear_warmup_steps = cfg['linear_warmup_steps']
-            def warmup(currrent_step: int):
-                return (currrent_step + 1) / (linear_warmup_steps + 1)
-            warmup_scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=warmup)
-            # setting up learning rate scheduler
-            if cfg['lr_schedule'] == 'constant':
-                next_scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lambda _: 1)
-            elif cfg['lr_schedule'] == 'poly':
-                total_num_iter = cfg['iterations']
-                next_scheduler = optim.lr_scheduler.LambdaLR(self.optimizer,
-                                                             lr_lambda=lambda x:
-                                                             (1 - (x / total_num_iter))**0.9)
-            elif cfg['lr_schedule'] == 'step':
-                next_scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer,
-                                                                cfg['lr_schedule_steps'],
-                                                                cfg['lr_schedule_gamma'])
-            else:
-                raise NotImplementedError
-            self.scheduler = optim.lr_scheduler.SequentialLR(self.optimizer,
-                                                             [warmup_scheduler, next_scheduler],
-                                                             [linear_warmup_steps])
-            # Logging info
-            self.log_text_interval = cfg['log_text_interval']
-            self.log_extra_interval = cfg['log_extra_interval']
-            self.save_weights_interval = cfg['save_weights_interval']
-            self.save_checkpoint_interval = cfg['save_checkpoint_interval']
-            self.save_copy_iterations = cfg['save_copy_iterations']
-            self.num_iterations = cfg['num_iterations']
-            if cfg['debug']:
-                self.log_text_interval = self.log_extra_interval = 1
-            # update() is called when we log metrics, within the logger
-            self.log.batch_timer = TimeEstimator(self.num_iterations, self.log_text_interval)
-            # update() is called every iteration, in this script
-            self.log.data_timer = PartialTimeEstimator(self.num_iterations, 1, ema_alpha=0.9)
-        else:
-            self.enter_val()
-    def train_fn(
-        self,
-        clip_f: torch.Tensor,
-        sync_f: torch.Tensor,
-        text_f: torch.Tensor,
-        a_mean: torch.Tensor,
-        a_std: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # sample
-        a_randn = torch.empty_like(a_mean).normal_(generator=self.rng)
-        x1 = a_mean + a_std * a_randn
-        bs = x1.shape[0]  # batch_size * seq_len * num_channels
-        # normalize the latents
-        x1 = self.network.module.normalize(x1)
-        t = log_normal_sample(x1,
-                              generator=self.rng,
-                              m=self.log_normal_sampling_mean,
-                              s=self.log_normal_sampling_scale)
-        x0, x1, xt, (clip_f, sync_f, text_f) = self.fm.get_x0_xt_c(x1,
-                                                                   t,
-                                                                   Cs=[clip_f, sync_f, text_f],
-                                                                   generator=self.rng)
-        # classifier-free training
-        samples = torch.rand(bs, device=x1.device, generator=self.rng)
-        null_video = (samples < self.null_condition_probability)
-        clip_f[null_video] = self.network.module.empty_clip_feat
-        sync_f[null_video] = self.network.module.empty_sync_feat
-        samples = torch.rand(bs, device=x1.device, generator=self.rng)
-        null_text = (samples < self.null_condition_probability)
-        text_f[null_text] = self.network.module.empty_string_feat
-        pred_v = self.network(xt, clip_f, sync_f, text_f, t)
-        loss = self.fm.loss(pred_v, x0, x1)
-        mean_loss = loss.mean()
-        return x1, loss, mean_loss, t
-    def val_fn(
-        self,
-        clip_f: torch.Tensor,
-        sync_f: torch.Tensor,
-        text_f: torch.Tensor,
-        x1: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        bs = x1.shape[0]  # batch_size * seq_len * num_channels
-        # normalize the latents
-        x1 = self.network.module.normalize(x1)
-        t = log_normal_sample(x1,
-                              generator=self.rng,
-                              m=self.log_normal_sampling_mean,
-                              s=self.log_normal_sampling_scale)
-        x0, x1, xt, (clip_f, sync_f, text_f) = self.fm.get_x0_xt_c(x1,
-                                                                   t,
-                                                                   Cs=[clip_f, sync_f, text_f],
-                                                                   generator=self.rng)
-        # classifier-free training
-        samples = torch.rand(bs, device=x1.device, generator=self.rng)
-        # null mask is for when a video is provided but we decided to ignore it
-        null_video = (samples < self.null_condition_probability)
-        # complete mask is for when a video is not provided or we decided to ignore it
-        clip_f[null_video] = self.network.module.empty_clip_feat
-        sync_f[null_video] = self.network.module.empty_sync_feat
-        samples = torch.rand(bs, device=x1.device, generator=self.rng)
-        null_text = (samples < self.null_condition_probability)
-        text_f[null_text] = self.network.module.empty_string_feat
-        pred_v = self.network(xt, clip_f, sync_f, text_f, t)
-        loss = self.fm.loss(pred_v, x0, x1)
-        mean_loss = loss.mean()
-        return loss, mean_loss, t
-    def train_pass(self, data, it: int = 0):
-        if not self.for_training:
-            raise ValueError('train_pass() should not be called when not training.')
-        self.enter_train()
-        with torch.amp.autocast('cuda', enabled=self.use_amp, dtype=torch.bfloat16):
-            clip_f = data['clip_features'].cuda(non_blocking=True)
-            sync_f = data['sync_features'].cuda(non_blocking=True)
-            text_f = data['text_features'].cuda(non_blocking=True)
-            video_exist = data['video_exist'].cuda(non_blocking=True)
-            text_exist = data['text_exist'].cuda(non_blocking=True)
-            a_mean = data['a_mean'].cuda(non_blocking=True)
-            a_std = data['a_std'].cuda(non_blocking=True)
-            # these masks are for non-existent data; masking for CFG training is in train_fn
-            clip_f[~video_exist] = self.network.module.empty_clip_feat
-            sync_f[~video_exist] = self.network.module.empty_sync_feat
-            text_f[~text_exist] = self.network.module.empty_string_feat
-            self.log.data_timer.end()
-            if it % self.log_extra_interval == 0:
-                unmasked_clip_f = clip_f.clone()
-                unmasked_sync_f = sync_f.clone()
-                unmasked_text_f = text_f.clone()
-            x1, loss, mean_loss, t = self.train_fn(clip_f, sync_f, text_f, a_mean, a_std)
-            self.train_integrator.add_dict({'loss': mean_loss})
-        if it % self.log_text_interval == 0 and it != 0:
-            self.train_integrator.add_scalar('lr', self.scheduler.get_last_lr()[0])
-            self.train_integrator.add_binned_tensor('binned_loss', loss, t)
-            self.train_integrator.finalize('train', it)
-            self.train_integrator.reset_except_hooks()
-        # Backward pass
-        self.optimizer.zero_grad(set_to_none=True)
-        if self.enable_grad_scaler:
-            self.scaler.scale(mean_loss).backward()
-            self.scaler.unscale_(self.optimizer)
-            grad_norm = torch.nn.utils.clip_grad_norm_(self.network.parameters(),
-                                                       self.clip_grad_norm)
-            self.scaler.step(self.optimizer)
-            self.scaler.update()
-        else:
-            mean_loss.backward()
-            grad_norm = torch.nn.utils.clip_grad_norm_(self.network.parameters(),
-                                                       self.clip_grad_norm)
-            self.optimizer.step()
-        if self.ema is not None and it >= self.ema_start:
-            self.ema.update()
-        self.scheduler.step()
-        self.integrator.add_scalar('grad_norm', grad_norm)
-        self.enter_val()
-        with torch.amp.autocast('cuda', enabled=self.use_amp,
-                                dtype=torch.bfloat16), torch.inference_mode():
-            try:
-                if it % self.log_extra_interval == 0:
-                    # save GT audio
-                    # unnormalize the latents
-                    x1 = self.network.module.unnormalize(x1[0:1])
-                    mel = self.features.decode(x1)
-                    audio = self.features.vocode(mel).cpu()[0]  # 1 * num_samples
-                    self.log.log_spectrogram('train', f'spec-gt-r{local_rank}', mel.cpu()[0], it)
-                    self.log.log_audio('train',
-                                       f'audio-gt-r{local_rank}',
-                                       audio,
-                                       it,
-                                       sample_rate=self.sample_rate)
-                    # save audio from sampling
-                    x0 = torch.empty_like(x1[0:1]).normal_(generator=self.rng)
-                    clip_f = unmasked_clip_f[0:1]
-                    sync_f = unmasked_sync_f[0:1]
-                    text_f = unmasked_text_f[0:1]
-                    conditions = self.network.module.preprocess_conditions(clip_f, sync_f, text_f)
-                    empty_conditions = self.network.module.get_empty_conditions(x0.shape[0])
-                    cfg_ode_wrapper = lambda t, x: self.network.module.ode_wrapper(
-                        t, x, conditions, empty_conditions, self.cfg_strength)
-                    x1_hat = self.fm.to_data(cfg_ode_wrapper, x0)
-                    x1_hat = self.network.module.unnormalize(x1_hat)
-                    mel = self.features.decode(x1_hat)
-                    audio = self.features.vocode(mel).cpu()[0]
-                    self.log.log_spectrogram('train', f'spec-r{local_rank}', mel.cpu()[0], it)
-                    self.log.log_audio('train',
-                                       f'audio-r{local_rank}',
-                                       audio,
-                                       it,
-                                       sample_rate=self.sample_rate)
-            except Exception as e:
-                self.log.warning(f'Error in extra logging: {e}')
-                if self.cfg.debug:
-                    raise
-        # Save network weights and checkpoint if needed
-        save_copy = it in self.save_copy_iterations
-        if (it % self.save_weights_interval == 0 and it != 0) or save_copy:
-            self.save_weights(it)
-        if it % self.save_checkpoint_interval == 0 and it != 0:
-            self.save_checkpoint(it, save_copy=save_copy)
-        self.log.data_timer.start()
-    @torch.inference_mode()
-    def validation_pass(self, data, it: int = 0):
-        self.enter_val()
-        with torch.amp.autocast('cuda', enabled=self.use_amp, dtype=torch.bfloat16):
-            clip_f = data['clip_features'].cuda(non_blocking=True)
-            sync_f = data['sync_features'].cuda(non_blocking=True)
-            text_f = data['text_features'].cuda(non_blocking=True)
-            video_exist = data['video_exist'].cuda(non_blocking=True)
-            text_exist = data['text_exist'].cuda(non_blocking=True)
-            a_mean = data['a_mean'].cuda(non_blocking=True)
-            a_std = data['a_std'].cuda(non_blocking=True)
-            clip_f[~video_exist] = self.network.module.empty_clip_feat
-            sync_f[~video_exist] = self.network.module.empty_sync_feat
-            text_f[~text_exist] = self.network.module.empty_string_feat
-            a_randn = torch.empty_like(a_mean).normal_(generator=self.rng)
-            x1 = a_mean + a_std * a_randn
-            self.log.data_timer.end()
-            loss, mean_loss, t = self.val_fn(clip_f.clone(), sync_f.clone(), text_f.clone(), x1)
-            self.val_integrator.add_binned_tensor('binned_loss', loss, t)
-            self.val_integrator.add_dict({'loss': mean_loss})
-        self.log.data_timer.start()
-    @torch.inference_mode()
-    def inference_pass(self,
-                       data,
-                       it: int,
-                       data_cfg: DictConfig,
-                       *,
-                       save_eval: bool = True) -> Path:
-        self.enter_val()
-        with torch.amp.autocast('cuda', enabled=self.use_amp, dtype=torch.bfloat16):
-            clip_f = data['clip_features'].cuda(non_blocking=True)
-            sync_f = data['sync_features'].cuda(non_blocking=True)
-            text_f = data['text_features'].cuda(non_blocking=True)
-            video_exist = data['video_exist'].cuda(non_blocking=True)
-            text_exist = data['text_exist'].cuda(non_blocking=True)
-            a_mean = data['a_mean'].cuda(non_blocking=True)  # for the shape only
-            clip_f[~video_exist] = self.network.module.empty_clip_feat
-            sync_f[~video_exist] = self.network.module.empty_sync_feat
-            text_f[~text_exist] = self.network.module.empty_string_feat
-            # sample
-            x0 = torch.empty_like(a_mean).normal_(generator=self.rng)
-            conditions = self.network.module.preprocess_conditions(clip_f, sync_f, text_f)
-            empty_conditions = self.network.module.get_empty_conditions(x0.shape[0])
-            cfg_ode_wrapper = lambda t, x: self.network.module.ode_wrapper(
-                t, x, conditions, empty_conditions, self.cfg_strength)
-            x1_hat = self.fm.to_data(cfg_ode_wrapper, x0)
-            x1_hat = self.network.module.unnormalize(x1_hat)
-            mel = self.features.decode(x1_hat)
-            audio = self.features.vocode(mel).cpu()
-            for i in range(audio.shape[0]):
-                video_id = data['id'][i]
-                if (not self.for_training) and i == 0:
-                    # save very few videos
-                    self.test_video_joiner.join(video_id, f'{video_id}', audio[i].transpose(0, 1))
-                if data_cfg.output_subdir is not None:
-                    # validation
-                    if save_eval:
-                        iter_naming = f'{it:09d}'
-                    else:
-                        iter_naming = 'val-cache'
-                    audio_dir = self.log.log_audio(iter_naming,
-                                                   f'{video_id}',
-                                                   audio[i],
-                                                   it=None,
-                                                   sample_rate=self.sample_rate,
-                                                   subdir=Path(data_cfg.output_subdir))
-                    if save_eval and i == 0:
-                        self.val_video_joiner.join(video_id, f'{iter_naming}-{video_id}',
-                                                   audio[i].transpose(0, 1))
-                else:
-                    # full test set, usually
-                    audio_dir = self.log.log_audio(f'{data_cfg.tag}-sampled',
-                                                   f'{video_id}',
-                                                   audio[i],
-                                                   it=None,
-                                                   sample_rate=self.sample_rate)
-        return Path(audio_dir)
-    @torch.inference_mode()
-    def eval(self, audio_dir: Path, it: int, data_cfg: DictConfig) -> dict[str, float]:
-        with torch.amp.autocast('cuda', enabled=False):
-            if local_rank == 0:
-                extract(audio_path=audio_dir,
-                        output_path=audio_dir / 'cache',
-                        device='cuda',
-                        batch_size=32,
-                        audio_length=8)
-                output_metrics = evaluate(gt_audio_cache=Path(data_cfg.gt_cache),
-                                          pred_audio_cache=audio_dir / 'cache')
-                for k, v in output_metrics.items():
-                    # pad k to 10 characters
-                    # pad v to 10 decimal places
-                    self.log.log_scalar(f'{data_cfg.tag}/{k}', v, it)
-                    self.log.info(f'{data_cfg.tag}/{k:<10}: {v:.10f}')
-            else:
-                output_metrics = None
-        return output_metrics
-    def save_weights(self, it, save_copy=False):
-        if local_rank != 0:
-            return
-        os.makedirs(self.run_path, exist_ok=True)
-        if save_copy:
-            model_path = self.run_path / f'{self.exp_id}_{it}.pth'
-            torch.save(self.network.module.state_dict(), model_path)
-            self.log.info(f'Network weights saved to {model_path}.')
-        # if last exists, move it to a shadow copy
-        model_path = self.run_path / f'{self.exp_id}_last.pth'
-        if model_path.exists():
-            shadow_path = model_path.with_name(model_path.name.replace('last', 'shadow'))
-            model_path.replace(shadow_path)
-            self.log.info(f'Network weights shadowed to {shadow_path}.')
-        torch.save(self.network.module.state_dict(), model_path)
-        self.log.info(f'Network weights saved to {model_path}.')
-    def save_checkpoint(self, it, save_copy=False):
-        if local_rank != 0:
-            return
-        checkpoint = {
-            'it': it,
-            'weights': self.network.module.state_dict(),
-            'optimizer': self.optimizer.state_dict(),
-            'scheduler': self.scheduler.state_dict(),
-            'ema': self.ema.state_dict() if self.ema is not None else None,
-        }
-        os.makedirs(self.run_path, exist_ok=True)
-        if save_copy:
-            model_path = self.run_path / f'{self.exp_id}_ckpt_{it}.pth'
-            torch.save(checkpoint, model_path)
-            self.log.info(f'Checkpoint saved to {model_path}.')
-        # if ckpt_last exists, move it to a shadow copy
-        model_path = self.run_path / f'{self.exp_id}_ckpt_last.pth'
-        if model_path.exists():
-            shadow_path = model_path.with_name(model_path.name.replace('last', 'shadow'))
-            model_path.replace(shadow_path)  # moves the file
-            self.log.info(f'Checkpoint shadowed to {shadow_path}.')
-        torch.save(checkpoint, model_path)
-        self.log.info(f'Checkpoint saved to {model_path}.')
-    def get_latest_checkpoint_path(self):
-        ckpt_path = self.run_path / f'{self.exp_id}_ckpt_last.pth'
-        if not ckpt_path.exists():
-            info_if_rank_zero(self.log, f'No checkpoint found at {ckpt_path}.')
-            return None
-        return ckpt_path
-    def get_latest_weight_path(self):
-        weight_path = self.run_path / f'{self.exp_id}_last.pth'
-        if not weight_path.exists():
-            self.log.info(f'No weight found at {weight_path}.')
-            return None
-        return weight_path
-    def get_final_ema_weight_path(self):
-        weight_path = self.run_path / f'{self.exp_id}_ema_final.pth'
-        if not weight_path.exists():
-            self.log.info(f'No weight found at {weight_path}.')
-            return None
-        return weight_path
-    def load_checkpoint(self, path):
-        # This method loads everything and should be used to resume training
-        map_location = 'cuda:%d' % local_rank
-        checkpoint = torch.load(path, map_location={'cuda:0': map_location}, weights_only=True)
-        it = checkpoint['it']
-        weights = checkpoint['weights']
-        optimizer = checkpoint['optimizer']
-        scheduler = checkpoint['scheduler']
-        if self.ema is not None:
-            self.ema.load_state_dict(checkpoint['ema'])
-            self.log.info(f'EMA states loaded from step {self.ema.step}')
-        map_location = 'cuda:%d' % local_rank
-        self.network.module.load_state_dict(weights)
-        self.optimizer.load_state_dict(optimizer)
-        self.scheduler.load_state_dict(scheduler)
-        self.log.info(f'Global iteration {it} loaded.')
-        self.log.info('Network weights, optimizer states, and scheduler states loaded.')
-        return it
-    def load_weights_in_memory(self, src_dict):
-        self.network.module.load_weights(src_dict)
-        self.log.info('Network weights loaded from memory.')
-    def load_weights(self, path):
-        # This method loads only the network weight and should be used to load a pretrained model
-        map_location = 'cuda:%d' % local_rank
-        src_dict = torch.load(path, map_location={'cuda:0': map_location}, weights_only=True)
-        self.log.info(f'Importing network weights from {path}...')
-        self.load_weights_in_memory(src_dict)
-    def weights(self):
-        return self.network.module.state_dict()
-    def enter_train(self):
-        self.integrator = self.train_integrator
-        self.network.train()
-        return self
-    def enter_val(self):
-        self.network.eval()
-        return self

mmaudio/sample.py DELETED Viewed

@@ -1,90 +0,0 @@
-import json
-import logging
-import os
-import random
-import numpy as np
-import torch
-from hydra.core.hydra_config import HydraConfig
-from omegaconf import DictConfig, open_dict
-from tqdm import tqdm
-from mmaudio.data.data_setup import setup_test_datasets
-from mmaudio.runner import Runner
-from mmaudio.utils.dist_utils import info_if_rank_zero
-from mmaudio.utils.logger import TensorboardLogger
-local_rank = int(os.environ['LOCAL_RANK'])
-world_size = int(os.environ['WORLD_SIZE'])
-def sample(cfg: DictConfig):
-    # initial setup
-    num_gpus = world_size
-    run_dir = HydraConfig.get().run.dir
-    # wrap python logger with a tensorboard logger
-    log = TensorboardLogger(cfg.exp_id,
-                            run_dir,
-                            logging.getLogger(),
-                            is_rank0=(local_rank == 0),
-                            enable_email=cfg.enable_email and not cfg.debug)
-    info_if_rank_zero(log, f'All configuration: {cfg}')
-    info_if_rank_zero(log, f'Number of GPUs detected: {num_gpus}')
-    # cuda setup
-    torch.cuda.set_device(local_rank)
-    torch.backends.cudnn.benchmark = cfg.cudnn_benchmark
-    # number of dataloader workers
-    info_if_rank_zero(log, f'Number of dataloader workers (per GPU): {cfg.num_workers}')
-    # Set seeds to ensure the same initialization
-    torch.manual_seed(cfg.seed)
-    np.random.seed(cfg.seed)
-    random.seed(cfg.seed)
-    # setting up configurations
-    info_if_rank_zero(log, f'Configuration: {cfg}')
-    info_if_rank_zero(log, f'Batch size (per GPU): {cfg.batch_size}')
-    # construct the trainer
-    runner = Runner(cfg, log=log, run_path=run_dir, for_training=False).enter_val()
-    # load the last weights if needed
-    if cfg['weights'] is not None:
-        info_if_rank_zero(log, f'Loading weights from the disk: {cfg["weights"]}')
-        runner.load_weights(cfg['weights'])
-        cfg['weights'] = None
-    else:
-        weights = runner.get_final_ema_weight_path()
-        if weights is not None:
-            info_if_rank_zero(log, f'Automatically finding weight: {weights}')
-            runner.load_weights(weights)
-    # setup datasets
-    dataset, sampler, loader = setup_test_datasets(cfg)
-    data_cfg = cfg.data.ExtractedVGG_test
-    with open_dict(data_cfg):
-        if cfg.output_name is not None:
-            # append to the tag
-            data_cfg.tag = f'{data_cfg.tag}-{cfg.output_name}'
-    # loop
-    audio_path = None
-    for curr_iter, data in enumerate(tqdm(loader)):
-        new_audio_path = runner.inference_pass(data, curr_iter, data_cfg)
-        if audio_path is None:
-            audio_path = new_audio_path
-        else:
-            assert audio_path == new_audio_path, 'Different audio path detected'
-    info_if_rank_zero(log, f'Inference completed. Audio path: {audio_path}')
-    output_metrics = runner.eval(audio_path, curr_iter, data_cfg)
-    if local_rank == 0:
-        # write the output metrics to run_dir
-        output_metrics_path = os.path.join(run_dir, f'{data_cfg.tag}-output_metrics.json')
-        with open(output_metrics_path, 'w') as f:
-            json.dump(output_metrics, f, indent=4)

mmaudio/utils/email_utils.py DELETED Viewed

@@ -1,50 +0,0 @@
-import logging
-import os
-from datetime import datetime
-import requests
-from dotenv import load_dotenv
-from pytz import timezone
-from mmaudio.utils.timezone import my_timezone
-_source = 'USE YOURS'
-_target = 'USE YOURS'
-log = logging.getLogger()
-_fmt = "%Y-%m-%d %H:%M:%S %Z%z"
-class EmailSender:
-    def __init__(self, exp_id: str, enable: bool):
-        self.exp_id = exp_id
-        self.enable = enable
-        if enable:
-            load_dotenv()
-            self.MAILGUN_API_KEY = os.getenv('MAILGUN_API_KEY')
-            if self.MAILGUN_API_KEY is None:
-                log.warning('MAILGUN_API_KEY is not set')
-                self.enable = False
-    def send(self, subject, content):
-        if self.enable:
-            subject = str(subject)
-            content = str(content)
-            try:
-                return requests.post(f'https://api.mailgun.net/v3/{_source}/messages',
-                                     auth=('api', self.MAILGUN_API_KEY),
-                                     data={
-                                         'from':
-                                         f'<agent name>🤖 <mailgun@{_source}>',
-                                         'to': [f'{_target}'],
-                                         'subject':
-                                         f'[{self.exp_id}] {subject}',
-                                         'text':
-                                         ('\n\n' + content + '\n\n<your sign off>\n' +
-                                          datetime.now(timezone(my_timezone)).strftime(_fmt)),
-                                     },
-                                     timeout=20)
-            except Exception as e:
-                log.error(f'Failed to send email: {e}')

mmaudio/utils/log_integrator.py DELETED Viewed

@@ -1,112 +0,0 @@
-"""
-Integrate numerical values for some iterations
-Typically used for loss computation / logging to tensorboard
-Call finalize and create a new Integrator when you want to display/log
-"""
-from typing import Callable, Union
-import torch
-from mmaudio.utils.logger import TensorboardLogger
-from mmaudio.utils.tensor_utils import distribute_into_histogram
-class Integrator:
-    def __init__(self, logger: TensorboardLogger, distributed: bool = True):
-        self.values = {}
-        self.counts = {}
-        self.hooks = []  # List is used here to maintain insertion order
-        # for binned tensors
-        self.binned_tensors = {}
-        self.binned_tensor_indices = {}
-        self.logger = logger
-        self.distributed = distributed
-        self.local_rank = torch.distributed.get_rank()
-        self.world_size = torch.distributed.get_world_size()
-    def add_scalar(self, key: str, x: Union[torch.Tensor, int, float]):
-        if isinstance(x, torch.Tensor):
-            x = x.detach()
-            if x.dtype in [torch.long, torch.int, torch.bool]:
-                x = x.float()
-        if key not in self.values:
-            self.counts[key] = 1
-            self.values[key] = x
-        else:
-            self.counts[key] += 1
-            self.values[key] += x
-    def add_dict(self, tensor_dict: dict[str, torch.Tensor]):
-        for k, v in tensor_dict.items():
-            self.add_scalar(k, v)
-    def add_binned_tensor(self, key: str, x: torch.Tensor, indices: torch.Tensor):
-        if key not in self.binned_tensors:
-            self.binned_tensors[key] = [x.detach().flatten()]
-            self.binned_tensor_indices[key] = [indices.detach().flatten()]
-        else:
-            self.binned_tensors[key].append(x.detach().flatten())
-            self.binned_tensor_indices[key].append(indices.detach().flatten())
-    def add_hook(self, hook: Callable[[torch.Tensor], tuple[str, torch.Tensor]]):
-        """
-        Adds a custom hook, i.e. compute new metrics using values in the dict
-        The hook takes the dict as argument, and returns a (k, v) tuple
-        e.g. for computing IoU
-        """
-        self.hooks.append(hook)
-    def reset_except_hooks(self):
-        self.values = {}
-        self.counts = {}
-    # Average and output the metrics
-    def finalize(self, prefix: str, it: int, ignore_timer: bool = False) -> None:
-        for hook in self.hooks:
-            k, v = hook(self.values)
-            self.add_scalar(k, v)
-        # for the metrics
-        outputs = {}
-        for k, v in self.values.items():
-            avg = v / self.counts[k]
-            if self.distributed:
-                # Inplace operation
-                if isinstance(avg, torch.Tensor):
-                    avg = avg.cuda()
-                else:
-                    avg = torch.tensor(avg).cuda()
-                torch.distributed.reduce(avg, dst=0)
-                if self.local_rank == 0:
-                    avg = (avg / self.world_size).cpu().item()
-                    outputs[k] = avg
-            else:
-                # Simple does it
-                outputs[k] = avg
-        if (not self.distributed) or (self.local_rank == 0):
-            self.logger.log_metrics(prefix, outputs, it, ignore_timer=ignore_timer)
-        # for the binned tensors
-        for k, v in self.binned_tensors.items():
-            x = torch.cat(v, dim=0)
-            indices = torch.cat(self.binned_tensor_indices[k], dim=0)
-            hist, count = distribute_into_histogram(x, indices)
-            if self.distributed:
-                torch.distributed.reduce(hist, dst=0)
-                torch.distributed.reduce(count, dst=0)
-                if self.local_rank == 0:
-                    hist = hist / count
-            else:
-                hist = hist / count
-            if (not self.distributed) or (self.local_rank == 0):
-                self.logger.log_histogram(f'{prefix}/{k}', hist, it)

mmaudio/utils/logger.py DELETED Viewed

@@ -1,231 +0,0 @@
-"""
-Dumps things to tensorboard and console
-"""
-import datetime
-import logging
-import math
-import os
-from collections import defaultdict
-from pathlib import Path
-from typing import Optional, Union
-import matplotlib.pyplot as plt
-import numpy as np
-import torch
-import torchaudio
-from PIL import Image
-from pytz import timezone
-from torch.utils.tensorboard import SummaryWriter
-from mmaudio.utils.email_utils import EmailSender
-from mmaudio.utils.time_estimator import PartialTimeEstimator, TimeEstimator
-from mmaudio.utils.timezone import my_timezone
-def tensor_to_numpy(image: torch.Tensor):
-    image_np = (image.numpy() * 255).astype('uint8')
-    return image_np
-def detach_to_cpu(x: torch.Tensor):
-    return x.detach().cpu()
-def fix_width_trunc(x: float):
-    return ('{:.9s}'.format('{:0.9f}'.format(x)))
-def plot_spectrogram(spectrogram: np.ndarray, title=None, ylabel="freq_bin", ax=None):
-    if ax is None:
-        _, ax = plt.subplots(1, 1)
-    if title is not None:
-        ax.set_title(title)
-    ax.set_ylabel(ylabel)
-    ax.imshow(spectrogram, origin="lower", aspect="auto", interpolation="nearest")
-class TensorboardLogger:
-    def __init__(self,
-                 exp_id: str,
-                 run_dir: Union[Path, str],
-                 py_logger: logging.Logger,
-                 *,
-                 is_rank0: bool = False,
-                 enable_email: bool = False):
-        self.exp_id = exp_id
-        self.run_dir = Path(run_dir)
-        self.py_log = py_logger
-        self.email_sender = EmailSender(exp_id, enable=(is_rank0 and enable_email))
-        if is_rank0:
-            self.tb_log = SummaryWriter(run_dir)
-        else:
-            self.tb_log = None
-        # Get current git info for logging
-        try:
-            import git
-            repo = git.Repo(".")
-            git_info = str(repo.active_branch) + ' ' + str(repo.head.commit.hexsha)
-        except (ImportError, RuntimeError, TypeError):
-            print('Failed to fetch git info. Defaulting to None')
-            git_info = 'None'
-        self.log_string('git', git_info)
-        # log the SLURM job id if available
-        job_id = os.environ.get('SLURM_JOB_ID', None)
-        if job_id is not None:
-            self.log_string('slurm_job_id', job_id)
-            self.email_sender.send(f'Job {job_id} started', f'Job started {run_dir}')
-        # used when logging metrics
-        self.batch_timer: TimeEstimator = None
-        self.data_timer: PartialTimeEstimator = None
-        self.nan_count = defaultdict(int)
-    def log_scalar(self, tag: str, x: float, it: int):
-        if self.tb_log is None:
-            return
-        if math.isnan(x) and 'grad_norm' not in tag:
-            self.nan_count[tag] += 1
-            if self.nan_count[tag] == 10:
-                self.email_sender.send(
-                    f'Nan detected in {tag} @ {self.run_dir}',
-                    f'Nan detected in {tag} at iteration {it}; run_dir: {self.run_dir}')
-        else:
-            self.nan_count[tag] = 0
-        self.tb_log.add_scalar(tag, x, it)
-    def log_metrics(self,
-                    prefix: str,
-                    metrics: dict[str, float],
-                    it: int,
-                    ignore_timer: bool = False):
-        msg = f'{self.exp_id}-{prefix} - it {it:6d}: '
-        metrics_msg = ''
-        for k, v in sorted(metrics.items()):
-            self.log_scalar(f'{prefix}/{k}', v, it)
-            metrics_msg += f'{k: >10}:{v:.7f},\t'
-        if self.batch_timer is not None and not ignore_timer:
-            self.batch_timer.update()
-            avg_time = self.batch_timer.get_and_reset_avg_time()
-            data_time = self.data_timer.get_and_reset_avg_time()
-            # add time to tensorboard
-            self.log_scalar(f'{prefix}/avg_time', avg_time, it)
-            self.log_scalar(f'{prefix}/data_time', data_time, it)
-            est = self.batch_timer.get_est_remaining(it)
-            est = datetime.timedelta(seconds=est)
-            if est.days > 0:
-                remaining_str = f'{est.days}d {est.seconds // 3600}h'
-            else:
-                remaining_str = f'{est.seconds // 3600}h {(est.seconds%3600) // 60}m'
-            eta = datetime.datetime.now(timezone(my_timezone)) + est
-            eta_str = eta.strftime('%Y-%m-%d %H:%M:%S %Z%z')
-            time_msg = f'avg_time:{avg_time:.3f},data:{data_time:.3f},remaining:{remaining_str},eta:{eta_str},\t'
-            msg = f'{msg} {time_msg}'
-        msg = f'{msg} {metrics_msg}'
-        self.py_log.info(msg)
-    def log_histogram(self, tag: str, hist: torch.Tensor, it: int):
-        if self.tb_log is None:
-            return
-        # hist should be a 1D tensor
-        hist = hist.cpu().numpy()
-        fig, ax = plt.subplots()
-        x_range = np.linspace(0, 1, len(hist))
-        ax.bar(x_range, hist, width=1 / (len(hist) - 1))
-        ax.set_xticks(x_range)
-        ax.set_xticklabels(x_range)
-        plt.tight_layout()
-        self.tb_log.add_figure(tag, fig, it)
-        plt.close()
-    def log_image(self, prefix: str, tag: str, image: np.ndarray, it: int):
-        image_dir = self.run_dir / f'{prefix}_images'
-        image_dir.mkdir(exist_ok=True, parents=True)
-        image = Image.fromarray(image)
-        image.save(image_dir / f'{it:09d}_{tag}.png')
-    def log_audio(self,
-                  prefix: str,
-                  tag: str,
-                  waveform: torch.Tensor,
-                  it: Optional[int] = None,
-                  *,
-                  subdir: Optional[Path] = None,
-                  sample_rate: int = 16000) -> Path:
-        if subdir is None:
-            audio_dir = self.run_dir / prefix
-        else:
-            audio_dir = self.run_dir / subdir / prefix
-        audio_dir.mkdir(exist_ok=True, parents=True)
-        if it is None:
-            name = f'{tag}.flac'
-        else:
-            name = f'{it:09d}_{tag}.flac'
-        torchaudio.save(audio_dir / name,
-                        waveform.cpu().float(),
-                        sample_rate=sample_rate,
-                        channels_first=True)
-        return Path(audio_dir)
-    def log_spectrogram(
-        self,
-        prefix: str,
-        tag: str,
-        spec: torch.Tensor,
-        it: Optional[int],
-        *,
-        subdir: Optional[Path] = None,
-    ):
-        if subdir is None:
-            spec_dir = self.run_dir / prefix
-        else:
-            spec_dir = self.run_dir / subdir / prefix
-        spec_dir.mkdir(exist_ok=True, parents=True)
-        if it is None:
-            name = f'{tag}.png'
-        else:
-            name = f'{it:09d}_{tag}.png'
-        plot_spectrogram(spec.cpu().float())
-        plt.tight_layout()
-        plt.savefig(spec_dir / name)
-        plt.close()
-    def log_string(self, tag: str, x: str):
-        self.py_log.info(f'{tag} - {x}')
-        if self.tb_log is None:
-            return
-        self.tb_log.add_text(tag, x)
-    def debug(self, x):
-        self.py_log.debug(x)
-    def info(self, x):
-        self.py_log.info(x)
-    def warning(self, x):
-        self.py_log.warning(x)
-    def error(self, x):
-        self.py_log.error(x)
-    def critical(self, x):
-        self.py_log.critical(x)
-        self.email_sender.send(f'Error occurred in {self.run_dir}', x)
-    def complete(self):
-        self.email_sender.send(f'Job completed in {self.run_dir}', 'Job completed')

mmaudio/utils/synthesize_ema.py DELETED Viewed

@@ -1,19 +0,0 @@
-from typing import Optional
-from nitrous_ema import PostHocEMA
-from omegaconf import DictConfig
-from mmaudio.model.networks import get_my_mmaudio
-def synthesize_ema(cfg: DictConfig, sigma: float, step: Optional[int]):
-    vae = get_my_mmaudio(cfg.model)
-    emas = PostHocEMA(vae,
-                      sigma_rels=cfg.ema.sigma_rels,
-                      update_every=cfg.ema.update_every,
-                      checkpoint_every_num_steps=cfg.ema.checkpoint_every,
-                      checkpoint_folder=cfg.ema.checkpoint_folder)
-    synthesized_ema = emas.synthesize_ema_model(sigma_rel=sigma, step=step, device='cpu')
-    state_dict = synthesized_ema.ema_model.state_dict()
-    return state_dict

mmaudio/utils/tensor_utils.py DELETED Viewed

@@ -1,14 +0,0 @@
-import torch
-def distribute_into_histogram(loss: torch.Tensor,
-                              t: torch.Tensor,
-                              num_bins: int = 25) -> tuple[torch.Tensor, torch.Tensor]:
-    loss = loss.detach().flatten()
-    t = t.detach().flatten()
-    t = (t * num_bins).long()
-    hist = torch.zeros(num_bins, device=loss.device)
-    count = torch.zeros(num_bins, device=loss.device)
-    hist.scatter_add_(0, t, loss)
-    count.scatter_add_(0, t, torch.ones_like(loss))
-    return hist, count

mmaudio/utils/time_estimator.py DELETED Viewed

@@ -1,72 +0,0 @@
-import time
-class TimeEstimator:
-    def __init__(self, total_iter: int, step_size: int, ema_alpha: float = 0.7):
-        self.avg_time_window = []  # window-based average
-        self.exp_avg_time = None  # exponential moving average
-        self.alpha = ema_alpha  # for exponential moving average
-        self.last_time = time.time()  # would not be accurate for the first iteration but well
-        self.total_iter = total_iter
-        self.step_size = step_size
-        self._buffering_exp = True
-    # call this at a fixed interval
-    # does not have to be every step
-    def update(self):
-        curr_time = time.time()
-        time_per_iter = curr_time - self.last_time
-        self.last_time = curr_time
-        self.avg_time_window.append(time_per_iter)
-        if self._buffering_exp:
-            if self.exp_avg_time is not None:
-                # discard the first iteration call to not pollute the ema
-                self._buffering_exp = False
-            self.exp_avg_time = time_per_iter
-        else:
-            self.exp_avg_time = self.alpha * self.exp_avg_time + (1 - self.alpha) * time_per_iter
-    def get_est_remaining(self, it: int):
-        if self.exp_avg_time is None:
-            return 0
-        remaining_iter = self.total_iter - it
-        return remaining_iter * self.exp_avg_time / self.step_size
-    def get_and_reset_avg_time(self):
-        avg = sum(self.avg_time_window) / len(self.avg_time_window) / self.step_size
-        self.avg_time_window = []
-        return avg
-class PartialTimeEstimator(TimeEstimator):
-    """
-    Used where the start_time and the end_time do not align
-    """
-    def update(self):
-        raise RuntimeError('Please use start() and end() for PartialTimeEstimator')
-    def start(self):
-        self.last_time = time.time()
-    def end(self):
-        assert self.last_time is not None, 'Please call start() before calling end()'
-        curr_time = time.time()
-        time_per_iter = curr_time - self.last_time
-        self.last_time = None
-        self.avg_time_window.append(time_per_iter)
-        if self._buffering_exp:
-            if self.exp_avg_time is not None:
-                # discard the first iteration call to not pollute the ema
-                self._buffering_exp = False
-            self.exp_avg_time = time_per_iter
-        else:
-            self.exp_avg_time = self.alpha * self.exp_avg_time + (1 - self.alpha) * time_per_iter

mmaudio/utils/timezone.py DELETED Viewed

	@@ -1 +0,0 @@
1	- my_timezone = 'US/Central'