Rex Cheng
		
	commited on
		
		
					Commit 
							
							Β·
						
						dbac20f
	
1
								Parent(s):
							
							f2786fb
								
initial commit
Browse filesThis view is limited to 50 files because it contains too many changes. Β 
							See raw diff
- LICENSE +21 -0
- README.md +152 -14
- app.py +149 -0
- demo.py +135 -0
- docs/images/icon.png +0 -0
- docs/index.html +147 -0
- docs/style.css +78 -0
- docs/style_videos.css +52 -0
- docs/video_gen.html +254 -0
- docs/video_main.html +98 -0
- docs/video_vgg.html +452 -0
- mmaudio/__init__.py +0 -0
- mmaudio/eval_utils.py +245 -0
- mmaudio/ext/__init__.py +1 -0
- mmaudio/ext/autoencoder/__init__.py +1 -0
- mmaudio/ext/autoencoder/autoencoder.py +48 -0
- mmaudio/ext/autoencoder/edm2_utils.py +168 -0
- mmaudio/ext/autoencoder/vae.py +369 -0
- mmaudio/ext/autoencoder/vae_modules.py +117 -0
- mmaudio/ext/bigvgan/LICENSE +21 -0
- mmaudio/ext/bigvgan/__init__.py +1 -0
- mmaudio/ext/bigvgan/activations.py +120 -0
- mmaudio/ext/bigvgan/alias_free_torch/__init__.py +6 -0
- mmaudio/ext/bigvgan/alias_free_torch/act.py +28 -0
- mmaudio/ext/bigvgan/alias_free_torch/filter.py +95 -0
- mmaudio/ext/bigvgan/alias_free_torch/resample.py +49 -0
- mmaudio/ext/bigvgan/bigvgan.py +32 -0
- mmaudio/ext/bigvgan/bigvgan_vocoder.yml +63 -0
- mmaudio/ext/bigvgan/env.py +18 -0
- mmaudio/ext/bigvgan/incl_licenses/LICENSE_1 +21 -0
- mmaudio/ext/bigvgan/incl_licenses/LICENSE_2 +21 -0
- mmaudio/ext/bigvgan/incl_licenses/LICENSE_3 +201 -0
- mmaudio/ext/bigvgan/incl_licenses/LICENSE_4 +29 -0
- mmaudio/ext/bigvgan/incl_licenses/LICENSE_5 +16 -0
- mmaudio/ext/bigvgan/models.py +255 -0
- mmaudio/ext/bigvgan/utils.py +31 -0
- mmaudio/ext/bigvgan_v2/LICENSE +21 -0
- mmaudio/ext/bigvgan_v2/__init__.py +0 -0
- mmaudio/ext/bigvgan_v2/activations.py +126 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/__init__.py +0 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/activation1d.py +77 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/compat.h +29 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/load.py +86 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/type_shim.h +92 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/torch/__init__.py +6 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/torch/act.py +32 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/torch/filter.py +101 -0
- mmaudio/ext/bigvgan_v2/alias_free_activation/torch/resample.py +54 -0
    	
        LICENSE
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            MIT License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2024 Ho Kei Cheng
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in all
         | 
| 13 | 
            +
            copies or substantial portions of the Software.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         | 
| 21 | 
            +
            SOFTWARE.
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,14 +1,152 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
             | 
| 3 | 
            -
             | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # [Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis](https://hkchengrex.github.io/MMAudio)
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            [Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/)
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            [[Paper (being prepared)]](https://hkchengrex.github.io/MMAudio) [[Project Page]](https://hkchengrex.github.io/MMAudio)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            **Note: This repository is still under construction. Single-example inference should work as expected. The training code will be added. Code is subject to non-backward-compatible changes.**
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            ## Highlight
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            MMAudio generates synchronized audio given video and/or text inputs.
         | 
| 16 | 
            +
            Our key innovation is multimodal joint training which allows training on a wide range of audio-visual and audio-text datasets.
         | 
| 17 | 
            +
            Moreover, a synchronization module aligns the generated audio with the video frames.
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            ## Results
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            (All audio from our algorithm MMAudio)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            Videos from Sora: 
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
         | 
| 27 | 
            +
             | 
| 28 | 
            +
             | 
| 29 | 
            +
            Videos from MovieGen/Hunyuan Video/VGGSound: 
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            For more results, visit https://hkchengrex.com/MMAudio/video_main.html.
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            ## Installation
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            We have only tested this on Ubuntu.
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            ### Prerequisites
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            - Python 3.8+
         | 
| 44 | 
            +
            - PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/)
         | 
| 45 | 
            +
            - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            **Clone our repository:**
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            ```bash
         | 
| 50 | 
            +
            git clone https://github.com/hkchengrex/MMAudio.git
         | 
| 51 | 
            +
            ```
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            **Install with pip:**
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            ```bash
         | 
| 56 | 
            +
            cd MMAudio
         | 
| 57 | 
            +
            pip install -e .
         | 
| 58 | 
            +
            ```
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            (If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            **Pretrained models:**
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            | Model    | Download link | File size |
         | 
| 67 | 
            +
            | -------- | ------- | ------- |
         | 
| 68 | 
            +
            | Flow prediction network, small 16kHz | <a href="https://databank.illinois.edu/datafiles/k6jve/download" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
         | 
| 69 | 
            +
            | Flow prediction network, small 44.1kHz | <a href="https://databank.illinois.edu/datafiles/864ya/download" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
         | 
| 70 | 
            +
            | Flow prediction network, medium 44.1kHz | <a href="https://databank.illinois.edu/datafiles/pa94t/download" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
         | 
| 71 | 
            +
            | Flow prediction network, large 44.1kHz **(recommended)** | <a href="https://databank.illinois.edu/datafiles/4jx76/download" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
         | 
| 72 | 
            +
            | 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
         | 
| 73 | 
            +
            | 16kHz BigVGAN vocoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
         | 
| 74 | 
            +
            | 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G | 
         | 
| 75 | 
            +
            | Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            The 44.1kHz vocoder will be downloaded automatically.
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            The expected directory structure (full):
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            ```bash
         | 
| 82 | 
            +
            MMAudio
         | 
| 83 | 
            +
            βββ ext_weights
         | 
| 84 | 
            +
            β   βββ best_netG.pt
         | 
| 85 | 
            +
            β   βββ synchformer_state_dict.pth
         | 
| 86 | 
            +
            β   βββ v1-16.pth
         | 
| 87 | 
            +
            β   βββ v1-44.pth
         | 
| 88 | 
            +
            βββ weights
         | 
| 89 | 
            +
            β   βββ mmaudio_small_16k.pth
         | 
| 90 | 
            +
            β   βββ mmaudio_small_44k.pth
         | 
| 91 | 
            +
            β   βββ mmaudio_medium_44k.pth
         | 
| 92 | 
            +
            β   βββ mmaudio_large_44k.pth
         | 
| 93 | 
            +
            βββ ...
         | 
| 94 | 
            +
            ```
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            The expected directory structure (minimal, for the recommended model only):
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            ```bash
         | 
| 99 | 
            +
            MMAudio
         | 
| 100 | 
            +
            βββ ext_weights
         | 
| 101 | 
            +
            β   βββ synchformer_state_dict.pth
         | 
| 102 | 
            +
            β   βββ v1-44.pth
         | 
| 103 | 
            +
            βββ weights
         | 
| 104 | 
            +
            β   βββ mmaudio_large_44k.pth
         | 
| 105 | 
            +
            βββ ...
         | 
| 106 | 
            +
            ```
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            ## Demo
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            By default, these scripts use the `large_44k` model. 
         | 
| 111 | 
            +
            In our experiments, inference only takes around 6GB of GPU memory (in 16-bit mode) which should fit in most modern GPUs.
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            ### Command-line interface
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            With `demo.py`
         | 
| 116 | 
            +
            ```bash
         | 
| 117 | 
            +
            python demo.py --duration=8 --video=<path to video> --prompt "your prompt" 
         | 
| 118 | 
            +
            ```
         | 
| 119 | 
            +
            The output (audio in `.flac` format, and video in `.mp4` format) will be saved in `./output`.
         | 
| 120 | 
            +
            See the file for more options.
         | 
| 121 | 
            +
            Simply omit the `--video` option for text-to-audio synthesis.
         | 
| 122 | 
            +
            The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality.
         | 
| 123 | 
            +
             | 
| 124 | 
            +
             | 
| 125 | 
            +
            ### Gradio interface
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            Supports video-to-audio and text-to-audio synthesis.
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            ```
         | 
| 130 | 
            +
            python gradio_demo.py
         | 
| 131 | 
            +
            ```
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            ### Known limitations
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            1. The model sometimes generates undesired unintelligible human speech-like sounds
         | 
| 136 | 
            +
            2. The model sometimes generates undesired background music
         | 
| 137 | 
            +
            3. The model struggles with unfamiliar concepts, e.g., it can generate "gunfires" but not "RPG firing".
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            We believe all of these three limitations can be addressed with more high-quality training data.
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            ## Training
         | 
| 142 | 
            +
            Work in progress.
         | 
| 143 | 
            +
             | 
| 144 | 
            +
            ## Evaluation
         | 
| 145 | 
            +
            Work in progress.
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            ## Acknowledgement
         | 
| 148 | 
            +
            Many thanks to:
         | 
| 149 | 
            +
            - [Make-An-Audio 2](https://github.com/bytedance/Make-An-Audio-2) for the 16kHz BigVGAN pretrained model
         | 
| 150 | 
            +
            - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
         | 
| 151 | 
            +
            - [Synchformer](https://github.com/v-iashin/Synchformer) 
         | 
| 152 | 
            +
             | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,149 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import logging
         | 
| 2 | 
            +
            from datetime import datetime
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import gradio as gr
         | 
| 6 | 
            +
            import torch
         | 
| 7 | 
            +
            import torchaudio
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
         | 
| 10 | 
            +
                                            setup_eval_logging)
         | 
| 11 | 
            +
            from mmaudio.model.flow_matching import FlowMatching
         | 
| 12 | 
            +
            from mmaudio.model.networks import MMAudio, get_my_mmaudio
         | 
| 13 | 
            +
            from mmaudio.model.sequence_config import SequenceConfig
         | 
| 14 | 
            +
            from mmaudio.model.utils.features_utils import FeaturesUtils
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            torch.backends.cuda.matmul.allow_tf32 = True
         | 
| 17 | 
            +
            torch.backends.cudnn.allow_tf32 = True
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            log = logging.getLogger()
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            device = 'cuda'
         | 
| 22 | 
            +
            dtype = torch.bfloat16
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            model: ModelConfig = all_model_cfg['large_44k_v2']
         | 
| 25 | 
            +
            model.download_if_needed()
         | 
| 26 | 
            +
            output_dir = Path('./output/gradio')
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            setup_eval_logging()
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
            def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
         | 
| 32 | 
            +
                seq_cfg = model.seq_cfg
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
         | 
| 35 | 
            +
                net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
         | 
| 36 | 
            +
                log.info(f'Loaded weights from {model.model_path}')
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
         | 
| 39 | 
            +
                                              synchformer_ckpt=model.synchformer_ckpt,
         | 
| 40 | 
            +
                                              enable_conditions=True,
         | 
| 41 | 
            +
                                              mode=model.mode,
         | 
| 42 | 
            +
                                              bigvgan_vocoder_ckpt=model.bigvgan_16k_path)
         | 
| 43 | 
            +
                feature_utils = feature_utils.to(device, dtype).eval()
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                return net, feature_utils, seq_cfg
         | 
| 46 | 
            +
             | 
| 47 | 
            +
             | 
| 48 | 
            +
            net, feature_utils, seq_cfg = get_model()
         | 
| 49 | 
            +
             | 
| 50 | 
            +
             | 
| 51 | 
            +
            @torch.inference_mode()
         | 
| 52 | 
            +
            def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
         | 
| 53 | 
            +
                               cfg_strength: float, duration: float):
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                rng = torch.Generator(device=device)
         | 
| 56 | 
            +
                rng.manual_seed(seed)
         | 
| 57 | 
            +
                fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                clip_frames, sync_frames, duration = load_video(video, duration)
         | 
| 60 | 
            +
                clip_frames = clip_frames.unsqueeze(0)
         | 
| 61 | 
            +
                sync_frames = sync_frames.unsqueeze(0)
         | 
| 62 | 
            +
                seq_cfg.duration = duration
         | 
| 63 | 
            +
                net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                audios = generate(clip_frames,
         | 
| 66 | 
            +
                                  sync_frames, [prompt],
         | 
| 67 | 
            +
                                  negative_text=[negative_prompt],
         | 
| 68 | 
            +
                                  feature_utils=feature_utils,
         | 
| 69 | 
            +
                                  net=net,
         | 
| 70 | 
            +
                                  fm=fm,
         | 
| 71 | 
            +
                                  rng=rng,
         | 
| 72 | 
            +
                                  cfg_strength=cfg_strength)
         | 
| 73 | 
            +
                audio = audios.float().cpu()[0]
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
         | 
| 76 | 
            +
                output_dir.mkdir(exist_ok=True, parents=True)
         | 
| 77 | 
            +
                video_save_path = output_dir / f'{current_time_string}.mp4'
         | 
| 78 | 
            +
                make_video(video,
         | 
| 79 | 
            +
                           video_save_path,
         | 
| 80 | 
            +
                           audio,
         | 
| 81 | 
            +
                           sampling_rate=seq_cfg.sampling_rate,
         | 
| 82 | 
            +
                           duration_sec=seq_cfg.duration)
         | 
| 83 | 
            +
                return video_save_path
         | 
| 84 | 
            +
             | 
| 85 | 
            +
             | 
| 86 | 
            +
            @torch.inference_mode()
         | 
| 87 | 
            +
            def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
         | 
| 88 | 
            +
                              duration: float):
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                rng = torch.Generator(device=device)
         | 
| 91 | 
            +
                rng.manual_seed(seed)
         | 
| 92 | 
            +
                fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                clip_frames = sync_frames = None
         | 
| 95 | 
            +
                seq_cfg.duration = duration
         | 
| 96 | 
            +
                net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                audios = generate(clip_frames,
         | 
| 99 | 
            +
                                  sync_frames, [prompt],
         | 
| 100 | 
            +
                                  negative_text=[negative_prompt],
         | 
| 101 | 
            +
                                  feature_utils=feature_utils,
         | 
| 102 | 
            +
                                  net=net,
         | 
| 103 | 
            +
                                  fm=fm,
         | 
| 104 | 
            +
                                  rng=rng,
         | 
| 105 | 
            +
                                  cfg_strength=cfg_strength)
         | 
| 106 | 
            +
                audio = audios.float().cpu()[0]
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
         | 
| 109 | 
            +
                output_dir.mkdir(exist_ok=True, parents=True)
         | 
| 110 | 
            +
                audio_save_path = output_dir / f'{current_time_string}.flac'
         | 
| 111 | 
            +
                torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
         | 
| 112 | 
            +
                return audio_save_path
         | 
| 113 | 
            +
             | 
| 114 | 
            +
             | 
| 115 | 
            +
            video_to_audio_tab = gr.Interface(
         | 
| 116 | 
            +
                fn=video_to_audio,
         | 
| 117 | 
            +
                inputs=[
         | 
| 118 | 
            +
                    gr.Video(),
         | 
| 119 | 
            +
                    gr.Text(label='Prompt'),
         | 
| 120 | 
            +
                    gr.Text(label='Negative prompt', value='music'),
         | 
| 121 | 
            +
                    gr.Number(label='Seed', value=0, precision=0, minimum=0),
         | 
| 122 | 
            +
                    gr.Number(label='Num steps', value=25, precision=0, minimum=1),
         | 
| 123 | 
            +
                    gr.Number(label='Guidance Strength', value=4.5, minimum=1),
         | 
| 124 | 
            +
                    gr.Number(label='Duration (sec)', value=8, minimum=1),
         | 
| 125 | 
            +
                ],
         | 
| 126 | 
            +
                outputs='playable_video',
         | 
| 127 | 
            +
                cache_examples=False,
         | 
| 128 | 
            +
                title='MMAudio β Video-to-Audio Synthesis',
         | 
| 129 | 
            +
            )
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            text_to_audio_tab = gr.Interface(
         | 
| 132 | 
            +
                fn=text_to_audio,
         | 
| 133 | 
            +
                inputs=[
         | 
| 134 | 
            +
                    gr.Text(label='Prompt'),
         | 
| 135 | 
            +
                    gr.Text(label='Negative prompt'),
         | 
| 136 | 
            +
                    gr.Number(label='Seed', value=0, precision=0, minimum=0),
         | 
| 137 | 
            +
                    gr.Number(label='Num steps', value=25, precision=0, minimum=1),
         | 
| 138 | 
            +
                    gr.Number(label='Guidance Strength', value=4.5, minimum=1),
         | 
| 139 | 
            +
                    gr.Number(label='Duration (sec)', value=8, minimum=1),
         | 
| 140 | 
            +
                ],
         | 
| 141 | 
            +
                outputs='audio',
         | 
| 142 | 
            +
                cache_examples=False,
         | 
| 143 | 
            +
                title='MMAudio β Text-to-Audio Synthesis',
         | 
| 144 | 
            +
            )
         | 
| 145 | 
            +
             | 
| 146 | 
            +
            if __name__ == "__main__":
         | 
| 147 | 
            +
                gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
         | 
| 148 | 
            +
                                   ['Video-to-Audio', 'Text-to-Audio']).launch(server_port=17888,
         | 
| 149 | 
            +
                                                                               allowed_paths=[output_dir])
         | 
    	
        demo.py
    ADDED
    
    | @@ -0,0 +1,135 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import logging
         | 
| 2 | 
            +
            from argparse import ArgumentParser
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import torch
         | 
| 6 | 
            +
            import torchaudio
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate,
         | 
| 9 | 
            +
                                            load_video, make_video, setup_eval_logging)
         | 
| 10 | 
            +
            from mmaudio.model.flow_matching import FlowMatching
         | 
| 11 | 
            +
            from mmaudio.model.networks import MMAudio, get_my_mmaudio
         | 
| 12 | 
            +
            from mmaudio.model.utils.features_utils import FeaturesUtils
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            torch.backends.cuda.matmul.allow_tf32 = True
         | 
| 15 | 
            +
            torch.backends.cudnn.allow_tf32 = True
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            log = logging.getLogger()
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            @torch.inference_mode()
         | 
| 21 | 
            +
            def main():
         | 
| 22 | 
            +
                setup_eval_logging()
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                parser = ArgumentParser()
         | 
| 25 | 
            +
                parser.add_argument('--variant',
         | 
| 26 | 
            +
                                    type=str,
         | 
| 27 | 
            +
                                    default='large_44k_v2',
         | 
| 28 | 
            +
                                    help='small_16k, small_44k, medium_44k, large_44k, large_44k_v2')
         | 
| 29 | 
            +
                parser.add_argument('--video', type=Path, help='Path to the video file')
         | 
| 30 | 
            +
                parser.add_argument('--prompt', type=str, help='Input prompt', default='')
         | 
| 31 | 
            +
                parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
         | 
| 32 | 
            +
                parser.add_argument('--duration', type=float, default=8.0)
         | 
| 33 | 
            +
                parser.add_argument('--cfg_strength', type=float, default=4.5)
         | 
| 34 | 
            +
                parser.add_argument('--num_steps', type=int, default=25)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                parser.add_argument('--mask_away_clip', action='store_true')
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                parser.add_argument('--output', type=Path, help='Output directory', default='./output')
         | 
| 39 | 
            +
                parser.add_argument('--seed', type=int, help='Random seed', default=42)
         | 
| 40 | 
            +
                parser.add_argument('--skip_video_composite', action='store_true')
         | 
| 41 | 
            +
                parser.add_argument('--full_precision', action='store_true')
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                args = parser.parse_args()
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                if args.variant not in all_model_cfg:
         | 
| 46 | 
            +
                    raise ValueError(f'Unknown model variant: {args.variant}')
         | 
| 47 | 
            +
                model: ModelConfig = all_model_cfg[args.variant]
         | 
| 48 | 
            +
                model.download_if_needed()
         | 
| 49 | 
            +
                seq_cfg = model.seq_cfg
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                if args.video:
         | 
| 52 | 
            +
                    video_path: Path = Path(args.video).expanduser()
         | 
| 53 | 
            +
                else:
         | 
| 54 | 
            +
                    video_path = None
         | 
| 55 | 
            +
                prompt: str = args.prompt
         | 
| 56 | 
            +
                negative_prompt: str = args.negative_prompt
         | 
| 57 | 
            +
                output_dir: str = args.output.expanduser()
         | 
| 58 | 
            +
                seed: int = args.seed
         | 
| 59 | 
            +
                num_steps: int = args.num_steps
         | 
| 60 | 
            +
                duration: float = args.duration
         | 
| 61 | 
            +
                cfg_strength: float = args.cfg_strength
         | 
| 62 | 
            +
                skip_video_composite: bool = args.skip_video_composite
         | 
| 63 | 
            +
                mask_away_clip: bool = args.mask_away_clip
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                device = 'cuda'
         | 
| 66 | 
            +
                dtype = torch.float32 if args.full_precision else torch.bfloat16
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                output_dir.mkdir(parents=True, exist_ok=True)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                # load a pretrained model
         | 
| 71 | 
            +
                net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
         | 
| 72 | 
            +
                net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
         | 
| 73 | 
            +
                log.info(f'Loaded weights from {model.model_path}')
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                # misc setup
         | 
| 76 | 
            +
                rng = torch.Generator(device=device)
         | 
| 77 | 
            +
                rng.manual_seed(seed)
         | 
| 78 | 
            +
                fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
         | 
| 81 | 
            +
                                              synchformer_ckpt=model.synchformer_ckpt,
         | 
| 82 | 
            +
                                              enable_conditions=True,
         | 
| 83 | 
            +
                                              mode=model.mode,
         | 
| 84 | 
            +
                                              bigvgan_vocoder_ckpt=model.bigvgan_16k_path)
         | 
| 85 | 
            +
                feature_utils = feature_utils.to(device, dtype).eval()
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                if video_path is not None:
         | 
| 88 | 
            +
                    log.info(f'Using video {video_path}')
         | 
| 89 | 
            +
                    clip_frames, sync_frames, duration = load_video(video_path, duration)
         | 
| 90 | 
            +
                    if mask_away_clip:
         | 
| 91 | 
            +
                        clip_frames = None
         | 
| 92 | 
            +
                    else:
         | 
| 93 | 
            +
                        clip_frames = clip_frames.unsqueeze(0)
         | 
| 94 | 
            +
                    sync_frames = sync_frames.unsqueeze(0)
         | 
| 95 | 
            +
                else:
         | 
| 96 | 
            +
                    log.info('No video provided -- text-to-audio mode')
         | 
| 97 | 
            +
                    clip_frames = sync_frames = None
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                seq_cfg.duration = duration
         | 
| 100 | 
            +
                net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                log.info(f'Prompt: {prompt}')
         | 
| 103 | 
            +
                log.info(f'Negative prompt: {negative_prompt}')
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                audios = generate(clip_frames,
         | 
| 106 | 
            +
                                  sync_frames, [prompt],
         | 
| 107 | 
            +
                                  negative_text=[negative_prompt],
         | 
| 108 | 
            +
                                  feature_utils=feature_utils,
         | 
| 109 | 
            +
                                  net=net,
         | 
| 110 | 
            +
                                  fm=fm,
         | 
| 111 | 
            +
                                  rng=rng,
         | 
| 112 | 
            +
                                  cfg_strength=cfg_strength)
         | 
| 113 | 
            +
                audio = audios.float().cpu()[0]
         | 
| 114 | 
            +
                if video_path is not None:
         | 
| 115 | 
            +
                    save_path = output_dir / f'{video_path.stem}.flac'
         | 
| 116 | 
            +
                else:
         | 
| 117 | 
            +
                    safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
         | 
| 118 | 
            +
                    save_path = output_dir / f'{safe_filename}.flac'
         | 
| 119 | 
            +
                torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                log.info(f'Audio saved to {save_path}')
         | 
| 122 | 
            +
                if video_path is not None and not skip_video_composite:
         | 
| 123 | 
            +
                    video_save_path = output_dir / f'{video_path.stem}.mp4'
         | 
| 124 | 
            +
                    make_video(video_path,
         | 
| 125 | 
            +
                               video_save_path,
         | 
| 126 | 
            +
                               audio,
         | 
| 127 | 
            +
                               sampling_rate=seq_cfg.sampling_rate,
         | 
| 128 | 
            +
                               duration_sec=seq_cfg.duration)
         | 
| 129 | 
            +
                    log.info(f'Video saved to {output_dir / video_save_path}')
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
         | 
| 132 | 
            +
             | 
| 133 | 
            +
             | 
| 134 | 
            +
            if __name__ == '__main__':
         | 
| 135 | 
            +
                main()
         | 
    	
        docs/images/icon.png
    ADDED
    
    |   | 
    	
        docs/index.html
    ADDED
    
    | @@ -0,0 +1,147 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <!DOCTYPE html>
         | 
| 2 | 
            +
            <html lang="en">
         | 
| 3 | 
            +
            <head>
         | 
| 4 | 
            +
                <!-- Google tag (gtag.js) -->
         | 
| 5 | 
            +
                <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
         | 
| 6 | 
            +
                <script>
         | 
| 7 | 
            +
                window.dataLayer = window.dataLayer || [];
         | 
| 8 | 
            +
                function gtag(){dataLayer.push(arguments);}
         | 
| 9 | 
            +
                gtag('js', new Date());
         | 
| 10 | 
            +
                gtag('config', 'G-0JKBJ3WRJZ');
         | 
| 11 | 
            +
                </script>
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                <link rel="preconnect" href="https://fonts.googleapis.com">
         | 
| 14 | 
            +
                <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
         | 
| 15 | 
            +
                <link href="https://fonts.googleapis.com/css2?family=Source+Sans+3&display=swap" rel="stylesheet">
         | 
| 16 | 
            +
                <meta charset="UTF-8">
         | 
| 17 | 
            +
                <title>MMAudio</title>
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                <link rel="icon" type="image/png" href="images/icon.png">
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                <meta name="viewport" content="width=device-width, initial-scale=1">
         | 
| 22 | 
            +
                <!-- CSS only -->
         | 
| 23 | 
            +
                <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
         | 
| 24 | 
            +
                    integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
         | 
| 25 | 
            +
                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                <link rel="stylesheet" href="style.css">
         | 
| 28 | 
            +
            </head>
         | 
| 29 | 
            +
            <body>
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                <body>
         | 
| 32 | 
            +
                    <br><br><br><br>
         | 
| 33 | 
            +
                    <div class="container">
         | 
| 34 | 
            +
                        <div class="row text-center" style="font-size:38px">
         | 
| 35 | 
            +
                            <div class="col strong">
         | 
| 36 | 
            +
                                Taming Multimodal Joint Training for High-Quality <br>Video-to-Audio Synthesis
         | 
| 37 | 
            +
                            </div>
         | 
| 38 | 
            +
                        </div>
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                        <br>
         | 
| 41 | 
            +
                        <div class="row text-center" style="font-size:28px">
         | 
| 42 | 
            +
                            <div class="col">
         | 
| 43 | 
            +
                                arXiv 2024
         | 
| 44 | 
            +
                            </div>
         | 
| 45 | 
            +
                        </div>
         | 
| 46 | 
            +
                        <br>
         | 
| 47 | 
            +
                
         | 
| 48 | 
            +
                        <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
         | 
| 49 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 50 | 
            +
                                <a href="https://hkchengrex.github.io/">Ho Kei Cheng<sup>1</sup></a>
         | 
| 51 | 
            +
                            </div>
         | 
| 52 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 53 | 
            +
                                <nobr><a href="https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ">Masato Ishii<sup>2</sup></a></nobr>
         | 
| 54 | 
            +
                            </div>
         | 
| 55 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 56 | 
            +
                                <nobr><a href="https://scholar.google.com/citations?user=sXAjHFIAAAAJ">Akio Hayakawa<sup>2</sup></a></nobr>
         | 
| 57 | 
            +
                            </div>
         | 
| 58 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 59 | 
            +
                                <nobr><a href="https://scholar.google.com/citations?user=XCRO260AAAAJ">Takashi Shibuya<sup>2</sup></a></nobr>
         | 
| 60 | 
            +
                            </div>
         | 
| 61 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 62 | 
            +
                                <nobr><a href="https://www.alexander-schwing.de/">Alexander Schwing<sup>1</sup></a></nobr>
         | 
| 63 | 
            +
                            </div>
         | 
| 64 | 
            +
                            <div class="col-sm-auto px-lg-2" >
         | 
| 65 | 
            +
                                <nobr><a href="https://www.yukimitsufuji.com/">Yuki Mitsufuji<sup>2,3</sup></a></nobr>
         | 
| 66 | 
            +
                            </div>
         | 
| 67 | 
            +
                        </div>
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                        <div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
         | 
| 70 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 71 | 
            +
                                <sup>1</sup>University of Illinois Urbana-Champaign
         | 
| 72 | 
            +
                            </div>
         | 
| 73 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 74 | 
            +
                                <sup>2</sup>Sony AI
         | 
| 75 | 
            +
                            </div>
         | 
| 76 | 
            +
                            <div class="col-sm-auto px-lg-2">
         | 
| 77 | 
            +
                                <sup>3</sup>Sony Group Corporation
         | 
| 78 | 
            +
                            </div>
         | 
| 79 | 
            +
                        </div>
         | 
| 80 | 
            +
                
         | 
| 81 | 
            +
                        <br>
         | 
| 82 | 
            +
                
         | 
| 83 | 
            +
                        <br>
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                        <div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
         | 
| 86 | 
            +
                            <!-- <div class="col-sm-2">
         | 
| 87 | 
            +
                                <a href="https://arxiv.org/abs/2310.12982">[arXiv]</a>
         | 
| 88 | 
            +
                            </div> -->
         | 
| 89 | 
            +
                            <div class="col-sm-3">
         | 
| 90 | 
            +
                                <a href="">[Paper (being prepared)]</a>
         | 
| 91 | 
            +
                            </div>
         | 
| 92 | 
            +
                            <div class="col-sm-3">
         | 
| 93 | 
            +
                                <a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
         | 
| 94 | 
            +
                            </div>
         | 
| 95 | 
            +
                            <!-- <div class="col-sm-2">
         | 
| 96 | 
            +
                                <a
         | 
| 97 | 
            +
                                    href="https://colab.research.google.com/drive/1yo43XTbjxuWA7XgCUO9qxAi7wBI6HzvP?usp=sharing">[Colab]</a>
         | 
| 98 | 
            +
                            </div> -->
         | 
| 99 | 
            +
                        </div>
         | 
| 100 | 
            +
                
         | 
| 101 | 
            +
                        <br>
         | 
| 102 | 
            +
                
         | 
| 103 | 
            +
                        <hr>
         | 
| 104 | 
            +
                
         | 
| 105 | 
            +
                        <div class="row" style="font-size:32px">
         | 
| 106 | 
            +
                            <div class="col strong">
         | 
| 107 | 
            +
                                TL;DR
         | 
| 108 | 
            +
                            </div>
         | 
| 109 | 
            +
                        </div>
         | 
| 110 | 
            +
                        <br>
         | 
| 111 | 
            +
                        <div class="row">
         | 
| 112 | 
            +
                            <div class="col">
         | 
| 113 | 
            +
                                <p class="light" style="text-align: left;">
         | 
| 114 | 
            +
                                    MMAudio generates synchronized audio given video and/or text inputs.
         | 
| 115 | 
            +
                                </p>
         | 
| 116 | 
            +
                            </div>
         | 
| 117 | 
            +
                        </div>
         | 
| 118 | 
            +
                
         | 
| 119 | 
            +
                        <br>
         | 
| 120 | 
            +
                        <hr>
         | 
| 121 | 
            +
                        <br>
         | 
| 122 | 
            +
                
         | 
| 123 | 
            +
                        <div class="row" style="font-size:32px">
         | 
| 124 | 
            +
                            <div class="col strong">
         | 
| 125 | 
            +
                                Demo
         | 
| 126 | 
            +
                            </div>
         | 
| 127 | 
            +
                        </div>
         | 
| 128 | 
            +
                        <br>
         | 
| 129 | 
            +
                        <div class="row" style="font-size:48px">
         | 
| 130 | 
            +
                            <div class="col strong text-center">
         | 
| 131 | 
            +
                                <a href="video_main.html" style="text-decoration: underline;"><More results></a>
         | 
| 132 | 
            +
                            </div>
         | 
| 133 | 
            +
                        </div>
         | 
| 134 | 
            +
                        <br>
         | 
| 135 | 
            +
                        <div class="video-container" style="text-align: center;">
         | 
| 136 | 
            +
                            <iframe src="https://youtube.com/embed/YElewUT2M4M"></iframe>
         | 
| 137 | 
            +
                            </div>
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                        <br>
         | 
| 140 | 
            +
                
         | 
| 141 | 
            +
                        <br><br>
         | 
| 142 | 
            +
                        <br><br>
         | 
| 143 | 
            +
                
         | 
| 144 | 
            +
                    </div>
         | 
| 145 | 
            +
             | 
| 146 | 
            +
            </body>
         | 
| 147 | 
            +
            </html>
         | 
    	
        docs/style.css
    ADDED
    
    | @@ -0,0 +1,78 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            body {
         | 
| 2 | 
            +
                font-family: 'Source Sans 3', sans-serif;
         | 
| 3 | 
            +
                font-size: 18px;
         | 
| 4 | 
            +
                margin-left: auto;
         | 
| 5 | 
            +
                margin-right: auto;
         | 
| 6 | 
            +
                font-weight: 400;
         | 
| 7 | 
            +
                height: 100%;
         | 
| 8 | 
            +
                max-width: 1000px;
         | 
| 9 | 
            +
            }
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            table {
         | 
| 12 | 
            +
                width: 100%;
         | 
| 13 | 
            +
                border-collapse: collapse;
         | 
| 14 | 
            +
            }
         | 
| 15 | 
            +
            th, td {
         | 
| 16 | 
            +
                border: 1px solid #ddd;
         | 
| 17 | 
            +
                padding: 8px;
         | 
| 18 | 
            +
                text-align: center;
         | 
| 19 | 
            +
            }
         | 
| 20 | 
            +
            th {
         | 
| 21 | 
            +
                background-color: #f2f2f2;
         | 
| 22 | 
            +
            }
         | 
| 23 | 
            +
            video {
         | 
| 24 | 
            +
                width: 100%;
         | 
| 25 | 
            +
                height: auto;
         | 
| 26 | 
            +
            }
         | 
| 27 | 
            +
            p {
         | 
| 28 | 
            +
                font-size: 28px;
         | 
| 29 | 
            +
            }
         | 
| 30 | 
            +
            h2 {
         | 
| 31 | 
            +
                font-size: 36px;
         | 
| 32 | 
            +
            }
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            .strong {
         | 
| 35 | 
            +
                font-weight: 700;
         | 
| 36 | 
            +
            }
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            .light {
         | 
| 39 | 
            +
                font-weight: 100;
         | 
| 40 | 
            +
            }
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            .heavy {
         | 
| 43 | 
            +
                font-weight: 900;
         | 
| 44 | 
            +
            }
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            .column {
         | 
| 47 | 
            +
                float: left;
         | 
| 48 | 
            +
            }
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            a:link,
         | 
| 51 | 
            +
            a:visited {
         | 
| 52 | 
            +
                color: #05538f;
         | 
| 53 | 
            +
                text-decoration: none;
         | 
| 54 | 
            +
            }
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            a:hover {
         | 
| 57 | 
            +
                color: #63cbdd;
         | 
| 58 | 
            +
            }
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            hr {
         | 
| 61 | 
            +
                border: 0;
         | 
| 62 | 
            +
                height: 1px;
         | 
| 63 | 
            +
                background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.75), rgba(0, 0, 0, 0));
         | 
| 64 | 
            +
            }
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            .video-container {
         | 
| 67 | 
            +
                position: relative;
         | 
| 68 | 
            +
                padding-bottom: 56.25%; /* 16:9 */
         | 
| 69 | 
            +
                height: 0;
         | 
| 70 | 
            +
              }
         | 
| 71 | 
            +
              
         | 
| 72 | 
            +
            .video-container iframe {
         | 
| 73 | 
            +
                position: absolute;
         | 
| 74 | 
            +
                top: 0;
         | 
| 75 | 
            +
                left: 0;
         | 
| 76 | 
            +
                width: 100%;
         | 
| 77 | 
            +
                height: 100%;
         | 
| 78 | 
            +
            }
         | 
    	
        docs/style_videos.css
    ADDED
    
    | @@ -0,0 +1,52 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            body {
         | 
| 2 | 
            +
                font-family: 'Source Sans 3', sans-serif;
         | 
| 3 | 
            +
                font-size: 1.5vh;
         | 
| 4 | 
            +
                font-weight: 400;
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            table {
         | 
| 8 | 
            +
                width: 100%;
         | 
| 9 | 
            +
                border-collapse: collapse;
         | 
| 10 | 
            +
            }
         | 
| 11 | 
            +
            th, td {
         | 
| 12 | 
            +
                border: 1px solid #ddd;
         | 
| 13 | 
            +
                padding: 8px;
         | 
| 14 | 
            +
                text-align: center;
         | 
| 15 | 
            +
            }
         | 
| 16 | 
            +
            th {
         | 
| 17 | 
            +
                background-color: #f2f2f2;
         | 
| 18 | 
            +
            }
         | 
| 19 | 
            +
            video {
         | 
| 20 | 
            +
                width: 100%;
         | 
| 21 | 
            +
                height: auto;
         | 
| 22 | 
            +
            }
         | 
| 23 | 
            +
            p {
         | 
| 24 | 
            +
                font-size: 1.5vh;
         | 
| 25 | 
            +
                font-weight: bold;
         | 
| 26 | 
            +
            }
         | 
| 27 | 
            +
            h2 {
         | 
| 28 | 
            +
                font-size: 2vh;
         | 
| 29 | 
            +
                font-weight: bold;
         | 
| 30 | 
            +
            }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            .video-container {
         | 
| 33 | 
            +
                position: relative;
         | 
| 34 | 
            +
                padding-bottom: 56.25%; /* 16:9 */
         | 
| 35 | 
            +
                height: 0;
         | 
| 36 | 
            +
              }
         | 
| 37 | 
            +
              
         | 
| 38 | 
            +
            .video-container iframe {
         | 
| 39 | 
            +
                position: absolute;
         | 
| 40 | 
            +
                top: 0;
         | 
| 41 | 
            +
                left: 0;
         | 
| 42 | 
            +
                width: 100%;
         | 
| 43 | 
            +
                height: 100%;
         | 
| 44 | 
            +
            }
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            .video-header {
         | 
| 47 | 
            +
                background-color: #f2f2f2;
         | 
| 48 | 
            +
                text-align: center;
         | 
| 49 | 
            +
                font-size: 1.5vh;
         | 
| 50 | 
            +
                font-weight: bold;
         | 
| 51 | 
            +
                padding: 8px;
         | 
| 52 | 
            +
            }
         | 
    	
        docs/video_gen.html
    ADDED
    
    | @@ -0,0 +1,254 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <!DOCTYPE html>
         | 
| 2 | 
            +
            <html lang="en">
         | 
| 3 | 
            +
            <head>
         | 
| 4 | 
            +
                <!-- Google tag (gtag.js) -->
         | 
| 5 | 
            +
                <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
         | 
| 6 | 
            +
                <script>
         | 
| 7 | 
            +
                window.dataLayer = window.dataLayer || [];
         | 
| 8 | 
            +
                function gtag(){dataLayer.push(arguments);}
         | 
| 9 | 
            +
                gtag('js', new Date());
         | 
| 10 | 
            +
                gtag('config', 'G-0JKBJ3WRJZ');
         | 
| 11 | 
            +
                </script>
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
         | 
| 14 | 
            +
                <meta charset="UTF-8">
         | 
| 15 | 
            +
                <title>MMAudio</title>
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                <link rel="icon" type="image/png" href="images/icon.png">
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                <meta name="viewport" content="width=device-width, initial-scale=1">
         | 
| 20 | 
            +
                <!-- CSS only -->
         | 
| 21 | 
            +
                <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
         | 
| 22 | 
            +
                    integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
         | 
| 23 | 
            +
                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                <link rel="stylesheet" href="style_videos.css">
         | 
| 26 | 
            +
            </head>
         | 
| 27 | 
            +
            <body>
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                <div id="moviegen_all">
         | 
| 30 | 
            +
                <h2 id="moviegen" style="text-align: center;">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</h2>
         | 
| 31 | 
            +
                <p id="moviegen1" style="overflow: hidden;">
         | 
| 32 | 
            +
                    Example 1: Ice cracking with sharp snapping sound, and metal tool scraping against the ice surface. 
         | 
| 33 | 
            +
                    <span style="float: right;"><a href="#index">Back to index</a></span>
         | 
| 34 | 
            +
                </p> 
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                <div class="row g-1">
         | 
| 37 | 
            +
                    <div class="col-sm-6">
         | 
| 38 | 
            +
                        <div class="video-header">Movie Gen Audio</div>
         | 
| 39 | 
            +
                        <div class="video-container">
         | 
| 40 | 
            +
                            <iframe src="https://youtube.com/embed/d7Lb0ihtGcE"></iframe>
         | 
| 41 | 
            +
                        </div> 
         | 
| 42 | 
            +
                    </div>
         | 
| 43 | 
            +
                    <div class="col-sm-6">
         | 
| 44 | 
            +
                        <div class="video-header">Ours</div>
         | 
| 45 | 
            +
                        <div class="video-container">
         | 
| 46 | 
            +
                            <iframe src="https://youtube.com/embed/F4JoJ2r2m8U"></iframe>
         | 
| 47 | 
            +
                            </div> 
         | 
| 48 | 
            +
                    </div>
         | 
| 49 | 
            +
                </div>
         | 
| 50 | 
            +
                <br>
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                <!-- <p id="moviegen2">Example 2: Rhythmic splashing and lapping of water. <span style="float:right;"><a href="#index">Back to index</a></span> </p> 
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                <table>
         | 
| 55 | 
            +
                    <thead>
         | 
| 56 | 
            +
                        <tr>
         | 
| 57 | 
            +
                            <th>Movie Gen Audio</th>
         | 
| 58 | 
            +
                            <th>Ours</th>
         | 
| 59 | 
            +
                        </tr>
         | 
| 60 | 
            +
                    </thead>
         | 
| 61 | 
            +
                    <tbody>
         | 
| 62 | 
            +
                        <tr>
         | 
| 63 | 
            +
                            <td width="50%">
         | 
| 64 | 
            +
                                <div class="video-container">
         | 
| 65 | 
            +
                                <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
         | 
| 66 | 
            +
                                </div>
         | 
| 67 | 
            +
                            </td>
         | 
| 68 | 
            +
                            <td width="50%">
         | 
| 69 | 
            +
                                <div class="video-container">
         | 
| 70 | 
            +
                                <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
         | 
| 71 | 
            +
                                </div>
         | 
| 72 | 
            +
                            </td>
         | 
| 73 | 
            +
                        </tr>
         | 
| 74 | 
            +
                    </tbody>
         | 
| 75 | 
            +
                </table> -->
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                <p id="moviegen2" style="overflow: hidden;">
         | 
| 78 | 
            +
                    Example 2: Rhythmic splashing and lapping of water. 
         | 
| 79 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 80 | 
            +
                </p> 
         | 
| 81 | 
            +
                <div class="row g-1">
         | 
| 82 | 
            +
                    <div class="col-sm-6">
         | 
| 83 | 
            +
                        <div class="video-header">Movie Gen Audio</div>
         | 
| 84 | 
            +
                        <div class="video-container">
         | 
| 85 | 
            +
                            <iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
         | 
| 86 | 
            +
                        </div> 
         | 
| 87 | 
            +
                    </div>
         | 
| 88 | 
            +
                    <div class="col-sm-6">
         | 
| 89 | 
            +
                        <div class="video-header">Ours</div>
         | 
| 90 | 
            +
                        <div class="video-container">
         | 
| 91 | 
            +
                            <iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
         | 
| 92 | 
            +
                            </div> 
         | 
| 93 | 
            +
                    </div>
         | 
| 94 | 
            +
                </div>
         | 
| 95 | 
            +
                <br>
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                <p id="moviegen3" style="overflow: hidden;">
         | 
| 98 | 
            +
                    Example 3: Shovel scrapes against dry earth. 
         | 
| 99 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 100 | 
            +
                </p> 
         | 
| 101 | 
            +
                <div class="row g-1">
         | 
| 102 | 
            +
                    <div class="col-sm-6">
         | 
| 103 | 
            +
                        <div class="video-header">Movie Gen Audio</div>
         | 
| 104 | 
            +
                        <div class="video-container">
         | 
| 105 | 
            +
                            <iframe src="https://youtube.com/embed/PUKGyEve7XQ"></iframe>
         | 
| 106 | 
            +
                        </div> 
         | 
| 107 | 
            +
                    </div>
         | 
| 108 | 
            +
                    <div class="col-sm-6">
         | 
| 109 | 
            +
                        <div class="video-header">Ours</div>
         | 
| 110 | 
            +
                        <div class="video-container">
         | 
| 111 | 
            +
                            <iframe src="https://youtube.com/embed/CNn7i8VNkdc"></iframe>
         | 
| 112 | 
            +
                        </div> 
         | 
| 113 | 
            +
                    </div>
         | 
| 114 | 
            +
                </div>
         | 
| 115 | 
            +
                <br>
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                <p id="moviegen4" style="overflow: hidden;">
         | 
| 119 | 
            +
                    (Failure case) Example 4: Creamy sound of mashed potatoes being scooped. 
         | 
| 120 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 121 | 
            +
                </p> 
         | 
| 122 | 
            +
                <div class="row g-1">
         | 
| 123 | 
            +
                    <div class="col-sm-6">
         | 
| 124 | 
            +
                        <div class="video-header">Movie Gen Audio</div>
         | 
| 125 | 
            +
                        <div class="video-container">
         | 
| 126 | 
            +
                            <iframe src="https://youtube.com/embed/PJv1zxR9JjQ"></iframe>
         | 
| 127 | 
            +
                        </div> 
         | 
| 128 | 
            +
                    </div>
         | 
| 129 | 
            +
                    <div class="col-sm-6">
         | 
| 130 | 
            +
                        <div class="video-header">Ours</div>
         | 
| 131 | 
            +
                        <div class="video-container">
         | 
| 132 | 
            +
                            <iframe src="https://youtube.com/embed/c3-LJ1lNsPQ"></iframe>
         | 
| 133 | 
            +
                        </div> 
         | 
| 134 | 
            +
                    </div>
         | 
| 135 | 
            +
                </div>
         | 
| 136 | 
            +
                <br>
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                </div>
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                <div id="hunyuan_sora_all">
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                <h2 id="hunyuan" style="text-align: center;">Results on Videos Generated by Hunyuan</h2>
         | 
| 143 | 
            +
                <p style="overflow: hidden;">
         | 
| 144 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 145 | 
            +
                </p> 
         | 
| 146 | 
            +
                <div class="row g-1">
         | 
| 147 | 
            +
                    <div class="col-sm-6">
         | 
| 148 | 
            +
                        <div class="video-header">Typing</div>
         | 
| 149 | 
            +
                        <div class="video-container">
         | 
| 150 | 
            +
                            <iframe src="https://youtube.com/embed/8ln_9hhH_nk"></iframe>
         | 
| 151 | 
            +
                        </div> 
         | 
| 152 | 
            +
                    </div>
         | 
| 153 | 
            +
                    <div class="col-sm-6">
         | 
| 154 | 
            +
                        <div class="video-header">Water is rushing down a stream and pouring</div>
         | 
| 155 | 
            +
                        <div class="video-container">
         | 
| 156 | 
            +
                            <iframe src="https://youtube.com/embed/5df1FZFQj30"></iframe>
         | 
| 157 | 
            +
                        </div> 
         | 
| 158 | 
            +
                    </div>
         | 
| 159 | 
            +
                </div>
         | 
| 160 | 
            +
                <div class="row g-1">
         | 
| 161 | 
            +
                    <div class="col-sm-6">
         | 
| 162 | 
            +
                        <div class="video-header">Waves on beach</div>
         | 
| 163 | 
            +
                        <div class="video-container">
         | 
| 164 | 
            +
                            <iframe src="https://youtube.com/embed/7wQ9D5WgpFc"></iframe>
         | 
| 165 | 
            +
                        </div> 
         | 
| 166 | 
            +
                    </div>
         | 
| 167 | 
            +
                    <div class="col-sm-6">
         | 
| 168 | 
            +
                        <div class="video-header">Water droplet</div>
         | 
| 169 | 
            +
                        <div class="video-container">
         | 
| 170 | 
            +
                            <iframe src="https://youtube.com/embed/q7M2nsalGjM"></iframe>
         | 
| 171 | 
            +
                        </div> 
         | 
| 172 | 
            +
                    </div>
         | 
| 173 | 
            +
                </div>
         | 
| 174 | 
            +
                <br>
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                <h2 id="sora" style="text-align: center;">Results on Videos Generated by Sora</h2>
         | 
| 177 | 
            +
                <p style="overflow: hidden;">
         | 
| 178 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 179 | 
            +
                </p> 
         | 
| 180 | 
            +
                <div class="row g-1">
         | 
| 181 | 
            +
                    <div class="col-sm-6">
         | 
| 182 | 
            +
                        <div class="video-header">Ships riding waves</div>
         | 
| 183 | 
            +
                        <div class="video-container">
         | 
| 184 | 
            +
                            <iframe src="https://youtube.com/embed/JbgQzHHytk8"></iframe>
         | 
| 185 | 
            +
                        </div> 
         | 
| 186 | 
            +
                    </div>
         | 
| 187 | 
            +
                    <div class="col-sm-6">
         | 
| 188 | 
            +
                        <div class="video-header">Train (no text prompt given)</div>
         | 
| 189 | 
            +
                        <div class="video-container">
         | 
| 190 | 
            +
                            <iframe src="https://youtube.com/embed/xOW7zrjpWC8"></iframe>
         | 
| 191 | 
            +
                        </div> 
         | 
| 192 | 
            +
                    </div>
         | 
| 193 | 
            +
                </div>
         | 
| 194 | 
            +
                <div class="row g-1">
         | 
| 195 | 
            +
                    <div class="col-sm-6">
         | 
| 196 | 
            +
                        <div class="video-header">Seashore (no text prompt given)</div>
         | 
| 197 | 
            +
                        <div class="video-container">
         | 
| 198 | 
            +
                            <iframe src="https://youtube.com/embed/fIuw5Y8ZZ9E"></iframe>
         | 
| 199 | 
            +
                        </div> 
         | 
| 200 | 
            +
                    </div>
         | 
| 201 | 
            +
                    <div class="col-sm-6">
         | 
| 202 | 
            +
                        <div class="video-header">Surfing (failure: unprompted music)</div>
         | 
| 203 | 
            +
                        <div class="video-container">
         | 
| 204 | 
            +
                            <iframe src="https://youtube.com/embed/UcSTk-v0M_s"></iframe>
         | 
| 205 | 
            +
                        </div> 
         | 
| 206 | 
            +
                    </div>
         | 
| 207 | 
            +
                </div>
         | 
| 208 | 
            +
                <br>
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                <div id="mochi_ltx_all">
         | 
| 211 | 
            +
                <h2 id="mochi" style="text-align: center;">Results on Videos Generated by Mochi 1</h2>
         | 
| 212 | 
            +
                <p style="overflow: hidden;">
         | 
| 213 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 214 | 
            +
                </p> 
         | 
| 215 | 
            +
                <div class="row g-1">
         | 
| 216 | 
            +
                    <div class="col-sm-6">
         | 
| 217 | 
            +
                        <div class="video-header">Magical fire and lightning (no text prompt given)</div>
         | 
| 218 | 
            +
                        <div class="video-container">
         | 
| 219 | 
            +
                            <iframe src="https://youtube.com/embed/tTlRZaSMNwY"></iframe>
         | 
| 220 | 
            +
                        </div> 
         | 
| 221 | 
            +
                    </div>
         | 
| 222 | 
            +
                    <div class="col-sm-6">
         | 
| 223 | 
            +
                        <div class="video-header">Storm (no text prompt given)</div>
         | 
| 224 | 
            +
                        <div class="video-container">
         | 
| 225 | 
            +
                            <iframe src="https://youtube.com/embed/4hrZTMJUy3w"></iframe>
         | 
| 226 | 
            +
                        </div> 
         | 
| 227 | 
            +
                    </div>
         | 
| 228 | 
            +
                </div>
         | 
| 229 | 
            +
                <br>
         | 
| 230 | 
            +
             | 
| 231 | 
            +
                <h2 id="ltx" style="text-align: center;">Results on Videos Generated by LTX-Video</h2>
         | 
| 232 | 
            +
                <p style="overflow: hidden;">
         | 
| 233 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 234 | 
            +
                </p> 
         | 
| 235 | 
            +
                <div class="row g-1">
         | 
| 236 | 
            +
                    <div class="col-sm-6">
         | 
| 237 | 
            +
                        <div class="video-header">Firewood burning and cracking</div>
         | 
| 238 | 
            +
                        <div class="video-container">
         | 
| 239 | 
            +
                            <iframe src="https://youtube.com/embed/P7_DDpgev0g"></iframe>
         | 
| 240 | 
            +
                        </div> 
         | 
| 241 | 
            +
                    </div>
         | 
| 242 | 
            +
                    <div class="col-sm-6">
         | 
| 243 | 
            +
                        <div class="video-header">Waterfall, water splashing</div>
         | 
| 244 | 
            +
                        <div class="video-container">
         | 
| 245 | 
            +
                            <iframe src="https://youtube.com/embed/4MvjceYnIO0"></iframe>
         | 
| 246 | 
            +
                        </div> 
         | 
| 247 | 
            +
                    </div>
         | 
| 248 | 
            +
                </div>
         | 
| 249 | 
            +
                <br>
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                </div>
         | 
| 252 | 
            +
             | 
| 253 | 
            +
            </body>
         | 
| 254 | 
            +
            </html>
         | 
    	
        docs/video_main.html
    ADDED
    
    | @@ -0,0 +1,98 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <!DOCTYPE html>
         | 
| 2 | 
            +
            <html lang="en">
         | 
| 3 | 
            +
            <head>
         | 
| 4 | 
            +
                <!-- Google tag (gtag.js) -->
         | 
| 5 | 
            +
                <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
         | 
| 6 | 
            +
                <script>
         | 
| 7 | 
            +
                window.dataLayer = window.dataLayer || [];
         | 
| 8 | 
            +
                function gtag(){dataLayer.push(arguments);}
         | 
| 9 | 
            +
                gtag('js', new Date());
         | 
| 10 | 
            +
                gtag('config', 'G-0JKBJ3WRJZ');
         | 
| 11 | 
            +
                </script>
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
         | 
| 14 | 
            +
                <meta charset="UTF-8">
         | 
| 15 | 
            +
                <title>MMAudio</title>
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                <link rel="icon" type="image/png" href="images/icon.png">
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
         | 
| 20 | 
            +
                <!-- CSS only -->
         | 
| 21 | 
            +
                <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
         | 
| 22 | 
            +
                    integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
         | 
| 23 | 
            +
                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                <link rel="stylesheet" href="style_videos.css">
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                <script type="text/javascript">
         | 
| 28 | 
            +
                    $(document).ready(function(){
         | 
| 29 | 
            +
                        $("#content").load("video_gen.html #moviegen_all");
         | 
| 30 | 
            +
                        $("#load_moveigen").click(function(){
         | 
| 31 | 
            +
                            $("#content").load("video_gen.html #moviegen_all");
         | 
| 32 | 
            +
                        });
         | 
| 33 | 
            +
                        $("#load_hunyuan_sora").click(function(){
         | 
| 34 | 
            +
                            $("#content").load("video_gen.html #hunyuan_sora_all");
         | 
| 35 | 
            +
                        });
         | 
| 36 | 
            +
                        $("#load_mochi_ltx").click(function(){
         | 
| 37 | 
            +
                            $("#content").load("video_gen.html #mochi_ltx_all");
         | 
| 38 | 
            +
                        });
         | 
| 39 | 
            +
                        $("#load_vgg1").click(function(){
         | 
| 40 | 
            +
                            $("#content").load("video_vgg.html #vgg1");
         | 
| 41 | 
            +
                        });
         | 
| 42 | 
            +
                        $("#load_vgg2").click(function(){
         | 
| 43 | 
            +
                            $("#content").load("video_vgg.html #vgg2");
         | 
| 44 | 
            +
                        });
         | 
| 45 | 
            +
                        $("#load_vgg3").click(function(){
         | 
| 46 | 
            +
                            $("#content").load("video_vgg.html #vgg3");
         | 
| 47 | 
            +
                        });
         | 
| 48 | 
            +
                        $("#load_vgg4").click(function(){
         | 
| 49 | 
            +
                            $("#content").load("video_vgg.html #vgg4");
         | 
| 50 | 
            +
                        });
         | 
| 51 | 
            +
                        $("#load_vgg5").click(function(){
         | 
| 52 | 
            +
                            $("#content").load("video_vgg.html #vgg5");
         | 
| 53 | 
            +
                        });
         | 
| 54 | 
            +
                        $("#load_vgg6").click(function(){
         | 
| 55 | 
            +
                            $("#content").load("video_vgg.html #vgg6");
         | 
| 56 | 
            +
                        });
         | 
| 57 | 
            +
                        $("#load_vgg_extra").click(function(){
         | 
| 58 | 
            +
                            $("#content").load("video_vgg.html #vgg_extra");
         | 
| 59 | 
            +
                        });
         | 
| 60 | 
            +
                    });
         | 
| 61 | 
            +
                </script>
         | 
| 62 | 
            +
            </head>
         | 
| 63 | 
            +
            <body>
         | 
| 64 | 
            +
                <h1 id="index" style="text-align: center;">Index</h1>
         | 
| 65 | 
            +
                <p><b>(Click on the links to load the corresponding videos)</b> <span style="float:right;"><a href="index.html">Back to project page</a></span></p>
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                <ol>
         | 
| 68 | 
            +
                    <li>
         | 
| 69 | 
            +
                        <a href="#" id="load_moveigen">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</a>
         | 
| 70 | 
            +
                    </li>
         | 
| 71 | 
            +
                    <li>
         | 
| 72 | 
            +
                        <a href="#" id="load_hunyuan_sora">Results on Videos Generated by Hunyuan and Sora</a>
         | 
| 73 | 
            +
                    </li>
         | 
| 74 | 
            +
                    <li>
         | 
| 75 | 
            +
                        <a href="#" id="load_mochi_ltx">Results on Videos Generated by Mochi 1 and LTX-Video</a>
         | 
| 76 | 
            +
                    </li>
         | 
| 77 | 
            +
                    <li>
         | 
| 78 | 
            +
                        On VGGSound
         | 
| 79 | 
            +
                        <ol>
         | 
| 80 | 
            +
                            <li><a id='load_vgg1' href="#">Example 1: Wolf howling</a></li>
         | 
| 81 | 
            +
                            <li><a id='load_vgg2' href="#">Example 2: Striking a golf ball</a></li>
         | 
| 82 | 
            +
                            <li><a id='load_vgg3' href="#">Example 3: Hitting a drum</a></li>
         | 
| 83 | 
            +
                            <li><a id='load_vgg4' href="#">Example 4: Dog barking</a></li>
         | 
| 84 | 
            +
                            <li><a id='load_vgg5' href="#">Example 5: Playing a string instrument</a></li>
         | 
| 85 | 
            +
                            <li><a id='load_vgg6' href="#">Example 6: A group of people playing tambourines</a></li>
         | 
| 86 | 
            +
                            <li><a id='load_vgg_extra' href="#">Extra results & failure cases</a></li>
         | 
| 87 | 
            +
                        </ol>
         | 
| 88 | 
            +
                    </li>
         | 
| 89 | 
            +
                </ol>
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                <div id="content" class="container-fluid">
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                </div>
         | 
| 94 | 
            +
                <br>
         | 
| 95 | 
            +
                <br>
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            </body>
         | 
| 98 | 
            +
            </html>
         | 
    	
        docs/video_vgg.html
    ADDED
    
    | @@ -0,0 +1,452 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            <!DOCTYPE html>
         | 
| 2 | 
            +
            <html lang="en">
         | 
| 3 | 
            +
            <head>
         | 
| 4 | 
            +
                <!-- Google tag (gtag.js) -->
         | 
| 5 | 
            +
                <script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
         | 
| 6 | 
            +
                <script>
         | 
| 7 | 
            +
                window.dataLayer = window.dataLayer || [];
         | 
| 8 | 
            +
                function gtag(){dataLayer.push(arguments);}
         | 
| 9 | 
            +
                gtag('js', new Date());
         | 
| 10 | 
            +
                gtag('config', 'G-0JKBJ3WRJZ');
         | 
| 11 | 
            +
                </script>
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
         | 
| 14 | 
            +
                <meta charset="UTF-8">
         | 
| 15 | 
            +
                <title>MMAudio</title>
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                <meta name="viewport" content="width=device-width, initial-scale=1">
         | 
| 18 | 
            +
                <!-- CSS only -->
         | 
| 19 | 
            +
                <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
         | 
| 20 | 
            +
                    integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
         | 
| 21 | 
            +
                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                <link rel="stylesheet" href="style_videos.css">
         | 
| 24 | 
            +
            </head>
         | 
| 25 | 
            +
            <body>
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                <div id="vgg1">
         | 
| 28 | 
            +
                <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 29 | 
            +
                <p style="overflow: hidden;">
         | 
| 30 | 
            +
                    Example 1: Wolf howling. 
         | 
| 31 | 
            +
                    <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 32 | 
            +
                </p> 
         | 
| 33 | 
            +
                    <div class="row g-1">
         | 
| 34 | 
            +
                        <div class="col-sm-3">
         | 
| 35 | 
            +
                            <div class="video-header">Ground-truth</div>
         | 
| 36 | 
            +
                            <div class="video-container">
         | 
| 37 | 
            +
                                <iframe src="https://youtube.com/embed/9J_V74gqMUA"></iframe>
         | 
| 38 | 
            +
                            </div> 
         | 
| 39 | 
            +
                        </div>
         | 
| 40 | 
            +
                        <div class="col-sm-3">
         | 
| 41 | 
            +
                            <div class="video-header">Ours</div>
         | 
| 42 | 
            +
                            <div class="video-container">
         | 
| 43 | 
            +
                                <iframe src="https://youtube.com/embed/P6O8IpjErPc"></iframe>
         | 
| 44 | 
            +
                                </div> 
         | 
| 45 | 
            +
                        </div>
         | 
| 46 | 
            +
                        <div class="col-sm-3">
         | 
| 47 | 
            +
                            <div class="video-header">V2A-Mapper</div>
         | 
| 48 | 
            +
                            <div class="video-container">
         | 
| 49 | 
            +
                                <iframe src="https://youtube.com/embed/w-5eyqepvTk"></iframe>
         | 
| 50 | 
            +
                                </div> 
         | 
| 51 | 
            +
                        </div>
         | 
| 52 | 
            +
                        <div class="col-sm-3">
         | 
| 53 | 
            +
                            <div class="video-header">FoleyCrafter</div>
         | 
| 54 | 
            +
                            <div class="video-container">
         | 
| 55 | 
            +
                                <iframe src="https://youtube.com/embed/VOLfoZlRkzo"></iframe>
         | 
| 56 | 
            +
                                </div> 
         | 
| 57 | 
            +
                        </div>
         | 
| 58 | 
            +
                    </div>
         | 
| 59 | 
            +
                    <div class="row g-1">
         | 
| 60 | 
            +
                        <div class="col-sm-3">
         | 
| 61 | 
            +
                            <div class="video-header">Frieren</div>
         | 
| 62 | 
            +
                            <div class="video-container">
         | 
| 63 | 
            +
                                <iframe src="https://youtube.com/embed/49owKyA5Pa8"></iframe>
         | 
| 64 | 
            +
                            </div> 
         | 
| 65 | 
            +
                        </div>
         | 
| 66 | 
            +
                        <div class="col-sm-3">
         | 
| 67 | 
            +
                            <div class="video-header">VATT</div>
         | 
| 68 | 
            +
                            <div class="video-container">
         | 
| 69 | 
            +
                                <iframe src="https://youtube.com/embed/QVtrFgbeGDM"></iframe>
         | 
| 70 | 
            +
                                </div> 
         | 
| 71 | 
            +
                        </div>
         | 
| 72 | 
            +
                        <div class="col-sm-3">
         | 
| 73 | 
            +
                            <div class="video-header">V-AURA</div>
         | 
| 74 | 
            +
                            <div class="video-container">
         | 
| 75 | 
            +
                                <iframe src="https://youtube.com/embed/8r0uEfSNjvI"></iframe>
         | 
| 76 | 
            +
                                </div> 
         | 
| 77 | 
            +
                        </div>
         | 
| 78 | 
            +
                        <div class="col-sm-3">
         | 
| 79 | 
            +
                            <div class="video-header">Seeing and Hearing</div>
         | 
| 80 | 
            +
                            <div class="video-container">
         | 
| 81 | 
            +
                                <iframe src="https://youtube.com/embed/bn-sLg2qulk"></iframe>
         | 
| 82 | 
            +
                                </div> 
         | 
| 83 | 
            +
                        </div>
         | 
| 84 | 
            +
                    </div>
         | 
| 85 | 
            +
                </div>
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                <div id="vgg2">
         | 
| 88 | 
            +
                    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 89 | 
            +
                    <p style="overflow: hidden;">
         | 
| 90 | 
            +
                        Example 2: Striking a golf ball.
         | 
| 91 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span>
         | 
| 92 | 
            +
                    </p>
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                    <div class="row g-1">
         | 
| 95 | 
            +
                        <div class="col-sm-3">
         | 
| 96 | 
            +
                            <div class="video-header">Ground-truth</div>
         | 
| 97 | 
            +
                            <div class="video-container">
         | 
| 98 | 
            +
                                <iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
         | 
| 99 | 
            +
                            </div>
         | 
| 100 | 
            +
                        </div>
         | 
| 101 | 
            +
                        <div class="col-sm-3">
         | 
| 102 | 
            +
                            <div class="video-header">Ours</div>
         | 
| 103 | 
            +
                            <div class="video-container">
         | 
| 104 | 
            +
                                <iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
         | 
| 105 | 
            +
                            </div>
         | 
| 106 | 
            +
                        </div>
         | 
| 107 | 
            +
                        <div class="col-sm-3">
         | 
| 108 | 
            +
                            <div class="video-header">V2A-Mapper</div>
         | 
| 109 | 
            +
                            <div class="video-container">
         | 
| 110 | 
            +
                                <iframe src="https://youtube.com/embed/jgKfLBLhh7Y"></iframe>
         | 
| 111 | 
            +
                            </div>
         | 
| 112 | 
            +
                        </div>
         | 
| 113 | 
            +
                        <div class="col-sm-3">
         | 
| 114 | 
            +
                            <div class="video-header">FoleyCrafter</div>
         | 
| 115 | 
            +
                            <div class="video-container">
         | 
| 116 | 
            +
                                <iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
         | 
| 117 | 
            +
                            </div>
         | 
| 118 | 
            +
                        </div>
         | 
| 119 | 
            +
                    </div>
         | 
| 120 | 
            +
                    <div class="row g-1">
         | 
| 121 | 
            +
                        <div class="col-sm-3">
         | 
| 122 | 
            +
                            <div class="video-header">Frieren</div>
         | 
| 123 | 
            +
                            <div class="video-container">
         | 
| 124 | 
            +
                                <iframe src="https://youtube.com/embed/tz-LpbB0MBc"></iframe>
         | 
| 125 | 
            +
                            </div>
         | 
| 126 | 
            +
                        </div>
         | 
| 127 | 
            +
                        <div class="col-sm-3">
         | 
| 128 | 
            +
                            <div class="video-header">VATT</div>
         | 
| 129 | 
            +
                            <div class="video-container">
         | 
| 130 | 
            +
                                <iframe src="https://youtube.com/embed/RTDUHMi08n4"></iframe>
         | 
| 131 | 
            +
                            </div>
         | 
| 132 | 
            +
                        </div>
         | 
| 133 | 
            +
                        <div class="col-sm-3">
         | 
| 134 | 
            +
                            <div class="video-header">V-AURA</div>
         | 
| 135 | 
            +
                            <div class="video-container">
         | 
| 136 | 
            +
                                <iframe src="https://youtube.com/embed/N-3TDOsPnZQ"></iframe>
         | 
| 137 | 
            +
                            </div>
         | 
| 138 | 
            +
                        </div>
         | 
| 139 | 
            +
                        <div class="col-sm-3">
         | 
| 140 | 
            +
                            <div class="video-header">Seeing and Hearing</div>
         | 
| 141 | 
            +
                            <div class="video-container">
         | 
| 142 | 
            +
                                <iframe src="https://youtube.com/embed/QnsHnLn4gB0"></iframe>
         | 
| 143 | 
            +
                            </div>
         | 
| 144 | 
            +
                        </div>
         | 
| 145 | 
            +
                    </div>
         | 
| 146 | 
            +
                </div>
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                <div id="vgg3">
         | 
| 149 | 
            +
                    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 150 | 
            +
                    <p style="overflow: hidden;">
         | 
| 151 | 
            +
                        Example 3: Hitting a drum. 
         | 
| 152 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 153 | 
            +
                    </p> 
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                    <div class="row g-1">
         | 
| 156 | 
            +
                        <div class="col-sm-3">
         | 
| 157 | 
            +
                            <div class="video-header">Ground-truth</div>
         | 
| 158 | 
            +
                            <div class="video-container">
         | 
| 159 | 
            +
                                <iframe src="https://youtube.com/embed/0oeIwq77w0Q"></iframe>
         | 
| 160 | 
            +
                            </div> 
         | 
| 161 | 
            +
                        </div>
         | 
| 162 | 
            +
                        <div class="col-sm-3">
         | 
| 163 | 
            +
                            <div class="video-header">Ours</div>
         | 
| 164 | 
            +
                            <div class="video-container">
         | 
| 165 | 
            +
                                <iframe src="https://youtube.com/embed/-UtPV9ohuIM"></iframe>
         | 
| 166 | 
            +
                            </div> 
         | 
| 167 | 
            +
                        </div>
         | 
| 168 | 
            +
                        <div class="col-sm-3">
         | 
| 169 | 
            +
                            <div class="video-header">V2A-Mapper</div>
         | 
| 170 | 
            +
                            <div class="video-container">
         | 
| 171 | 
            +
                                <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
         | 
| 172 | 
            +
                            </div> 
         | 
| 173 | 
            +
                        </div>
         | 
| 174 | 
            +
                        <div class="col-sm-3">
         | 
| 175 | 
            +
                            <div class="video-header">FoleyCrafter</div>
         | 
| 176 | 
            +
                            <div class="video-container">
         | 
| 177 | 
            +
                                <iframe src="https://youtube.com/embed/kkCsXPOlBvY"></iframe>
         | 
| 178 | 
            +
                            </div> 
         | 
| 179 | 
            +
                        </div>
         | 
| 180 | 
            +
                    </div>
         | 
| 181 | 
            +
                    <div class="row g-1">
         | 
| 182 | 
            +
                        <div class="col-sm-3">
         | 
| 183 | 
            +
                            <div class="video-header">Frieren</div>
         | 
| 184 | 
            +
                            <div class="video-container">
         | 
| 185 | 
            +
                                <iframe src="https://youtube.com/embed/MbNKsVsuvig"></iframe>
         | 
| 186 | 
            +
                            </div> 
         | 
| 187 | 
            +
                        </div>
         | 
| 188 | 
            +
                        <div class="col-sm-3">
         | 
| 189 | 
            +
                            <div class="video-header">VATT</div>
         | 
| 190 | 
            +
                            <div class="video-container">
         | 
| 191 | 
            +
                                <iframe src="https://youtube.com/embed/2yYviBjrpBw"></iframe>
         | 
| 192 | 
            +
                            </div> 
         | 
| 193 | 
            +
                        </div>
         | 
| 194 | 
            +
                        <div class="col-sm-3">
         | 
| 195 | 
            +
                            <div class="video-header">V-AURA</div>
         | 
| 196 | 
            +
                            <div class="video-container">
         | 
| 197 | 
            +
                                <iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
         | 
| 198 | 
            +
                            </div> 
         | 
| 199 | 
            +
                        </div>
         | 
| 200 | 
            +
                        <div class="col-sm-3">
         | 
| 201 | 
            +
                            <div class="video-header">Seeing and Hearing</div>
         | 
| 202 | 
            +
                            <div class="video-container">
         | 
| 203 | 
            +
                                <iframe src="https://youtube.com/embed/6dnyQt4Fuhs"></iframe>
         | 
| 204 | 
            +
                            </div> 
         | 
| 205 | 
            +
                        </div>
         | 
| 206 | 
            +
                    </div>
         | 
| 207 | 
            +
                </div>
         | 
| 208 | 
            +
                </div>
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                <div id="vgg4">
         | 
| 211 | 
            +
                    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 212 | 
            +
                    <p style="overflow: hidden;">
         | 
| 213 | 
            +
                        Example 4: Dog barking. 
         | 
| 214 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span> 
         | 
| 215 | 
            +
                    </p> 
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                    <div class="row g-1">
         | 
| 218 | 
            +
                        <div class="col-sm-3">
         | 
| 219 | 
            +
                            <div class="video-header">Ground-truth</div>
         | 
| 220 | 
            +
                            <div class="video-container">
         | 
| 221 | 
            +
                                <iframe src="https://youtube.com/embed/ckaqvTyMYAw"></iframe>
         | 
| 222 | 
            +
                            </div> 
         | 
| 223 | 
            +
                        </div>
         | 
| 224 | 
            +
                        <div class="col-sm-3">
         | 
| 225 | 
            +
                            <div class="video-header">Ours</div>
         | 
| 226 | 
            +
                            <div class="video-container">
         | 
| 227 | 
            +
                                <iframe src="https://youtube.com/embed/_aRndFZzZ-I"></iframe>
         | 
| 228 | 
            +
                            </div> 
         | 
| 229 | 
            +
                        </div>
         | 
| 230 | 
            +
                        <div class="col-sm-3">
         | 
| 231 | 
            +
                            <div class="video-header">V2A-Mapper</div>
         | 
| 232 | 
            +
                            <div class="video-container">
         | 
| 233 | 
            +
                                <iframe src="https://youtube.com/embed/mNCISP3LBl0"></iframe>
         | 
| 234 | 
            +
                            </div> 
         | 
| 235 | 
            +
                        </div>
         | 
| 236 | 
            +
                        <div class="col-sm-3">
         | 
| 237 | 
            +
                            <div class="video-header">FoleyCrafter</div>
         | 
| 238 | 
            +
                            <div class="video-container">
         | 
| 239 | 
            +
                                <iframe src="https://youtube.com/embed/phZBQ3L7foE"></iframe>
         | 
| 240 | 
            +
                            </div> 
         | 
| 241 | 
            +
                        </div>
         | 
| 242 | 
            +
                    </div>
         | 
| 243 | 
            +
                    <div class="row g-1">
         | 
| 244 | 
            +
                        <div class="col-sm-3">
         | 
| 245 | 
            +
                            <div class="video-header">Frieren</div>
         | 
| 246 | 
            +
                            <div class="video-container">
         | 
| 247 | 
            +
                                <iframe src="https://youtube.com/embed/Sb5Mg1-ORao"></iframe>
         | 
| 248 | 
            +
                            </div> 
         | 
| 249 | 
            +
                        </div>
         | 
| 250 | 
            +
                        <div class="col-sm-3">
         | 
| 251 | 
            +
                            <div class="video-header">VATT</div>
         | 
| 252 | 
            +
                            <div class="video-container">
         | 
| 253 | 
            +
                                <iframe src="https://youtube.com/embed/eHmAGOmtDDg"></iframe>
         | 
| 254 | 
            +
                            </div> 
         | 
| 255 | 
            +
                        </div>
         | 
| 256 | 
            +
                        <div class="col-sm-3">
         | 
| 257 | 
            +
                            <div class="video-header">V-AURA</div>
         | 
| 258 | 
            +
                            <div class="video-container">
         | 
| 259 | 
            +
                                <iframe src="https://youtube.com/embed/NEGa3krBrm0"></iframe>
         | 
| 260 | 
            +
                            </div> 
         | 
| 261 | 
            +
                        </div>
         | 
| 262 | 
            +
                        <div class="col-sm-3">
         | 
| 263 | 
            +
                            <div class="video-header">Seeing and Hearing</div>
         | 
| 264 | 
            +
                            <div class="video-container">
         | 
| 265 | 
            +
                                <iframe src="https://youtube.com/embed/aO0EAXlwE7A"></iframe>
         | 
| 266 | 
            +
                            </div> 
         | 
| 267 | 
            +
                        </div>
         | 
| 268 | 
            +
                    </div>
         | 
| 269 | 
            +
                </div>
         | 
| 270 | 
            +
                
         | 
| 271 | 
            +
                <div id="vgg5">
         | 
| 272 | 
            +
                    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 273 | 
            +
                    <p style="overflow: hidden;">
         | 
| 274 | 
            +
                        Example 5: Playing a string instrument.
         | 
| 275 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span>
         | 
| 276 | 
            +
                    </p>
         | 
| 277 | 
            +
             | 
| 278 | 
            +
                    <div class="row g-1">
         | 
| 279 | 
            +
                        <div class="col-sm-3">
         | 
| 280 | 
            +
                            <div class="video-header">Ground-truth</div>
         | 
| 281 | 
            +
                            <div class="video-container">
         | 
| 282 | 
            +
                                <iframe src="https://youtube.com/embed/KP1QhWauIOc"></iframe>
         | 
| 283 | 
            +
                            </div>
         | 
| 284 | 
            +
                        </div>
         | 
| 285 | 
            +
                        <div class="col-sm-3">
         | 
| 286 | 
            +
                            <div class="video-header">Ours</div>
         | 
| 287 | 
            +
                            <div class="video-container">
         | 
| 288 | 
            +
                                <iframe src="https://youtube.com/embed/ovaJhWSquYE"></iframe>
         | 
| 289 | 
            +
                            </div>
         | 
| 290 | 
            +
                        </div>
         | 
| 291 | 
            +
                        <div class="col-sm-3">
         | 
| 292 | 
            +
                            <div class="video-header">V2A-Mapper</div>
         | 
| 293 | 
            +
                            <div class="video-container">
         | 
| 294 | 
            +
                                <iframe src="https://youtube.com/embed/N723FS9lcy8"></iframe>
         | 
| 295 | 
            +
                            </div>
         | 
| 296 | 
            +
                        </div>
         | 
| 297 | 
            +
                        <div class="col-sm-3">
         | 
| 298 | 
            +
                            <div class="video-header">FoleyCrafter</div>
         | 
| 299 | 
            +
                            <div class="video-container">
         | 
| 300 | 
            +
                                <iframe src="https://youtube.com/embed/t0N4ZAAXo58"></iframe>
         | 
| 301 | 
            +
                            </div>
         | 
| 302 | 
            +
                        </div>
         | 
| 303 | 
            +
                    </div>
         | 
| 304 | 
            +
                    <div class="row g-1">
         | 
| 305 | 
            +
                        <div class="col-sm-3">
         | 
| 306 | 
            +
                            <div class="video-header">Frieren</div>
         | 
| 307 | 
            +
                            <div class="video-container">
         | 
| 308 | 
            +
                                <iframe src="https://youtube.com/embed/8YSRs03QNNA"></iframe>
         | 
| 309 | 
            +
                            </div>
         | 
| 310 | 
            +
                        </div>
         | 
| 311 | 
            +
                        <div class="col-sm-3">
         | 
| 312 | 
            +
                            <div class="video-header">VATT</div>
         | 
| 313 | 
            +
                            <div class="video-container">
         | 
| 314 | 
            +
                                <iframe src="https://youtube.com/embed/vOpMz55J1kY"></iframe>
         | 
| 315 | 
            +
                            </div>
         | 
| 316 | 
            +
                        </div>
         | 
| 317 | 
            +
                        <div class="col-sm-3">
         | 
| 318 | 
            +
                            <div class="video-header">V-AURA</div>
         | 
| 319 | 
            +
                            <div class="video-container">
         | 
| 320 | 
            +
                                <iframe src="https://youtube.com/embed/9JHC75vr9h0"></iframe>
         | 
| 321 | 
            +
                            </div>
         | 
| 322 | 
            +
                        </div>
         | 
| 323 | 
            +
                        <div class="col-sm-3">
         | 
| 324 | 
            +
                            <div class="video-header">Seeing and Hearing</div>
         | 
| 325 | 
            +
                            <div class="video-container">
         | 
| 326 | 
            +
                                <iframe src="https://youtube.com/embed/9w0JckNzXmY"></iframe>
         | 
| 327 | 
            +
                            </div>
         | 
| 328 | 
            +
                        </div>
         | 
| 329 | 
            +
                    </div>
         | 
| 330 | 
            +
                </div>
         | 
| 331 | 
            +
                
         | 
| 332 | 
            +
                <div id="vgg6">
         | 
| 333 | 
            +
                    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 334 | 
            +
                    <p style="overflow: hidden;">
         | 
| 335 | 
            +
                        Example 6: A group of people playing tambourines.
         | 
| 336 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span>
         | 
| 337 | 
            +
                    </p>
         | 
| 338 | 
            +
             | 
| 339 | 
            +
                    <div class="row g-1">
         | 
| 340 | 
            +
                        <div class="col-sm-3">
         | 
| 341 | 
            +
                            <div class="video-header">Ground-truth</div>
         | 
| 342 | 
            +
                            <div class="video-container">
         | 
| 343 | 
            +
                                <iframe src="https://youtube.com/embed/mx6JLxzUkRc"></iframe>
         | 
| 344 | 
            +
                            </div>
         | 
| 345 | 
            +
                        </div>
         | 
| 346 | 
            +
                        <div class="col-sm-3">
         | 
| 347 | 
            +
                            <div class="video-header">Ours</div>
         | 
| 348 | 
            +
                            <div class="video-container">
         | 
| 349 | 
            +
                                <iframe src="https://youtube.com/embed/oLirHhP9Su8"></iframe>
         | 
| 350 | 
            +
                            </div>
         | 
| 351 | 
            +
                        </div>
         | 
| 352 | 
            +
                        <div class="col-sm-3">
         | 
| 353 | 
            +
                            <div class="video-header">V2A-Mapper</div>
         | 
| 354 | 
            +
                            <div class="video-container">
         | 
| 355 | 
            +
                                <iframe src="https://youtube.com/embed/HkLkHMqptv0"></iframe>
         | 
| 356 | 
            +
                            </div>
         | 
| 357 | 
            +
                        </div>
         | 
| 358 | 
            +
                        <div class="col-sm-3">
         | 
| 359 | 
            +
                            <div class="video-header">FoleyCrafter</div>
         | 
| 360 | 
            +
                            <div class="video-container">
         | 
| 361 | 
            +
                                <iframe src="https://youtube.com/embed/rpHiiODjmNU"></iframe>
         | 
| 362 | 
            +
                            </div>
         | 
| 363 | 
            +
                        </div>
         | 
| 364 | 
            +
                    </div>
         | 
| 365 | 
            +
                    <div class="row g-1">
         | 
| 366 | 
            +
                        <div class="col-sm-3">
         | 
| 367 | 
            +
                            <div class="video-header">Frieren</div>
         | 
| 368 | 
            +
                            <div class="video-container">
         | 
| 369 | 
            +
                                <iframe src="https://youtube.com/embed/1mVD3fJ0LpM"></iframe>
         | 
| 370 | 
            +
                            </div>
         | 
| 371 | 
            +
                        </div>
         | 
| 372 | 
            +
                        <div class="col-sm-3">
         | 
| 373 | 
            +
                            <div class="video-header">VATT</div>
         | 
| 374 | 
            +
                            <div class="video-container">
         | 
| 375 | 
            +
                                <iframe src="https://youtube.com/embed/yjVFnJiEJlw"></iframe>
         | 
| 376 | 
            +
                            </div>
         | 
| 377 | 
            +
                        </div>
         | 
| 378 | 
            +
                        <div class="col-sm-3">
         | 
| 379 | 
            +
                            <div class="video-header">V-AURA</div>
         | 
| 380 | 
            +
                            <div class="video-container">
         | 
| 381 | 
            +
                                <iframe src="https://youtube.com/embed/neVeMSWtRkU"></iframe>
         | 
| 382 | 
            +
                            </div>
         | 
| 383 | 
            +
                        </div>
         | 
| 384 | 
            +
                        <div class="col-sm-3">
         | 
| 385 | 
            +
                            <div class="video-header">Seeing and Hearing</div>
         | 
| 386 | 
            +
                            <div class="video-container">
         | 
| 387 | 
            +
                                <iframe src="https://youtube.com/embed/EUE7YwyVWz8"></iframe>
         | 
| 388 | 
            +
                            </div>
         | 
| 389 | 
            +
                        </div>
         | 
| 390 | 
            +
                    </div>
         | 
| 391 | 
            +
                </div>
         | 
| 392 | 
            +
                
         | 
| 393 | 
            +
                <div id="vgg_extra">
         | 
| 394 | 
            +
                    <h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
         | 
| 395 | 
            +
                    <p style="overflow: hidden;">
         | 
| 396 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span>
         | 
| 397 | 
            +
                    </p>
         | 
| 398 | 
            +
             | 
| 399 | 
            +
                    <div class="row g-1">
         | 
| 400 | 
            +
                        <div class="col-sm-3">
         | 
| 401 | 
            +
                        <div class="video-header">Moving train</div>
         | 
| 402 | 
            +
                        <div class="video-container">
         | 
| 403 | 
            +
                            <iframe src="https://youtube.com/embed/Ta6H45rBzJc"></iframe>
         | 
| 404 | 
            +
                        </div>
         | 
| 405 | 
            +
                        </div>
         | 
| 406 | 
            +
                        <div class="col-sm-3">
         | 
| 407 | 
            +
                        <div class="video-header">Water splashing</div>
         | 
| 408 | 
            +
                        <div class="video-container">
         | 
| 409 | 
            +
                            <iframe src="https://youtube.com/embed/hl6AtgHXpb4"></iframe>
         | 
| 410 | 
            +
                        </div>
         | 
| 411 | 
            +
                        </div>
         | 
| 412 | 
            +
                        <div class="col-sm-3">
         | 
| 413 | 
            +
                        <div class="video-header">Skateboarding</div>
         | 
| 414 | 
            +
                        <div class="video-container">
         | 
| 415 | 
            +
                            <iframe src="https://youtube.com/embed/n4sCNi_9buI"></iframe>
         | 
| 416 | 
            +
                        </div>
         | 
| 417 | 
            +
                        </div>
         | 
| 418 | 
            +
                        <div class="col-sm-3">
         | 
| 419 | 
            +
                        <div class="video-header">Synchronized clapping</div>
         | 
| 420 | 
            +
                        <div class="video-container">
         | 
| 421 | 
            +
                            <iframe src="https://youtube.com/embed/oxexfpLn7FE"></iframe>
         | 
| 422 | 
            +
                        </div>
         | 
| 423 | 
            +
                        </div>
         | 
| 424 | 
            +
                    </div>
         | 
| 425 | 
            +
             | 
| 426 | 
            +
                    <br><br>
         | 
| 427 | 
            +
                
         | 
| 428 | 
            +
                    <div id="extra-failure">
         | 
| 429 | 
            +
                        <h2 style="text-align: center;">Failure cases</h2>
         | 
| 430 | 
            +
                        <p style="overflow: hidden;">
         | 
| 431 | 
            +
                        <span style="float:right;"><a href="#index">Back to index</a></span>
         | 
| 432 | 
            +
                        </p>
         | 
| 433 | 
            +
             | 
| 434 | 
            +
                        <div class="row g-1">
         | 
| 435 | 
            +
                        <div class="col-sm-6">
         | 
| 436 | 
            +
                            <div class="video-header">Human speech</div>
         | 
| 437 | 
            +
                            <div class="video-container">
         | 
| 438 | 
            +
                            <iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
         | 
| 439 | 
            +
                            </div>
         | 
| 440 | 
            +
                        </div>
         | 
| 441 | 
            +
                        <div class="col-sm-6">
         | 
| 442 | 
            +
                            <div class="video-header">Unfamiliar vision input</div>
         | 
| 443 | 
            +
                            <div class="video-container">
         | 
| 444 | 
            +
                            <iframe src="https://youtube.com/embed/hfnAqmK3X7w"></iframe>
         | 
| 445 | 
            +
                            </div>
         | 
| 446 | 
            +
                        </div>
         | 
| 447 | 
            +
                        </div>
         | 
| 448 | 
            +
                    </div>
         | 
| 449 | 
            +
                    </div>
         | 
| 450 | 
            +
             | 
| 451 | 
            +
            </body>
         | 
| 452 | 
            +
            </html>
         | 
    	
        mmaudio/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        mmaudio/eval_utils.py
    ADDED
    
    | @@ -0,0 +1,245 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import dataclasses
         | 
| 2 | 
            +
            import logging
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
            from typing import Optional
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import torch
         | 
| 7 | 
            +
            from colorlog import ColoredFormatter
         | 
| 8 | 
            +
            from torchvision.transforms import v2
         | 
| 9 | 
            +
            from torio.io import StreamingMediaDecoder, StreamingMediaEncoder
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            from mmaudio.model.flow_matching import FlowMatching
         | 
| 12 | 
            +
            from mmaudio.model.networks import MMAudio
         | 
| 13 | 
            +
            from mmaudio.model.sequence_config import (CONFIG_16K, CONFIG_44K, SequenceConfig)
         | 
| 14 | 
            +
            from mmaudio.model.utils.features_utils import FeaturesUtils
         | 
| 15 | 
            +
            from mmaudio.utils.download_utils import download_model_if_needed
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            log = logging.getLogger()
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            @dataclasses.dataclass
         | 
| 21 | 
            +
            class ModelConfig:
         | 
| 22 | 
            +
                model_name: str
         | 
| 23 | 
            +
                model_path: Path
         | 
| 24 | 
            +
                vae_path: Path
         | 
| 25 | 
            +
                bigvgan_16k_path: Optional[Path]
         | 
| 26 | 
            +
                mode: str
         | 
| 27 | 
            +
                synchformer_ckpt: Path = Path('./ext_weights/synchformer_state_dict.pth')
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                @property
         | 
| 30 | 
            +
                def seq_cfg(self) -> SequenceConfig:
         | 
| 31 | 
            +
                    if self.mode == '16k':
         | 
| 32 | 
            +
                        return CONFIG_16K
         | 
| 33 | 
            +
                    elif self.mode == '44k':
         | 
| 34 | 
            +
                        return CONFIG_44K
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def download_if_needed(self):
         | 
| 37 | 
            +
                    download_model_if_needed(self.model_path)
         | 
| 38 | 
            +
                    download_model_if_needed(self.vae_path)
         | 
| 39 | 
            +
                    if self.bigvgan_16k_path is not None:
         | 
| 40 | 
            +
                        download_model_if_needed(self.bigvgan_16k_path)
         | 
| 41 | 
            +
                    download_model_if_needed(self.synchformer_ckpt)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            small_16k = ModelConfig(model_name='small_16k',
         | 
| 45 | 
            +
                                    model_path=Path('./weights/mmaudio_small_16k.pth'),
         | 
| 46 | 
            +
                                    vae_path=Path('./ext_weights/v1-16.pth'),
         | 
| 47 | 
            +
                                    bigvgan_16k_path=Path('./ext_weights/best_netG.pt'),
         | 
| 48 | 
            +
                                    mode='16k')
         | 
| 49 | 
            +
            small_44k = ModelConfig(model_name='small_44k',
         | 
| 50 | 
            +
                                    model_path=Path('./weights/mmaudio_small_44k.pth'),
         | 
| 51 | 
            +
                                    vae_path=Path('./ext_weights/v1-44.pth'),
         | 
| 52 | 
            +
                                    bigvgan_16k_path=None,
         | 
| 53 | 
            +
                                    mode='44k')
         | 
| 54 | 
            +
            medium_44k = ModelConfig(model_name='medium_44k',
         | 
| 55 | 
            +
                                     model_path=Path('./weights/mmaudio_medium_44k.pth'),
         | 
| 56 | 
            +
                                     vae_path=Path('./ext_weights/v1-44.pth'),
         | 
| 57 | 
            +
                                     bigvgan_16k_path=None,
         | 
| 58 | 
            +
                                     mode='44k')
         | 
| 59 | 
            +
            large_44k = ModelConfig(model_name='large_44k',
         | 
| 60 | 
            +
                                    model_path=Path('./weights/mmaudio_large_44k.pth'),
         | 
| 61 | 
            +
                                    vae_path=Path('./ext_weights/v1-44.pth'),
         | 
| 62 | 
            +
                                    bigvgan_16k_path=None,
         | 
| 63 | 
            +
                                    mode='44k')
         | 
| 64 | 
            +
            large_44k_v2 = ModelConfig(model_name='large_44k_v2',
         | 
| 65 | 
            +
                                       model_path=Path('./weights/mmaudio_large_44k_v2.pth'),
         | 
| 66 | 
            +
                                       vae_path=Path('./ext_weights/v1-44.pth'),
         | 
| 67 | 
            +
                                       bigvgan_16k_path=None,
         | 
| 68 | 
            +
                                       mode='44k')
         | 
| 69 | 
            +
            all_model_cfg: dict[str, ModelConfig] = {
         | 
| 70 | 
            +
                'small_16k': small_16k,
         | 
| 71 | 
            +
                'small_44k': small_44k,
         | 
| 72 | 
            +
                'medium_44k': medium_44k,
         | 
| 73 | 
            +
                'large_44k': large_44k,
         | 
| 74 | 
            +
                'large_44k_v2': large_44k_v2,
         | 
| 75 | 
            +
            }
         | 
| 76 | 
            +
             | 
| 77 | 
            +
             | 
| 78 | 
            +
            def generate(clip_video: Optional[torch.Tensor],
         | 
| 79 | 
            +
                         sync_video: Optional[torch.Tensor],
         | 
| 80 | 
            +
                         text: Optional[list[str]],
         | 
| 81 | 
            +
                         *,
         | 
| 82 | 
            +
                         negative_text: Optional[list[str]] = None,
         | 
| 83 | 
            +
                         feature_utils: FeaturesUtils,
         | 
| 84 | 
            +
                         net: MMAudio,
         | 
| 85 | 
            +
                         fm: FlowMatching,
         | 
| 86 | 
            +
                         rng: torch.Generator,
         | 
| 87 | 
            +
                         cfg_strength: float):
         | 
| 88 | 
            +
                device = feature_utils.device
         | 
| 89 | 
            +
                dtype = feature_utils.dtype
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                bs = len(text)
         | 
| 92 | 
            +
                if clip_video is not None:
         | 
| 93 | 
            +
                    clip_video = clip_video.to(device, dtype, non_blocking=True)
         | 
| 94 | 
            +
                    clip_features = feature_utils.encode_video_with_clip(clip_video, batch_size=bs)
         | 
| 95 | 
            +
                else:
         | 
| 96 | 
            +
                    clip_features = net.get_empty_clip_sequence(bs)
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                if sync_video is not None:
         | 
| 99 | 
            +
                    sync_video = sync_video.to(device, dtype, non_blocking=True)
         | 
| 100 | 
            +
                    sync_features = feature_utils.encode_video_with_sync(sync_video, batch_size=bs)
         | 
| 101 | 
            +
                else:
         | 
| 102 | 
            +
                    sync_features = net.get_empty_sync_sequence(bs)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                if text is not None:
         | 
| 105 | 
            +
                    text_features = feature_utils.encode_text(text)
         | 
| 106 | 
            +
                else:
         | 
| 107 | 
            +
                    text_features = net.get_empty_string_sequence(bs)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                if negative_text is not None:
         | 
| 110 | 
            +
                    assert len(negative_text) == bs
         | 
| 111 | 
            +
                    negative_text_features = feature_utils.encode_text(negative_text)
         | 
| 112 | 
            +
                else:
         | 
| 113 | 
            +
                    negative_text_features = net.get_empty_string_sequence(bs)
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                x0 = torch.randn(bs,
         | 
| 116 | 
            +
                                 net.latent_seq_len,
         | 
| 117 | 
            +
                                 net.latent_dim,
         | 
| 118 | 
            +
                                 device=device,
         | 
| 119 | 
            +
                                 dtype=dtype,
         | 
| 120 | 
            +
                                 generator=rng)
         | 
| 121 | 
            +
                preprocessed_conditions = net.preprocess_conditions(clip_features, sync_features, text_features)
         | 
| 122 | 
            +
                empty_conditions = net.get_empty_conditions(
         | 
| 123 | 
            +
                    bs, negative_text_features=negative_text_features if negative_text is not None else None)
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                cfg_ode_wrapper = lambda t, x: net.ode_wrapper(t, x, preprocessed_conditions, empty_conditions,
         | 
| 126 | 
            +
                                                               cfg_strength)
         | 
| 127 | 
            +
                x1 = fm.to_data(cfg_ode_wrapper, x0)
         | 
| 128 | 
            +
                x1 = net.unnormalize(x1)
         | 
| 129 | 
            +
                spec = feature_utils.decode(x1)
         | 
| 130 | 
            +
                audio = feature_utils.vocode(spec)
         | 
| 131 | 
            +
                return audio
         | 
| 132 | 
            +
             | 
| 133 | 
            +
             | 
| 134 | 
            +
            LOGFORMAT = "  %(log_color)s%(levelname)-8s%(reset)s | %(log_color)s%(message)s%(reset)s"
         | 
| 135 | 
            +
             | 
| 136 | 
            +
             | 
| 137 | 
            +
            def setup_eval_logging(log_level: int = logging.INFO):
         | 
| 138 | 
            +
                logging.root.setLevel(log_level)
         | 
| 139 | 
            +
                formatter = ColoredFormatter(LOGFORMAT)
         | 
| 140 | 
            +
                stream = logging.StreamHandler()
         | 
| 141 | 
            +
                stream.setLevel(log_level)
         | 
| 142 | 
            +
                stream.setFormatter(formatter)
         | 
| 143 | 
            +
                log = logging.getLogger()
         | 
| 144 | 
            +
                log.setLevel(log_level)
         | 
| 145 | 
            +
                log.addHandler(stream)
         | 
| 146 | 
            +
             | 
| 147 | 
            +
             | 
| 148 | 
            +
            def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, torch.Tensor, float]:
         | 
| 149 | 
            +
                _CLIP_SIZE = 384
         | 
| 150 | 
            +
                _CLIP_FPS = 8.0
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                _SYNC_SIZE = 224
         | 
| 153 | 
            +
                _SYNC_FPS = 25.0
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                clip_transform = v2.Compose([
         | 
| 156 | 
            +
                    v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
         | 
| 157 | 
            +
                    v2.ToImage(),
         | 
| 158 | 
            +
                    v2.ToDtype(torch.float32, scale=True),
         | 
| 159 | 
            +
                ])
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                sync_transform = v2.Compose([
         | 
| 162 | 
            +
                    v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
         | 
| 163 | 
            +
                    v2.CenterCrop(_SYNC_SIZE),
         | 
| 164 | 
            +
                    v2.ToImage(),
         | 
| 165 | 
            +
                    v2.ToDtype(torch.float32, scale=True),
         | 
| 166 | 
            +
                    v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
         | 
| 167 | 
            +
                ])
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                reader = StreamingMediaDecoder(video_path)
         | 
| 170 | 
            +
                reader.add_basic_video_stream(
         | 
| 171 | 
            +
                    frames_per_chunk=int(_CLIP_FPS * duration_sec),
         | 
| 172 | 
            +
                    frame_rate=_CLIP_FPS,
         | 
| 173 | 
            +
                    format='rgb24',
         | 
| 174 | 
            +
                )
         | 
| 175 | 
            +
                reader.add_basic_video_stream(
         | 
| 176 | 
            +
                    frames_per_chunk=int(_SYNC_FPS * duration_sec),
         | 
| 177 | 
            +
                    frame_rate=_SYNC_FPS,
         | 
| 178 | 
            +
                    format='rgb24',
         | 
| 179 | 
            +
                )
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                reader.fill_buffer()
         | 
| 182 | 
            +
                data_chunk = reader.pop_chunks()
         | 
| 183 | 
            +
                clip_chunk = data_chunk[0]
         | 
| 184 | 
            +
                sync_chunk = data_chunk[1]
         | 
| 185 | 
            +
                assert clip_chunk is not None
         | 
| 186 | 
            +
                assert sync_chunk is not None
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                clip_frames = clip_transform(clip_chunk)
         | 
| 189 | 
            +
                sync_frames = sync_transform(sync_chunk)
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                clip_length_sec = clip_frames.shape[0] / _CLIP_FPS
         | 
| 192 | 
            +
                sync_length_sec = sync_frames.shape[0] / _SYNC_FPS
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                if clip_length_sec < duration_sec:
         | 
| 195 | 
            +
                    log.warning(f'Clip video is too short: {clip_length_sec:.2f} < {duration_sec:.2f}')
         | 
| 196 | 
            +
                    log.warning(f'Truncating to {clip_length_sec:.2f} sec')
         | 
| 197 | 
            +
                    duration_sec = clip_length_sec
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                if sync_length_sec < duration_sec:
         | 
| 200 | 
            +
                    log.warning(f'Sync video is too short: {sync_length_sec:.2f} < {duration_sec:.2f}')
         | 
| 201 | 
            +
                    log.warning(f'Truncating to {sync_length_sec:.2f} sec')
         | 
| 202 | 
            +
                    duration_sec = sync_length_sec
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                clip_frames = clip_frames[:int(_CLIP_FPS * duration_sec)]
         | 
| 205 | 
            +
                sync_frames = sync_frames[:int(_SYNC_FPS * duration_sec)]
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                return clip_frames, sync_frames, duration_sec
         | 
| 208 | 
            +
             | 
| 209 | 
            +
             | 
| 210 | 
            +
            def make_video(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int,
         | 
| 211 | 
            +
                           duration_sec: float):
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                approx_max_length = int(duration_sec * 60)
         | 
| 214 | 
            +
                reader = StreamingMediaDecoder(video_path)
         | 
| 215 | 
            +
                reader.add_basic_video_stream(
         | 
| 216 | 
            +
                    frames_per_chunk=approx_max_length,
         | 
| 217 | 
            +
                    format='rgb24',
         | 
| 218 | 
            +
                )
         | 
| 219 | 
            +
                reader.fill_buffer()
         | 
| 220 | 
            +
                video_chunk = reader.pop_chunks()[0]
         | 
| 221 | 
            +
                assert video_chunk is not None
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                fps = int(reader.get_out_stream_info(0).frame_rate)
         | 
| 224 | 
            +
                if fps > 60:
         | 
| 225 | 
            +
                    log.warning(f'This code supports only up to 60 fps, but the video has {fps} fps')
         | 
| 226 | 
            +
                    log.warning(f'Just change the *60 above me')
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                h, w = video_chunk.shape[-2:]
         | 
| 229 | 
            +
                video_chunk = video_chunk[:int(fps * duration_sec)]
         | 
| 230 | 
            +
             | 
| 231 | 
            +
                writer = StreamingMediaEncoder(output_path)
         | 
| 232 | 
            +
                writer.add_audio_stream(
         | 
| 233 | 
            +
                    sample_rate=sampling_rate,
         | 
| 234 | 
            +
                    num_channels=audio.shape[0],
         | 
| 235 | 
            +
                    encoder='aac',  # 'flac' does not work for some reason?
         | 
| 236 | 
            +
                )
         | 
| 237 | 
            +
                writer.add_video_stream(frame_rate=fps,
         | 
| 238 | 
            +
                                        width=w,
         | 
| 239 | 
            +
                                        height=h,
         | 
| 240 | 
            +
                                        format='rgb24',
         | 
| 241 | 
            +
                                        encoder='libx264',
         | 
| 242 | 
            +
                                        encoder_format='yuv420p')
         | 
| 243 | 
            +
                with writer.open():
         | 
| 244 | 
            +
                    writer.write_audio_chunk(0, audio.float().transpose(0, 1))
         | 
| 245 | 
            +
                    writer.write_video_chunk(1, video_chunk)
         | 
    	
        mmaudio/ext/__init__.py
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
             | 
    	
        mmaudio/ext/autoencoder/__init__.py
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            from .autoencoder import AutoEncoderModule
         | 
    	
        mmaudio/ext/autoencoder/autoencoder.py
    ADDED
    
    | @@ -0,0 +1,48 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from typing import Literal, Optional
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            import torch.nn as nn
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from mmaudio.ext.autoencoder.vae import VAE, get_my_vae
         | 
| 7 | 
            +
            from mmaudio.ext.bigvgan import BigVGAN
         | 
| 8 | 
            +
            from mmaudio.ext.bigvgan_v2.bigvgan import BigVGAN as BigVGANv2
         | 
| 9 | 
            +
            from mmaudio.model.utils.distributions import DiagonalGaussianDistribution
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            class AutoEncoderModule(nn.Module):
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def __init__(self,
         | 
| 15 | 
            +
                             *,
         | 
| 16 | 
            +
                             vae_ckpt_path,
         | 
| 17 | 
            +
                             vocoder_ckpt_path: Optional[str] = None,
         | 
| 18 | 
            +
                             mode: Literal['16k', '44k']):
         | 
| 19 | 
            +
                    super().__init__()
         | 
| 20 | 
            +
                    self.vae: VAE = get_my_vae(mode).eval()
         | 
| 21 | 
            +
                    vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
         | 
| 22 | 
            +
                    self.vae.load_state_dict(vae_state_dict)
         | 
| 23 | 
            +
                    self.vae.remove_weight_norm()
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    if mode == '16k':
         | 
| 26 | 
            +
                        assert vocoder_ckpt_path is not None
         | 
| 27 | 
            +
                        self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
         | 
| 28 | 
            +
                    elif mode == '44k':
         | 
| 29 | 
            +
                        self.vocoder = BigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x',
         | 
| 30 | 
            +
                                                                 use_cuda_kernel=False)
         | 
| 31 | 
            +
                        self.vocoder.remove_weight_norm()
         | 
| 32 | 
            +
                    else:
         | 
| 33 | 
            +
                        raise ValueError(f'Unknown mode: {mode}')
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    for param in self.parameters():
         | 
| 36 | 
            +
                        param.requires_grad = False
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                @torch.inference_mode()
         | 
| 39 | 
            +
                def encode(self, x: torch.Tensor) -> DiagonalGaussianDistribution:
         | 
| 40 | 
            +
                    return self.vae.encode(x)
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                @torch.inference_mode()
         | 
| 43 | 
            +
                def decode(self, z: torch.Tensor) -> torch.Tensor:
         | 
| 44 | 
            +
                    return self.vae.decode(z)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                @torch.inference_mode()
         | 
| 47 | 
            +
                def vocode(self, spec: torch.Tensor) -> torch.Tensor:
         | 
| 48 | 
            +
                    return self.vocoder(spec)
         | 
    	
        mmaudio/ext/autoencoder/edm2_utils.py
    ADDED
    
    | @@ -0,0 +1,168 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # This work is licensed under a Creative Commons
         | 
| 4 | 
            +
            # Attribution-NonCommercial-ShareAlike 4.0 International License.
         | 
| 5 | 
            +
            # You should have received a copy of the license along with this
         | 
| 6 | 
            +
            # work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/
         | 
| 7 | 
            +
            """Improved diffusion model architecture proposed in the paper
         | 
| 8 | 
            +
            "Analyzing and Improving the Training Dynamics of Diffusion Models"."""
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            import numpy as np
         | 
| 11 | 
            +
            import torch
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            #----------------------------------------------------------------------------
         | 
| 14 | 
            +
            # Variant of constant() that inherits dtype and device from the given
         | 
| 15 | 
            +
            # reference tensor by default.
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            _constant_cache = dict()
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            def constant(value, shape=None, dtype=None, device=None, memory_format=None):
         | 
| 21 | 
            +
                value = np.asarray(value)
         | 
| 22 | 
            +
                if shape is not None:
         | 
| 23 | 
            +
                    shape = tuple(shape)
         | 
| 24 | 
            +
                if dtype is None:
         | 
| 25 | 
            +
                    dtype = torch.get_default_dtype()
         | 
| 26 | 
            +
                if device is None:
         | 
| 27 | 
            +
                    device = torch.device('cpu')
         | 
| 28 | 
            +
                if memory_format is None:
         | 
| 29 | 
            +
                    memory_format = torch.contiguous_format
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
         | 
| 32 | 
            +
                tensor = _constant_cache.get(key, None)
         | 
| 33 | 
            +
                if tensor is None:
         | 
| 34 | 
            +
                    tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
         | 
| 35 | 
            +
                    if shape is not None:
         | 
| 36 | 
            +
                        tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
         | 
| 37 | 
            +
                    tensor = tensor.contiguous(memory_format=memory_format)
         | 
| 38 | 
            +
                    _constant_cache[key] = tensor
         | 
| 39 | 
            +
                return tensor
         | 
| 40 | 
            +
             | 
| 41 | 
            +
             | 
| 42 | 
            +
            def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None):
         | 
| 43 | 
            +
                if dtype is None:
         | 
| 44 | 
            +
                    dtype = ref.dtype
         | 
| 45 | 
            +
                if device is None:
         | 
| 46 | 
            +
                    device = ref.device
         | 
| 47 | 
            +
                return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
             | 
| 50 | 
            +
            #----------------------------------------------------------------------------
         | 
| 51 | 
            +
            # Normalize given tensor to unit magnitude with respect to the given
         | 
| 52 | 
            +
            # dimensions. Default = all dimensions except the first.
         | 
| 53 | 
            +
             | 
| 54 | 
            +
             | 
| 55 | 
            +
            def normalize(x, dim=None, eps=1e-4):
         | 
| 56 | 
            +
                if dim is None:
         | 
| 57 | 
            +
                    dim = list(range(1, x.ndim))
         | 
| 58 | 
            +
                norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
         | 
| 59 | 
            +
                norm = torch.add(eps, norm, alpha=np.sqrt(norm.numel() / x.numel()))
         | 
| 60 | 
            +
                return x / norm.to(x.dtype)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            class Normalize(torch.nn.Module):
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                def __init__(self, dim=None, eps=1e-4):
         | 
| 66 | 
            +
                    super().__init__()
         | 
| 67 | 
            +
                    self.dim = dim
         | 
| 68 | 
            +
                    self.eps = eps
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                def forward(self, x):
         | 
| 71 | 
            +
                    return normalize(x, dim=self.dim, eps=self.eps)
         | 
| 72 | 
            +
             | 
| 73 | 
            +
             | 
| 74 | 
            +
            #----------------------------------------------------------------------------
         | 
| 75 | 
            +
            # Upsample or downsample the given tensor with the given filter,
         | 
| 76 | 
            +
            # or keep it as is.
         | 
| 77 | 
            +
             | 
| 78 | 
            +
             | 
| 79 | 
            +
            def resample(x, f=[1, 1], mode='keep'):
         | 
| 80 | 
            +
                if mode == 'keep':
         | 
| 81 | 
            +
                    return x
         | 
| 82 | 
            +
                f = np.float32(f)
         | 
| 83 | 
            +
                assert f.ndim == 1 and len(f) % 2 == 0
         | 
| 84 | 
            +
                pad = (len(f) - 1) // 2
         | 
| 85 | 
            +
                f = f / f.sum()
         | 
| 86 | 
            +
                f = np.outer(f, f)[np.newaxis, np.newaxis, :, :]
         | 
| 87 | 
            +
                f = const_like(x, f)
         | 
| 88 | 
            +
                c = x.shape[1]
         | 
| 89 | 
            +
                if mode == 'down':
         | 
| 90 | 
            +
                    return torch.nn.functional.conv2d(x,
         | 
| 91 | 
            +
                                                      f.tile([c, 1, 1, 1]),
         | 
| 92 | 
            +
                                                      groups=c,
         | 
| 93 | 
            +
                                                      stride=2,
         | 
| 94 | 
            +
                                                      padding=(pad, ))
         | 
| 95 | 
            +
                assert mode == 'up'
         | 
| 96 | 
            +
                return torch.nn.functional.conv_transpose2d(x, (f * 4).tile([c, 1, 1, 1]),
         | 
| 97 | 
            +
                                                            groups=c,
         | 
| 98 | 
            +
                                                            stride=2,
         | 
| 99 | 
            +
                                                            padding=(pad, ))
         | 
| 100 | 
            +
             | 
| 101 | 
            +
             | 
| 102 | 
            +
            #----------------------------------------------------------------------------
         | 
| 103 | 
            +
            # Magnitude-preserving SiLU (Equation 81).
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
            def mp_silu(x):
         | 
| 107 | 
            +
                return torch.nn.functional.silu(x) / 0.596
         | 
| 108 | 
            +
             | 
| 109 | 
            +
             | 
| 110 | 
            +
            class MPSiLU(torch.nn.Module):
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                def forward(self, x):
         | 
| 113 | 
            +
                    return mp_silu(x)
         | 
| 114 | 
            +
             | 
| 115 | 
            +
             | 
| 116 | 
            +
            #----------------------------------------------------------------------------
         | 
| 117 | 
            +
            # Magnitude-preserving sum (Equation 88).
         | 
| 118 | 
            +
             | 
| 119 | 
            +
             | 
| 120 | 
            +
            def mp_sum(a, b, t=0.5):
         | 
| 121 | 
            +
                return a.lerp(b, t) / np.sqrt((1 - t)**2 + t**2)
         | 
| 122 | 
            +
             | 
| 123 | 
            +
             | 
| 124 | 
            +
            #----------------------------------------------------------------------------
         | 
| 125 | 
            +
            # Magnitude-preserving concatenation (Equation 103).
         | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 | 
            +
            def mp_cat(a, b, dim=1, t=0.5):
         | 
| 129 | 
            +
                Na = a.shape[dim]
         | 
| 130 | 
            +
                Nb = b.shape[dim]
         | 
| 131 | 
            +
                C = np.sqrt((Na + Nb) / ((1 - t)**2 + t**2))
         | 
| 132 | 
            +
                wa = C / np.sqrt(Na) * (1 - t)
         | 
| 133 | 
            +
                wb = C / np.sqrt(Nb) * t
         | 
| 134 | 
            +
                return torch.cat([wa * a, wb * b], dim=dim)
         | 
| 135 | 
            +
             | 
| 136 | 
            +
             | 
| 137 | 
            +
            #----------------------------------------------------------------------------
         | 
| 138 | 
            +
            # Magnitude-preserving convolution or fully-connected layer (Equation 47)
         | 
| 139 | 
            +
            # with force weight normalization (Equation 66).
         | 
| 140 | 
            +
             | 
| 141 | 
            +
             | 
| 142 | 
            +
            class MPConv1D(torch.nn.Module):
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                def __init__(self, in_channels, out_channels, kernel_size):
         | 
| 145 | 
            +
                    super().__init__()
         | 
| 146 | 
            +
                    self.out_channels = out_channels
         | 
| 147 | 
            +
                    self.weight = torch.nn.Parameter(torch.randn(out_channels, in_channels, kernel_size))
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                    self.weight_norm_removed = False
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                def forward(self, x, gain=1):
         | 
| 152 | 
            +
                    assert self.weight_norm_removed, 'call remove_weight_norm() before inference'
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    w = self.weight * gain
         | 
| 155 | 
            +
                    if w.ndim == 2:
         | 
| 156 | 
            +
                        return x @ w.t()
         | 
| 157 | 
            +
                    assert w.ndim == 3
         | 
| 158 | 
            +
                    return torch.nn.functional.conv1d(x, w, padding=(w.shape[-1] // 2, ))
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                def remove_weight_norm(self):
         | 
| 161 | 
            +
                    w = self.weight.to(torch.float32)
         | 
| 162 | 
            +
                    w = normalize(w)  # traditional weight normalization
         | 
| 163 | 
            +
                    w = w / np.sqrt(w[0].numel())
         | 
| 164 | 
            +
                    w = w.to(self.weight.dtype)
         | 
| 165 | 
            +
                    self.weight.data.copy_(w)
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                    self.weight_norm_removed = True
         | 
| 168 | 
            +
                    return self
         | 
    	
        mmaudio/ext/autoencoder/vae.py
    ADDED
    
    | @@ -0,0 +1,369 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import logging
         | 
| 2 | 
            +
            from typing import Optional
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            import torch.nn as nn
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from mmaudio.ext.autoencoder.edm2_utils import MPConv1D
         | 
| 8 | 
            +
            from mmaudio.ext.autoencoder.vae_modules import (AttnBlock1D, Downsample1D, ResnetBlock1D,
         | 
| 9 | 
            +
                                                             Upsample1D, nonlinearity)
         | 
| 10 | 
            +
            from mmaudio.model.utils.distributions import DiagonalGaussianDistribution
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            log = logging.getLogger()
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            DATA_MEAN_80D = [
         | 
| 15 | 
            +
                -1.6058, -1.3676, -1.2520, -1.2453, -1.2078, -1.2224, -1.2419, -1.2439, -1.2922, -1.2927,
         | 
| 16 | 
            +
                -1.3170, -1.3543, -1.3401, -1.3836, -1.3907, -1.3912, -1.4313, -1.4152, -1.4527, -1.4728,
         | 
| 17 | 
            +
                -1.4568, -1.5101, -1.5051, -1.5172, -1.5623, -1.5373, -1.5746, -1.5687, -1.6032, -1.6131,
         | 
| 18 | 
            +
                -1.6081, -1.6331, -1.6489, -1.6489, -1.6700, -1.6738, -1.6953, -1.6969, -1.7048, -1.7280,
         | 
| 19 | 
            +
                -1.7361, -1.7495, -1.7658, -1.7814, -1.7889, -1.8064, -1.8221, -1.8377, -1.8417, -1.8643,
         | 
| 20 | 
            +
                -1.8857, -1.8929, -1.9173, -1.9379, -1.9531, -1.9673, -1.9824, -2.0042, -2.0215, -2.0436,
         | 
| 21 | 
            +
                -2.0766, -2.1064, -2.1418, -2.1855, -2.2319, -2.2767, -2.3161, -2.3572, -2.3954, -2.4282,
         | 
| 22 | 
            +
                -2.4659, -2.5072, -2.5552, -2.6074, -2.6584, -2.7107, -2.7634, -2.8266, -2.8981, -2.9673
         | 
| 23 | 
            +
            ]
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            DATA_STD_80D = [
         | 
| 26 | 
            +
                1.0291, 1.0411, 1.0043, 0.9820, 0.9677, 0.9543, 0.9450, 0.9392, 0.9343, 0.9297, 0.9276, 0.9263,
         | 
| 27 | 
            +
                0.9242, 0.9254, 0.9232, 0.9281, 0.9263, 0.9315, 0.9274, 0.9247, 0.9277, 0.9199, 0.9188, 0.9194,
         | 
| 28 | 
            +
                0.9160, 0.9161, 0.9146, 0.9161, 0.9100, 0.9095, 0.9145, 0.9076, 0.9066, 0.9095, 0.9032, 0.9043,
         | 
| 29 | 
            +
                0.9038, 0.9011, 0.9019, 0.9010, 0.8984, 0.8983, 0.8986, 0.8961, 0.8962, 0.8978, 0.8962, 0.8973,
         | 
| 30 | 
            +
                0.8993, 0.8976, 0.8995, 0.9016, 0.8982, 0.8972, 0.8974, 0.8949, 0.8940, 0.8947, 0.8936, 0.8939,
         | 
| 31 | 
            +
                0.8951, 0.8956, 0.9017, 0.9167, 0.9436, 0.9690, 1.0003, 1.0225, 1.0381, 1.0491, 1.0545, 1.0604,
         | 
| 32 | 
            +
                1.0761, 1.0929, 1.1089, 1.1196, 1.1176, 1.1156, 1.1117, 1.1070
         | 
| 33 | 
            +
            ]
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            DATA_MEAN_128D = [
         | 
| 36 | 
            +
                -3.3462, -2.6723, -2.4893, -2.3143, -2.2664, -2.3317, -2.1802, -2.4006, -2.2357, -2.4597,
         | 
| 37 | 
            +
                -2.3717, -2.4690, -2.5142, -2.4919, -2.6610, -2.5047, -2.7483, -2.5926, -2.7462, -2.7033,
         | 
| 38 | 
            +
                -2.7386, -2.8112, -2.7502, -2.9594, -2.7473, -3.0035, -2.8891, -2.9922, -2.9856, -3.0157,
         | 
| 39 | 
            +
                -3.1191, -2.9893, -3.1718, -3.0745, -3.1879, -3.2310, -3.1424, -3.2296, -3.2791, -3.2782,
         | 
| 40 | 
            +
                -3.2756, -3.3134, -3.3509, -3.3750, -3.3951, -3.3698, -3.4505, -3.4509, -3.5089, -3.4647,
         | 
| 41 | 
            +
                -3.5536, -3.5788, -3.5867, -3.6036, -3.6400, -3.6747, -3.7072, -3.7279, -3.7283, -3.7795,
         | 
| 42 | 
            +
                -3.8259, -3.8447, -3.8663, -3.9182, -3.9605, -3.9861, -4.0105, -4.0373, -4.0762, -4.1121,
         | 
| 43 | 
            +
                -4.1488, -4.1874, -4.2461, -4.3170, -4.3639, -4.4452, -4.5282, -4.6297, -4.7019, -4.7960,
         | 
| 44 | 
            +
                -4.8700, -4.9507, -5.0303, -5.0866, -5.1634, -5.2342, -5.3242, -5.4053, -5.4927, -5.5712,
         | 
| 45 | 
            +
                -5.6464, -5.7052, -5.7619, -5.8410, -5.9188, -6.0103, -6.0955, -6.1673, -6.2362, -6.3120,
         | 
| 46 | 
            +
                -6.3926, -6.4797, -6.5565, -6.6511, -6.8130, -6.9961, -7.1275, -7.2457, -7.3576, -7.4663,
         | 
| 47 | 
            +
                -7.6136, -7.7469, -7.8815, -8.0132, -8.1515, -8.3071, -8.4722, -8.7418, -9.3975, -9.6628,
         | 
| 48 | 
            +
                -9.7671, -9.8863, -9.9992, -10.0860, -10.1709, -10.5418, -11.2795, -11.3861
         | 
| 49 | 
            +
            ]
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            DATA_STD_128D = [
         | 
| 52 | 
            +
                2.3804, 2.4368, 2.3772, 2.3145, 2.2803, 2.2510, 2.2316, 2.2083, 2.1996, 2.1835, 2.1769, 2.1659,
         | 
| 53 | 
            +
                2.1631, 2.1618, 2.1540, 2.1606, 2.1571, 2.1567, 2.1612, 2.1579, 2.1679, 2.1683, 2.1634, 2.1557,
         | 
| 54 | 
            +
                2.1668, 2.1518, 2.1415, 2.1449, 2.1406, 2.1350, 2.1313, 2.1415, 2.1281, 2.1352, 2.1219, 2.1182,
         | 
| 55 | 
            +
                2.1327, 2.1195, 2.1137, 2.1080, 2.1179, 2.1036, 2.1087, 2.1036, 2.1015, 2.1068, 2.0975, 2.0991,
         | 
| 56 | 
            +
                2.0902, 2.1015, 2.0857, 2.0920, 2.0893, 2.0897, 2.0910, 2.0881, 2.0925, 2.0873, 2.0960, 2.0900,
         | 
| 57 | 
            +
                2.0957, 2.0958, 2.0978, 2.0936, 2.0886, 2.0905, 2.0845, 2.0855, 2.0796, 2.0840, 2.0813, 2.0817,
         | 
| 58 | 
            +
                2.0838, 2.0840, 2.0917, 2.1061, 2.1431, 2.1976, 2.2482, 2.3055, 2.3700, 2.4088, 2.4372, 2.4609,
         | 
| 59 | 
            +
                2.4731, 2.4847, 2.5072, 2.5451, 2.5772, 2.6147, 2.6529, 2.6596, 2.6645, 2.6726, 2.6803, 2.6812,
         | 
| 60 | 
            +
                2.6899, 2.6916, 2.6931, 2.6998, 2.7062, 2.7262, 2.7222, 2.7158, 2.7041, 2.7485, 2.7491, 2.7451,
         | 
| 61 | 
            +
                2.7485, 2.7233, 2.7297, 2.7233, 2.7145, 2.6958, 2.6788, 2.6439, 2.6007, 2.4786, 2.2469, 2.1877,
         | 
| 62 | 
            +
                2.1392, 2.0717, 2.0107, 1.9676, 1.9140, 1.7102, 0.9101, 0.7164
         | 
| 63 | 
            +
            ]
         | 
| 64 | 
            +
             | 
| 65 | 
            +
             | 
| 66 | 
            +
            class VAE(nn.Module):
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def __init__(
         | 
| 69 | 
            +
                    self,
         | 
| 70 | 
            +
                    *,
         | 
| 71 | 
            +
                    data_dim: int,
         | 
| 72 | 
            +
                    embed_dim: int,
         | 
| 73 | 
            +
                    hidden_dim: int,
         | 
| 74 | 
            +
                ):
         | 
| 75 | 
            +
                    super().__init__()
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    if data_dim == 80:
         | 
| 78 | 
            +
                        self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_80D, dtype=torch.float32).cuda())
         | 
| 79 | 
            +
                        self.data_std = nn.Buffer(torch.tensor(DATA_STD_80D, dtype=torch.float32).cuda())
         | 
| 80 | 
            +
                    elif data_dim == 128:
         | 
| 81 | 
            +
                        self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_128D, dtype=torch.float32).cuda())
         | 
| 82 | 
            +
                        self.data_std = nn.Buffer(torch.tensor(DATA_STD_128D, dtype=torch.float32).cuda())
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    self.data_mean = self.data_mean.view(1, -1, 1)
         | 
| 85 | 
            +
                    self.data_std = self.data_std.view(1, -1, 1)
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    self.encoder = Encoder1D(
         | 
| 88 | 
            +
                        dim=hidden_dim,
         | 
| 89 | 
            +
                        ch_mult=(1, 2, 4),
         | 
| 90 | 
            +
                        num_res_blocks=2,
         | 
| 91 | 
            +
                        attn_layers=[3],
         | 
| 92 | 
            +
                        down_layers=[0],
         | 
| 93 | 
            +
                        in_dim=data_dim,
         | 
| 94 | 
            +
                        embed_dim=embed_dim,
         | 
| 95 | 
            +
                    )
         | 
| 96 | 
            +
                    self.decoder = Decoder1D(
         | 
| 97 | 
            +
                        dim=hidden_dim,
         | 
| 98 | 
            +
                        ch_mult=(1, 2, 4),
         | 
| 99 | 
            +
                        num_res_blocks=2,
         | 
| 100 | 
            +
                        attn_layers=[3],
         | 
| 101 | 
            +
                        down_layers=[0],
         | 
| 102 | 
            +
                        in_dim=data_dim,
         | 
| 103 | 
            +
                        out_dim=data_dim,
         | 
| 104 | 
            +
                        embed_dim=embed_dim,
         | 
| 105 | 
            +
                    )
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                    self.embed_dim = embed_dim
         | 
| 108 | 
            +
                    # self.quant_conv = nn.Conv1d(2 * embed_dim, 2 * embed_dim, 1)
         | 
| 109 | 
            +
                    # self.post_quant_conv = nn.Conv1d(embed_dim, embed_dim, 1)
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    self.initialize_weights()
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                def initialize_weights(self):
         | 
| 114 | 
            +
                    pass
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                def encode(self, x: torch.Tensor, normalize: bool = True) -> DiagonalGaussianDistribution:
         | 
| 117 | 
            +
                    if normalize:
         | 
| 118 | 
            +
                        x = self.normalize(x)
         | 
| 119 | 
            +
                    moments = self.encoder(x)
         | 
| 120 | 
            +
                    posterior = DiagonalGaussianDistribution(moments)
         | 
| 121 | 
            +
                    return posterior
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                def decode(self, z: torch.Tensor, unnormalize: bool = True) -> torch.Tensor:
         | 
| 124 | 
            +
                    dec = self.decoder(z)
         | 
| 125 | 
            +
                    if unnormalize:
         | 
| 126 | 
            +
                        dec = self.unnormalize(dec)
         | 
| 127 | 
            +
                    return dec
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                def normalize(self, x: torch.Tensor) -> torch.Tensor:
         | 
| 130 | 
            +
                    return (x - self.data_mean) / self.data_std
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                def unnormalize(self, x: torch.Tensor) -> torch.Tensor:
         | 
| 133 | 
            +
                    return x * self.data_std + self.data_mean
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                def forward(
         | 
| 136 | 
            +
                    self,
         | 
| 137 | 
            +
                    x: torch.Tensor,
         | 
| 138 | 
            +
                    sample_posterior: bool = True,
         | 
| 139 | 
            +
                    rng: Optional[torch.Generator] = None,
         | 
| 140 | 
            +
                    normalize: bool = True,
         | 
| 141 | 
            +
                    unnormalize: bool = True,
         | 
| 142 | 
            +
                ) -> tuple[torch.Tensor, DiagonalGaussianDistribution]:
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                    posterior = self.encode(x, normalize=normalize)
         | 
| 145 | 
            +
                    if sample_posterior:
         | 
| 146 | 
            +
                        z = posterior.sample(rng)
         | 
| 147 | 
            +
                    else:
         | 
| 148 | 
            +
                        z = posterior.mode()
         | 
| 149 | 
            +
                    dec = self.decode(z, unnormalize=unnormalize)
         | 
| 150 | 
            +
                    return dec, posterior
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                def load_weights(self, src_dict) -> None:
         | 
| 153 | 
            +
                    self.load_state_dict(src_dict, strict=True)
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                @property
         | 
| 156 | 
            +
                def device(self) -> torch.device:
         | 
| 157 | 
            +
                    return next(self.parameters()).device
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                def get_last_layer(self):
         | 
| 160 | 
            +
                    return self.decoder.conv_out.weight
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                def remove_weight_norm(self):
         | 
| 163 | 
            +
                    for name, m in self.named_modules():
         | 
| 164 | 
            +
                        if isinstance(m, MPConv1D):
         | 
| 165 | 
            +
                            m.remove_weight_norm()
         | 
| 166 | 
            +
                            log.debug(f"Removed weight norm from {name}")
         | 
| 167 | 
            +
                    return self
         | 
| 168 | 
            +
             | 
| 169 | 
            +
             | 
| 170 | 
            +
            class Encoder1D(nn.Module):
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                def __init__(self,
         | 
| 173 | 
            +
                             *,
         | 
| 174 | 
            +
                             dim: int,
         | 
| 175 | 
            +
                             ch_mult: tuple[int] = (1, 2, 4, 8),
         | 
| 176 | 
            +
                             num_res_blocks: int,
         | 
| 177 | 
            +
                             attn_layers: list[int] = [],
         | 
| 178 | 
            +
                             down_layers: list[int] = [],
         | 
| 179 | 
            +
                             resamp_with_conv: bool = True,
         | 
| 180 | 
            +
                             in_dim: int,
         | 
| 181 | 
            +
                             embed_dim: int,
         | 
| 182 | 
            +
                             double_z: bool = True,
         | 
| 183 | 
            +
                             kernel_size: int = 3,
         | 
| 184 | 
            +
                             clip_act: float = 256.0):
         | 
| 185 | 
            +
                    super().__init__()
         | 
| 186 | 
            +
                    self.dim = dim
         | 
| 187 | 
            +
                    self.num_layers = len(ch_mult)
         | 
| 188 | 
            +
                    self.num_res_blocks = num_res_blocks
         | 
| 189 | 
            +
                    self.in_channels = in_dim
         | 
| 190 | 
            +
                    self.clip_act = clip_act
         | 
| 191 | 
            +
                    self.down_layers = down_layers
         | 
| 192 | 
            +
                    self.attn_layers = attn_layers
         | 
| 193 | 
            +
                    self.conv_in = MPConv1D(in_dim, self.dim, kernel_size=kernel_size)
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                    in_ch_mult = (1, ) + tuple(ch_mult)
         | 
| 196 | 
            +
                    self.in_ch_mult = in_ch_mult
         | 
| 197 | 
            +
                    # downsampling
         | 
| 198 | 
            +
                    self.down = nn.ModuleList()
         | 
| 199 | 
            +
                    for i_level in range(self.num_layers):
         | 
| 200 | 
            +
                        block = nn.ModuleList()
         | 
| 201 | 
            +
                        attn = nn.ModuleList()
         | 
| 202 | 
            +
                        block_in = dim * in_ch_mult[i_level]
         | 
| 203 | 
            +
                        block_out = dim * ch_mult[i_level]
         | 
| 204 | 
            +
                        for i_block in range(self.num_res_blocks):
         | 
| 205 | 
            +
                            block.append(
         | 
| 206 | 
            +
                                ResnetBlock1D(in_dim=block_in,
         | 
| 207 | 
            +
                                              out_dim=block_out,
         | 
| 208 | 
            +
                                              kernel_size=kernel_size,
         | 
| 209 | 
            +
                                              use_norm=True))
         | 
| 210 | 
            +
                            block_in = block_out
         | 
| 211 | 
            +
                            if i_level in attn_layers:
         | 
| 212 | 
            +
                                attn.append(AttnBlock1D(block_in))
         | 
| 213 | 
            +
                        down = nn.Module()
         | 
| 214 | 
            +
                        down.block = block
         | 
| 215 | 
            +
                        down.attn = attn
         | 
| 216 | 
            +
                        if i_level in down_layers:
         | 
| 217 | 
            +
                            down.downsample = Downsample1D(block_in, resamp_with_conv)
         | 
| 218 | 
            +
                        self.down.append(down)
         | 
| 219 | 
            +
             | 
| 220 | 
            +
                    # middle
         | 
| 221 | 
            +
                    self.mid = nn.Module()
         | 
| 222 | 
            +
                    self.mid.block_1 = ResnetBlock1D(in_dim=block_in,
         | 
| 223 | 
            +
                                                     out_dim=block_in,
         | 
| 224 | 
            +
                                                     kernel_size=kernel_size,
         | 
| 225 | 
            +
                                                     use_norm=True)
         | 
| 226 | 
            +
                    self.mid.attn_1 = AttnBlock1D(block_in)
         | 
| 227 | 
            +
                    self.mid.block_2 = ResnetBlock1D(in_dim=block_in,
         | 
| 228 | 
            +
                                                     out_dim=block_in,
         | 
| 229 | 
            +
                                                     kernel_size=kernel_size,
         | 
| 230 | 
            +
                                                     use_norm=True)
         | 
| 231 | 
            +
             | 
| 232 | 
            +
                    # end
         | 
| 233 | 
            +
                    self.conv_out = MPConv1D(block_in,
         | 
| 234 | 
            +
                                             2 * embed_dim if double_z else embed_dim,
         | 
| 235 | 
            +
                                             kernel_size=kernel_size)
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                    self.learnable_gain = nn.Parameter(torch.zeros([]))
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                def forward(self, x):
         | 
| 240 | 
            +
             | 
| 241 | 
            +
                    # downsampling
         | 
| 242 | 
            +
                    hs = [self.conv_in(x)]
         | 
| 243 | 
            +
                    for i_level in range(self.num_layers):
         | 
| 244 | 
            +
                        for i_block in range(self.num_res_blocks):
         | 
| 245 | 
            +
                            h = self.down[i_level].block[i_block](hs[-1])
         | 
| 246 | 
            +
                            if len(self.down[i_level].attn) > 0:
         | 
| 247 | 
            +
                                h = self.down[i_level].attn[i_block](h)
         | 
| 248 | 
            +
                            h = h.clamp(-self.clip_act, self.clip_act)
         | 
| 249 | 
            +
                            hs.append(h)
         | 
| 250 | 
            +
                        if i_level in self.down_layers:
         | 
| 251 | 
            +
                            hs.append(self.down[i_level].downsample(hs[-1]))
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                    # middle
         | 
| 254 | 
            +
                    h = hs[-1]
         | 
| 255 | 
            +
                    h = self.mid.block_1(h)
         | 
| 256 | 
            +
                    h = self.mid.attn_1(h)
         | 
| 257 | 
            +
                    h = self.mid.block_2(h)
         | 
| 258 | 
            +
                    h = h.clamp(-self.clip_act, self.clip_act)
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                    # end
         | 
| 261 | 
            +
                    h = nonlinearity(h)
         | 
| 262 | 
            +
                    h = self.conv_out(h, gain=(self.learnable_gain + 1))
         | 
| 263 | 
            +
                    return h
         | 
| 264 | 
            +
             | 
| 265 | 
            +
             | 
| 266 | 
            +
            class Decoder1D(nn.Module):
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                def __init__(self,
         | 
| 269 | 
            +
                             *,
         | 
| 270 | 
            +
                             dim: int,
         | 
| 271 | 
            +
                             out_dim: int,
         | 
| 272 | 
            +
                             ch_mult: tuple[int] = (1, 2, 4, 8),
         | 
| 273 | 
            +
                             num_res_blocks: int,
         | 
| 274 | 
            +
                             attn_layers: list[int] = [],
         | 
| 275 | 
            +
                             down_layers: list[int] = [],
         | 
| 276 | 
            +
                             kernel_size: int = 3,
         | 
| 277 | 
            +
                             resamp_with_conv: bool = True,
         | 
| 278 | 
            +
                             in_dim: int,
         | 
| 279 | 
            +
                             embed_dim: int,
         | 
| 280 | 
            +
                             clip_act: float = 256.0):
         | 
| 281 | 
            +
                    super().__init__()
         | 
| 282 | 
            +
                    self.ch = dim
         | 
| 283 | 
            +
                    self.num_layers = len(ch_mult)
         | 
| 284 | 
            +
                    self.num_res_blocks = num_res_blocks
         | 
| 285 | 
            +
                    self.in_channels = in_dim
         | 
| 286 | 
            +
                    self.clip_act = clip_act
         | 
| 287 | 
            +
                    self.down_layers = [i + 1 for i in down_layers]  # each downlayer add one
         | 
| 288 | 
            +
             | 
| 289 | 
            +
                    # compute in_ch_mult, block_in and curr_res at lowest res
         | 
| 290 | 
            +
                    block_in = dim * ch_mult[self.num_layers - 1]
         | 
| 291 | 
            +
             | 
| 292 | 
            +
                    # z to block_in
         | 
| 293 | 
            +
                    self.conv_in = MPConv1D(embed_dim, block_in, kernel_size=kernel_size)
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                    # middle
         | 
| 296 | 
            +
                    self.mid = nn.Module()
         | 
| 297 | 
            +
                    self.mid.block_1 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True)
         | 
| 298 | 
            +
                    self.mid.attn_1 = AttnBlock1D(block_in)
         | 
| 299 | 
            +
                    self.mid.block_2 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True)
         | 
| 300 | 
            +
             | 
| 301 | 
            +
                    # upsampling
         | 
| 302 | 
            +
                    self.up = nn.ModuleList()
         | 
| 303 | 
            +
                    for i_level in reversed(range(self.num_layers)):
         | 
| 304 | 
            +
                        block = nn.ModuleList()
         | 
| 305 | 
            +
                        attn = nn.ModuleList()
         | 
| 306 | 
            +
                        block_out = dim * ch_mult[i_level]
         | 
| 307 | 
            +
                        for i_block in range(self.num_res_blocks + 1):
         | 
| 308 | 
            +
                            block.append(ResnetBlock1D(in_dim=block_in, out_dim=block_out, use_norm=True))
         | 
| 309 | 
            +
                            block_in = block_out
         | 
| 310 | 
            +
                            if i_level in attn_layers:
         | 
| 311 | 
            +
                                attn.append(AttnBlock1D(block_in))
         | 
| 312 | 
            +
                        up = nn.Module()
         | 
| 313 | 
            +
                        up.block = block
         | 
| 314 | 
            +
                        up.attn = attn
         | 
| 315 | 
            +
                        if i_level in self.down_layers:
         | 
| 316 | 
            +
                            up.upsample = Upsample1D(block_in, resamp_with_conv)
         | 
| 317 | 
            +
                        self.up.insert(0, up)  # prepend to get consistent order
         | 
| 318 | 
            +
             | 
| 319 | 
            +
                    # end
         | 
| 320 | 
            +
                    self.conv_out = MPConv1D(block_in, out_dim, kernel_size=kernel_size)
         | 
| 321 | 
            +
                    self.learnable_gain = nn.Parameter(torch.zeros([]))
         | 
| 322 | 
            +
             | 
| 323 | 
            +
                def forward(self, z):
         | 
| 324 | 
            +
                    # z to block_in
         | 
| 325 | 
            +
                    h = self.conv_in(z)
         | 
| 326 | 
            +
             | 
| 327 | 
            +
                    # middle
         | 
| 328 | 
            +
                    h = self.mid.block_1(h)
         | 
| 329 | 
            +
                    h = self.mid.attn_1(h)
         | 
| 330 | 
            +
                    h = self.mid.block_2(h)
         | 
| 331 | 
            +
                    h = h.clamp(-self.clip_act, self.clip_act)
         | 
| 332 | 
            +
             | 
| 333 | 
            +
                    # upsampling
         | 
| 334 | 
            +
                    for i_level in reversed(range(self.num_layers)):
         | 
| 335 | 
            +
                        for i_block in range(self.num_res_blocks + 1):
         | 
| 336 | 
            +
                            h = self.up[i_level].block[i_block](h)
         | 
| 337 | 
            +
                            if len(self.up[i_level].attn) > 0:
         | 
| 338 | 
            +
                                h = self.up[i_level].attn[i_block](h)
         | 
| 339 | 
            +
                            h = h.clamp(-self.clip_act, self.clip_act)
         | 
| 340 | 
            +
                        if i_level in self.down_layers:
         | 
| 341 | 
            +
                            h = self.up[i_level].upsample(h)
         | 
| 342 | 
            +
             | 
| 343 | 
            +
                    h = nonlinearity(h)
         | 
| 344 | 
            +
                    h = self.conv_out(h, gain=(self.learnable_gain + 1))
         | 
| 345 | 
            +
                    return h
         | 
| 346 | 
            +
             | 
| 347 | 
            +
             | 
| 348 | 
            +
            def VAE_16k(**kwargs) -> VAE:
         | 
| 349 | 
            +
                return VAE(data_dim=80, embed_dim=20, hidden_dim=384, **kwargs)
         | 
| 350 | 
            +
             | 
| 351 | 
            +
             | 
| 352 | 
            +
            def VAE_44k(**kwargs) -> VAE:
         | 
| 353 | 
            +
                return VAE(data_dim=128, embed_dim=40, hidden_dim=512, **kwargs)
         | 
| 354 | 
            +
             | 
| 355 | 
            +
             | 
| 356 | 
            +
            def get_my_vae(name: str, **kwargs) -> VAE:
         | 
| 357 | 
            +
                if name == '16k':
         | 
| 358 | 
            +
                    return VAE_16k(**kwargs)
         | 
| 359 | 
            +
                if name == '44k':
         | 
| 360 | 
            +
                    return VAE_44k(**kwargs)
         | 
| 361 | 
            +
                raise ValueError(f'Unknown model: {name}')
         | 
| 362 | 
            +
             | 
| 363 | 
            +
             | 
| 364 | 
            +
            if __name__ == '__main__':
         | 
| 365 | 
            +
                network = get_my_vae('standard')
         | 
| 366 | 
            +
             | 
| 367 | 
            +
                # print the number of parameters in terms of millions
         | 
| 368 | 
            +
                num_params = sum(p.numel() for p in network.parameters()) / 1e6
         | 
| 369 | 
            +
                print(f'Number of parameters: {num_params:.2f}M')
         | 
    	
        mmaudio/ext/autoencoder/vae_modules.py
    ADDED
    
    | @@ -0,0 +1,117 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import torch
         | 
| 2 | 
            +
            import torch.nn as nn
         | 
| 3 | 
            +
            import torch.nn.functional as F
         | 
| 4 | 
            +
            from einops import rearrange
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from mmaudio.ext.autoencoder.edm2_utils import (MPConv1D, mp_silu, mp_sum, normalize)
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            def nonlinearity(x):
         | 
| 10 | 
            +
                # swish
         | 
| 11 | 
            +
                return mp_silu(x)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            class ResnetBlock1D(nn.Module):
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                def __init__(self, *, in_dim, out_dim=None, conv_shortcut=False, kernel_size=3, use_norm=True):
         | 
| 17 | 
            +
                    super().__init__()
         | 
| 18 | 
            +
                    self.in_dim = in_dim
         | 
| 19 | 
            +
                    out_dim = in_dim if out_dim is None else out_dim
         | 
| 20 | 
            +
                    self.out_dim = out_dim
         | 
| 21 | 
            +
                    self.use_conv_shortcut = conv_shortcut
         | 
| 22 | 
            +
                    self.use_norm = use_norm
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    self.conv1 = MPConv1D(in_dim, out_dim, kernel_size=kernel_size)
         | 
| 25 | 
            +
                    self.conv2 = MPConv1D(out_dim, out_dim, kernel_size=kernel_size)
         | 
| 26 | 
            +
                    if self.in_dim != self.out_dim:
         | 
| 27 | 
            +
                        if self.use_conv_shortcut:
         | 
| 28 | 
            +
                            self.conv_shortcut = MPConv1D(in_dim, out_dim, kernel_size=kernel_size)
         | 
| 29 | 
            +
                        else:
         | 
| 30 | 
            +
                            self.nin_shortcut = MPConv1D(in_dim, out_dim, kernel_size=1)
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def forward(self, x: torch.Tensor) -> torch.Tensor:
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    # pixel norm
         | 
| 35 | 
            +
                    if self.use_norm:
         | 
| 36 | 
            +
                        x = normalize(x, dim=1)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    h = x
         | 
| 39 | 
            +
                    h = nonlinearity(h)
         | 
| 40 | 
            +
                    h = self.conv1(h)
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    h = nonlinearity(h)
         | 
| 43 | 
            +
                    h = self.conv2(h)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    if self.in_dim != self.out_dim:
         | 
| 46 | 
            +
                        if self.use_conv_shortcut:
         | 
| 47 | 
            +
                            x = self.conv_shortcut(x)
         | 
| 48 | 
            +
                        else:
         | 
| 49 | 
            +
                            x = self.nin_shortcut(x)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    return mp_sum(x, h, t=0.3)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
            class AttnBlock1D(nn.Module):
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def __init__(self, in_channels, num_heads=1):
         | 
| 57 | 
            +
                    super().__init__()
         | 
| 58 | 
            +
                    self.in_channels = in_channels
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                    self.num_heads = num_heads
         | 
| 61 | 
            +
                    self.qkv = MPConv1D(in_channels, in_channels * 3, kernel_size=1)
         | 
| 62 | 
            +
                    self.proj_out = MPConv1D(in_channels, in_channels, kernel_size=1)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                def forward(self, x):
         | 
| 65 | 
            +
                    h = x
         | 
| 66 | 
            +
                    y = self.qkv(h)
         | 
| 67 | 
            +
                    y = y.reshape(y.shape[0], self.num_heads, -1, 3, y.shape[-1])
         | 
| 68 | 
            +
                    q, k, v = normalize(y, dim=2).unbind(3)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    q = rearrange(q, 'b h c l -> b h l c')
         | 
| 71 | 
            +
                    k = rearrange(k, 'b h c l -> b h l c')
         | 
| 72 | 
            +
                    v = rearrange(v, 'b h c l -> b h l c')
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    h = F.scaled_dot_product_attention(q, k, v)
         | 
| 75 | 
            +
                    h = rearrange(h, 'b h l c -> b (h c) l')
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    h = self.proj_out(h)
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    return mp_sum(x, h, t=0.3)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
             | 
| 82 | 
            +
            class Upsample1D(nn.Module):
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                def __init__(self, in_channels, with_conv):
         | 
| 85 | 
            +
                    super().__init__()
         | 
| 86 | 
            +
                    self.with_conv = with_conv
         | 
| 87 | 
            +
                    if self.with_conv:
         | 
| 88 | 
            +
                        self.conv = MPConv1D(in_channels, in_channels, kernel_size=3)
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                def forward(self, x):
         | 
| 91 | 
            +
                    x = F.interpolate(x, scale_factor=2.0, mode='nearest-exact')  # support 3D tensor(B,C,T)
         | 
| 92 | 
            +
                    if self.with_conv:
         | 
| 93 | 
            +
                        x = self.conv(x)
         | 
| 94 | 
            +
                    return x
         | 
| 95 | 
            +
             | 
| 96 | 
            +
             | 
| 97 | 
            +
            class Downsample1D(nn.Module):
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                def __init__(self, in_channels, with_conv):
         | 
| 100 | 
            +
                    super().__init__()
         | 
| 101 | 
            +
                    self.with_conv = with_conv
         | 
| 102 | 
            +
                    if self.with_conv:
         | 
| 103 | 
            +
                        # no asymmetric padding in torch conv, must do it ourselves
         | 
| 104 | 
            +
                        self.conv1 = MPConv1D(in_channels, in_channels, kernel_size=1)
         | 
| 105 | 
            +
                        self.conv2 = MPConv1D(in_channels, in_channels, kernel_size=1)
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                def forward(self, x):
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    if self.with_conv:
         | 
| 110 | 
            +
                        x = self.conv1(x)
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    x = F.avg_pool1d(x, kernel_size=2, stride=2)
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                    if self.with_conv:
         | 
| 115 | 
            +
                        x = self.conv2(x)
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    return x
         | 
    	
        mmaudio/ext/bigvgan/LICENSE
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            MIT License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2022 NVIDIA CORPORATION.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in all
         | 
| 13 | 
            +
            copies or substantial portions of the Software. 
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         | 
| 21 | 
            +
            SOFTWARE.
         | 
    	
        mmaudio/ext/bigvgan/__init__.py
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            from .bigvgan import BigVGAN
         | 
    	
        mmaudio/ext/bigvgan/activations.py
    ADDED
    
    | @@ -0,0 +1,120 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            from torch import nn, sin, pow
         | 
| 6 | 
            +
            from torch.nn import Parameter
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            class Snake(nn.Module):
         | 
| 10 | 
            +
                '''
         | 
| 11 | 
            +
                Implementation of a sine-based periodic activation function
         | 
| 12 | 
            +
                Shape:
         | 
| 13 | 
            +
                    - Input: (B, C, T)
         | 
| 14 | 
            +
                    - Output: (B, C, T), same shape as the input
         | 
| 15 | 
            +
                Parameters:
         | 
| 16 | 
            +
                    - alpha - trainable parameter
         | 
| 17 | 
            +
                References:
         | 
| 18 | 
            +
                    - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
         | 
| 19 | 
            +
                    https://arxiv.org/abs/2006.08195
         | 
| 20 | 
            +
                Examples:
         | 
| 21 | 
            +
                    >>> a1 = snake(256)
         | 
| 22 | 
            +
                    >>> x = torch.randn(256)
         | 
| 23 | 
            +
                    >>> x = a1(x)
         | 
| 24 | 
            +
                '''
         | 
| 25 | 
            +
                def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
         | 
| 26 | 
            +
                    '''
         | 
| 27 | 
            +
                    Initialization.
         | 
| 28 | 
            +
                    INPUT:
         | 
| 29 | 
            +
                        - in_features: shape of the input
         | 
| 30 | 
            +
                        - alpha: trainable parameter
         | 
| 31 | 
            +
                        alpha is initialized to 1 by default, higher values = higher-frequency.
         | 
| 32 | 
            +
                        alpha will be trained along with the rest of your model.
         | 
| 33 | 
            +
                    '''
         | 
| 34 | 
            +
                    super(Snake, self).__init__()
         | 
| 35 | 
            +
                    self.in_features = in_features
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    # initialize alpha
         | 
| 38 | 
            +
                    self.alpha_logscale = alpha_logscale
         | 
| 39 | 
            +
                    if self.alpha_logscale: # log scale alphas initialized to zeros
         | 
| 40 | 
            +
                        self.alpha = Parameter(torch.zeros(in_features) * alpha)
         | 
| 41 | 
            +
                    else: # linear scale alphas initialized to ones
         | 
| 42 | 
            +
                        self.alpha = Parameter(torch.ones(in_features) * alpha)
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    self.alpha.requires_grad = alpha_trainable
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    self.no_div_by_zero = 0.000000001
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                def forward(self, x):
         | 
| 49 | 
            +
                    '''
         | 
| 50 | 
            +
                    Forward pass of the function.
         | 
| 51 | 
            +
                    Applies the function to the input elementwise.
         | 
| 52 | 
            +
                    Snake βΆ= x + 1/a * sin^2 (xa)
         | 
| 53 | 
            +
                    '''
         | 
| 54 | 
            +
                    alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
         | 
| 55 | 
            +
                    if self.alpha_logscale:
         | 
| 56 | 
            +
                        alpha = torch.exp(alpha)
         | 
| 57 | 
            +
                    x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    return x
         | 
| 60 | 
            +
             | 
| 61 | 
            +
             | 
| 62 | 
            +
            class SnakeBeta(nn.Module):
         | 
| 63 | 
            +
                '''
         | 
| 64 | 
            +
                A modified Snake function which uses separate parameters for the magnitude of the periodic components
         | 
| 65 | 
            +
                Shape:
         | 
| 66 | 
            +
                    - Input: (B, C, T)
         | 
| 67 | 
            +
                    - Output: (B, C, T), same shape as the input
         | 
| 68 | 
            +
                Parameters:
         | 
| 69 | 
            +
                    - alpha - trainable parameter that controls frequency
         | 
| 70 | 
            +
                    - beta - trainable parameter that controls magnitude
         | 
| 71 | 
            +
                References:
         | 
| 72 | 
            +
                    - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
         | 
| 73 | 
            +
                    https://arxiv.org/abs/2006.08195
         | 
| 74 | 
            +
                Examples:
         | 
| 75 | 
            +
                    >>> a1 = snakebeta(256)
         | 
| 76 | 
            +
                    >>> x = torch.randn(256)
         | 
| 77 | 
            +
                    >>> x = a1(x)
         | 
| 78 | 
            +
                '''
         | 
| 79 | 
            +
                def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
         | 
| 80 | 
            +
                    '''
         | 
| 81 | 
            +
                    Initialization.
         | 
| 82 | 
            +
                    INPUT:
         | 
| 83 | 
            +
                        - in_features: shape of the input
         | 
| 84 | 
            +
                        - alpha - trainable parameter that controls frequency
         | 
| 85 | 
            +
                        - beta - trainable parameter that controls magnitude
         | 
| 86 | 
            +
                        alpha is initialized to 1 by default, higher values = higher-frequency.
         | 
| 87 | 
            +
                        beta is initialized to 1 by default, higher values = higher-magnitude.
         | 
| 88 | 
            +
                        alpha will be trained along with the rest of your model.
         | 
| 89 | 
            +
                    '''
         | 
| 90 | 
            +
                    super(SnakeBeta, self).__init__()
         | 
| 91 | 
            +
                    self.in_features = in_features
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                    # initialize alpha
         | 
| 94 | 
            +
                    self.alpha_logscale = alpha_logscale
         | 
| 95 | 
            +
                    if self.alpha_logscale: # log scale alphas initialized to zeros
         | 
| 96 | 
            +
                        self.alpha = Parameter(torch.zeros(in_features) * alpha)
         | 
| 97 | 
            +
                        self.beta = Parameter(torch.zeros(in_features) * alpha)
         | 
| 98 | 
            +
                    else: # linear scale alphas initialized to ones
         | 
| 99 | 
            +
                        self.alpha = Parameter(torch.ones(in_features) * alpha)
         | 
| 100 | 
            +
                        self.beta = Parameter(torch.ones(in_features) * alpha)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                    self.alpha.requires_grad = alpha_trainable
         | 
| 103 | 
            +
                    self.beta.requires_grad = alpha_trainable
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    self.no_div_by_zero = 0.000000001
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                def forward(self, x):
         | 
| 108 | 
            +
                    '''
         | 
| 109 | 
            +
                    Forward pass of the function.
         | 
| 110 | 
            +
                    Applies the function to the input elementwise.
         | 
| 111 | 
            +
                    SnakeBeta βΆ= x + 1/b * sin^2 (xa)
         | 
| 112 | 
            +
                    '''
         | 
| 113 | 
            +
                    alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
         | 
| 114 | 
            +
                    beta = self.beta.unsqueeze(0).unsqueeze(-1)
         | 
| 115 | 
            +
                    if self.alpha_logscale:
         | 
| 116 | 
            +
                        alpha = torch.exp(alpha)
         | 
| 117 | 
            +
                        beta = torch.exp(beta)
         | 
| 118 | 
            +
                    x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                    return x
         | 
    	
        mmaudio/ext/bigvgan/alias_free_torch/__init__.py
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from .filter import *
         | 
| 5 | 
            +
            from .resample import *
         | 
| 6 | 
            +
            from .act import *
         | 
    	
        mmaudio/ext/bigvgan/alias_free_torch/act.py
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch.nn as nn
         | 
| 5 | 
            +
            from .resample import UpSample1d, DownSample1d
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class Activation1d(nn.Module):
         | 
| 9 | 
            +
                def __init__(self,
         | 
| 10 | 
            +
                             activation,
         | 
| 11 | 
            +
                             up_ratio: int = 2,
         | 
| 12 | 
            +
                             down_ratio: int = 2,
         | 
| 13 | 
            +
                             up_kernel_size: int = 12,
         | 
| 14 | 
            +
                             down_kernel_size: int = 12):
         | 
| 15 | 
            +
                    super().__init__()
         | 
| 16 | 
            +
                    self.up_ratio = up_ratio
         | 
| 17 | 
            +
                    self.down_ratio = down_ratio
         | 
| 18 | 
            +
                    self.act = activation
         | 
| 19 | 
            +
                    self.upsample = UpSample1d(up_ratio, up_kernel_size)
         | 
| 20 | 
            +
                    self.downsample = DownSample1d(down_ratio, down_kernel_size)
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                # x: [B,C,T]
         | 
| 23 | 
            +
                def forward(self, x):
         | 
| 24 | 
            +
                    x = self.upsample(x)
         | 
| 25 | 
            +
                    x = self.act(x)
         | 
| 26 | 
            +
                    x = self.downsample(x)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    return x
         | 
    	
        mmaudio/ext/bigvgan/alias_free_torch/filter.py
    ADDED
    
    | @@ -0,0 +1,95 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            import torch.nn as nn
         | 
| 6 | 
            +
            import torch.nn.functional as F
         | 
| 7 | 
            +
            import math
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            if 'sinc' in dir(torch):
         | 
| 10 | 
            +
                sinc = torch.sinc
         | 
| 11 | 
            +
            else:
         | 
| 12 | 
            +
                # This code is adopted from adefossez's julius.core.sinc under the MIT License
         | 
| 13 | 
            +
                # https://adefossez.github.io/julius/julius/core.html
         | 
| 14 | 
            +
                #   LICENSE is in incl_licenses directory.
         | 
| 15 | 
            +
                def sinc(x: torch.Tensor):
         | 
| 16 | 
            +
                    """
         | 
| 17 | 
            +
                    Implementation of sinc, i.e. sin(pi * x) / (pi * x)
         | 
| 18 | 
            +
                    __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
         | 
| 19 | 
            +
                    """
         | 
| 20 | 
            +
                    return torch.where(x == 0,
         | 
| 21 | 
            +
                                       torch.tensor(1., device=x.device, dtype=x.dtype),
         | 
| 22 | 
            +
                                       torch.sin(math.pi * x) / math.pi / x)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
         | 
| 26 | 
            +
            # https://adefossez.github.io/julius/julius/lowpass.html
         | 
| 27 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 28 | 
            +
            def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
         | 
| 29 | 
            +
                even = (kernel_size % 2 == 0)
         | 
| 30 | 
            +
                half_size = kernel_size // 2
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                #For kaiser window
         | 
| 33 | 
            +
                delta_f = 4 * half_width
         | 
| 34 | 
            +
                A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
         | 
| 35 | 
            +
                if A > 50.:
         | 
| 36 | 
            +
                    beta = 0.1102 * (A - 8.7)
         | 
| 37 | 
            +
                elif A >= 21.:
         | 
| 38 | 
            +
                    beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
         | 
| 39 | 
            +
                else:
         | 
| 40 | 
            +
                    beta = 0.
         | 
| 41 | 
            +
                window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
         | 
| 44 | 
            +
                if even:
         | 
| 45 | 
            +
                    time = (torch.arange(-half_size, half_size) + 0.5)
         | 
| 46 | 
            +
                else:
         | 
| 47 | 
            +
                    time = torch.arange(kernel_size) - half_size
         | 
| 48 | 
            +
                if cutoff == 0:
         | 
| 49 | 
            +
                    filter_ = torch.zeros_like(time)
         | 
| 50 | 
            +
                else:
         | 
| 51 | 
            +
                    filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
         | 
| 52 | 
            +
                    # Normalize filter to have sum = 1, otherwise we will have a small leakage
         | 
| 53 | 
            +
                    # of the constant component in the input signal.
         | 
| 54 | 
            +
                    filter_ /= filter_.sum()
         | 
| 55 | 
            +
                    filter = filter_.view(1, 1, kernel_size)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                return filter
         | 
| 58 | 
            +
             | 
| 59 | 
            +
             | 
| 60 | 
            +
            class LowPassFilter1d(nn.Module):
         | 
| 61 | 
            +
                def __init__(self,
         | 
| 62 | 
            +
                             cutoff=0.5,
         | 
| 63 | 
            +
                             half_width=0.6,
         | 
| 64 | 
            +
                             stride: int = 1,
         | 
| 65 | 
            +
                             padding: bool = True,
         | 
| 66 | 
            +
                             padding_mode: str = 'replicate',
         | 
| 67 | 
            +
                             kernel_size: int = 12):
         | 
| 68 | 
            +
                    # kernel_size should be even number for stylegan3 setup,
         | 
| 69 | 
            +
                    # in this implementation, odd number is also possible.
         | 
| 70 | 
            +
                    super().__init__()
         | 
| 71 | 
            +
                    if cutoff < -0.:
         | 
| 72 | 
            +
                        raise ValueError("Minimum cutoff must be larger than zero.")
         | 
| 73 | 
            +
                    if cutoff > 0.5:
         | 
| 74 | 
            +
                        raise ValueError("A cutoff above 0.5 does not make sense.")
         | 
| 75 | 
            +
                    self.kernel_size = kernel_size
         | 
| 76 | 
            +
                    self.even = (kernel_size % 2 == 0)
         | 
| 77 | 
            +
                    self.pad_left = kernel_size // 2 - int(self.even)
         | 
| 78 | 
            +
                    self.pad_right = kernel_size // 2
         | 
| 79 | 
            +
                    self.stride = stride
         | 
| 80 | 
            +
                    self.padding = padding
         | 
| 81 | 
            +
                    self.padding_mode = padding_mode
         | 
| 82 | 
            +
                    filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
         | 
| 83 | 
            +
                    self.register_buffer("filter", filter)
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                #input [B, C, T]
         | 
| 86 | 
            +
                def forward(self, x):
         | 
| 87 | 
            +
                    _, C, _ = x.shape
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    if self.padding:
         | 
| 90 | 
            +
                        x = F.pad(x, (self.pad_left, self.pad_right),
         | 
| 91 | 
            +
                                  mode=self.padding_mode)
         | 
| 92 | 
            +
                    out = F.conv1d(x, self.filter.expand(C, -1, -1),
         | 
| 93 | 
            +
                                   stride=self.stride, groups=C)
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    return out
         | 
    	
        mmaudio/ext/bigvgan/alias_free_torch/resample.py
    ADDED
    
    | @@ -0,0 +1,49 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch.nn as nn
         | 
| 5 | 
            +
            from torch.nn import functional as F
         | 
| 6 | 
            +
            from .filter import LowPassFilter1d
         | 
| 7 | 
            +
            from .filter import kaiser_sinc_filter1d
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            class UpSample1d(nn.Module):
         | 
| 11 | 
            +
                def __init__(self, ratio=2, kernel_size=None):
         | 
| 12 | 
            +
                    super().__init__()
         | 
| 13 | 
            +
                    self.ratio = ratio
         | 
| 14 | 
            +
                    self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
         | 
| 15 | 
            +
                    self.stride = ratio
         | 
| 16 | 
            +
                    self.pad = self.kernel_size // ratio - 1
         | 
| 17 | 
            +
                    self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
         | 
| 18 | 
            +
                    self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
         | 
| 19 | 
            +
                    filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
         | 
| 20 | 
            +
                                                  half_width=0.6 / ratio,
         | 
| 21 | 
            +
                                                  kernel_size=self.kernel_size)
         | 
| 22 | 
            +
                    self.register_buffer("filter", filter)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                # x: [B, C, T]
         | 
| 25 | 
            +
                def forward(self, x):
         | 
| 26 | 
            +
                    _, C, _ = x.shape
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    x = F.pad(x, (self.pad, self.pad), mode='replicate')
         | 
| 29 | 
            +
                    x = self.ratio * F.conv_transpose1d(
         | 
| 30 | 
            +
                        x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
         | 
| 31 | 
            +
                    x = x[..., self.pad_left:-self.pad_right]
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                    return x
         | 
| 34 | 
            +
             | 
| 35 | 
            +
             | 
| 36 | 
            +
            class DownSample1d(nn.Module):
         | 
| 37 | 
            +
                def __init__(self, ratio=2, kernel_size=None):
         | 
| 38 | 
            +
                    super().__init__()
         | 
| 39 | 
            +
                    self.ratio = ratio
         | 
| 40 | 
            +
                    self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
         | 
| 41 | 
            +
                    self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
         | 
| 42 | 
            +
                                                   half_width=0.6 / ratio,
         | 
| 43 | 
            +
                                                   stride=ratio,
         | 
| 44 | 
            +
                                                   kernel_size=self.kernel_size)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                def forward(self, x):
         | 
| 47 | 
            +
                    xx = self.lowpass(x)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    return xx
         | 
    	
        mmaudio/ext/bigvgan/bigvgan.py
    ADDED
    
    | @@ -0,0 +1,32 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from pathlib import Path
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            import torch.nn as nn
         | 
| 5 | 
            +
            from omegaconf import OmegaConf
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from mmaudio.ext.bigvgan.models import BigVGANVocoder
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            _bigvgan_vocoder_path = Path(__file__).parent / 'bigvgan_vocoder.yml'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            class BigVGAN(nn.Module):
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def __init__(self, ckpt_path, config_path=_bigvgan_vocoder_path):
         | 
| 15 | 
            +
                    super().__init__()
         | 
| 16 | 
            +
                    vocoder_cfg = OmegaConf.load(config_path)
         | 
| 17 | 
            +
                    self.vocoder = BigVGANVocoder(vocoder_cfg).eval()
         | 
| 18 | 
            +
                    vocoder_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)['generator']
         | 
| 19 | 
            +
                    self.vocoder.load_state_dict(vocoder_ckpt)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    self.weight_norm_removed = False
         | 
| 22 | 
            +
                    self.remove_weight_norm()
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                @torch.inference_mode()
         | 
| 25 | 
            +
                def forward(self, x):
         | 
| 26 | 
            +
                    assert self.weight_norm_removed, 'call remove_weight_norm() before inference'
         | 
| 27 | 
            +
                    return self.vocoder(x)
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def remove_weight_norm(self):
         | 
| 30 | 
            +
                    self.vocoder.remove_weight_norm()
         | 
| 31 | 
            +
                    self.weight_norm_removed = True
         | 
| 32 | 
            +
                    return self
         | 
    	
        mmaudio/ext/bigvgan/bigvgan_vocoder.yml
    ADDED
    
    | @@ -0,0 +1,63 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            resblock: '1'
         | 
| 2 | 
            +
            num_gpus: 0
         | 
| 3 | 
            +
            batch_size: 64
         | 
| 4 | 
            +
            num_mels: 80
         | 
| 5 | 
            +
            learning_rate: 0.0001
         | 
| 6 | 
            +
            adam_b1: 0.8
         | 
| 7 | 
            +
            adam_b2: 0.99
         | 
| 8 | 
            +
            lr_decay: 0.999
         | 
| 9 | 
            +
            seed: 1234
         | 
| 10 | 
            +
            upsample_rates:
         | 
| 11 | 
            +
            - 4
         | 
| 12 | 
            +
            - 4
         | 
| 13 | 
            +
            - 2
         | 
| 14 | 
            +
            - 2
         | 
| 15 | 
            +
            - 2
         | 
| 16 | 
            +
            - 2
         | 
| 17 | 
            +
            upsample_kernel_sizes:
         | 
| 18 | 
            +
            - 8
         | 
| 19 | 
            +
            - 8
         | 
| 20 | 
            +
            - 4
         | 
| 21 | 
            +
            - 4
         | 
| 22 | 
            +
            - 4
         | 
| 23 | 
            +
            - 4
         | 
| 24 | 
            +
            upsample_initial_channel: 1536
         | 
| 25 | 
            +
            resblock_kernel_sizes:
         | 
| 26 | 
            +
            - 3
         | 
| 27 | 
            +
            - 7
         | 
| 28 | 
            +
            - 11
         | 
| 29 | 
            +
            resblock_dilation_sizes:
         | 
| 30 | 
            +
            - - 1
         | 
| 31 | 
            +
              - 3
         | 
| 32 | 
            +
              - 5
         | 
| 33 | 
            +
            - - 1
         | 
| 34 | 
            +
              - 3
         | 
| 35 | 
            +
              - 5
         | 
| 36 | 
            +
            - - 1
         | 
| 37 | 
            +
              - 3
         | 
| 38 | 
            +
              - 5
         | 
| 39 | 
            +
            activation: snakebeta
         | 
| 40 | 
            +
            snake_logscale: true
         | 
| 41 | 
            +
            resolutions:
         | 
| 42 | 
            +
            - - 1024
         | 
| 43 | 
            +
              - 120
         | 
| 44 | 
            +
              - 600
         | 
| 45 | 
            +
            - - 2048
         | 
| 46 | 
            +
              - 240
         | 
| 47 | 
            +
              - 1200
         | 
| 48 | 
            +
            - - 512
         | 
| 49 | 
            +
              - 50
         | 
| 50 | 
            +
              - 240
         | 
| 51 | 
            +
            mpd_reshapes:
         | 
| 52 | 
            +
            - 2
         | 
| 53 | 
            +
            - 3
         | 
| 54 | 
            +
            - 5
         | 
| 55 | 
            +
            - 7
         | 
| 56 | 
            +
            - 11
         | 
| 57 | 
            +
            use_spectral_norm: false
         | 
| 58 | 
            +
            discriminator_channel_mult: 1
         | 
| 59 | 
            +
            num_workers: 4
         | 
| 60 | 
            +
            dist_config:
         | 
| 61 | 
            +
              dist_backend: nccl
         | 
| 62 | 
            +
              dist_url: tcp://localhost:54341
         | 
| 63 | 
            +
              world_size: 1
         | 
    	
        mmaudio/ext/bigvgan/env.py
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            import shutil
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class AttrDict(dict):
         | 
| 9 | 
            +
                def __init__(self, *args, **kwargs):
         | 
| 10 | 
            +
                    super(AttrDict, self).__init__(*args, **kwargs)
         | 
| 11 | 
            +
                    self.__dict__ = self
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            def build_env(config, config_name, path):
         | 
| 15 | 
            +
                t_path = os.path.join(path, config_name)
         | 
| 16 | 
            +
                if config != t_path:
         | 
| 17 | 
            +
                    os.makedirs(path, exist_ok=True)
         | 
| 18 | 
            +
                    shutil.copyfile(config, os.path.join(path, config_name))
         | 
    	
        mmaudio/ext/bigvgan/incl_licenses/LICENSE_1
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            MIT License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2020 Jungil Kong
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in all
         | 
| 13 | 
            +
            copies or substantial portions of the Software.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         | 
| 21 | 
            +
            SOFTWARE.
         | 
    	
        mmaudio/ext/bigvgan/incl_licenses/LICENSE_2
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            MIT License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2020 Edward Dixon
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in all
         | 
| 13 | 
            +
            copies or substantial portions of the Software.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         | 
| 21 | 
            +
            SOFTWARE.
         | 
    	
        mmaudio/ext/bigvgan/incl_licenses/LICENSE_3
    ADDED
    
    | @@ -0,0 +1,201 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
                                            Apache License
         | 
| 2 | 
            +
                                       Version 2.0, January 2004
         | 
| 3 | 
            +
                                    http://www.apache.org/licenses/
         | 
| 4 | 
            +
             | 
| 5 | 
            +
               TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
         | 
| 6 | 
            +
             | 
| 7 | 
            +
               1. Definitions.
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  "License" shall mean the terms and conditions for use, reproduction,
         | 
| 10 | 
            +
                  and distribution as defined by Sections 1 through 9 of this document.
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  "Licensor" shall mean the copyright owner or entity authorized by
         | 
| 13 | 
            +
                  the copyright owner that is granting the License.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  "Legal Entity" shall mean the union of the acting entity and all
         | 
| 16 | 
            +
                  other entities that control, are controlled by, or are under common
         | 
| 17 | 
            +
                  control with that entity. For the purposes of this definition,
         | 
| 18 | 
            +
                  "control" means (i) the power, direct or indirect, to cause the
         | 
| 19 | 
            +
                  direction or management of such entity, whether by contract or
         | 
| 20 | 
            +
                  otherwise, or (ii) ownership of fifty percent (50%) or more of the
         | 
| 21 | 
            +
                  outstanding shares, or (iii) beneficial ownership of such entity.
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  "You" (or "Your") shall mean an individual or Legal Entity
         | 
| 24 | 
            +
                  exercising permissions granted by this License.
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  "Source" form shall mean the preferred form for making modifications,
         | 
| 27 | 
            +
                  including but not limited to software source code, documentation
         | 
| 28 | 
            +
                  source, and configuration files.
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  "Object" form shall mean any form resulting from mechanical
         | 
| 31 | 
            +
                  transformation or translation of a Source form, including but
         | 
| 32 | 
            +
                  not limited to compiled object code, generated documentation,
         | 
| 33 | 
            +
                  and conversions to other media types.
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  "Work" shall mean the work of authorship, whether in Source or
         | 
| 36 | 
            +
                  Object form, made available under the License, as indicated by a
         | 
| 37 | 
            +
                  copyright notice that is included in or attached to the work
         | 
| 38 | 
            +
                  (an example is provided in the Appendix below).
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                  "Derivative Works" shall mean any work, whether in Source or Object
         | 
| 41 | 
            +
                  form, that is based on (or derived from) the Work and for which the
         | 
| 42 | 
            +
                  editorial revisions, annotations, elaborations, or other modifications
         | 
| 43 | 
            +
                  represent, as a whole, an original work of authorship. For the purposes
         | 
| 44 | 
            +
                  of this License, Derivative Works shall not include works that remain
         | 
| 45 | 
            +
                  separable from, or merely link (or bind by name) to the interfaces of,
         | 
| 46 | 
            +
                  the Work and Derivative Works thereof.
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                  "Contribution" shall mean any work of authorship, including
         | 
| 49 | 
            +
                  the original version of the Work and any modifications or additions
         | 
| 50 | 
            +
                  to that Work or Derivative Works thereof, that is intentionally
         | 
| 51 | 
            +
                  submitted to Licensor for inclusion in the Work by the copyright owner
         | 
| 52 | 
            +
                  or by an individual or Legal Entity authorized to submit on behalf of
         | 
| 53 | 
            +
                  the copyright owner. For the purposes of this definition, "submitted"
         | 
| 54 | 
            +
                  means any form of electronic, verbal, or written communication sent
         | 
| 55 | 
            +
                  to the Licensor or its representatives, including but not limited to
         | 
| 56 | 
            +
                  communication on electronic mailing lists, source code control systems,
         | 
| 57 | 
            +
                  and issue tracking systems that are managed by, or on behalf of, the
         | 
| 58 | 
            +
                  Licensor for the purpose of discussing and improving the Work, but
         | 
| 59 | 
            +
                  excluding communication that is conspicuously marked or otherwise
         | 
| 60 | 
            +
                  designated in writing by the copyright owner as "Not a Contribution."
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                  "Contributor" shall mean Licensor and any individual or Legal Entity
         | 
| 63 | 
            +
                  on behalf of whom a Contribution has been received by Licensor and
         | 
| 64 | 
            +
                  subsequently incorporated within the Work.
         | 
| 65 | 
            +
             | 
| 66 | 
            +
               2. Grant of Copyright License. Subject to the terms and conditions of
         | 
| 67 | 
            +
                  this License, each Contributor hereby grants to You a perpetual,
         | 
| 68 | 
            +
                  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
         | 
| 69 | 
            +
                  copyright license to reproduce, prepare Derivative Works of,
         | 
| 70 | 
            +
                  publicly display, publicly perform, sublicense, and distribute the
         | 
| 71 | 
            +
                  Work and such Derivative Works in Source or Object form.
         | 
| 72 | 
            +
             | 
| 73 | 
            +
               3. Grant of Patent License. Subject to the terms and conditions of
         | 
| 74 | 
            +
                  this License, each Contributor hereby grants to You a perpetual,
         | 
| 75 | 
            +
                  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
         | 
| 76 | 
            +
                  (except as stated in this section) patent license to make, have made,
         | 
| 77 | 
            +
                  use, offer to sell, sell, import, and otherwise transfer the Work,
         | 
| 78 | 
            +
                  where such license applies only to those patent claims licensable
         | 
| 79 | 
            +
                  by such Contributor that are necessarily infringed by their
         | 
| 80 | 
            +
                  Contribution(s) alone or by combination of their Contribution(s)
         | 
| 81 | 
            +
                  with the Work to which such Contribution(s) was submitted. If You
         | 
| 82 | 
            +
                  institute patent litigation against any entity (including a
         | 
| 83 | 
            +
                  cross-claim or counterclaim in a lawsuit) alleging that the Work
         | 
| 84 | 
            +
                  or a Contribution incorporated within the Work constitutes direct
         | 
| 85 | 
            +
                  or contributory patent infringement, then any patent licenses
         | 
| 86 | 
            +
                  granted to You under this License for that Work shall terminate
         | 
| 87 | 
            +
                  as of the date such litigation is filed.
         | 
| 88 | 
            +
             | 
| 89 | 
            +
               4. Redistribution. You may reproduce and distribute copies of the
         | 
| 90 | 
            +
                  Work or Derivative Works thereof in any medium, with or without
         | 
| 91 | 
            +
                  modifications, and in Source or Object form, provided that You
         | 
| 92 | 
            +
                  meet the following conditions:
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  (a) You must give any other recipients of the Work or
         | 
| 95 | 
            +
                      Derivative Works a copy of this License; and
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                  (b) You must cause any modified files to carry prominent notices
         | 
| 98 | 
            +
                      stating that You changed the files; and
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                  (c) You must retain, in the Source form of any Derivative Works
         | 
| 101 | 
            +
                      that You distribute, all copyright, patent, trademark, and
         | 
| 102 | 
            +
                      attribution notices from the Source form of the Work,
         | 
| 103 | 
            +
                      excluding those notices that do not pertain to any part of
         | 
| 104 | 
            +
                      the Derivative Works; and
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                  (d) If the Work includes a "NOTICE" text file as part of its
         | 
| 107 | 
            +
                      distribution, then any Derivative Works that You distribute must
         | 
| 108 | 
            +
                      include a readable copy of the attribution notices contained
         | 
| 109 | 
            +
                      within such NOTICE file, excluding those notices that do not
         | 
| 110 | 
            +
                      pertain to any part of the Derivative Works, in at least one
         | 
| 111 | 
            +
                      of the following places: within a NOTICE text file distributed
         | 
| 112 | 
            +
                      as part of the Derivative Works; within the Source form or
         | 
| 113 | 
            +
                      documentation, if provided along with the Derivative Works; or,
         | 
| 114 | 
            +
                      within a display generated by the Derivative Works, if and
         | 
| 115 | 
            +
                      wherever such third-party notices normally appear. The contents
         | 
| 116 | 
            +
                      of the NOTICE file are for informational purposes only and
         | 
| 117 | 
            +
                      do not modify the License. You may add Your own attribution
         | 
| 118 | 
            +
                      notices within Derivative Works that You distribute, alongside
         | 
| 119 | 
            +
                      or as an addendum to the NOTICE text from the Work, provided
         | 
| 120 | 
            +
                      that such additional attribution notices cannot be construed
         | 
| 121 | 
            +
                      as modifying the License.
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                  You may add Your own copyright statement to Your modifications and
         | 
| 124 | 
            +
                  may provide additional or different license terms and conditions
         | 
| 125 | 
            +
                  for use, reproduction, or distribution of Your modifications, or
         | 
| 126 | 
            +
                  for any such Derivative Works as a whole, provided Your use,
         | 
| 127 | 
            +
                  reproduction, and distribution of the Work otherwise complies with
         | 
| 128 | 
            +
                  the conditions stated in this License.
         | 
| 129 | 
            +
             | 
| 130 | 
            +
               5. Submission of Contributions. Unless You explicitly state otherwise,
         | 
| 131 | 
            +
                  any Contribution intentionally submitted for inclusion in the Work
         | 
| 132 | 
            +
                  by You to the Licensor shall be under the terms and conditions of
         | 
| 133 | 
            +
                  this License, without any additional terms or conditions.
         | 
| 134 | 
            +
                  Notwithstanding the above, nothing herein shall supersede or modify
         | 
| 135 | 
            +
                  the terms of any separate license agreement you may have executed
         | 
| 136 | 
            +
                  with Licensor regarding such Contributions.
         | 
| 137 | 
            +
             | 
| 138 | 
            +
               6. Trademarks. This License does not grant permission to use the trade
         | 
| 139 | 
            +
                  names, trademarks, service marks, or product names of the Licensor,
         | 
| 140 | 
            +
                  except as required for reasonable and customary use in describing the
         | 
| 141 | 
            +
                  origin of the Work and reproducing the content of the NOTICE file.
         | 
| 142 | 
            +
             | 
| 143 | 
            +
               7. Disclaimer of Warranty. Unless required by applicable law or
         | 
| 144 | 
            +
                  agreed to in writing, Licensor provides the Work (and each
         | 
| 145 | 
            +
                  Contributor provides its Contributions) on an "AS IS" BASIS,
         | 
| 146 | 
            +
                  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
         | 
| 147 | 
            +
                  implied, including, without limitation, any warranties or conditions
         | 
| 148 | 
            +
                  of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
         | 
| 149 | 
            +
                  PARTICULAR PURPOSE. You are solely responsible for determining the
         | 
| 150 | 
            +
                  appropriateness of using or redistributing the Work and assume any
         | 
| 151 | 
            +
                  risks associated with Your exercise of permissions under this License.
         | 
| 152 | 
            +
             | 
| 153 | 
            +
               8. Limitation of Liability. In no event and under no legal theory,
         | 
| 154 | 
            +
                  whether in tort (including negligence), contract, or otherwise,
         | 
| 155 | 
            +
                  unless required by applicable law (such as deliberate and grossly
         | 
| 156 | 
            +
                  negligent acts) or agreed to in writing, shall any Contributor be
         | 
| 157 | 
            +
                  liable to You for damages, including any direct, indirect, special,
         | 
| 158 | 
            +
                  incidental, or consequential damages of any character arising as a
         | 
| 159 | 
            +
                  result of this License or out of the use or inability to use the
         | 
| 160 | 
            +
                  Work (including but not limited to damages for loss of goodwill,
         | 
| 161 | 
            +
                  work stoppage, computer failure or malfunction, or any and all
         | 
| 162 | 
            +
                  other commercial damages or losses), even if such Contributor
         | 
| 163 | 
            +
                  has been advised of the possibility of such damages.
         | 
| 164 | 
            +
             | 
| 165 | 
            +
               9. Accepting Warranty or Additional Liability. While redistributing
         | 
| 166 | 
            +
                  the Work or Derivative Works thereof, You may choose to offer,
         | 
| 167 | 
            +
                  and charge a fee for, acceptance of support, warranty, indemnity,
         | 
| 168 | 
            +
                  or other liability obligations and/or rights consistent with this
         | 
| 169 | 
            +
                  License. However, in accepting such obligations, You may act only
         | 
| 170 | 
            +
                  on Your own behalf and on Your sole responsibility, not on behalf
         | 
| 171 | 
            +
                  of any other Contributor, and only if You agree to indemnify,
         | 
| 172 | 
            +
                  defend, and hold each Contributor harmless for any liability
         | 
| 173 | 
            +
                  incurred by, or claims asserted against, such Contributor by reason
         | 
| 174 | 
            +
                  of your accepting any such warranty or additional liability.
         | 
| 175 | 
            +
             | 
| 176 | 
            +
               END OF TERMS AND CONDITIONS
         | 
| 177 | 
            +
             | 
| 178 | 
            +
               APPENDIX: How to apply the Apache License to your work.
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                  To apply the Apache License to your work, attach the following
         | 
| 181 | 
            +
                  boilerplate notice, with the fields enclosed by brackets "[]"
         | 
| 182 | 
            +
                  replaced with your own identifying information. (Don't include
         | 
| 183 | 
            +
                  the brackets!)  The text should be enclosed in the appropriate
         | 
| 184 | 
            +
                  comment syntax for the file format. We also recommend that a
         | 
| 185 | 
            +
                  file or class name and description of purpose be included on the
         | 
| 186 | 
            +
                  same "printed page" as the copyright notice for easier
         | 
| 187 | 
            +
                  identification within third-party archives.
         | 
| 188 | 
            +
             | 
| 189 | 
            +
               Copyright [yyyy] [name of copyright owner]
         | 
| 190 | 
            +
             | 
| 191 | 
            +
               Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 192 | 
            +
               you may not use this file except in compliance with the License.
         | 
| 193 | 
            +
               You may obtain a copy of the License at
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                   http://www.apache.org/licenses/LICENSE-2.0
         | 
| 196 | 
            +
             | 
| 197 | 
            +
               Unless required by applicable law or agreed to in writing, software
         | 
| 198 | 
            +
               distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 199 | 
            +
               WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 200 | 
            +
               See the License for the specific language governing permissions and
         | 
| 201 | 
            +
               limitations under the License.
         | 
    	
        mmaudio/ext/bigvgan/incl_licenses/LICENSE_4
    ADDED
    
    | @@ -0,0 +1,29 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            BSD 3-Clause License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2019, Seungwon Park λ°μΉμ
         | 
| 4 | 
            +
            All rights reserved.
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Redistribution and use in source and binary forms, with or without
         | 
| 7 | 
            +
            modification, are permitted provided that the following conditions are met:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            1. Redistributions of source code must retain the above copyright notice, this
         | 
| 10 | 
            +
               list of conditions and the following disclaimer.
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            2. Redistributions in binary form must reproduce the above copyright notice,
         | 
| 13 | 
            +
               this list of conditions and the following disclaimer in the documentation
         | 
| 14 | 
            +
               and/or other materials provided with the distribution.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            3. Neither the name of the copyright holder nor the names of its
         | 
| 17 | 
            +
               contributors may be used to endorse or promote products derived from
         | 
| 18 | 
            +
               this software without specific prior written permission.
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
         | 
| 21 | 
            +
            AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
         | 
| 22 | 
            +
            IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
         | 
| 23 | 
            +
            DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
         | 
| 24 | 
            +
            FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
         | 
| 25 | 
            +
            DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
         | 
| 26 | 
            +
            SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
         | 
| 27 | 
            +
            CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
         | 
| 28 | 
            +
            OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
         | 
| 29 | 
            +
            OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         | 
    	
        mmaudio/ext/bigvgan/incl_licenses/LICENSE_5
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Copyright 2020 Alexandre DΓ©fossez
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
         | 
| 4 | 
            +
            associated documentation files (the "Software"), to deal in the Software without restriction,
         | 
| 5 | 
            +
            including without limitation the rights to use, copy, modify, merge, publish, distribute,
         | 
| 6 | 
            +
            sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
         | 
| 7 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            The above copyright notice and this permission notice shall be included in all copies or
         | 
| 10 | 
            +
            substantial portions of the Software.
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
         | 
| 13 | 
            +
            NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         | 
| 14 | 
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
         | 
| 15 | 
            +
            DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 16 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         | 
    	
        mmaudio/ext/bigvgan/models.py
    ADDED
    
    | @@ -0,0 +1,255 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright (c) 2022 NVIDIA CORPORATION.
         | 
| 2 | 
            +
            #   Licensed under the MIT license.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
         | 
| 5 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import torch
         | 
| 8 | 
            +
            import torch.nn as nn
         | 
| 9 | 
            +
            from torch.nn import Conv1d, ConvTranspose1d
         | 
| 10 | 
            +
            from torch.nn.utils.parametrizations import weight_norm
         | 
| 11 | 
            +
            from torch.nn.utils.parametrize import remove_parametrizations
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            from mmaudio.ext.bigvgan import activations
         | 
| 14 | 
            +
            from mmaudio.ext.bigvgan.alias_free_torch import *
         | 
| 15 | 
            +
            from mmaudio.ext.bigvgan.utils import get_padding, init_weights
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            LRELU_SLOPE = 0.1
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            class AMPBlock1(torch.nn.Module):
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
         | 
| 23 | 
            +
                    super(AMPBlock1, self).__init__()
         | 
| 24 | 
            +
                    self.h = h
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    self.convs1 = nn.ModuleList([
         | 
| 27 | 
            +
                        weight_norm(
         | 
| 28 | 
            +
                            Conv1d(channels,
         | 
| 29 | 
            +
                                   channels,
         | 
| 30 | 
            +
                                   kernel_size,
         | 
| 31 | 
            +
                                   1,
         | 
| 32 | 
            +
                                   dilation=dilation[0],
         | 
| 33 | 
            +
                                   padding=get_padding(kernel_size, dilation[0]))),
         | 
| 34 | 
            +
                        weight_norm(
         | 
| 35 | 
            +
                            Conv1d(channels,
         | 
| 36 | 
            +
                                   channels,
         | 
| 37 | 
            +
                                   kernel_size,
         | 
| 38 | 
            +
                                   1,
         | 
| 39 | 
            +
                                   dilation=dilation[1],
         | 
| 40 | 
            +
                                   padding=get_padding(kernel_size, dilation[1]))),
         | 
| 41 | 
            +
                        weight_norm(
         | 
| 42 | 
            +
                            Conv1d(channels,
         | 
| 43 | 
            +
                                   channels,
         | 
| 44 | 
            +
                                   kernel_size,
         | 
| 45 | 
            +
                                   1,
         | 
| 46 | 
            +
                                   dilation=dilation[2],
         | 
| 47 | 
            +
                                   padding=get_padding(kernel_size, dilation[2])))
         | 
| 48 | 
            +
                    ])
         | 
| 49 | 
            +
                    self.convs1.apply(init_weights)
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    self.convs2 = nn.ModuleList([
         | 
| 52 | 
            +
                        weight_norm(
         | 
| 53 | 
            +
                            Conv1d(channels,
         | 
| 54 | 
            +
                                   channels,
         | 
| 55 | 
            +
                                   kernel_size,
         | 
| 56 | 
            +
                                   1,
         | 
| 57 | 
            +
                                   dilation=1,
         | 
| 58 | 
            +
                                   padding=get_padding(kernel_size, 1))),
         | 
| 59 | 
            +
                        weight_norm(
         | 
| 60 | 
            +
                            Conv1d(channels,
         | 
| 61 | 
            +
                                   channels,
         | 
| 62 | 
            +
                                   kernel_size,
         | 
| 63 | 
            +
                                   1,
         | 
| 64 | 
            +
                                   dilation=1,
         | 
| 65 | 
            +
                                   padding=get_padding(kernel_size, 1))),
         | 
| 66 | 
            +
                        weight_norm(
         | 
| 67 | 
            +
                            Conv1d(channels,
         | 
| 68 | 
            +
                                   channels,
         | 
| 69 | 
            +
                                   kernel_size,
         | 
| 70 | 
            +
                                   1,
         | 
| 71 | 
            +
                                   dilation=1,
         | 
| 72 | 
            +
                                   padding=get_padding(kernel_size, 1)))
         | 
| 73 | 
            +
                    ])
         | 
| 74 | 
            +
                    self.convs2.apply(init_weights)
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    self.num_layers = len(self.convs1) + len(self.convs2)  # total number of conv layers
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    if activation == 'snake':  # periodic nonlinearity with snake function and anti-aliasing
         | 
| 79 | 
            +
                        self.activations = nn.ModuleList([
         | 
| 80 | 
            +
                            Activation1d(
         | 
| 81 | 
            +
                                activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
         | 
| 82 | 
            +
                            for _ in range(self.num_layers)
         | 
| 83 | 
            +
                        ])
         | 
| 84 | 
            +
                    elif activation == 'snakebeta':  # periodic nonlinearity with snakebeta function and anti-aliasing
         | 
| 85 | 
            +
                        self.activations = nn.ModuleList([
         | 
| 86 | 
            +
                            Activation1d(
         | 
| 87 | 
            +
                                activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
         | 
| 88 | 
            +
                            for _ in range(self.num_layers)
         | 
| 89 | 
            +
                        ])
         | 
| 90 | 
            +
                    else:
         | 
| 91 | 
            +
                        raise NotImplementedError(
         | 
| 92 | 
            +
                            "activation incorrectly specified. check the config file and look for 'activation'."
         | 
| 93 | 
            +
                        )
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                def forward(self, x):
         | 
| 96 | 
            +
                    acts1, acts2 = self.activations[::2], self.activations[1::2]
         | 
| 97 | 
            +
                    for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
         | 
| 98 | 
            +
                        xt = a1(x)
         | 
| 99 | 
            +
                        xt = c1(xt)
         | 
| 100 | 
            +
                        xt = a2(xt)
         | 
| 101 | 
            +
                        xt = c2(xt)
         | 
| 102 | 
            +
                        x = xt + x
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    return x
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                def remove_weight_norm(self):
         | 
| 107 | 
            +
                    for l in self.convs1:
         | 
| 108 | 
            +
                        remove_parametrizations(l, 'weight')
         | 
| 109 | 
            +
                    for l in self.convs2:
         | 
| 110 | 
            +
                        remove_parametrizations(l, 'weight')
         | 
| 111 | 
            +
             | 
| 112 | 
            +
             | 
| 113 | 
            +
            class AMPBlock2(torch.nn.Module):
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
         | 
| 116 | 
            +
                    super(AMPBlock2, self).__init__()
         | 
| 117 | 
            +
                    self.h = h
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                    self.convs = nn.ModuleList([
         | 
| 120 | 
            +
                        weight_norm(
         | 
| 121 | 
            +
                            Conv1d(channels,
         | 
| 122 | 
            +
                                   channels,
         | 
| 123 | 
            +
                                   kernel_size,
         | 
| 124 | 
            +
                                   1,
         | 
| 125 | 
            +
                                   dilation=dilation[0],
         | 
| 126 | 
            +
                                   padding=get_padding(kernel_size, dilation[0]))),
         | 
| 127 | 
            +
                        weight_norm(
         | 
| 128 | 
            +
                            Conv1d(channels,
         | 
| 129 | 
            +
                                   channels,
         | 
| 130 | 
            +
                                   kernel_size,
         | 
| 131 | 
            +
                                   1,
         | 
| 132 | 
            +
                                   dilation=dilation[1],
         | 
| 133 | 
            +
                                   padding=get_padding(kernel_size, dilation[1])))
         | 
| 134 | 
            +
                    ])
         | 
| 135 | 
            +
                    self.convs.apply(init_weights)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                    self.num_layers = len(self.convs)  # total number of conv layers
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    if activation == 'snake':  # periodic nonlinearity with snake function and anti-aliasing
         | 
| 140 | 
            +
                        self.activations = nn.ModuleList([
         | 
| 141 | 
            +
                            Activation1d(
         | 
| 142 | 
            +
                                activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
         | 
| 143 | 
            +
                            for _ in range(self.num_layers)
         | 
| 144 | 
            +
                        ])
         | 
| 145 | 
            +
                    elif activation == 'snakebeta':  # periodic nonlinearity with snakebeta function and anti-aliasing
         | 
| 146 | 
            +
                        self.activations = nn.ModuleList([
         | 
| 147 | 
            +
                            Activation1d(
         | 
| 148 | 
            +
                                activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
         | 
| 149 | 
            +
                            for _ in range(self.num_layers)
         | 
| 150 | 
            +
                        ])
         | 
| 151 | 
            +
                    else:
         | 
| 152 | 
            +
                        raise NotImplementedError(
         | 
| 153 | 
            +
                            "activation incorrectly specified. check the config file and look for 'activation'."
         | 
| 154 | 
            +
                        )
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                def forward(self, x):
         | 
| 157 | 
            +
                    for c, a in zip(self.convs, self.activations):
         | 
| 158 | 
            +
                        xt = a(x)
         | 
| 159 | 
            +
                        xt = c(xt)
         | 
| 160 | 
            +
                        x = xt + x
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    return x
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                def remove_weight_norm(self):
         | 
| 165 | 
            +
                    for l in self.convs:
         | 
| 166 | 
            +
                        remove_parametrizations(l, 'weight')
         | 
| 167 | 
            +
             | 
| 168 | 
            +
             | 
| 169 | 
            +
            class BigVGANVocoder(torch.nn.Module):
         | 
| 170 | 
            +
                # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
         | 
| 171 | 
            +
                def __init__(self, h):
         | 
| 172 | 
            +
                    super().__init__()
         | 
| 173 | 
            +
                    self.h = h
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    self.num_kernels = len(h.resblock_kernel_sizes)
         | 
| 176 | 
            +
                    self.num_upsamples = len(h.upsample_rates)
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                    # pre conv
         | 
| 179 | 
            +
                    self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                    # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
         | 
| 182 | 
            +
                    resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                    # transposed conv-based upsamplers. does not apply anti-aliasing
         | 
| 185 | 
            +
                    self.ups = nn.ModuleList()
         | 
| 186 | 
            +
                    for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
         | 
| 187 | 
            +
                        self.ups.append(
         | 
| 188 | 
            +
                            nn.ModuleList([
         | 
| 189 | 
            +
                                weight_norm(
         | 
| 190 | 
            +
                                    ConvTranspose1d(h.upsample_initial_channel // (2**i),
         | 
| 191 | 
            +
                                                    h.upsample_initial_channel // (2**(i + 1)),
         | 
| 192 | 
            +
                                                    k,
         | 
| 193 | 
            +
                                                    u,
         | 
| 194 | 
            +
                                                    padding=(k - u) // 2))
         | 
| 195 | 
            +
                            ]))
         | 
| 196 | 
            +
             | 
| 197 | 
            +
                    # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
         | 
| 198 | 
            +
                    self.resblocks = nn.ModuleList()
         | 
| 199 | 
            +
                    for i in range(len(self.ups)):
         | 
| 200 | 
            +
                        ch = h.upsample_initial_channel // (2**(i + 1))
         | 
| 201 | 
            +
                        for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
         | 
| 202 | 
            +
                            self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
         | 
| 203 | 
            +
             | 
| 204 | 
            +
                    # post conv
         | 
| 205 | 
            +
                    if h.activation == "snake":  # periodic nonlinearity with snake function and anti-aliasing
         | 
| 206 | 
            +
                        activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
         | 
| 207 | 
            +
                        self.activation_post = Activation1d(activation=activation_post)
         | 
| 208 | 
            +
                    elif h.activation == "snakebeta":  # periodic nonlinearity with snakebeta function and anti-aliasing
         | 
| 209 | 
            +
                        activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
         | 
| 210 | 
            +
                        self.activation_post = Activation1d(activation=activation_post)
         | 
| 211 | 
            +
                    else:
         | 
| 212 | 
            +
                        raise NotImplementedError(
         | 
| 213 | 
            +
                            "activation incorrectly specified. check the config file and look for 'activation'."
         | 
| 214 | 
            +
                        )
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                    self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                    # weight initialization
         | 
| 219 | 
            +
                    for i in range(len(self.ups)):
         | 
| 220 | 
            +
                        self.ups[i].apply(init_weights)
         | 
| 221 | 
            +
                    self.conv_post.apply(init_weights)
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                def forward(self, x):
         | 
| 224 | 
            +
                    # pre conv
         | 
| 225 | 
            +
                    x = self.conv_pre(x)
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                    for i in range(self.num_upsamples):
         | 
| 228 | 
            +
                        # upsampling
         | 
| 229 | 
            +
                        for i_up in range(len(self.ups[i])):
         | 
| 230 | 
            +
                            x = self.ups[i][i_up](x)
         | 
| 231 | 
            +
                        # AMP blocks
         | 
| 232 | 
            +
                        xs = None
         | 
| 233 | 
            +
                        for j in range(self.num_kernels):
         | 
| 234 | 
            +
                            if xs is None:
         | 
| 235 | 
            +
                                xs = self.resblocks[i * self.num_kernels + j](x)
         | 
| 236 | 
            +
                            else:
         | 
| 237 | 
            +
                                xs += self.resblocks[i * self.num_kernels + j](x)
         | 
| 238 | 
            +
                        x = xs / self.num_kernels
         | 
| 239 | 
            +
             | 
| 240 | 
            +
                    # post conv
         | 
| 241 | 
            +
                    x = self.activation_post(x)
         | 
| 242 | 
            +
                    x = self.conv_post(x)
         | 
| 243 | 
            +
                    x = torch.tanh(x)
         | 
| 244 | 
            +
             | 
| 245 | 
            +
                    return x
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                def remove_weight_norm(self):
         | 
| 248 | 
            +
                    print('Removing weight norm...')
         | 
| 249 | 
            +
                    for l in self.ups:
         | 
| 250 | 
            +
                        for l_i in l:
         | 
| 251 | 
            +
                            remove_parametrizations(l_i, 'weight')
         | 
| 252 | 
            +
                    for l in self.resblocks:
         | 
| 253 | 
            +
                        l.remove_weight_norm()
         | 
| 254 | 
            +
                    remove_parametrizations(self.conv_pre, 'weight')
         | 
| 255 | 
            +
                    remove_parametrizations(self.conv_post, 'weight')
         | 
    	
        mmaudio/ext/bigvgan/utils.py
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import torch
         | 
| 7 | 
            +
            from torch.nn.utils.parametrizations import weight_norm
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            def init_weights(m, mean=0.0, std=0.01):
         | 
| 11 | 
            +
                classname = m.__class__.__name__
         | 
| 12 | 
            +
                if classname.find("Conv") != -1:
         | 
| 13 | 
            +
                    m.weight.data.normal_(mean, std)
         | 
| 14 | 
            +
             | 
| 15 | 
            +
             | 
| 16 | 
            +
            def apply_weight_norm(m):
         | 
| 17 | 
            +
                classname = m.__class__.__name__
         | 
| 18 | 
            +
                if classname.find("Conv") != -1:
         | 
| 19 | 
            +
                    weight_norm(m)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
             | 
| 22 | 
            +
            def get_padding(kernel_size, dilation=1):
         | 
| 23 | 
            +
                return int((kernel_size * dilation - dilation) / 2)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            def load_checkpoint(filepath, device):
         | 
| 27 | 
            +
                assert os.path.isfile(filepath)
         | 
| 28 | 
            +
                print("Loading '{}'".format(filepath))
         | 
| 29 | 
            +
                checkpoint_dict = torch.load(filepath, map_location=device)
         | 
| 30 | 
            +
                print("Complete.")
         | 
| 31 | 
            +
                return checkpoint_dict
         | 
    	
        mmaudio/ext/bigvgan_v2/LICENSE
    ADDED
    
    | @@ -0,0 +1,21 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            MIT License
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Copyright (c) 2024 NVIDIA CORPORATION.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining a copy
         | 
| 6 | 
            +
            of this software and associated documentation files (the "Software"), to deal
         | 
| 7 | 
            +
            in the Software without restriction, including without limitation the rights
         | 
| 8 | 
            +
            to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
         | 
| 9 | 
            +
            copies of the Software, and to permit persons to whom the Software is
         | 
| 10 | 
            +
            furnished to do so, subject to the following conditions:
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            The above copyright notice and this permission notice shall be included in all
         | 
| 13 | 
            +
            copies or substantial portions of the Software.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
         | 
| 16 | 
            +
            IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
         | 
| 17 | 
            +
            FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
         | 
| 18 | 
            +
            AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
         | 
| 19 | 
            +
            LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
         | 
| 20 | 
            +
            OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
         | 
| 21 | 
            +
            SOFTWARE.
         | 
    	
        mmaudio/ext/bigvgan_v2/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        mmaudio/ext/bigvgan_v2/activations.py
    ADDED
    
    | @@ -0,0 +1,126 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            from torch import nn, sin, pow
         | 
| 6 | 
            +
            from torch.nn import Parameter
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            class Snake(nn.Module):
         | 
| 10 | 
            +
                """
         | 
| 11 | 
            +
                Implementation of a sine-based periodic activation function
         | 
| 12 | 
            +
                Shape:
         | 
| 13 | 
            +
                    - Input: (B, C, T)
         | 
| 14 | 
            +
                    - Output: (B, C, T), same shape as the input
         | 
| 15 | 
            +
                Parameters:
         | 
| 16 | 
            +
                    - alpha - trainable parameter
         | 
| 17 | 
            +
                References:
         | 
| 18 | 
            +
                    - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
         | 
| 19 | 
            +
                    https://arxiv.org/abs/2006.08195
         | 
| 20 | 
            +
                Examples:
         | 
| 21 | 
            +
                    >>> a1 = snake(256)
         | 
| 22 | 
            +
                    >>> x = torch.randn(256)
         | 
| 23 | 
            +
                    >>> x = a1(x)
         | 
| 24 | 
            +
                """
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def __init__(
         | 
| 27 | 
            +
                    self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
         | 
| 28 | 
            +
                ):
         | 
| 29 | 
            +
                    """
         | 
| 30 | 
            +
                    Initialization.
         | 
| 31 | 
            +
                    INPUT:
         | 
| 32 | 
            +
                        - in_features: shape of the input
         | 
| 33 | 
            +
                        - alpha: trainable parameter
         | 
| 34 | 
            +
                        alpha is initialized to 1 by default, higher values = higher-frequency.
         | 
| 35 | 
            +
                        alpha will be trained along with the rest of your model.
         | 
| 36 | 
            +
                    """
         | 
| 37 | 
            +
                    super(Snake, self).__init__()
         | 
| 38 | 
            +
                    self.in_features = in_features
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    # Initialize alpha
         | 
| 41 | 
            +
                    self.alpha_logscale = alpha_logscale
         | 
| 42 | 
            +
                    if self.alpha_logscale:  # Log scale alphas initialized to zeros
         | 
| 43 | 
            +
                        self.alpha = Parameter(torch.zeros(in_features) * alpha)
         | 
| 44 | 
            +
                    else:  # Linear scale alphas initialized to ones
         | 
| 45 | 
            +
                        self.alpha = Parameter(torch.ones(in_features) * alpha)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    self.alpha.requires_grad = alpha_trainable
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    self.no_div_by_zero = 0.000000001
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                def forward(self, x):
         | 
| 52 | 
            +
                    """
         | 
| 53 | 
            +
                    Forward pass of the function.
         | 
| 54 | 
            +
                    Applies the function to the input elementwise.
         | 
| 55 | 
            +
                    Snake βΆ= x + 1/a * sin^2 (xa)
         | 
| 56 | 
            +
                    """
         | 
| 57 | 
            +
                    alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # Line up with x to [B, C, T]
         | 
| 58 | 
            +
                    if self.alpha_logscale:
         | 
| 59 | 
            +
                        alpha = torch.exp(alpha)
         | 
| 60 | 
            +
                    x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    return x
         | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
            class SnakeBeta(nn.Module):
         | 
| 66 | 
            +
                """
         | 
| 67 | 
            +
                A modified Snake function which uses separate parameters for the magnitude of the periodic components
         | 
| 68 | 
            +
                Shape:
         | 
| 69 | 
            +
                    - Input: (B, C, T)
         | 
| 70 | 
            +
                    - Output: (B, C, T), same shape as the input
         | 
| 71 | 
            +
                Parameters:
         | 
| 72 | 
            +
                    - alpha - trainable parameter that controls frequency
         | 
| 73 | 
            +
                    - beta - trainable parameter that controls magnitude
         | 
| 74 | 
            +
                References:
         | 
| 75 | 
            +
                    - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
         | 
| 76 | 
            +
                    https://arxiv.org/abs/2006.08195
         | 
| 77 | 
            +
                Examples:
         | 
| 78 | 
            +
                    >>> a1 = snakebeta(256)
         | 
| 79 | 
            +
                    >>> x = torch.randn(256)
         | 
| 80 | 
            +
                    >>> x = a1(x)
         | 
| 81 | 
            +
                """
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                def __init__(
         | 
| 84 | 
            +
                    self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
         | 
| 85 | 
            +
                ):
         | 
| 86 | 
            +
                    """
         | 
| 87 | 
            +
                    Initialization.
         | 
| 88 | 
            +
                    INPUT:
         | 
| 89 | 
            +
                        - in_features: shape of the input
         | 
| 90 | 
            +
                        - alpha - trainable parameter that controls frequency
         | 
| 91 | 
            +
                        - beta - trainable parameter that controls magnitude
         | 
| 92 | 
            +
                        alpha is initialized to 1 by default, higher values = higher-frequency.
         | 
| 93 | 
            +
                        beta is initialized to 1 by default, higher values = higher-magnitude.
         | 
| 94 | 
            +
                        alpha will be trained along with the rest of your model.
         | 
| 95 | 
            +
                    """
         | 
| 96 | 
            +
                    super(SnakeBeta, self).__init__()
         | 
| 97 | 
            +
                    self.in_features = in_features
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    # Initialize alpha
         | 
| 100 | 
            +
                    self.alpha_logscale = alpha_logscale
         | 
| 101 | 
            +
                    if self.alpha_logscale:  # Log scale alphas initialized to zeros
         | 
| 102 | 
            +
                        self.alpha = Parameter(torch.zeros(in_features) * alpha)
         | 
| 103 | 
            +
                        self.beta = Parameter(torch.zeros(in_features) * alpha)
         | 
| 104 | 
            +
                    else:  # Linear scale alphas initialized to ones
         | 
| 105 | 
            +
                        self.alpha = Parameter(torch.ones(in_features) * alpha)
         | 
| 106 | 
            +
                        self.beta = Parameter(torch.ones(in_features) * alpha)
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    self.alpha.requires_grad = alpha_trainable
         | 
| 109 | 
            +
                    self.beta.requires_grad = alpha_trainable
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    self.no_div_by_zero = 0.000000001
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                def forward(self, x):
         | 
| 114 | 
            +
                    """
         | 
| 115 | 
            +
                    Forward pass of the function.
         | 
| 116 | 
            +
                    Applies the function to the input elementwise.
         | 
| 117 | 
            +
                    SnakeBeta βΆ= x + 1/b * sin^2 (xa)
         | 
| 118 | 
            +
                    """
         | 
| 119 | 
            +
                    alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # Line up with x to [B, C, T]
         | 
| 120 | 
            +
                    beta = self.beta.unsqueeze(0).unsqueeze(-1)
         | 
| 121 | 
            +
                    if self.alpha_logscale:
         | 
| 122 | 
            +
                        alpha = torch.exp(alpha)
         | 
| 123 | 
            +
                        beta = torch.exp(beta)
         | 
| 124 | 
            +
                    x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                    return x
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/__init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/activation1d.py
    ADDED
    
    | @@ -0,0 +1,77 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright (c) 2024 NVIDIA CORPORATION.
         | 
| 2 | 
            +
            #   Licensed under the MIT license.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            import torch.nn as nn
         | 
| 6 | 
            +
            from alias_free_activation.torch.resample import UpSample1d, DownSample1d
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
         | 
| 9 | 
            +
            from alias_free_activation.cuda import load
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            anti_alias_activation_cuda = load.load()
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            class FusedAntiAliasActivation(torch.autograd.Function):
         | 
| 15 | 
            +
                """
         | 
| 16 | 
            +
                Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
         | 
| 17 | 
            +
                The hyperparameters are hard-coded in the kernel to maximize speed.
         | 
| 18 | 
            +
                NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
         | 
| 19 | 
            +
                """
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                @staticmethod
         | 
| 22 | 
            +
                def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
         | 
| 23 | 
            +
                    activation_results = anti_alias_activation_cuda.forward(
         | 
| 24 | 
            +
                        inputs, up_ftr, down_ftr, alpha, beta
         | 
| 25 | 
            +
                    )
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    return activation_results
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                @staticmethod
         | 
| 30 | 
            +
                def backward(ctx, output_grads):
         | 
| 31 | 
            +
                    raise NotImplementedError
         | 
| 32 | 
            +
                    return output_grads, None, None
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            class Activation1d(nn.Module):
         | 
| 36 | 
            +
                def __init__(
         | 
| 37 | 
            +
                    self,
         | 
| 38 | 
            +
                    activation,
         | 
| 39 | 
            +
                    up_ratio: int = 2,
         | 
| 40 | 
            +
                    down_ratio: int = 2,
         | 
| 41 | 
            +
                    up_kernel_size: int = 12,
         | 
| 42 | 
            +
                    down_kernel_size: int = 12,
         | 
| 43 | 
            +
                    fused: bool = True,
         | 
| 44 | 
            +
                ):
         | 
| 45 | 
            +
                    super().__init__()
         | 
| 46 | 
            +
                    self.up_ratio = up_ratio
         | 
| 47 | 
            +
                    self.down_ratio = down_ratio
         | 
| 48 | 
            +
                    self.act = activation
         | 
| 49 | 
            +
                    self.upsample = UpSample1d(up_ratio, up_kernel_size)
         | 
| 50 | 
            +
                    self.downsample = DownSample1d(down_ratio, down_kernel_size)
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    self.fused = fused  # Whether to use fused CUDA kernel or not
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                def forward(self, x):
         | 
| 55 | 
            +
                    if not self.fused:
         | 
| 56 | 
            +
                        x = self.upsample(x)
         | 
| 57 | 
            +
                        x = self.act(x)
         | 
| 58 | 
            +
                        x = self.downsample(x)
         | 
| 59 | 
            +
                        return x
         | 
| 60 | 
            +
                    else:
         | 
| 61 | 
            +
                        if self.act.__class__.__name__ == "Snake":
         | 
| 62 | 
            +
                            beta = self.act.alpha.data  # Snake uses same params for alpha and beta
         | 
| 63 | 
            +
                        else:
         | 
| 64 | 
            +
                            beta = (
         | 
| 65 | 
            +
                                self.act.beta.data
         | 
| 66 | 
            +
                            )  # Snakebeta uses different params for alpha and beta
         | 
| 67 | 
            +
                        alpha = self.act.alpha.data
         | 
| 68 | 
            +
                        if (
         | 
| 69 | 
            +
                            not self.act.alpha_logscale
         | 
| 70 | 
            +
                        ):  # Exp baked into cuda kernel, cancel it out with a log
         | 
| 71 | 
            +
                            alpha = torch.log(alpha)
         | 
| 72 | 
            +
                            beta = torch.log(beta)
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                        x = FusedAntiAliasActivation.apply(
         | 
| 75 | 
            +
                            x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
         | 
| 76 | 
            +
                        )
         | 
| 77 | 
            +
                        return x
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/anti_alias_activation.cpp
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            /* coding=utf-8
         | 
| 2 | 
            +
             * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
             * you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
             * You may obtain a copy of the License at
         | 
| 7 | 
            +
             *
         | 
| 8 | 
            +
             *     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
             *
         | 
| 10 | 
            +
             * Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
             * distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
             * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
             * See the License for the specific language governing permissions and
         | 
| 14 | 
            +
             * limitations under the License.
         | 
| 15 | 
            +
             */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             #include <torch/extension.h>
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         | 
| 22 | 
            +
                m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
         | 
| 23 | 
            +
            }
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/anti_alias_activation_cuda.cu
    ADDED
    
    | @@ -0,0 +1,246 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            /* coding=utf-8
         | 
| 2 | 
            +
             * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
             * you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
             * You may obtain a copy of the License at
         | 
| 7 | 
            +
             *
         | 
| 8 | 
            +
             *     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
             *
         | 
| 10 | 
            +
             * Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
             * distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
             * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
             * See the License for the specific language governing permissions and
         | 
| 14 | 
            +
             * limitations under the License.
         | 
| 15 | 
            +
             */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            #include <ATen/ATen.h>
         | 
| 18 | 
            +
            #include <cuda.h>
         | 
| 19 | 
            +
            #include <cuda_runtime.h>
         | 
| 20 | 
            +
            #include <cuda_fp16.h>
         | 
| 21 | 
            +
            #include <cuda_profiler_api.h>
         | 
| 22 | 
            +
            #include <ATen/cuda/CUDAContext.h>
         | 
| 23 | 
            +
            #include <torch/extension.h>
         | 
| 24 | 
            +
            #include "type_shim.h"
         | 
| 25 | 
            +
            #include <assert.h>
         | 
| 26 | 
            +
            #include <cfloat>
         | 
| 27 | 
            +
            #include <limits>
         | 
| 28 | 
            +
            #include <stdint.h>
         | 
| 29 | 
            +
            #include <c10/macros/Macros.h>
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            namespace
         | 
| 32 | 
            +
            {
         | 
| 33 | 
            +
                // Hard-coded hyperparameters
         | 
| 34 | 
            +
                // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
         | 
| 35 | 
            +
                constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
         | 
| 36 | 
            +
                constexpr int BUFFER_SIZE = 32;
         | 
| 37 | 
            +
                constexpr int FILTER_SIZE = 12;
         | 
| 38 | 
            +
                constexpr int HALF_FILTER_SIZE = 6;
         | 
| 39 | 
            +
                constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
         | 
| 40 | 
            +
                constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
         | 
| 41 | 
            +
                constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                template <typename input_t, typename output_t, typename acc_t>
         | 
| 44 | 
            +
                __global__ void anti_alias_activation_forward(
         | 
| 45 | 
            +
                    output_t *dst,
         | 
| 46 | 
            +
                    const input_t *src,
         | 
| 47 | 
            +
                    const input_t *up_ftr,
         | 
| 48 | 
            +
                    const input_t *down_ftr,
         | 
| 49 | 
            +
                    const input_t *alpha,
         | 
| 50 | 
            +
                    const input_t *beta,
         | 
| 51 | 
            +
                    int batch_size,
         | 
| 52 | 
            +
                    int channels,
         | 
| 53 | 
            +
                    int seq_len)
         | 
| 54 | 
            +
                {
         | 
| 55 | 
            +
                    // Up and downsample filters
         | 
| 56 | 
            +
                    input_t up_filter[FILTER_SIZE];
         | 
| 57 | 
            +
                    input_t down_filter[FILTER_SIZE];
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    // Load data from global memory including extra indices reserved for replication paddings
         | 
| 60 | 
            +
                    input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
         | 
| 61 | 
            +
                    input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    // Output stores downsampled output before writing to dst
         | 
| 64 | 
            +
                    output_t output[BUFFER_SIZE];
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    // blockDim/threadIdx = (128, 1, 1)
         | 
| 67 | 
            +
                    // gridDim/blockIdx = (seq_blocks, channels, batches)
         | 
| 68 | 
            +
                    int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
         | 
| 69 | 
            +
                    int local_offset = threadIdx.x * BUFFER_SIZE;
         | 
| 70 | 
            +
                    int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                    // intermediate have double the seq_len
         | 
| 73 | 
            +
                    int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
         | 
| 74 | 
            +
                    int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    // Get values needed for replication padding before moving pointer
         | 
| 77 | 
            +
                    const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
         | 
| 78 | 
            +
                    input_t seq_left_most_value = right_most_pntr[0];
         | 
| 79 | 
            +
                    input_t seq_right_most_value = right_most_pntr[seq_len - 1];
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                    // Move src and dst pointers
         | 
| 82 | 
            +
                    src += block_offset + local_offset;
         | 
| 83 | 
            +
                    dst += block_offset + local_offset;
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    // Alpha and beta values for snake activatons. Applies exp by default
         | 
| 86 | 
            +
                    alpha = alpha + blockIdx.y;
         | 
| 87 | 
            +
                    input_t alpha_val = expf(alpha[0]);
         | 
| 88 | 
            +
                    beta = beta + blockIdx.y;
         | 
| 89 | 
            +
                    input_t beta_val = expf(beta[0]);
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                    #pragma unroll
         | 
| 92 | 
            +
                    for (int it = 0; it < FILTER_SIZE; it += 1)
         | 
| 93 | 
            +
                    {
         | 
| 94 | 
            +
                        up_filter[it] = up_ftr[it];
         | 
| 95 | 
            +
                        down_filter[it] = down_ftr[it];
         | 
| 96 | 
            +
                    }
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                    // Apply replication padding for upsampling, matching torch impl
         | 
| 99 | 
            +
                    #pragma unroll
         | 
| 100 | 
            +
                    for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
         | 
| 101 | 
            +
                    {
         | 
| 102 | 
            +
                        int element_index = seq_offset + it; // index for element
         | 
| 103 | 
            +
                        if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
         | 
| 104 | 
            +
                        {
         | 
| 105 | 
            +
                            elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
         | 
| 106 | 
            +
                        }
         | 
| 107 | 
            +
                        if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
         | 
| 108 | 
            +
                        {
         | 
| 109 | 
            +
                            elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
         | 
| 110 | 
            +
                        }
         | 
| 111 | 
            +
                        if ((element_index >= 0) && (element_index < seq_len))
         | 
| 112 | 
            +
                        {
         | 
| 113 | 
            +
                            elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
         | 
| 114 | 
            +
                        }
         | 
| 115 | 
            +
                    }
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    // Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
         | 
| 118 | 
            +
                    #pragma unroll
         | 
| 119 | 
            +
                    for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
         | 
| 120 | 
            +
                    {
         | 
| 121 | 
            +
                        input_t acc = 0.0;
         | 
| 122 | 
            +
                        int element_index = intermediate_seq_offset + it; // index for intermediate
         | 
| 123 | 
            +
                        #pragma unroll
         | 
| 124 | 
            +
                        for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
         | 
| 125 | 
            +
                        {
         | 
| 126 | 
            +
                            if ((element_index + f_idx) >= 0)
         | 
| 127 | 
            +
                            {
         | 
| 128 | 
            +
                                acc += up_filter[f_idx] * elements[it + f_idx];
         | 
| 129 | 
            +
                            }
         | 
| 130 | 
            +
                        }
         | 
| 131 | 
            +
                        intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
         | 
| 132 | 
            +
                    }
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                    // Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
         | 
| 135 | 
            +
                    double no_div_by_zero = 0.000000001;
         | 
| 136 | 
            +
                    #pragma unroll
         | 
| 137 | 
            +
                    for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
         | 
| 138 | 
            +
                    {
         | 
| 139 | 
            +
                        intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
         | 
| 140 | 
            +
                    }
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    // Apply replication padding before downsampling conv from intermediates
         | 
| 143 | 
            +
                    #pragma unroll
         | 
| 144 | 
            +
                    for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
         | 
| 145 | 
            +
                    {
         | 
| 146 | 
            +
                        intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
         | 
| 147 | 
            +
                    }
         | 
| 148 | 
            +
                    #pragma unroll
         | 
| 149 | 
            +
                    for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
         | 
| 150 | 
            +
                    {
         | 
| 151 | 
            +
                        intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
         | 
| 152 | 
            +
                    }
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    // Apply downsample strided convolution (assuming stride=2) from intermediates
         | 
| 155 | 
            +
                    #pragma unroll
         | 
| 156 | 
            +
                    for (int it = 0; it < BUFFER_SIZE; it += 1)
         | 
| 157 | 
            +
                    {
         | 
| 158 | 
            +
                        input_t acc = 0.0;
         | 
| 159 | 
            +
                        #pragma unroll
         | 
| 160 | 
            +
                        for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
         | 
| 161 | 
            +
                        {
         | 
| 162 | 
            +
                            // Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
         | 
| 163 | 
            +
                            acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
         | 
| 164 | 
            +
                        }
         | 
| 165 | 
            +
                        output[it] = acc;
         | 
| 166 | 
            +
                    }
         | 
| 167 | 
            +
             | 
| 168 | 
            +
                    // Write output to dst
         | 
| 169 | 
            +
                    #pragma unroll
         | 
| 170 | 
            +
                    for (int it = 0;  it < BUFFER_SIZE;  it += ELEMENTS_PER_LDG_STG)
         | 
| 171 | 
            +
                    {
         | 
| 172 | 
            +
                        int element_index = seq_offset + it;
         | 
| 173 | 
            +
                        if (element_index < seq_len)
         | 
| 174 | 
            +
                        {
         | 
| 175 | 
            +
                            dst[it] = output[it];
         | 
| 176 | 
            +
                        }
         | 
| 177 | 
            +
                    }
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                }
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                template <typename input_t, typename output_t, typename acc_t>
         | 
| 182 | 
            +
                void dispatch_anti_alias_activation_forward(
         | 
| 183 | 
            +
                    output_t *dst,
         | 
| 184 | 
            +
                    const input_t *src,
         | 
| 185 | 
            +
                    const input_t *up_ftr,
         | 
| 186 | 
            +
                    const input_t *down_ftr,
         | 
| 187 | 
            +
                    const input_t *alpha,
         | 
| 188 | 
            +
                    const input_t *beta,
         | 
| 189 | 
            +
                    int batch_size,
         | 
| 190 | 
            +
                    int channels,
         | 
| 191 | 
            +
                    int seq_len)
         | 
| 192 | 
            +
                {
         | 
| 193 | 
            +
                    if (seq_len == 0)
         | 
| 194 | 
            +
                    {
         | 
| 195 | 
            +
                        return;
         | 
| 196 | 
            +
                    }
         | 
| 197 | 
            +
                    else
         | 
| 198 | 
            +
                    {
         | 
| 199 | 
            +
                        // Use 128 threads per block to maximimize gpu utilization
         | 
| 200 | 
            +
                        constexpr int threads_per_block = 128;
         | 
| 201 | 
            +
                        constexpr int seq_len_per_block = 4096;
         | 
| 202 | 
            +
                        int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
         | 
| 203 | 
            +
                        dim3 blocks(blocks_per_seq_len, channels, batch_size);
         | 
| 204 | 
            +
                        dim3 threads(threads_per_block, 1, 1);
         | 
| 205 | 
            +
             | 
| 206 | 
            +
                        anti_alias_activation_forward<input_t, output_t, acc_t>
         | 
| 207 | 
            +
                            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
         | 
| 208 | 
            +
                    }
         | 
| 209 | 
            +
                }
         | 
| 210 | 
            +
            }
         | 
| 211 | 
            +
             | 
| 212 | 
            +
            extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
         | 
| 213 | 
            +
            {
         | 
| 214 | 
            +
                // Input is a 3d tensor with dimensions [batches, channels, seq_len]
         | 
| 215 | 
            +
                const int batches = input.size(0);
         | 
| 216 | 
            +
                const int channels = input.size(1);
         | 
| 217 | 
            +
                const int seq_len = input.size(2);
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                // Output
         | 
| 220 | 
            +
                auto act_options = input.options().requires_grad(false);
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                torch::Tensor anti_alias_activation_results =
         | 
| 223 | 
            +
                    torch::empty({batches, channels, seq_len}, act_options);
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                void *input_ptr = static_cast<void *>(input.data_ptr());
         | 
| 226 | 
            +
                void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
         | 
| 227 | 
            +
                void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
         | 
| 228 | 
            +
                void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
         | 
| 229 | 
            +
                void *beta_ptr = static_cast<void *>(beta.data_ptr());
         | 
| 230 | 
            +
                void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
         | 
| 231 | 
            +
             | 
| 232 | 
            +
                DISPATCH_FLOAT_HALF_AND_BFLOAT(
         | 
| 233 | 
            +
                    input.scalar_type(),
         | 
| 234 | 
            +
                    "dispatch anti alias activation_forward",
         | 
| 235 | 
            +
                    dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
         | 
| 236 | 
            +
                        reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
         | 
| 237 | 
            +
                        reinterpret_cast<const scalar_t *>(input_ptr),
         | 
| 238 | 
            +
                        reinterpret_cast<const scalar_t *>(up_filter_ptr),
         | 
| 239 | 
            +
                        reinterpret_cast<const scalar_t *>(down_filter_ptr),
         | 
| 240 | 
            +
                        reinterpret_cast<const scalar_t *>(alpha_ptr),
         | 
| 241 | 
            +
                        reinterpret_cast<const scalar_t *>(beta_ptr),
         | 
| 242 | 
            +
                        batches,
         | 
| 243 | 
            +
                        channels,
         | 
| 244 | 
            +
                        seq_len););
         | 
| 245 | 
            +
                return anti_alias_activation_results;
         | 
| 246 | 
            +
            }
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/compat.h
    ADDED
    
    | @@ -0,0 +1,29 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            /* coding=utf-8
         | 
| 2 | 
            +
             * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
             * you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
             * You may obtain a copy of the License at
         | 
| 7 | 
            +
             *
         | 
| 8 | 
            +
             *     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
             *
         | 
| 10 | 
            +
             * Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
             * distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
             * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
             * See the License for the specific language governing permissions and
         | 
| 14 | 
            +
             * limitations under the License.
         | 
| 15 | 
            +
             */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            /*This code is copied fron NVIDIA apex:
         | 
| 18 | 
            +
             *     https://github.com/NVIDIA/apex
         | 
| 19 | 
            +
             *     with minor changes. */
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            #ifndef TORCH_CHECK
         | 
| 22 | 
            +
            #define TORCH_CHECK AT_CHECK
         | 
| 23 | 
            +
            #endif
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            #ifdef VERSION_GE_1_3
         | 
| 26 | 
            +
            #define DATA_PTR data_ptr
         | 
| 27 | 
            +
            #else
         | 
| 28 | 
            +
            #define DATA_PTR data
         | 
| 29 | 
            +
            #endif
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/load.py
    ADDED
    
    | @@ -0,0 +1,86 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright (c) 2024 NVIDIA CORPORATION.
         | 
| 2 | 
            +
            #   Licensed under the MIT license.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
            import pathlib
         | 
| 6 | 
            +
            import subprocess
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from torch.utils import cpp_extension
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            """
         | 
| 11 | 
            +
            Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
         | 
| 12 | 
            +
            Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
         | 
| 13 | 
            +
            """
         | 
| 14 | 
            +
            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            def load():
         | 
| 18 | 
            +
                # Check if cuda 11 is installed for compute capability 8.0
         | 
| 19 | 
            +
                cc_flag = []
         | 
| 20 | 
            +
                _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
         | 
| 21 | 
            +
                if int(bare_metal_major) >= 11:
         | 
| 22 | 
            +
                    cc_flag.append("-gencode")
         | 
| 23 | 
            +
                    cc_flag.append("arch=compute_80,code=sm_80")
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                # Build path
         | 
| 26 | 
            +
                srcpath = pathlib.Path(__file__).parent.absolute()
         | 
| 27 | 
            +
                buildpath = srcpath / "build"
         | 
| 28 | 
            +
                _create_build_dir(buildpath)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                # Helper function to build the kernels.
         | 
| 31 | 
            +
                def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
         | 
| 32 | 
            +
                    return cpp_extension.load(
         | 
| 33 | 
            +
                        name=name,
         | 
| 34 | 
            +
                        sources=sources,
         | 
| 35 | 
            +
                        build_directory=buildpath,
         | 
| 36 | 
            +
                        extra_cflags=[
         | 
| 37 | 
            +
                            "-O3",
         | 
| 38 | 
            +
                        ],
         | 
| 39 | 
            +
                        extra_cuda_cflags=[
         | 
| 40 | 
            +
                            "-O3",
         | 
| 41 | 
            +
                            "-gencode",
         | 
| 42 | 
            +
                            "arch=compute_70,code=sm_70",
         | 
| 43 | 
            +
                            "--use_fast_math",
         | 
| 44 | 
            +
                        ]
         | 
| 45 | 
            +
                        + extra_cuda_flags
         | 
| 46 | 
            +
                        + cc_flag,
         | 
| 47 | 
            +
                        verbose=True,
         | 
| 48 | 
            +
                    )
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                extra_cuda_flags = [
         | 
| 51 | 
            +
                    "-U__CUDA_NO_HALF_OPERATORS__",
         | 
| 52 | 
            +
                    "-U__CUDA_NO_HALF_CONVERSIONS__",
         | 
| 53 | 
            +
                    "--expt-relaxed-constexpr",
         | 
| 54 | 
            +
                    "--expt-extended-lambda",
         | 
| 55 | 
            +
                ]
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                sources = [
         | 
| 58 | 
            +
                    srcpath / "anti_alias_activation.cpp",
         | 
| 59 | 
            +
                    srcpath / "anti_alias_activation_cuda.cu",
         | 
| 60 | 
            +
                ]
         | 
| 61 | 
            +
                anti_alias_activation_cuda = _cpp_extention_load_helper(
         | 
| 62 | 
            +
                    "anti_alias_activation_cuda", sources, extra_cuda_flags
         | 
| 63 | 
            +
                )
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                return anti_alias_activation_cuda
         | 
| 66 | 
            +
             | 
| 67 | 
            +
             | 
| 68 | 
            +
            def _get_cuda_bare_metal_version(cuda_dir):
         | 
| 69 | 
            +
                raw_output = subprocess.check_output(
         | 
| 70 | 
            +
                    [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
         | 
| 71 | 
            +
                )
         | 
| 72 | 
            +
                output = raw_output.split()
         | 
| 73 | 
            +
                release_idx = output.index("release") + 1
         | 
| 74 | 
            +
                release = output[release_idx].split(".")
         | 
| 75 | 
            +
                bare_metal_major = release[0]
         | 
| 76 | 
            +
                bare_metal_minor = release[1][0]
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                return raw_output, bare_metal_major, bare_metal_minor
         | 
| 79 | 
            +
             | 
| 80 | 
            +
             | 
| 81 | 
            +
            def _create_build_dir(buildpath):
         | 
| 82 | 
            +
                try:
         | 
| 83 | 
            +
                    os.mkdir(buildpath)
         | 
| 84 | 
            +
                except OSError:
         | 
| 85 | 
            +
                    if not os.path.isdir(buildpath):
         | 
| 86 | 
            +
                        print(f"Creation of the build directory {buildpath} failed")
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/cuda/type_shim.h
    ADDED
    
    | @@ -0,0 +1,92 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            /* coding=utf-8
         | 
| 2 | 
            +
             * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
             * you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
             * You may obtain a copy of the License at
         | 
| 7 | 
            +
             *
         | 
| 8 | 
            +
             *     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
             *
         | 
| 10 | 
            +
             * Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
             * distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
             * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
             * See the License for the specific language governing permissions and
         | 
| 14 | 
            +
             * limitations under the License.
         | 
| 15 | 
            +
             */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            #include <ATen/ATen.h>
         | 
| 18 | 
            +
            #include "compat.h"
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            #define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...)                 \
         | 
| 21 | 
            +
            	switch (TYPE)                                                       \
         | 
| 22 | 
            +
            	{                                                                   \
         | 
| 23 | 
            +
            	case at::ScalarType::Float:                                         \
         | 
| 24 | 
            +
            	{                                                                   \
         | 
| 25 | 
            +
            		using scalar_t = float;                                         \
         | 
| 26 | 
            +
            		__VA_ARGS__;                                                    \
         | 
| 27 | 
            +
            		break;                                                          \
         | 
| 28 | 
            +
            	}                                                                   \
         | 
| 29 | 
            +
            	case at::ScalarType::Half:                                          \
         | 
| 30 | 
            +
            	{                                                                   \
         | 
| 31 | 
            +
            		using scalar_t = at::Half;                                      \
         | 
| 32 | 
            +
            		__VA_ARGS__;                                                    \
         | 
| 33 | 
            +
            		break;                                                          \
         | 
| 34 | 
            +
            	}                                                                   \
         | 
| 35 | 
            +
            	case at::ScalarType::BFloat16:                                      \
         | 
| 36 | 
            +
            	{                                                                   \
         | 
| 37 | 
            +
            		using scalar_t = at::BFloat16;                                  \
         | 
| 38 | 
            +
            		__VA_ARGS__;                                                    \
         | 
| 39 | 
            +
            		break;                                                          \
         | 
| 40 | 
            +
            	}                                                                   \
         | 
| 41 | 
            +
            	default:                                                            \
         | 
| 42 | 
            +
            		AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
         | 
| 43 | 
            +
            	}
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
         | 
| 46 | 
            +
            	switch (TYPEIN)                                                            \
         | 
| 47 | 
            +
            	{                                                                          \
         | 
| 48 | 
            +
            	case at::ScalarType::Float:                                                \
         | 
| 49 | 
            +
            	{                                                                          \
         | 
| 50 | 
            +
            		using scalar_t_in = float;                                             \
         | 
| 51 | 
            +
            		switch (TYPEOUT)                                                       \
         | 
| 52 | 
            +
            		{                                                                      \
         | 
| 53 | 
            +
            		case at::ScalarType::Float:                                            \
         | 
| 54 | 
            +
            		{                                                                      \
         | 
| 55 | 
            +
            			using scalar_t_out = float;                                        \
         | 
| 56 | 
            +
            			__VA_ARGS__;                                                       \
         | 
| 57 | 
            +
            			break;                                                             \
         | 
| 58 | 
            +
            		}                                                                      \
         | 
| 59 | 
            +
            		case at::ScalarType::Half:                                             \
         | 
| 60 | 
            +
            		{                                                                      \
         | 
| 61 | 
            +
            			using scalar_t_out = at::Half;                                     \
         | 
| 62 | 
            +
            			__VA_ARGS__;                                                       \
         | 
| 63 | 
            +
            			break;                                                             \
         | 
| 64 | 
            +
            		}                                                                      \
         | 
| 65 | 
            +
            		case at::ScalarType::BFloat16:                                         \
         | 
| 66 | 
            +
            		{                                                                      \
         | 
| 67 | 
            +
            			using scalar_t_out = at::BFloat16;                                 \
         | 
| 68 | 
            +
            			__VA_ARGS__;                                                       \
         | 
| 69 | 
            +
            			break;                                                             \
         | 
| 70 | 
            +
            		}                                                                      \
         | 
| 71 | 
            +
            		default:                                                               \
         | 
| 72 | 
            +
            			AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
         | 
| 73 | 
            +
            		}                                                                      \
         | 
| 74 | 
            +
            		break;                                                                 \
         | 
| 75 | 
            +
            	}                                                                          \
         | 
| 76 | 
            +
            	case at::ScalarType::Half:                                                 \
         | 
| 77 | 
            +
            	{                                                                          \
         | 
| 78 | 
            +
            		using scalar_t_in = at::Half;                                          \
         | 
| 79 | 
            +
            		using scalar_t_out = at::Half;                                         \
         | 
| 80 | 
            +
            		__VA_ARGS__;                                                           \
         | 
| 81 | 
            +
            		break;                                                                 \
         | 
| 82 | 
            +
            	}                                                                          \
         | 
| 83 | 
            +
            	case at::ScalarType::BFloat16:                                             \
         | 
| 84 | 
            +
            	{                                                                          \
         | 
| 85 | 
            +
            		using scalar_t_in = at::BFloat16;                                      \
         | 
| 86 | 
            +
            		using scalar_t_out = at::BFloat16;                                     \
         | 
| 87 | 
            +
            		__VA_ARGS__;                                                           \
         | 
| 88 | 
            +
            		break;                                                                 \
         | 
| 89 | 
            +
            	}                                                                          \
         | 
| 90 | 
            +
            	default:                                                                   \
         | 
| 91 | 
            +
            		AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");      \
         | 
| 92 | 
            +
            	}
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/torch/__init__.py
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from .filter import *
         | 
| 5 | 
            +
            from .resample import *
         | 
| 6 | 
            +
            from .act import *
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/torch/act.py
    ADDED
    
    | @@ -0,0 +1,32 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch.nn as nn
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from mmaudio.ext.bigvgan_v2.alias_free_activation.torch.resample import (DownSample1d, UpSample1d)
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            class Activation1d(nn.Module):
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                def __init__(
         | 
| 12 | 
            +
                    self,
         | 
| 13 | 
            +
                    activation,
         | 
| 14 | 
            +
                    up_ratio: int = 2,
         | 
| 15 | 
            +
                    down_ratio: int = 2,
         | 
| 16 | 
            +
                    up_kernel_size: int = 12,
         | 
| 17 | 
            +
                    down_kernel_size: int = 12,
         | 
| 18 | 
            +
                ):
         | 
| 19 | 
            +
                    super().__init__()
         | 
| 20 | 
            +
                    self.up_ratio = up_ratio
         | 
| 21 | 
            +
                    self.down_ratio = down_ratio
         | 
| 22 | 
            +
                    self.act = activation
         | 
| 23 | 
            +
                    self.upsample = UpSample1d(up_ratio, up_kernel_size)
         | 
| 24 | 
            +
                    self.downsample = DownSample1d(down_ratio, down_kernel_size)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                # x: [B,C,T]
         | 
| 27 | 
            +
                def forward(self, x):
         | 
| 28 | 
            +
                    x = self.upsample(x)
         | 
| 29 | 
            +
                    x = self.act(x)
         | 
| 30 | 
            +
                    x = self.downsample(x)
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    return x
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/torch/filter.py
    ADDED
    
    | @@ -0,0 +1,101 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch
         | 
| 5 | 
            +
            import torch.nn as nn
         | 
| 6 | 
            +
            import torch.nn.functional as F
         | 
| 7 | 
            +
            import math
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            if "sinc" in dir(torch):
         | 
| 10 | 
            +
                sinc = torch.sinc
         | 
| 11 | 
            +
            else:
         | 
| 12 | 
            +
                # This code is adopted from adefossez's julius.core.sinc under the MIT License
         | 
| 13 | 
            +
                # https://adefossez.github.io/julius/julius/core.html
         | 
| 14 | 
            +
                #   LICENSE is in incl_licenses directory.
         | 
| 15 | 
            +
                def sinc(x: torch.Tensor):
         | 
| 16 | 
            +
                    """
         | 
| 17 | 
            +
                    Implementation of sinc, i.e. sin(pi * x) / (pi * x)
         | 
| 18 | 
            +
                    __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
         | 
| 19 | 
            +
                    """
         | 
| 20 | 
            +
                    return torch.where(
         | 
| 21 | 
            +
                        x == 0,
         | 
| 22 | 
            +
                        torch.tensor(1.0, device=x.device, dtype=x.dtype),
         | 
| 23 | 
            +
                        torch.sin(math.pi * x) / math.pi / x,
         | 
| 24 | 
            +
                    )
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
            # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
         | 
| 28 | 
            +
            # https://adefossez.github.io/julius/julius/lowpass.html
         | 
| 29 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 30 | 
            +
            def kaiser_sinc_filter1d(
         | 
| 31 | 
            +
                cutoff, half_width, kernel_size
         | 
| 32 | 
            +
            ):  # return filter [1,1,kernel_size]
         | 
| 33 | 
            +
                even = kernel_size % 2 == 0
         | 
| 34 | 
            +
                half_size = kernel_size // 2
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                # For kaiser window
         | 
| 37 | 
            +
                delta_f = 4 * half_width
         | 
| 38 | 
            +
                A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
         | 
| 39 | 
            +
                if A > 50.0:
         | 
| 40 | 
            +
                    beta = 0.1102 * (A - 8.7)
         | 
| 41 | 
            +
                elif A >= 21.0:
         | 
| 42 | 
            +
                    beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
         | 
| 43 | 
            +
                else:
         | 
| 44 | 
            +
                    beta = 0.0
         | 
| 45 | 
            +
                window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
         | 
| 48 | 
            +
                if even:
         | 
| 49 | 
            +
                    time = torch.arange(-half_size, half_size) + 0.5
         | 
| 50 | 
            +
                else:
         | 
| 51 | 
            +
                    time = torch.arange(kernel_size) - half_size
         | 
| 52 | 
            +
                if cutoff == 0:
         | 
| 53 | 
            +
                    filter_ = torch.zeros_like(time)
         | 
| 54 | 
            +
                else:
         | 
| 55 | 
            +
                    filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
         | 
| 56 | 
            +
                    """
         | 
| 57 | 
            +
                    Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
         | 
| 58 | 
            +
                    """
         | 
| 59 | 
            +
                    filter_ /= filter_.sum()
         | 
| 60 | 
            +
                    filter = filter_.view(1, 1, kernel_size)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                return filter
         | 
| 63 | 
            +
             | 
| 64 | 
            +
             | 
| 65 | 
            +
            class LowPassFilter1d(nn.Module):
         | 
| 66 | 
            +
                def __init__(
         | 
| 67 | 
            +
                    self,
         | 
| 68 | 
            +
                    cutoff=0.5,
         | 
| 69 | 
            +
                    half_width=0.6,
         | 
| 70 | 
            +
                    stride: int = 1,
         | 
| 71 | 
            +
                    padding: bool = True,
         | 
| 72 | 
            +
                    padding_mode: str = "replicate",
         | 
| 73 | 
            +
                    kernel_size: int = 12,
         | 
| 74 | 
            +
                ):
         | 
| 75 | 
            +
                    """
         | 
| 76 | 
            +
                    kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
         | 
| 77 | 
            +
                    """
         | 
| 78 | 
            +
                    super().__init__()
         | 
| 79 | 
            +
                    if cutoff < -0.0:
         | 
| 80 | 
            +
                        raise ValueError("Minimum cutoff must be larger than zero.")
         | 
| 81 | 
            +
                    if cutoff > 0.5:
         | 
| 82 | 
            +
                        raise ValueError("A cutoff above 0.5 does not make sense.")
         | 
| 83 | 
            +
                    self.kernel_size = kernel_size
         | 
| 84 | 
            +
                    self.even = kernel_size % 2 == 0
         | 
| 85 | 
            +
                    self.pad_left = kernel_size // 2 - int(self.even)
         | 
| 86 | 
            +
                    self.pad_right = kernel_size // 2
         | 
| 87 | 
            +
                    self.stride = stride
         | 
| 88 | 
            +
                    self.padding = padding
         | 
| 89 | 
            +
                    self.padding_mode = padding_mode
         | 
| 90 | 
            +
                    filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
         | 
| 91 | 
            +
                    self.register_buffer("filter", filter)
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                # Input [B, C, T]
         | 
| 94 | 
            +
                def forward(self, x):
         | 
| 95 | 
            +
                    _, C, _ = x.shape
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                    if self.padding:
         | 
| 98 | 
            +
                        x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
         | 
| 99 | 
            +
                    out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    return out
         | 
    	
        mmaudio/ext/bigvgan_v2/alias_free_activation/torch/resample.py
    ADDED
    
    | @@ -0,0 +1,54 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
         | 
| 2 | 
            +
            #   LICENSE is in incl_licenses directory.
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import torch.nn as nn
         | 
| 5 | 
            +
            from torch.nn import functional as F
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            from mmaudio.ext.bigvgan_v2.alias_free_activation.torch.filter import (LowPassFilter1d,
         | 
| 8 | 
            +
                                                                                   kaiser_sinc_filter1d)
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            class UpSample1d(nn.Module):
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def __init__(self, ratio=2, kernel_size=None):
         | 
| 14 | 
            +
                    super().__init__()
         | 
| 15 | 
            +
                    self.ratio = ratio
         | 
| 16 | 
            +
                    self.kernel_size = (int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size)
         | 
| 17 | 
            +
                    self.stride = ratio
         | 
| 18 | 
            +
                    self.pad = self.kernel_size // ratio - 1
         | 
| 19 | 
            +
                    self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
         | 
| 20 | 
            +
                    self.pad_right = (self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2)
         | 
| 21 | 
            +
                    filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
         | 
| 22 | 
            +
                                                  half_width=0.6 / ratio,
         | 
| 23 | 
            +
                                                  kernel_size=self.kernel_size)
         | 
| 24 | 
            +
                    self.register_buffer("filter", filter)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                # x: [B, C, T]
         | 
| 27 | 
            +
                def forward(self, x):
         | 
| 28 | 
            +
                    _, C, _ = x.shape
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    x = F.pad(x, (self.pad, self.pad), mode="replicate")
         | 
| 31 | 
            +
                    x = self.ratio * F.conv_transpose1d(
         | 
| 32 | 
            +
                        x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
         | 
| 33 | 
            +
                    x = x[..., self.pad_left:-self.pad_right]
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    return x
         | 
| 36 | 
            +
             | 
| 37 | 
            +
             | 
| 38 | 
            +
            class DownSample1d(nn.Module):
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                def __init__(self, ratio=2, kernel_size=None):
         | 
| 41 | 
            +
                    super().__init__()
         | 
| 42 | 
            +
                    self.ratio = ratio
         | 
| 43 | 
            +
                    self.kernel_size = (int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size)
         | 
| 44 | 
            +
                    self.lowpass = LowPassFilter1d(
         | 
| 45 | 
            +
                        cutoff=0.5 / ratio,
         | 
| 46 | 
            +
                        half_width=0.6 / ratio,
         | 
| 47 | 
            +
                        stride=ratio,
         | 
| 48 | 
            +
                        kernel_size=self.kernel_size,
         | 
| 49 | 
            +
                    )
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                def forward(self, x):
         | 
| 52 | 
            +
                    xx = self.lowpass(x)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    return xx
         | 
