EzAudio

Sleeping

App Files Files Community

OpenSound commited on Sep 16, 2024

Commit

b9d6819

verified ·

1 Parent(s): 2c654bd

Upload 84 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

api.py +117 -0
src/.idea/.gitignore +8 -0
src/.idea/inspectionProfiles/Project_Default.xml +34 -0
src/.idea/inspectionProfiles/profiles_settings.xml +6 -0
src/.idea/misc.xml +7 -0
src/.idea/modules.xml +8 -0
src/.idea/src.iml +12 -0
src/.idea/workspace.xml +128 -0
src/inference.py +169 -0
src/models/blocks.py +325 -0
src/models/conditioners.py +180 -0
src/models/udit.py +356 -0
src/models/utils/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
src/models/utils/.ipynb_checkpoints/attention-checkpoint.py +290 -0
src/models/utils/.ipynb_checkpoints/modules-checkpoint.py +374 -0
src/models/utils/.ipynb_checkpoints/rotary-checkpoint.py +91 -0
src/models/utils/.ipynb_checkpoints/span_mask-checkpoint.py +146 -0
src/models/utils/.ipynb_checkpoints/timm-checkpoint.py +114 -0
src/models/utils/__init__.py +0 -0
src/models/utils/__pycache__/__init__.cpython-310.pyc +0 -0
src/models/utils/__pycache__/__init__.cpython-311.pyc +0 -0
src/models/utils/__pycache__/attention.cpython-310.pyc +0 -0
src/models/utils/__pycache__/attention.cpython-311.pyc +0 -0
src/models/utils/__pycache__/modules.cpython-310.pyc +0 -0
src/models/utils/__pycache__/modules.cpython-311.pyc +0 -0
src/models/utils/__pycache__/rotary.cpython-310.pyc +0 -0
src/models/utils/__pycache__/rotary.cpython-311.pyc +0 -0
src/models/utils/__pycache__/span_mask.cpython-310.pyc +0 -0
src/models/utils/__pycache__/span_mask.cpython-311.pyc +0 -0
src/models/utils/__pycache__/timm.cpython-310.pyc +0 -0
src/models/utils/__pycache__/timm.cpython-311.pyc +0 -0
src/models/utils/attention.py +290 -0
src/models/utils/bk/.ipynb_checkpoints/attention-checkpoint.py +99 -0
src/models/utils/bk/.ipynb_checkpoints/llama_rotary-checkpoint.py +74 -0
src/models/utils/bk/__pycache__/rotary.cpython-311.pyc +0 -0
src/models/utils/bk/attention.py +99 -0
src/models/utils/bk/llama_rotary.py +74 -0
src/models/utils/modules.py +374 -0
src/models/utils/rotary.py +91 -0
src/models/utils/span_mask.py +146 -0
src/models/utils/timm.py +114 -0
src/modules/autoencoder_wrapper.py +83 -0
src/modules/clap_wrapper.py +0 -0
src/modules/dac/__init__.py +16 -0
src/modules/dac/__main__.py +36 -0
src/modules/dac/compare/__init__.py +0 -0
src/modules/dac/compare/encodec.py +54 -0
src/modules/dac/model/__init__.py +4 -0
src/modules/dac/model/base.py +294 -0
src/modules/dac/model/dac.py +364 -0

api.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import torch
+import random
+import numpy as np
+import gradio as gr
+import soundfile as sf
+from transformers import T5Tokenizer, T5EncoderModel
+from diffusers import DDIMScheduler
+from src.models.conditioners import MaskDiT
+from src.modules.autoencoder_wrapper import Autoencoder
+from src.inference import inference
+from src.utils import load_yaml_with_includes
+# Load model and configs
+def load_models(config_name, ckpt_path, vae_path, device):
+    params = load_yaml_with_includes(config_name)
+    # Load codec model
+    autoencoder = Autoencoder(ckpt_path=vae_path,
+                              model_type=params['autoencoder']['name'],
+                              quantization_first=params['autoencoder']['q_first']).to(device)
+    autoencoder.eval()
+    # Load text encoder
+    tokenizer = T5Tokenizer.from_pretrained(params['text_encoder']['model'])
+    text_encoder = T5EncoderModel.from_pretrained(params['text_encoder']['model']).to(device)
+    text_encoder.eval()
+    # Load main U-Net model
+    unet = MaskDiT(**params['model']).to(device)
+    unet.load_state_dict(torch.load(ckpt_path)['model'])
+    unet.eval()
+    # Load noise scheduler
+    noise_scheduler = DDIMScheduler(**params['diff'])
+    return autoencoder, unet, tokenizer, text_encoder, noise_scheduler, params
+MAX_SEED = np.iinfo(np.int32).max
+# Model and config paths
+config_name = 'ckpts/ezaudio-xl.yml'
+ckpt_path = 'ckpts/s3/ezaudio_s3_xl.pt'
+vae_path = 'ckpts/vae/1m.pt'
+save_path = 'output/'
+os.makedirs(save_path, exist_ok=True)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+autoencoder, unet, tokenizer, text_encoder, noise_scheduler, params = load_models(config_name, ckpt_path, vae_path,
+                                                                                  device)
+latents = torch.randn((1, 128, 128), device=device)
+noise = torch.randn_like(latents)
+timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (1,), device=device)
+_ = noise_scheduler.add_noise(latents, noise, timesteps)
+# Inference function
+def generate_audio(text, length,
+                   guidance_scale, guidance_rescale, ddim_steps, eta,
+                   random_seed, randomize_seed):
+    neg_text = None
+    length = length * params['autoencoder']['latent_sr']
+    if randomize_seed:
+        random_seed = random.randint(0, MAX_SEED)
+    pred = inference(autoencoder, unet, None, None,
+                     tokenizer, text_encoder,
+                     params, noise_scheduler,
+                     text, neg_text,
+                     length,
+                     guidance_scale, guidance_rescale,
+                     ddim_steps, eta, random_seed,
+                     device)
+    pred = pred.cpu().numpy().squeeze(0).squeeze(0)
+    # output_file = f"{save_path}/{text}.wav"
+    # sf.write(output_file, pred, samplerate=params['autoencoder']['sr'])
+    return params['autoencoder']['sr'], pred
+# Gradio Interface
+def gradio_interface():
+    # Input components
+    text_input = gr.Textbox(label="Text Prompt", value="the sound of dog barking")
+    length_input = gr.Slider(minimum=1, maximum=10, step=1, value=10, label="Audio Length (in seconds)")
+    # Advanced settings
+    guidance_scale_input = gr.Slider(minimum=1.0, maximum=10, step=0.1, value=5, label="Guidance Scale")
+    guidance_rescale_input = gr.Slider(minimum=0.0, maximum=1, step=0.05, value=0.75, label="Guidance Rescale")
+    ddim_steps_input = gr.Slider(minimum=25, maximum=200, step=5, value=100, label="DDIM Steps")
+    eta_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1, label="Eta")
+    random_seed_input = gr.Slider(minimum=0, maximum=MAX_SEED, step=1, value=0,)
+    randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+    # Output component
+    output_audio = gr.Audio(label="Converted Audio", type="numpy")
+    # Interface
+    gr.Interface(
+        fn=generate_audio,
+        inputs=[text_input, length_input, guidance_scale_input, guidance_rescale_input, ddim_steps_input, eta_input,
+                random_seed_input, randomize_seed],
+        outputs=output_audio,
+        title="EzAudio Text-to-Audio Generator",
+        description="Generate audio from text using a diffusion model. Adjust advanced settings for more control.",
+        allow_flagging="never"
+    ).launch()
+if __name__ == "__main__":
+    gradio_interface()

src/.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

src/.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,34 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="21">
+            <item index="0" class="java.lang.String" itemvalue="numba" />
+            <item index="1" class="java.lang.String" itemvalue="scipy" />
+            <item index="2" class="java.lang.String" itemvalue="decorator" />
+            <item index="3" class="java.lang.String" itemvalue="six" />
+            <item index="4" class="java.lang.String" itemvalue="joblib" />
+            <item index="5" class="java.lang.String" itemvalue="threadpoolctl" />
+            <item index="6" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="7" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="8" class="java.lang.String" itemvalue="cffi" />
+            <item index="9" class="java.lang.String" itemvalue="SoundFile" />
+            <item index="10" class="java.lang.String" itemvalue="audioread" />
+            <item index="11" class="java.lang.String" itemvalue="kiwisolver" />
+            <item index="12" class="java.lang.String" itemvalue="cycler" />
+            <item index="13" class="java.lang.String" itemvalue="llvmlite" />
+            <item index="14" class="java.lang.String" itemvalue="mido" />
+            <item index="15" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="16" class="java.lang.String" itemvalue="resampy" />
+            <item index="17" class="java.lang.String" itemvalue="librosa" />
+            <item index="18" class="java.lang.String" itemvalue="pyparsing" />
+            <item index="19" class="java.lang.String" itemvalue="pretty-midi" />
+            <item index="20" class="java.lang.String" itemvalue="Pillow" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

src/.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

src/.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.10" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
+</project>

src/.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/src.iml" filepath="$PROJECT_DIR$/.idea/src.iml" />
+    </modules>
+  </component>
+</project>

src/.idea/src.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

src/.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,128 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="cb82860d-7ce6-451e-932b-96d3a6e7b20d" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="ProjectColorInfo"><![CDATA[{
+  "associatedIndex": 4
+}]]></component>
+  <component name="ProjectId" id="2m8UaG5ZprDRDpwT0ASOoKWJVNg" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "Python.api.executor": "Run",
+    "Python.clean.executor": "Run",
+    "Python.gradio.executor": "Run",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="RunManager" selected="Python.api">
+    <configuration name="api" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="src" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/.." />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/../api.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="clean" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="src" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/../ckpts/vae" />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/../ckpts/vae/clean.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="gradio" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="src" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/.." />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/../gradio.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.api" />
+        <item itemvalue="Python.gradio" />
+        <item itemvalue="Python.clean" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-js-predefined-1d06a55b98c1-74d2a5396914-JavaScript-PY-241.14494.241" />
+        <option value="bundled-python-sdk-0509580d9d50-28c9f5db9ffe-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.14494.241" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="cb82860d-7ce6-451e-932b-96d3a6e7b20d" name="Changes" comment="" />
+      <created>1726457759523</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1726457759523</updated>
+      <workItem from="1726457760668" duration="3668000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+</project>

src/inference.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import random
+import pandas as pd
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+from utils import scale_shift_re
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+@torch.no_grad()
+def inference(autoencoder, unet, gt, gt_mask,
+              tokenizer, text_encoder,
+              params, noise_scheduler,
+              text_raw, neg_text=None,
+              audio_frames=500,
+              guidance_scale=3, guidance_rescale=0.0,
+              ddim_steps=50, eta=1, random_seed=2024,
+              device='cuda',
+              ):
+    if neg_text is None:
+        neg_text = [""]
+    if tokenizer is not None:
+        text_batch = tokenizer(text_raw,
+                               max_length=params['text_encoder']['max_length'],
+                               padding="max_length", truncation=True, return_tensors="pt")
+        text, text_mask = text_batch.input_ids.to(device), text_batch.attention_mask.to(device).bool()
+        text = text_encoder(input_ids=text, attention_mask=text_mask).last_hidden_state
+        uncond_text_batch = tokenizer(neg_text,
+                                      max_length=params['text_encoder']['max_length'],
+                                      padding="max_length", truncation=True, return_tensors="pt")
+        uncond_text, uncond_text_mask = uncond_text_batch.input_ids.to(device), uncond_text_batch.attention_mask.to(device).bool()
+        uncond_text = text_encoder(input_ids=uncond_text,
+                                   attention_mask=uncond_text_mask).last_hidden_state
+    else:
+        text, text_mask = None, None
+        guidance_scale = None
+    codec_dim = params['model']['out_chans']
+    unet.eval()
+    if random_seed is not None:
+        generator = torch.Generator(device=device).manual_seed(random_seed)
+    else:
+        generator = torch.Generator(device=device)
+        generator.seed()
+    noise_scheduler.set_timesteps(ddim_steps)
+    # init noise
+    noise = torch.randn((1, codec_dim, audio_frames), generator=generator, device=device)
+    latents = noise
+    for t in noise_scheduler.timesteps:
+        latents = noise_scheduler.scale_model_input(latents, t)
+        if guidance_scale:
+            latents_combined = torch.cat([latents, latents], dim=0)
+            text_combined = torch.cat([text, uncond_text], dim=0)
+            text_mask_combined = torch.cat([text_mask, uncond_text_mask], dim=0)
+            if gt is not None:
+                gt_combined = torch.cat([gt, gt], dim=0)
+                gt_mask_combined = torch.cat([gt_mask, gt_mask], dim=0)
+            else:
+                gt_combined = None
+                gt_mask_combined = None
+            output_combined, _ = unet(latents_combined, t, text_combined, context_mask=text_mask_combined,
+                                      cls_token=None, gt=gt_combined, mae_mask_infer=gt_mask_combined)
+            output_text, output_uncond = torch.chunk(output_combined, 2, dim=0)
+            output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+            if guidance_rescale > 0.0:
+                output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                guidance_rescale=guidance_rescale)
+        else:
+            output_pred, mae_mask = unet(latents, t, text, context_mask=text_mask,
+                                         cls_token=None, gt=gt, mae_mask_infer=gt_mask)
+        latents = noise_scheduler.step(model_output=output_pred, timestep=t,
+                                       sample=latents,
+                                       eta=eta, generator=generator).prev_sample
+    pred = scale_shift_re(latents, params['autoencoder']['scale'],
+                          params['autoencoder']['shift'])
+    if gt is not None:
+        pred[~gt_mask] = gt[~gt_mask]
+    pred_wav = autoencoder(embedding=pred)
+    return pred_wav
+@torch.no_grad()
+def eval_udit(autoencoder, unet,
+              tokenizer, text_encoder,
+              params, noise_scheduler,
+              val_df, subset,
+              audio_frames, mae=False,
+              guidance_scale=3, guidance_rescale=0.0,
+              ddim_steps=50, eta=1, random_seed=2023,
+              device='cuda',
+              epoch=0, save_path='logs/eval/', val_num=5):
+    val_df = pd.read_csv(val_df)
+    val_df = val_df[val_df['split'] == subset]
+    if mae:
+        val_df = val_df[val_df['audio_length'] != 0]
+    save_path = save_path + str(epoch) + '/'
+    os.makedirs(save_path, exist_ok=True)
+    for i in tqdm(range(len(val_df))):
+        row = val_df.iloc[i]
+        text = [row['caption']]
+        if mae:
+            audio_path = params['data']['val_dir'] + str(row['audio_path'])
+            gt, sr = librosa.load(audio_path, sr=params['data']['sr'])
+            gt = gt / (np.max(np.abs(gt)) + 1e-9)
+            sf.write(save_path + text[0] + '_gt.wav', gt, samplerate=params['data']['sr'])
+            num_samples = 10 * sr
+            if len(gt) < num_samples:
+                padding = num_samples - len(gt)
+                gt = np.pad(gt, (0, padding), 'constant')
+            else:
+                gt = gt[:num_samples]
+            gt = torch.tensor(gt).unsqueeze(0).unsqueeze(1).to(device)
+            gt = autoencoder(audio=gt)
+            B, D, L = gt.shape
+            mask_len = int(L * 0.2)
+            gt_mask = torch.zeros(B, D, L).to(device)
+            for _ in range(2):
+                start = random.randint(0, L - mask_len)
+                gt_mask[:, :, start:start + mask_len] = 1
+            gt_mask = gt_mask.bool()
+        else:
+            gt = None
+            gt_mask = None
+        pred = inference(autoencoder, unet, gt, gt_mask,
+                         tokenizer, text_encoder,
+                         params, noise_scheduler,
+                         text, neg_text=None,
+                         audio_frames=audio_frames,
+                         guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
+                         ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
+                         device=device)
+        pred = pred.cpu().numpy().squeeze(0).squeeze(0)
+        sf.write(save_path + text[0] + '.wav', pred, samplerate=params['data']['sr'])
+        if i + 1 >= val_num:
+            break

src/models/blocks.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .utils.attention import Attention, JointAttention
+from .utils.modules import unpatchify, FeedForward
+from .utils.modules import film_modulate
+class AdaLN(nn.Module):
+    def __init__(self, dim, ada_mode='ada', r=None, alpha=None):
+        super().__init__()
+        self.ada_mode = ada_mode
+        self.scale_shift_table = None
+        if ada_mode == 'ada':
+            # move nn.silu outside
+            self.time_ada = nn.Linear(dim, 6 * dim, bias=True)
+        elif ada_mode == 'ada_single':
+            # adaln used in pixel-art alpha
+            self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        elif ada_mode in ['ada_lora', 'ada_lora_bias']:
+            self.lora_a = nn.Linear(dim, r * 6, bias=False)
+            self.lora_b = nn.Linear(r * 6, dim * 6, bias=False)
+            self.scaling = alpha / r
+            if ada_mode == 'ada_lora_bias':
+                # take bias out for consistency
+                self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        else:
+            raise NotImplementedError
+    def forward(self, time_token=None, time_ada=None):
+        if self.ada_mode == 'ada':
+            assert time_ada is None
+            B = time_token.shape[0]
+            time_ada = self.time_ada(time_token).reshape(B, 6, -1)
+        elif self.ada_mode == 'ada_single':
+            B = time_ada.shape[0]
+            time_ada = time_ada.reshape(B, 6, -1)
+            time_ada = self.scale_shift_table[None] + time_ada
+        elif self.ada_mode in ['ada_lora', 'ada_lora_bias']:
+            B = time_ada.shape[0]
+            time_ada_lora = self.lora_b(self.lora_a(time_token)) * self.scaling
+            time_ada = time_ada + time_ada_lora
+            time_ada = time_ada.reshape(B, 6, -1)
+            if self.scale_shift_table is not None:
+                time_ada = self.scale_shift_table[None] + time_ada
+        else:
+            raise NotImplementedError
+        return time_ada
+class DiTBlock(nn.Module):
+    """
+    A modified PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(self, dim, context_dim=None,
+                 num_heads=8, mlp_ratio=4.,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 act_layer='gelu', norm_layer=nn.LayerNorm,
+                 time_fusion='none',
+                 ada_lora_rank=None, ada_lora_alpha=None,
+                 skip=False, skip_norm=False,
+                 rope_mode='none',
+                 context_norm=False,
+                 use_checkpoint=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim=dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias, qk_scale=qk_scale,
+                              qk_norm=qk_norm,
+                              rope_mode=rope_mode)
+        if context_dim is not None:
+            self.use_context = True
+            self.cross_attn = Attention(dim=dim,
+                                        num_heads=num_heads,
+                                        context_dim=context_dim,
+                                        qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                        qk_norm=qk_norm,
+                                        rope_mode='none')
+            self.norm2 = norm_layer(dim)
+            if context_norm:
+                self.norm_context = norm_layer(context_dim)
+            else:
+                self.norm_context = nn.Identity()
+        else:
+            self.use_context = False
+        self.norm3 = norm_layer(dim)
+        self.mlp = FeedForward(dim=dim, mult=mlp_ratio,
+                               activation_fn=act_layer, dropout=0)
+        self.use_adanorm = True if time_fusion != 'token' else False
+        if self.use_adanorm:
+            self.adaln = AdaLN(dim, ada_mode=time_fusion,
+                               r=ada_lora_rank, alpha=ada_lora_alpha)
+        if skip:
+            self.skip_norm = norm_layer(2 * dim) if skip_norm else nn.Identity()
+            self.skip_linear = nn.Linear(2 * dim, dim)
+        else:
+            self.skip_linear = None
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x, time_token=None, time_ada=None,
+                skip=None, context=None,
+                x_mask=None, context_mask=None, extras=None):
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x,
+                              time_token, time_ada, skip, context,
+                              x_mask, context_mask, extras,
+                              use_reentrant=False)
+        else:
+            return self._forward(x,
+                                 time_token, time_ada, skip, context,
+                                 x_mask, context_mask, extras)
+    def _forward(self, x, time_token=None, time_ada=None,
+                 skip=None, context=None,
+                 x_mask=None, context_mask=None, extras=None):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa,
+             shift_mlp, scale_mlp, gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(self.norm1(x), shift=shift_msa,
+                                   scale=scale_msa)
+            x = x + (1 - gate_msa) * self.attn(x_norm, context=None,
+                                               context_mask=x_mask,
+                                               extras=extras)
+        else:
+            x = x + self.attn(self.norm1(x), context=None, context_mask=x_mask,
+                              extras=extras)
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(x=self.norm2(x),
+                                    context=self.norm_context(context),
+                                    context_mask=context_mask, extras=extras)
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(self.norm3(x), shift=shift_mlp, scale=scale_mlp)
+            x = x + (1 - gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class JointDiTBlock(nn.Module):
+    """
+    A modified PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(self, dim, context_dim=None,
+                 num_heads=8, mlp_ratio=4.,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 act_layer='gelu', norm_layer=nn.LayerNorm,
+                 time_fusion='none',
+                 ada_lora_rank=None, ada_lora_alpha=None,
+                 skip=(False, False),
+                 rope_mode=False,
+                 context_norm=False,
+                 use_checkpoint=False,):
+        super().__init__()
+        # no cross attention
+        assert context_dim is None
+        self.attn_norm_x = norm_layer(dim)
+        self.attn_norm_c = norm_layer(dim)
+        self.attn = JointAttention(dim=dim,
+                                   num_heads=num_heads,
+                                   qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                   qk_norm=qk_norm,
+                                   rope_mode=rope_mode)
+        self.ffn_norm_x = norm_layer(dim)
+        self.ffn_norm_c = norm_layer(dim)
+        self.mlp_x = FeedForward(dim=dim, mult=mlp_ratio,
+                                 activation_fn=act_layer, dropout=0)
+        self.mlp_c = FeedForward(dim=dim, mult=mlp_ratio,
+                                 activation_fn=act_layer, dropout=0)
+        # Zero-out the shift table
+        self.use_adanorm = True if time_fusion != 'token' else False
+        if self.use_adanorm:
+            self.adaln = AdaLN(dim, ada_mode=time_fusion,
+                               r=ada_lora_rank, alpha=ada_lora_alpha)
+        if skip is False:
+            skip_x, skip_c = False, False
+        else:
+            skip_x, skip_c = skip
+        self.skip_linear_x = nn.Linear(2 * dim, dim) if skip_x else None
+        self.skip_linear_c = nn.Linear(2 * dim, dim) if skip_c else None
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x, time_token=None, time_ada=None,
+                skip=None, context=None,
+                x_mask=None, context_mask=None, extras=None):
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x,
+                              time_token, time_ada, skip,
+                              context, x_mask, context_mask, extras,
+                              use_reentrant=False)
+        else:
+            return self._forward(x,
+                                 time_token, time_ada, skip,
+                                 context, x_mask, context_mask, extras)
+    def _forward(self, x, time_token=None, time_ada=None,
+                 skip=None, context=None,
+                 x_mask=None, context_mask=None, extras=None):
+        assert context is None and context_mask is None
+        context, x = x[:, :extras, :], x[:, extras:, :]
+        context_mask, x_mask = x_mask[:, :extras], x_mask[:, extras:]
+        if skip is not None:
+            skip_c, skip_x = skip[:, :extras, :], skip[:, extras:, :]
+        B, T, C = x.shape
+        if self.skip_linear_x is not None:
+            x = self.skip_linear_x(torch.cat([x, skip_x], dim=-1))
+        if self.skip_linear_c is not None:
+            context = self.skip_linear_c(torch.cat([context, skip_c], dim=-1))
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa,
+             shift_mlp, scale_mlp, gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        x_norm = self.attn_norm_x(x)
+        c_norm = self.attn_norm_c(context)
+        if self.use_adanorm:
+            x_norm = film_modulate(x_norm, shift=shift_msa, scale=scale_msa)
+        x_out, c_out = self.attn(x_norm, context=c_norm,
+                                 x_mask=x_mask, context_mask=context_mask,
+                                 extras=extras)
+        if self.use_adanorm:
+            x = x + (1 - gate_msa) * x_out
+        else:
+            x = x + x_out
+        context = context + c_out
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(self.ffn_norm_x(x),
+                                   shift=shift_mlp, scale=scale_mlp)
+            x = x + (1 - gate_mlp) * self.mlp_x(x_norm)
+        else:
+            x = x + self.mlp_x(self.ffn_norm_x(x))
+        c_norm = self.ffn_norm_c(context)
+        context = context + self.mlp_c(c_norm)
+        return torch.cat((context, x), dim=1)
+class FinalBlock(nn.Module):
+    def __init__(self, embed_dim, patch_size, in_chans,
+                 img_size,
+                 input_type='2d',
+                 norm_layer=nn.LayerNorm,
+                 use_conv=True,
+                 use_adanorm=True):
+        super().__init__()
+        self.in_chans = in_chans
+        self.img_size = img_size
+        self.input_type = input_type
+        self.norm = norm_layer(embed_dim)
+        if use_adanorm:
+            self.use_adanorm = True
+        else:
+            self.use_adanorm = False
+        if input_type == '2d':
+            self.patch_dim = patch_size ** 2 * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv2d(self.in_chans, self.in_chans,
+                                             3, padding=1)
+            else:
+                self.final_layer = nn.Identity()
+        elif input_type == '1d':
+            self.patch_dim = patch_size * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv1d(self.in_chans, self.in_chans,
+                                             3, padding=1)
+            else:
+                self.final_layer = nn.Identity()
+    def forward(self, x, time_ada=None, extras=0):
+        B, T, C = x.shape
+        x = x[:, extras:, :]
+        # only handle generation target
+        if self.use_adanorm:
+            shift, scale = time_ada.reshape(B, 2, -1).chunk(2, dim=1)
+            x = film_modulate(self.norm(x), shift, scale)
+        else:
+            x = self.norm(x)
+        x = self.linear(x)
+        x = unpatchify(x, self.in_chans, self.input_type, self.img_size)
+        x = self.final_layer(x)
+        return x

src/models/conditioners.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import repeat
+import math
+from .udit import UDiT
+from .utils.span_mask import compute_mask_indices
+class EmbeddingCFG(nn.Module):
+    """
+    Handles label dropout for classifier-free guidance.
+    """
+    # todo: support 2D input
+    def __init__(self, in_channels):
+        super().__init__()
+        self.cfg_embedding = nn.Parameter(
+            torch.randn(in_channels) / in_channels ** 0.5)
+    def token_drop(self, condition, condition_mask, cfg_prob):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        b, t, device = condition.shape[0], condition.shape[1], condition.device
+        drop_ids = torch.rand(b, device=device) < cfg_prob
+        uncond = repeat(self.cfg_embedding, "c -> b t c", b=b, t=t)
+        condition = torch.where(drop_ids[:, None, None], uncond, condition)
+        if condition_mask is not None:
+            condition_mask[drop_ids] = False
+            condition_mask[drop_ids, 0] = True
+        return condition, condition_mask
+    def forward(self, condition, condition_mask, cfg_prob=0.0):
+        if condition_mask is not None:
+            condition_mask = condition_mask.clone()
+        if cfg_prob > 0:
+            condition, condition_mask = self.token_drop(condition,
+                                                        condition_mask,
+                                                        cfg_prob)
+        return condition, condition_mask
+class DiscreteCFG(nn.Module):
+    def __init__(self, replace_id=2):
+        super(DiscreteCFG, self).__init__()
+        self.replace_id = replace_id
+    def forward(self, context, context_mask, cfg_prob):
+        context = context.clone()
+        if context_mask is not None:
+            context_mask = context_mask.clone()
+        if cfg_prob > 0:
+            cfg_mask = torch.rand(len(context)) < cfg_prob
+            if torch.any(cfg_mask):
+                context[cfg_mask] = 0
+                context[cfg_mask, 0] = self.replace_id
+                if context_mask is not None:
+                    context_mask[cfg_mask] = False
+                    context_mask[cfg_mask, 0] = True
+        return context, context_mask
+class CFGModel(nn.Module):
+    def __init__(self, context_dim, backbone):
+        super().__init__()
+        self.model = backbone
+        self.context_cfg = EmbeddingCFG(context_dim)
+    def forward(self, x, timesteps,
+                context, x_mask=None, context_mask=None,
+                cfg_prob=0.0):
+        context = self.context_cfg(context, cfg_prob)
+        x = self.model(x=x, timesteps=timesteps,
+                       context=context,
+                       x_mask=x_mask, context_mask=context_mask)
+        return x
+class ConcatModel(nn.Module):
+    def __init__(self, backbone, in_dim, stride=[]):
+        super().__init__()
+        self.model = backbone
+        self.downsample_layers = nn.ModuleList()
+        for i, s in enumerate(stride):
+            downsample_layer = nn.Conv1d(
+                in_dim,
+                in_dim * 2,
+                kernel_size=2 * s,
+                stride=s,
+                padding=math.ceil(s / 2),
+            )
+            self.downsample_layers.append(downsample_layer)
+            in_dim = in_dim * 2
+        self.context_cfg = EmbeddingCFG(in_dim)
+    def forward(self, x, timesteps,
+                context, x_mask=None,
+                cfg=False, cfg_prob=0.0):
+        # todo: support 2D input
+        # x: B, C, L
+        # context: B, C, L
+        for downsample_layer in self.downsample_layers:
+            context = downsample_layer(context)
+        context = context.transpose(1, 2)
+        context = self.context_cfg(caption=context,
+                                   cfg=cfg, cfg_prob=cfg_prob)
+        context = context.transpose(1, 2)
+        assert context.shape[-1] == x.shape[-1]
+        x = torch.cat([context, x], dim=1)
+        x = self.model(x=x, timesteps=timesteps,
+                       context=None, x_mask=x_mask, context_mask=None)
+        return x
+class MaskDiT(nn.Module):
+    def __init__(self, mae=False, mae_prob=0.5, mask_ratio=[0.25, 1.0], mask_span=10, **kwargs):
+        super().__init__()
+        self.model = UDiT(**kwargs)
+        self.mae = mae
+        if self.mae:
+            out_channel = kwargs.pop('out_chans', None)
+            self.mask_embed = nn.Parameter(torch.zeros((out_channel)))
+            self.mae_prob = mae_prob
+            self.mask_ratio = mask_ratio
+            self.mask_span = mask_span
+    def random_masking(self, gt, mask_ratios, mae_mask_infer=None):
+        B, D, L = gt.shape
+        if mae_mask_infer is None:
+            # mask = torch.rand(B, L).to(gt.device) < mask_ratios.unsqueeze(1)
+            mask_ratios = mask_ratios.cpu().numpy()
+            mask = compute_mask_indices(shape=[B, L],
+                                        padding_mask=None,
+                                        mask_prob=mask_ratios,
+                                        mask_length=self.mask_span,
+                                        mask_type="static",
+                                        mask_other=0.0,
+                                        min_masks=1,
+                                        no_overlap=False,
+                                        min_space=0,)
+            mask = mask.unsqueeze(1).expand_as(gt)
+        else:
+            mask = mae_mask_infer
+            mask = mask.expand_as(gt)
+        gt[mask] = self.mask_embed.view(1, D, 1).expand_as(gt)[mask]
+        return gt, mask.type_as(gt)
+    def forward(self, x, timesteps, context,
+                x_mask=None, context_mask=None, cls_token=None,
+                gt=None, mae_mask_infer=None):
+        mae_mask = torch.ones_like(x)
+        if self.mae:
+            if gt is not None:
+                B, D, L = gt.shape
+                mask_ratios = torch.FloatTensor(B).uniform_(*self.mask_ratio).to(gt.device)
+                gt, mae_mask = self.random_masking(gt, mask_ratios, mae_mask_infer)
+                # apply mae only to the selected batches
+                if mae_mask_infer is None:
+                    # determine mae batch
+                    mae_batch = torch.rand(B) < self.mae_prob
+                    gt[~mae_batch] = self.mask_embed.view(1, D, 1).expand_as(gt)[~mae_batch]
+                    mae_mask[~mae_batch] = 1.0
+            else:
+                B, D, L = x.shape
+                gt = self.mask_embed.view(1, D, 1).expand_as(x)
+            x = torch.cat([x, gt, mae_mask[:, 0:1, :]], dim=1)
+        x = self.model(x=x, timesteps=timesteps, context=context,
+                       x_mask=x_mask, context_mask=context_mask,
+                       cls_token=cls_token)
+        # print(mae_mask[:, 0, :].sum(dim=-1))
+        return x, mae_mask

src/models/udit.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+import math
+from .utils.modules import PatchEmbed, TimestepEmbedder
+from .utils.modules import PE_wrapper, RMSNorm
+from .blocks import DiTBlock, JointDiTBlock, FinalBlock
+class UDiT(nn.Module):
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3,
+                 input_type='2d', out_chans=None,
+                 embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 act_layer='gelu', norm_layer='layernorm',
+                 context_norm=False,
+                 use_checkpoint=False,
+                 # time fusion ada or token
+                 time_fusion='token',
+                 ada_lora_rank=None, ada_lora_alpha=None,
+                 cls_dim=None,
+                 # max length is only used for concat
+                 context_dim=768, context_fusion='concat',
+                 context_max_length=128, context_pe_method='sinu',
+                 pe_method='abs', rope_mode='none',
+                 use_conv=True,
+                 skip=True, skip_norm=True):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_chans=in_chans,
+                                      embed_dim=embed_dim, input_type=input_type)
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(dim=embed_dim, method=pe_method,
+                               length=num_patches)
+        print(f'x position embedding: {pe_method}')
+        print(f'rope mode: {self.rope}')
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),)
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(dim=embed_dim, method='abs', length=self.extras)
+        elif time_fusion in ['ada', 'ada_single', 'ada_lora', 'ada_lora_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(embed_dim, 2 * embed_dim, bias=True)
+            if time_fusion in ['ada_single', 'ada_lora', 'ada_lora_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        print(f'time fusion mode: {self.time_fusion}')
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),)
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(dim=embed_dim,
+                                             method=context_pe_method,
+                                             length=context_max_length)
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(dim=embed_dim,
+                                             method=context_pe_method,
+                                             length=context_max_length)
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        print(f'context fusion mode: {context_fusion}')
+        print(f'context position embedding: {context_pe_method}')
+        if self.context_fusion == 'joint':
+            Block = JointDiTBlock
+            self.use_skip = skip[0]
+        else:
+            Block = DiTBlock
+            self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        print(f'use long skip connection: {skip}')
+        self.in_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, context_dim=context_dim, num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias, qk_scale=qk_scale, qk_norm=qk_norm,
+                act_layer=act_layer, norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_lora_rank=ada_lora_rank, ada_lora_alpha=ada_lora_alpha,
+                skip=False, skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.mid_block = Block(
+            dim=embed_dim, context_dim=context_dim, num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, qk_norm=qk_norm,
+            act_layer=act_layer, norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_lora_rank=ada_lora_rank, ada_lora_alpha=ada_lora_alpha,
+            skip=False, skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint)
+        self.out_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, context_dim=context_dim, num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias, qk_scale=qk_scale, qk_norm=qk_norm,
+                act_layer=act_layer, norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_lora_rank=ada_lora_rank, ada_lora_alpha=ada_lora_alpha,
+                skip=skip, skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(embed_dim=embed_dim,
+                                      patch_size=patch_size,
+                                      img_size=img_size,
+                                      in_chans=out_chans,
+                                      input_type=input_type,
+                                      norm_layer=norm_layer,
+                                      use_conv=use_conv,
+                                      use_adanorm=self.use_adanorm)
+        self.initialize_weights()
+    def _init_ada(self):
+        if self.time_fusion == 'ada':
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.weight, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+        elif self.time_fusion == 'ada_single':
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+        elif self.time_fusion in ['ada_lora', 'ada_lora_bias']:
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.kaiming_uniform_(block.adaln.lora_a.weight,
+                                         a=math.sqrt(5))
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+            nn.init.kaiming_uniform_(self.mid_block.adaln.lora_a.weight,
+                                     a=math.sqrt(5))
+            nn.init.constant_(self.mid_block.adaln.lora_b.weight, 0)
+            for block in self.out_blocks:
+                nn.init.kaiming_uniform_(block.adaln.lora_a.weight,
+                                         a=math.sqrt(5))
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+    def initialize_weights(self):
+        # Basic init for all layers
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # init patch Conv like Linear
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        # Zero-out AdaLN
+        if self.use_adanorm:
+            self._init_ada()
+        # Zero-out Cross Attention
+        if self.context_cross:
+            for block in self.in_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.weight, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+        # Zero-out cls embedding
+        if self.cls_embed:
+            if self.use_adanorm:
+                nn.init.constant_(self.cls_embed[-1].weight, 0)
+                nn.init.constant_(self.cls_embed[-1].bias, 0)
+        # Zero-out Output
+        # might not zero-out this when using v-prediction
+        # it could be good when using noise-prediction
+        # nn.init.constant_(self.final_block.linear.weight, 0)
+        # nn.init.constant_(self.final_block.linear.bias, 0)
+        # if self.use_conv:
+        #     nn.init.constant_(self.final_block.final_layer.weight.data, 0)
+        #     nn.init.constant_(self.final_block.final_layer.bias, 0)
+        # init out Conv
+        if self.use_conv:
+            nn.init.xavier_uniform_(self.final_block.final_layer.weight)
+            nn.init.constant_(self.final_block.final_layer.bias, 0)
+    def _concat_x_context(self, x, context, x_mask=None, context_mask=None):
+        assert context.shape[-2] == self.context_max_length
+        # Check if either x_mask or context_mask is provided
+        B = x.shape[0]
+        # Create default masks if they are not provided
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(B, context.shape[-2],
+                                      device=context.device).bool()
+        # Concatenate the masks along the second dimension (dim=1)
+        x_mask = torch.cat([context_mask, x_mask], dim=1)
+        # Concatenate context and x along the second dimension (dim=1)
+        x = torch.cat((context, x), dim=1)
+        return x, x_mask
+    def forward(self, x, timesteps, context,
+                x_mask=None, context_mask=None,
+                cls_token=None
+               ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(x=x, context=context_token,
+                                                   x_mask=x_mask,
+                                                   context_mask=context_mask)
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [torch.ones(B, time_token.shape[1], device=x_mask.device).bool(),
+                     x_mask], dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(x=x, time_token=time_token, time_ada=time_ada,
+                    skip=None, context=context_token,
+                    x_mask=x_mask, context_mask=context_mask,
+                    extras=self.extras)
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(x=x, time_token=time_token, time_ada=time_ada,
+                           skip=None, context=context_token,
+                           x_mask=x_mask, context_mask=context_mask,
+                           extras=self.extras)
+        for blk in self.out_blocks:
+            skip = skips.pop() if self.use_skip else None
+            x = blk(x=x, time_token=time_token, time_ada=time_ada,
+                    skip=skip, context=context_token,
+                    x_mask=x_mask, context_mask=context_mask,
+                    extras=self.extras)
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x

src/models/utils/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

File without changes

src/models/utils/.ipynb_checkpoints/attention-checkpoint.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+from .modules import RMSNorm
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    q_mask = default(q_mask, torch.ones((b, i), device=device, dtype=torch.bool))
+    k_mask = default(k_mask, torch.ones((b, j), device=device, dtype=torch.bool))
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1') * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(self, dim, context_dim=None, num_heads=8,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 attn_drop=0., proj_drop=0., rope_mode='none'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        if context_dim is None:
+            self.cross_attn = False
+        else:
+            self.cross_attn = True
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if self.cross_attn:
+            assert rope_mode == 'none'
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = self.rotary_c(q=q[:, :, :extras, :], k=k[:, :, :extras, :])
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(q, k, v,
+                                               dropout_p=self.attn_drop_p,
+                                               attn_mask=mask_binary)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(attn, mask_binary) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(x.shape, context.shape,
+                                      x.device, None, context_mask)
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads)
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads)
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class JointAttention(nn.Module):
+    def __init__(self, dim, num_heads=8,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 attn_drop=0., proj_drop=0.,
+                 rope_mode='none'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers(dim, qkv_bias)
+        self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers(dim, qkv_bias)
+        self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim)
+        self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_x = nn.Linear(dim, dim)
+        self.proj_drop_x = nn.Dropout(proj_drop)
+        self.proj_c = nn.Linear(dim, dim)
+        self.proj_drop_c = nn.Dropout(proj_drop)
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _make_qkv_layers(self, dim, qkv_bias):
+        return (nn.Linear(dim, dim, bias=qkv_bias),
+                nn.Linear(dim, dim, bias=qkv_bias),
+                nn.Linear(dim, dim, bias=qkv_bias))
+    def _make_norm_layers(self, qk_norm, head_dim):
+        if qk_norm is None:
+            norm_q = nn.Identity()
+            norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            norm_q = nn.LayerNorm(head_dim)
+            norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            norm_q = RMSNorm(head_dim)
+            norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        return norm_q, norm_k
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = self.rotary_c(q=q[:, :, :extras, :], k=k[:, :, :extras, :])
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(q, k, v,
+                                               dropout_p=self.attn_drop_p,
+                                               attn_mask=mask_binary)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(attn, mask_binary) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def _cat_mask(self, x, context, x_mask=None, context_mask=None):
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(B, context.shape[-2], device=context.device).bool()
+        mask = torch.cat([context_mask, x_mask], dim=1)
+        return mask
+    def forward(self, x, context, x_mask=None, context_mask=None, extras=0):
+        B, Lx, C = x.shape
+        _, Lc, _ = context.shape
+        if x_mask is not None or context_mask is not None:
+            mask = self._cat_mask(x, context,
+                                  x_mask=x_mask,
+                                  context_mask=context_mask)
+            shape = [B, Lx+Lc, C]
+            mask_binary = create_mask(q_shape=shape, k_shape=shape,
+                                      device=x.device,
+                                      q_mask=None, k_mask=mask)
+        else:
+            mask_binary = None
+        qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x)
+        qc, kc, vc = self.to_qc(context), self.to_kc(context), self.to_vc(context)
+        qx, kx, vx = map(lambda t: einops.rearrange(t, 'B L (H D) -> B H L D',
+                                                    H=self.num_heads), [qx, kx, vx])
+        qc, kc, vc = map(lambda t: einops.rearrange(t, 'B L (H D) -> B H L D',
+                                                    H=self.num_heads), [qc, kc, vc])
+        qx, kx = self.norm_qx(qx), self.norm_kx(kx)
+        qc, kc = self.norm_qc(qc), self.norm_kc(kc)
+        q, k, v = (torch.cat([qc, qx], dim=2),
+                   torch.cat([kc, kx], dim=2),
+                   torch.cat([vc, vx], dim=2))
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        context, x = x[:, :Lc, :], x[:, Lc:, :]
+        x = self.proj_x(x)
+        x = self.proj_drop_x(x)
+        context = self.proj_c(context)
+        context = self.proj_drop_c(context)
+        return x, context

src/models/utils/.ipynb_checkpoints/modules-checkpoint.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+import math
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .timm import trunc_normal_
+# disable in checkpoint mode
+# @torch.jit.script
+def film_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256,
+                 out_size=None):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(
+            self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+def patchify(imgs, patch_size, input_type='2d'):
+    if input_type == '2d':
+        x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
+    elif input_type == '1d':
+        x = einops.rearrange(imgs, 'B C (h p1) -> B h (p1 C)', p1=patch_size)
+    return x
+def unpatchify(x, channels=3, input_type='2d', img_size=None):
+    if input_type == '2d':
+        patch_size = int((x.shape[2] // channels) ** 0.5)
+        # h = w = int(x.shape[1] ** .5)
+        h, w = img_size[0] // patch_size, img_size[1] // patch_size
+        assert h * w == x.shape[1] and patch_size ** 2 * channels == x.shape[2]
+        x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h,
+                             p1=patch_size, p2=patch_size)
+    elif input_type == '1d':
+        patch_size = int((x.shape[2] // channels))
+        h = x.shape[1]
+        assert patch_size * channels == x.shape[2]
+        x = einops.rearrange(x, 'B h (p1 C) -> B C (h p1)', h=h, p1=patch_size)
+    return x
+class PatchEmbed(nn.Module):
+    """
+     Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type='2d'):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_type = input_type
+        if input_type == '2d':
+            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True)
+        elif input_type == '1d':
+            self.proj = nn.Conv1d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True)
+    def forward(self, x):
+        if self.input_type == '2d':
+            B, C, H, W = x.shape
+            assert H % self.patch_size == 0 and W % self.patch_size == 0
+        elif self.input_type == '1d':
+            B, C, H = x.shape
+            assert H % self.patch_size == 0
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    """
+    Relative positional embedding used in HuBERT
+    """
+    def __init__(self, dim=768, kernel_size=128, groups=16):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=True
+        )
+        self.conv = nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x):
+        # B C T
+        x = self.conv(x)
+        x = F.gelu(x[:, :, :-1])
+        return x
+class SinusoidalPositionalEncoding(nn.Module):
+    def __init__(self, dim, length):
+        super(SinusoidalPositionalEncoding, self).__init__()
+        self.length = length
+        self.dim = dim
+        self.register_buffer('pe', self._generate_positional_encoding(length, dim))
+    def _generate_positional_encoding(self, length, dim):
+        pe = torch.zeros(length, dim)
+        position = torch.arange(0, length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        return pe
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return x
+class PE_wrapper(nn.Module):
+    def __init__(self, dim=768, method='abs', length=None, **kwargs):
+        super().__init__()
+        self.method = method
+        if method == 'abs':
+            # init absolute pe like UViT
+            self.length = length
+            self.abs_pe = nn.Parameter(torch.zeros(1, length, dim))
+            trunc_normal_(self.abs_pe, std=.02)
+        elif method == 'conv':
+            self.conv_pe = PositionalConvEmbedding(dim=dim, **kwargs)
+        elif method == 'sinu':
+            self.sinu_pe = SinusoidalPositionalEncoding(dim=dim, length=length)
+        elif method == 'none':
+            # skip pe
+            self.id = nn.Identity()
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        if self.method == 'abs':
+            _, L, _ = x.shape
+            assert L <= self.length
+            x = x + self.abs_pe[:, :L, :]
+        elif self.method == 'conv':
+            x = x + self.conv_pe(x)
+        elif self.method == 'sinu':
+            x = self.sinu_pe(x)
+        elif self.method == 'none':
+            x = self.id(x)
+        else:
+            raise NotImplementedError
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class GELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none",
+                 bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32),
+                      approximate=self.approximate).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+# disable in checkpoint mode
+# @torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + beta * torch.sin(x * alpha).pow(2)
+class Snake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias,
+                 alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x = snake_beta(x, self.alpha, self.beta)
+        return x
+class GESnake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias,
+                 alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        return x * snake_beta(gate, self.alpha, self.beta)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        mult=4,
+        dropout=0.0,
+        activation_fn="geglu",
+        final_dropout=False,
+        inner_dim=None,
+        bias=True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "snake":
+            act_fn = Snake(dim, inner_dim, bias=bias)
+        elif activation_fn == "gesnake":
+            act_fn = GESnake(dim, inner_dim, bias=bias)
+        else:
+            raise NotImplementedError
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states

src/models/utils/.ipynb_checkpoints/rotary-checkpoint.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+"this rope is faster than llama rope with jit script"
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+# disable in checkpoint mode
+# @torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=-2):
+        # expect input: B, H, L, D
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        # also make sure dtype wont change
+        if (
+            seq_len != self._seq_len_cached
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(self, q, k):
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            q.float(), seq_dimension=-2
+        )
+        if k is not None:
+            return (
+                apply_rotary_pos_emb(q.float(),
+                                     self._cos_cached,
+                                     self._sin_cached).type_as(q),
+                apply_rotary_pos_emb(k.float(),
+                                     self._cos_cached,
+                                     self._sin_cached).type_as(k),
+            )
+        else:
+            return (
+                apply_rotary_pos_emb(q.float(),
+                                     self._cos_cached,
+                                     self._sin_cached).type_as(q),
+                None
+            )

src/models/utils/.ipynb_checkpoints/span_mask-checkpoint.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import numpy as np
+import torch
+from typing import Optional, Tuple
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    # Convert mask_prob to a NumPy array
+    mask_prob = np.array(mask_prob)
+    # Calculate all_num_mask for each element in the batch
+    all_num_mask = np.floor(mask_prob * all_sz / float(mask_length) + np.random.rand(bsz)).astype(int)
+    # Apply the max operation with min_masks for each element
+    all_num_mask = np.maximum(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask[i]
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    # min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        # if len(mask_idc) > min_len:
+            # mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return torch.tensor(mask)
+if __name__ == '__main__':
+    mask = compute_mask_indices(
+        shape=[4, 500],
+        padding_mask=None,
+        mask_prob=[0.65, 0.5, 0.65, 0.65],
+        mask_length=10,
+        mask_type="static",
+        mask_other=0.0,
+        min_masks=1,
+        no_overlap=False,
+        min_space=0,
+    )
+    print(mask)
+    print(mask.sum(dim=1))

src/models/utils/.ipynb_checkpoints/timm-checkpoint.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# code from timm 0.3.2
+import torch
+import torch.nn as nn
+import math
+import warnings
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None,
+                 act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

src/models/utils/__init__.py ADDED Viewed

File without changes

src/models/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/models/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (177 Bytes). View file

src/models/utils/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (7.61 kB). View file

src/models/utils/__pycache__/attention.cpython-311.pyc ADDED Viewed

Binary file (16.7 kB). View file

src/models/utils/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (13.2 kB). View file

src/models/utils/__pycache__/modules.cpython-311.pyc ADDED Viewed

Binary file (24 kB). View file

src/models/utils/__pycache__/rotary.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

src/models/utils/__pycache__/rotary.cpython-311.pyc ADDED Viewed

Binary file (4.99 kB). View file

src/models/utils/__pycache__/span_mask.cpython-310.pyc ADDED Viewed

Binary file (4.75 kB). View file

src/models/utils/__pycache__/span_mask.cpython-311.pyc ADDED Viewed

Binary file (8.51 kB). View file

src/models/utils/__pycache__/timm.cpython-310.pyc ADDED Viewed

Binary file (4.22 kB). View file

src/models/utils/__pycache__/timm.cpython-311.pyc ADDED Viewed

Binary file (6.46 kB). View file

src/models/utils/attention.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+from .modules import RMSNorm
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    q_mask = default(q_mask, torch.ones((b, i), device=device, dtype=torch.bool))
+    k_mask = default(k_mask, torch.ones((b, j), device=device, dtype=torch.bool))
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1') * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(self, dim, context_dim=None, num_heads=8,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 attn_drop=0., proj_drop=0., rope_mode='none'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        if context_dim is None:
+            self.cross_attn = False
+        else:
+            self.cross_attn = True
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if self.cross_attn:
+            assert rope_mode == 'none'
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = self.rotary_c(q=q[:, :, :extras, :], k=k[:, :, :extras, :])
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(q, k, v,
+                                               dropout_p=self.attn_drop_p,
+                                               attn_mask=mask_binary)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(attn, mask_binary) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(x.shape, context.shape,
+                                      x.device, None, context_mask)
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads)
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads)
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class JointAttention(nn.Module):
+    def __init__(self, dim, num_heads=8,
+                 qkv_bias=False, qk_scale=None, qk_norm=None,
+                 attn_drop=0., proj_drop=0.,
+                 rope_mode='none'):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers(dim, qkv_bias)
+        self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers(dim, qkv_bias)
+        self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim)
+        self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_x = nn.Linear(dim, dim)
+        self.proj_drop_x = nn.Dropout(proj_drop)
+        self.proj_c = nn.Linear(dim, dim)
+        self.proj_drop_c = nn.Dropout(proj_drop)
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _make_qkv_layers(self, dim, qkv_bias):
+        return (nn.Linear(dim, dim, bias=qkv_bias),
+                nn.Linear(dim, dim, bias=qkv_bias),
+                nn.Linear(dim, dim, bias=qkv_bias))
+    def _make_norm_layers(self, qk_norm, head_dim):
+        if qk_norm is None:
+            norm_q = nn.Identity()
+            norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            norm_q = nn.LayerNorm(head_dim)
+            norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            norm_q = RMSNorm(head_dim)
+            norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        return norm_q, norm_k
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = self.rotary_c(q=q[:, :, :extras, :], k=k[:, :, :extras, :])
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(q, k, v,
+                                               dropout_p=self.attn_drop_p,
+                                               attn_mask=mask_binary)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(attn, mask_binary) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def _cat_mask(self, x, context, x_mask=None, context_mask=None):
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(B, context.shape[-2], device=context.device).bool()
+        mask = torch.cat([context_mask, x_mask], dim=1)
+        return mask
+    def forward(self, x, context, x_mask=None, context_mask=None, extras=0):
+        B, Lx, C = x.shape
+        _, Lc, _ = context.shape
+        if x_mask is not None or context_mask is not None:
+            mask = self._cat_mask(x, context,
+                                  x_mask=x_mask,
+                                  context_mask=context_mask)
+            shape = [B, Lx+Lc, C]
+            mask_binary = create_mask(q_shape=shape, k_shape=shape,
+                                      device=x.device,
+                                      q_mask=None, k_mask=mask)
+        else:
+            mask_binary = None
+        qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x)
+        qc, kc, vc = self.to_qc(context), self.to_kc(context), self.to_vc(context)
+        qx, kx, vx = map(lambda t: einops.rearrange(t, 'B L (H D) -> B H L D',
+                                                    H=self.num_heads), [qx, kx, vx])
+        qc, kc, vc = map(lambda t: einops.rearrange(t, 'B L (H D) -> B H L D',
+                                                    H=self.num_heads), [qc, kc, vc])
+        qx, kx = self.norm_qx(qx), self.norm_kx(kx)
+        qc, kc = self.norm_qc(qc), self.norm_kc(kc)
+        q, k, v = (torch.cat([qc, qx], dim=2),
+                   torch.cat([kc, kx], dim=2),
+                   torch.cat([vc, vx], dim=2))
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        context, x = x[:, :Lc, :], x[:, Lc:, :]
+        x = self.proj_x(x)
+        x = self.proj_drop_x(x)
+        context = self.proj_c(context)
+        context = self.proj_drop_c(context)
+        return x, context

src/models/utils/bk/.ipynb_checkpoints/attention-checkpoint.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q, k, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q.shape[0], q.shape[-2], k.shape[-2], q.device
+    q_mask = default(q_mask, torch.ones((b, i), device=device, dtype=torch.bool))
+    k_mask = default(k_mask, torch.ones((b, j), device=device, dtype=torch.bool))
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1') * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(self, dim, context_dim=None, num_heads=8, qkv_bias=False, qk_scale=None,
+                 attn_drop=0., proj_drop=0., use_rope=False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_rope = use_rope
+        if self.use_rope:
+            self.rotary = RotaryEmbedding(dim=head_dim)
+    def forward(self, x, context=None, context_mask=None):
+        B, L, C = x.shape
+        q = self.to_q(x)
+        if context is None:
+            context = x
+        else:
+            assert self.use_rope is False
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(x, context, None, context_mask)
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads).float()
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads).float()
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads).float()
+        if self.use_rope:
+            q, k = self.rotary(q=q, k=k)
+        if ATTENTION_MODE == 'flash':
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v,
+                                                                 dropout_p=self.attn_drop_p,
+                                                                 attn_mask=mask_binary)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(attn, mask_binary) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplementedError
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

src/models/utils/bk/.ipynb_checkpoints/llama_rotary-checkpoint.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from typing import Tuple
+from rotary import RotaryEmbedding
+import time
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor,
+                          x: torch.Tensor,):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def compute_rope(q, freqs_cis):
+    return q * freqs_cis
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    # xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq1, xq2 = xq.chunk(2, dim=-1)
+    xq_ = torch.view_as_complex(torch.stack((xq1, xq2), dim=-1).float())
+    xk1, xk2 = xk.chunk(2, dim=-1)
+    xk_ = torch.view_as_complex(torch.stack((xk1, xk2), dim=-1).float())
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(compute_rope(xq_, freqs_cis)).flatten(3)
+    xk_out = torch.view_as_real(compute_rope(xk_, freqs_cis)).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+if __name__ == '__main__':
+    # Move data to CUDA
+    freq_cis = precompute_freqs_cis(4, 5).cuda()
+    x = torch.rand(1, 5, 1, 4).cuda()
+    y = torch.rand(1, 5, 1, 4).cuda()
+    # First method
+    start_time = time.time()
+    for _ in range(20000):
+        x1, y1 = apply_rotary_emb(x, y, freq_cis)
+    end_time = time.time()
+    print(f"Method 1 time cost: {end_time - start_time} seconds")
+    # Prepare data for the second method
+    x = x.permute(0, 2, 1, 3)
+    y = y.permute(0, 2, 1, 3)
+    rope = RotaryEmbedding(4).cuda()
+    # Second method
+    start_time = time.time()
+    for _ in range(20000):
+        x2, y2 = rope(x, y)
+    end_time = time.time()
+    print(f"Method 2 time cost: {end_time - start_time} seconds")
+    # Print the results
+    print(x1)
+    print(x2)

src/models/utils/bk/__pycache__/rotary.cpython-311.pyc ADDED Viewed

Binary file (4.8 kB). View file

src/models/utils/bk/attention.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q, k, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q.shape[0], q.shape[-2], k.shape[-2], q.device
+    q_mask = default(q_mask, torch.ones((b, i), device=device, dtype=torch.bool))
+    k_mask = default(k_mask, torch.ones((b, j), device=device, dtype=torch.bool))
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1') * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(self, dim, context_dim=None, num_heads=8, qkv_bias=False, qk_scale=None,
+                 attn_drop=0., proj_drop=0., use_rope=False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_rope = use_rope
+        if self.use_rope:
+            self.rotary = RotaryEmbedding(dim=head_dim)
+    def forward(self, x, context=None, context_mask=None):
+        B, L, C = x.shape
+        q = self.to_q(x)
+        if context is None:
+            context = x
+        else:
+            assert self.use_rope is False
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(x, context, None, context_mask)
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads).float()
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads).float()
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads).float()
+        if self.use_rope:
+            q, k = self.rotary(q=q, k=k)
+        if ATTENTION_MODE == 'flash':
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v,
+                                                                 dropout_p=self.attn_drop_p,
+                                                                 attn_mask=mask_binary)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(attn, mask_binary) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplementedError
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

src/models/utils/bk/llama_rotary.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from typing import Tuple
+from rotary import RotaryEmbedding
+import time
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor,
+                          x: torch.Tensor,):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def compute_rope(q, freqs_cis):
+    return q * freqs_cis
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    # xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq1, xq2 = xq.chunk(2, dim=-1)
+    xq_ = torch.view_as_complex(torch.stack((xq1, xq2), dim=-1).float())
+    xk1, xk2 = xk.chunk(2, dim=-1)
+    xk_ = torch.view_as_complex(torch.stack((xk1, xk2), dim=-1).float())
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(compute_rope(xq_, freqs_cis)).flatten(3)
+    xk_out = torch.view_as_real(compute_rope(xk_, freqs_cis)).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+if __name__ == '__main__':
+    # Move data to CUDA
+    freq_cis = precompute_freqs_cis(4, 5).cuda()
+    x = torch.rand(1, 5, 1, 4).cuda()
+    y = torch.rand(1, 5, 1, 4).cuda()
+    # First method
+    start_time = time.time()
+    for _ in range(20000):
+        x1, y1 = apply_rotary_emb(x, y, freq_cis)
+    end_time = time.time()
+    print(f"Method 1 time cost: {end_time - start_time} seconds")
+    # Prepare data for the second method
+    x = x.permute(0, 2, 1, 3)
+    y = y.permute(0, 2, 1, 3)
+    rope = RotaryEmbedding(4).cuda()
+    # Second method
+    start_time = time.time()
+    for _ in range(20000):
+        x2, y2 = rope(x, y)
+    end_time = time.time()
+    print(f"Method 2 time cost: {end_time - start_time} seconds")
+    # Print the results
+    print(x1)
+    print(x2)

src/models/utils/modules.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+import math
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .timm import trunc_normal_
+# disable in checkpoint mode
+# @torch.jit.script
+def film_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256,
+                 out_size=None):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(
+            self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+def patchify(imgs, patch_size, input_type='2d'):
+    if input_type == '2d':
+        x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
+    elif input_type == '1d':
+        x = einops.rearrange(imgs, 'B C (h p1) -> B h (p1 C)', p1=patch_size)
+    return x
+def unpatchify(x, channels=3, input_type='2d', img_size=None):
+    if input_type == '2d':
+        patch_size = int((x.shape[2] // channels) ** 0.5)
+        # h = w = int(x.shape[1] ** .5)
+        h, w = img_size[0] // patch_size, img_size[1] // patch_size
+        assert h * w == x.shape[1] and patch_size ** 2 * channels == x.shape[2]
+        x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h,
+                             p1=patch_size, p2=patch_size)
+    elif input_type == '1d':
+        patch_size = int((x.shape[2] // channels))
+        h = x.shape[1]
+        assert patch_size * channels == x.shape[2]
+        x = einops.rearrange(x, 'B h (p1 C) -> B C (h p1)', h=h, p1=patch_size)
+    return x
+class PatchEmbed(nn.Module):
+    """
+     Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type='2d'):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_type = input_type
+        if input_type == '2d':
+            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True)
+        elif input_type == '1d':
+            self.proj = nn.Conv1d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True)
+    def forward(self, x):
+        if self.input_type == '2d':
+            B, C, H, W = x.shape
+            assert H % self.patch_size == 0 and W % self.patch_size == 0
+        elif self.input_type == '1d':
+            B, C, H = x.shape
+            assert H % self.patch_size == 0
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    """
+    Relative positional embedding used in HuBERT
+    """
+    def __init__(self, dim=768, kernel_size=128, groups=16):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=True
+        )
+        self.conv = nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x):
+        # B C T
+        x = self.conv(x)
+        x = F.gelu(x[:, :, :-1])
+        return x
+class SinusoidalPositionalEncoding(nn.Module):
+    def __init__(self, dim, length):
+        super(SinusoidalPositionalEncoding, self).__init__()
+        self.length = length
+        self.dim = dim
+        self.register_buffer('pe', self._generate_positional_encoding(length, dim))
+    def _generate_positional_encoding(self, length, dim):
+        pe = torch.zeros(length, dim)
+        position = torch.arange(0, length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        return pe
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return x
+class PE_wrapper(nn.Module):
+    def __init__(self, dim=768, method='abs', length=None, **kwargs):
+        super().__init__()
+        self.method = method
+        if method == 'abs':
+            # init absolute pe like UViT
+            self.length = length
+            self.abs_pe = nn.Parameter(torch.zeros(1, length, dim))
+            trunc_normal_(self.abs_pe, std=.02)
+        elif method == 'conv':
+            self.conv_pe = PositionalConvEmbedding(dim=dim, **kwargs)
+        elif method == 'sinu':
+            self.sinu_pe = SinusoidalPositionalEncoding(dim=dim, length=length)
+        elif method == 'none':
+            # skip pe
+            self.id = nn.Identity()
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        if self.method == 'abs':
+            _, L, _ = x.shape
+            assert L <= self.length
+            x = x + self.abs_pe[:, :L, :]
+        elif self.method == 'conv':
+            x = x + self.conv_pe(x)
+        elif self.method == 'sinu':
+            x = self.sinu_pe(x)
+        elif self.method == 'none':
+            x = self.id(x)
+        else:
+            raise NotImplementedError
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class GELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none",
+                 bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32),
+                      approximate=self.approximate).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+# disable in checkpoint mode
+# @torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + beta * torch.sin(x * alpha).pow(2)
+class Snake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias,
+                 alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x = snake_beta(x, self.alpha, self.beta)
+        return x
+class GESnake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias,
+                 alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        return x * snake_beta(gate, self.alpha, self.beta)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        mult=4,
+        dropout=0.0,
+        activation_fn="geglu",
+        final_dropout=False,
+        inner_dim=None,
+        bias=True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "snake":
+            act_fn = Snake(dim, inner_dim, bias=bias)
+        elif activation_fn == "gesnake":
+            act_fn = GESnake(dim, inner_dim, bias=bias)
+        else:
+            raise NotImplementedError
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states

src/models/utils/rotary.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+"this rope is faster than llama rope with jit script"
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+# disable in checkpoint mode
+# @torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=-2):
+        # expect input: B, H, L, D
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        # also make sure dtype wont change
+        if (
+            seq_len != self._seq_len_cached
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(self, q, k):
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            q.float(), seq_dimension=-2
+        )
+        if k is not None:
+            return (
+                apply_rotary_pos_emb(q.float(),
+                                     self._cos_cached,
+                                     self._sin_cached).type_as(q),
+                apply_rotary_pos_emb(k.float(),
+                                     self._cos_cached,
+                                     self._sin_cached).type_as(k),
+            )
+        else:
+            return (
+                apply_rotary_pos_emb(q.float(),
+                                     self._cos_cached,
+                                     self._sin_cached).type_as(q),
+                None
+            )

src/models/utils/span_mask.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import numpy as np
+import torch
+from typing import Optional, Tuple
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    # Convert mask_prob to a NumPy array
+    mask_prob = np.array(mask_prob)
+    # Calculate all_num_mask for each element in the batch
+    all_num_mask = np.floor(mask_prob * all_sz / float(mask_length) + np.random.rand(bsz)).astype(int)
+    # Apply the max operation with min_masks for each element
+    all_num_mask = np.maximum(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask[i]
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    # min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        # if len(mask_idc) > min_len:
+            # mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return torch.tensor(mask)
+if __name__ == '__main__':
+    mask = compute_mask_indices(
+        shape=[4, 500],
+        padding_mask=None,
+        mask_prob=[0.65, 0.5, 0.65, 0.65],
+        mask_length=10,
+        mask_type="static",
+        mask_other=0.0,
+        min_masks=1,
+        no_overlap=False,
+        min_space=0,
+    )
+    print(mask)
+    print(mask.sum(dim=1))

src/models/utils/timm.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# code from timm 0.3.2
+import torch
+import torch.nn as nn
+import math
+import warnings
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None,
+                 act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

src/modules/autoencoder_wrapper.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+from .dac import DAC
+from .stable_vae import load_vae
+class Autoencoder(nn.Module):
+    def __init__(self, ckpt_path, model_type='dac', quantization_first=False):
+        super(Autoencoder, self).__init__()
+        self.model_type = model_type
+        if self.model_type == 'dac':
+            model = DAC.load(ckpt_path)
+        elif self.model_type == 'stable_vae':
+            model = load_vae(ckpt_path)
+        else:
+            raise NotImplementedError(f"Model type not implemented: {self.model_type}")
+        self.ae = model.eval()
+        self.quantization_first = quantization_first
+        print(f'Autoencoder quantization first mode: {quantization_first}')
+    @torch.no_grad()
+    def forward(self, audio=None, embedding=None):
+        if self.model_type == 'dac':
+            return self.process_dac(audio, embedding)
+        elif self.model_type == 'encodec':
+            return self.process_encodec(audio, embedding)
+        elif self.model_type == 'stable_vae':
+            return self.process_stable_vae(audio, embedding)
+        else:
+            raise NotImplementedError(f"Model type not implemented: {self.model_type}")
+    def process_dac(self, audio=None, embedding=None):
+        if audio is not None:
+            z = self.ae.encoder(audio)
+            if self.quantization_first:
+                z, *_ = self.ae.quantizer(z, None)
+            return z
+        elif embedding is not None:
+            z = embedding
+            if self.quantization_first:
+                audio = self.ae.decoder(z)
+            else:
+                z, *_ = self.ae.quantizer(z, None)
+                audio = self.ae.decoder(z)
+            return audio
+        else:
+            raise ValueError("Either audio or embedding must be provided.")
+    def process_encodec(self, audio=None, embedding=None):
+        if audio is not None:
+            z = self.ae.encoder(audio)
+            if self.quantization_first:
+                code = self.ae.quantizer.encode(z)
+                z = self.ae.quantizer.decode(code)
+            return z
+        elif embedding is not None:
+            z = embedding
+            if self.quantization_first:
+                audio = self.ae.decoder(z)
+            else:
+                code = self.ae.quantizer.encode(z)
+                z = self.ae.quantizer.decode(code)
+                audio = self.ae.decoder(z)
+            return audio
+        else:
+            raise ValueError("Either audio or embedding must be provided.")
+    def process_stable_vae(self, audio=None, embedding=None):
+        if audio is not None:
+            z = self.ae.encoder(audio)
+            if self.quantization_first:
+                z = self.ae.bottleneck.encode(z)
+            return z
+        if embedding is not None:
+            z = embedding
+            if self.quantization_first:
+                audio = self.ae.decoder(z)
+            else:
+                z = self.ae.bottleneck.encode(z)
+                audio = self.ae.decoder(z)
+            return audio
+        else:
+            raise ValueError("Either audio or embedding must be provided.")

src/modules/clap_wrapper.py ADDED Viewed

File without changes

src/modules/dac/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+__version__ = "1.0.0"
+# preserved here for legacy reasons
+__model_version__ = "latest"
+import audiotools
+audiotools.ml.BaseModel.INTERN += ["dac.**"]
+audiotools.ml.BaseModel.EXTERN += ["einops"]
+from . import nn
+from . import model
+from . import utils
+from .model import DAC
+from .model import DACFile

src/modules/dac/__main__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import sys
+import argbind
+from dac.utils import download
+from dac.utils.decode import decode
+from dac.utils.encode import encode
+STAGES = ["encode", "decode", "download"]
+def run(stage: str):
+    """Run stages.
+    Parameters
+    ----------
+    stage : str
+        Stage to run
+    """
+    if stage not in STAGES:
+        raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
+    stage_fn = globals()[stage]
+    if stage == "download":
+        stage_fn()
+        return
+    stage_fn()
+if __name__ == "__main__":
+    group = sys.argv.pop(1)
+    args = argbind.parse_args(group=group)
+    with argbind.scope(args):
+        run(group)

src/modules/dac/compare/__init__.py ADDED Viewed

File without changes

src/modules/dac/compare/encodec.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+from audiotools import AudioSignal
+from audiotools.ml import BaseModel
+from encodec import EncodecModel
+class Encodec(BaseModel):
+    def __init__(self, sample_rate: int = 24000, bandwidth: float = 24.0):
+        super().__init__()
+        if sample_rate == 24000:
+            self.model = EncodecModel.encodec_model_24khz()
+        else:
+            self.model = EncodecModel.encodec_model_48khz()
+        self.model.set_target_bandwidth(bandwidth)
+        self.sample_rate = 44100
+    def forward(
+        self,
+        audio_data: torch.Tensor,
+        sample_rate: int = 44100,
+        n_quantizers: int = None,
+    ):
+        signal = AudioSignal(audio_data, sample_rate)
+        signal.resample(self.model.sample_rate)
+        recons = self.model(signal.audio_data)
+        recons = AudioSignal(recons, self.model.sample_rate)
+        recons.resample(sample_rate)
+        return {"audio": recons.audio_data}
+if __name__ == "__main__":
+    import numpy as np
+    from functools import partial
+    model = Encodec()
+    for n, m in model.named_modules():
+        o = m.extra_repr()
+        p = sum([np.prod(p.size()) for p in m.parameters()])
+        fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
+        setattr(m, "extra_repr", partial(fn, o=o, p=p))
+    print(model)
+    print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
+    length = 88200 * 2
+    x = torch.randn(1, 1, length).to(model.device)
+    x.requires_grad_(True)
+    x.retain_grad()
+    # Make a forward pass
+    out = model(x)["audio"]
+    print(x.shape, out.shape)

src/modules/dac/model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .base import CodecMixin
+from .base import DACFile
+from .dac import DAC
+from .discriminator import Discriminator

src/modules/dac/model/base.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+import numpy as np
+import torch
+import tqdm
+from audiotools import AudioSignal
+from torch import nn
+SUPPORTED_VERSIONS = ["1.0.0"]
+@dataclass
+class DACFile:
+    codes: torch.Tensor
+    # Metadata
+    chunk_length: int
+    original_length: int
+    input_db: float
+    channels: int
+    sample_rate: int
+    padding: bool
+    dac_version: str
+    def save(self, path):
+        artifacts = {
+            "codes": self.codes.numpy().astype(np.uint16),
+            "metadata": {
+                "input_db": self.input_db.numpy().astype(np.float32),
+                "original_length": self.original_length,
+                "sample_rate": self.sample_rate,
+                "chunk_length": self.chunk_length,
+                "channels": self.channels,
+                "padding": self.padding,
+                "dac_version": SUPPORTED_VERSIONS[-1],
+            },
+        }
+        path = Path(path).with_suffix(".dac")
+        with open(path, "wb") as f:
+            np.save(f, artifacts)
+        return path
+    @classmethod
+    def load(cls, path):
+        artifacts = np.load(path, allow_pickle=True)[()]
+        codes = torch.from_numpy(artifacts["codes"].astype(int))
+        if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
+            raise RuntimeError(
+                f"Given file {path} can't be loaded with this version of descript-audio-codec."
+            )
+        return cls(codes=codes, **artifacts["metadata"])
+class CodecMixin:
+    @property
+    def padding(self):
+        if not hasattr(self, "_padding"):
+            self._padding = True
+        return self._padding
+    @padding.setter
+    def padding(self, value):
+        assert isinstance(value, bool)
+        layers = [
+            l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
+        ]
+        for layer in layers:
+            if value:
+                if hasattr(layer, "original_padding"):
+                    layer.padding = layer.original_padding
+            else:
+                layer.original_padding = layer.padding
+                layer.padding = tuple(0 for _ in range(len(layer.padding)))
+        self._padding = value
+    def get_delay(self):
+        # Any number works here, delay is invariant to input length
+        l_out = self.get_output_length(0)
+        L = l_out
+        layers = []
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                layers.append(layer)
+        for layer in reversed(layers):
+            d = layer.dilation[0]
+            k = layer.kernel_size[0]
+            s = layer.stride[0]
+            if isinstance(layer, nn.ConvTranspose1d):
+                L = ((L - d * (k - 1) - 1) / s) + 1
+            elif isinstance(layer, nn.Conv1d):
+                L = (L - 1) * s + d * (k - 1) + 1
+            L = math.ceil(L)
+        l_in = L
+        return (l_in - l_out) // 2
+    def get_output_length(self, input_length):
+        L = input_length
+        # Calculate output length
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                d = layer.dilation[0]
+                k = layer.kernel_size[0]
+                s = layer.stride[0]
+                if isinstance(layer, nn.Conv1d):
+                    L = ((L - d * (k - 1) - 1) / s) + 1
+                elif isinstance(layer, nn.ConvTranspose1d):
+                    L = (L - 1) * s + d * (k - 1) + 1
+                L = math.floor(L)
+        return L
+    @torch.no_grad()
+    def compress(
+        self,
+        audio_path_or_signal: Union[str, Path, AudioSignal],
+        win_duration: float = 1.0,
+        verbose: bool = False,
+        normalize_db: float = -16,
+        n_quantizers: int = None,
+    ) -> DACFile:
+        """Processes an audio signal from a file or AudioSignal object into
+        discrete codes. This function processes the signal in short windows,
+        using constant GPU memory.
+        Parameters
+        ----------
+        audio_path_or_signal : Union[str, Path, AudioSignal]
+            audio signal to reconstruct
+        win_duration : float, optional
+            window duration in seconds, by default 5.0
+        verbose : bool, optional
+            by default False
+        normalize_db : float, optional
+            normalize db, by default -16
+        Returns
+        -------
+        DACFile
+            Object containing compressed codes and metadata
+            required for decompression
+        """
+        audio_signal = audio_path_or_signal
+        if isinstance(audio_signal, (str, Path)):
+            audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
+        self.eval()
+        original_padding = self.padding
+        original_device = audio_signal.device
+        audio_signal = audio_signal.clone()
+        original_sr = audio_signal.sample_rate
+        resample_fn = audio_signal.resample
+        loudness_fn = audio_signal.loudness
+        # If audio is > 10 minutes long, use the ffmpeg versions
+        if audio_signal.signal_duration >= 10 * 60 * 60:
+            resample_fn = audio_signal.ffmpeg_resample
+            loudness_fn = audio_signal.ffmpeg_loudness
+        original_length = audio_signal.signal_length
+        resample_fn(self.sample_rate)
+        input_db = loudness_fn()
+        if normalize_db is not None:
+            audio_signal.normalize(normalize_db)
+        audio_signal.ensure_max_of_audio()
+        nb, nac, nt = audio_signal.audio_data.shape
+        audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
+        win_duration = (
+            audio_signal.signal_duration if win_duration is None else win_duration
+        )
+        if audio_signal.signal_duration <= win_duration:
+            # Unchunked compression (used if signal length < win duration)
+            self.padding = True
+            n_samples = nt
+            hop = nt
+        else:
+            # Chunked inference
+            self.padding = False
+            # Zero-pad signal on either side by the delay
+            audio_signal.zero_pad(self.delay, self.delay)
+            n_samples = int(win_duration * self.sample_rate)
+            # Round n_samples to nearest hop length multiple
+            n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
+            hop = self.get_output_length(n_samples)
+        codes = []
+        range_fn = range if not verbose else tqdm.trange
+        for i in range_fn(0, nt, hop):
+            x = audio_signal[..., i : i + n_samples]
+            x = x.zero_pad(0, max(0, n_samples - x.shape[-1]))
+            audio_data = x.audio_data.to(self.device)
+            audio_data = self.preprocess(audio_data, self.sample_rate)
+            _, c, _, _, _ = self.encode(audio_data, n_quantizers)
+            codes.append(c.to(original_device))
+            chunk_length = c.shape[-1]
+        codes = torch.cat(codes, dim=-1)
+        dac_file = DACFile(
+            codes=codes,
+            chunk_length=chunk_length,
+            original_length=original_length,
+            input_db=input_db,
+            channels=nac,
+            sample_rate=original_sr,
+            padding=self.padding,
+            dac_version=SUPPORTED_VERSIONS[-1],
+        )
+        if n_quantizers is not None:
+            codes = codes[:, :n_quantizers, :]
+        self.padding = original_padding
+        return dac_file
+    @torch.no_grad()
+    def decompress(
+        self,
+        obj: Union[str, Path, DACFile],
+        verbose: bool = False,
+    ) -> AudioSignal:
+        """Reconstruct audio from a given .dac file
+        Parameters
+        ----------
+        obj : Union[str, Path, DACFile]
+            .dac file location or corresponding DACFile object.
+        verbose : bool, optional
+            Prints progress if True, by default False
+        Returns
+        -------
+        AudioSignal
+            Object with the reconstructed audio
+        """
+        self.eval()
+        if isinstance(obj, (str, Path)):
+            obj = DACFile.load(obj)
+        original_padding = self.padding
+        self.padding = obj.padding
+        range_fn = range if not verbose else tqdm.trange
+        codes = obj.codes
+        original_device = codes.device
+        chunk_length = obj.chunk_length
+        recons = []
+        for i in range_fn(0, codes.shape[-1], chunk_length):
+            c = codes[..., i : i + chunk_length].to(self.device)
+            z = self.quantizer.from_codes(c)[0]
+            r = self.decode(z)
+            recons.append(r.to(original_device))
+        recons = torch.cat(recons, dim=-1)
+        recons = AudioSignal(recons, self.sample_rate)
+        resample_fn = recons.resample
+        loudness_fn = recons.loudness
+        # If audio is > 10 minutes long, use the ffmpeg versions
+        if recons.signal_duration >= 10 * 60 * 60:
+            resample_fn = recons.ffmpeg_resample
+            loudness_fn = recons.ffmpeg_loudness
+        recons.normalize(obj.input_db)
+        resample_fn(obj.sample_rate)
+        recons = recons[..., : obj.original_length]
+        loudness_fn()
+        recons.audio_data = recons.audio_data.reshape(
+            -1, obj.channels, obj.original_length
+        )
+        self.padding = original_padding
+        return recons

src/modules/dac/model/dac.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import math
+from typing import List
+from typing import Union
+import numpy as np
+import torch
+from audiotools import AudioSignal
+from audiotools.ml import BaseModel
+from torch import nn
+from .base import CodecMixin
+from ..nn.layers import Snake1d
+from ..nn.layers import WNConv1d
+from ..nn.layers import WNConvTranspose1d
+from ..nn.quantize import ResidualVectorQuantize
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1),
+            ResidualUnit(dim // 2, dilation=3),
+            ResidualUnit(dim // 2, dilation=9),
+            Snake1d(dim // 2),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        strides: list = [2, 4, 8, 8],
+        d_latent: int = 64,
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+        # Create last convolution
+        self.block += [
+            Snake1d(d_model),
+            WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
+        ]
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        return self.block(x)
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        d_out: int = 1,
+    ):
+        super().__init__()
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+class DAC(BaseModel, CodecMixin):
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: List[int] = [2, 4, 8, 8],
+        latent_dim: int = None,
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = [8, 8, 4, 2],
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: bool = False,
+        sample_rate: int = 44100,
+    ):
+        super().__init__()
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.sample_rate = sample_rate
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer = ResidualVectorQuantize(
+            input_dim=latent_dim,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_dropout=quantizer_dropout,
+        )
+        self.decoder = Decoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+        )
+        self.sample_rate = sample_rate
+        self.apply(init_weights)
+        self.delay = self.get_delay()
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data
+    def encode(
+        self,
+        audio_data: torch.Tensor,
+        n_quantizers: int = None,
+    ):
+        """Encode given audio data and return quantized latent codes
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None
+            If None, all quantizers are used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+        """
+        z = self.encoder(audio_data)
+        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
+            z, n_quantizers
+        )
+        return z, codes, latents, commitment_loss, codebook_loss
+    def decode(self, z: torch.Tensor):
+        """Decode given latent codes and return audio data
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+            Quantized continuous representation of input
+        length : int, optional
+            Number of samples in output audio, by default None
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        return self.decoder(z)
+    def forward(
+        self,
+        audio_data: torch.Tensor,
+        sample_rate: int = None,
+        n_quantizers: int = None,
+    ):
+        """Model forward pass
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        sample_rate : int, optional
+            Sample rate of audio data in Hz, by default None
+            If None, defaults to `self.sample_rate`
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None.
+            If None, all quantizers are used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        length = audio_data.shape[-1]
+        audio_data = self.preprocess(audio_data, sample_rate)
+        z, codes, latents, commitment_loss, codebook_loss = self.encode(
+            audio_data, n_quantizers
+        )
+        x = self.decode(z)
+        return {
+            "audio": x[..., :length],
+            "z": z,
+            "codes": codes,
+            "latents": latents,
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+if __name__ == "__main__":
+    import numpy as np
+    from functools import partial
+    model = DAC().to("cpu")
+    for n, m in model.named_modules():
+        o = m.extra_repr()
+        p = sum([np.prod(p.size()) for p in m.parameters()])
+        fn = lambda o, p: o + f" {p/1e6:<.3f}M params."
+        setattr(m, "extra_repr", partial(fn, o=o, p=p))
+    print(model)
+    print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
+    length = 88200 * 2
+    x = torch.randn(1, 1, length).to(model.device)
+    x.requires_grad_(True)
+    x.retain_grad()
+    # Make a forward pass
+    out = model(x)["audio"]
+    print("Input shape:", x.shape)
+    print("Output shape:", out.shape)
+    # Create gradient variable
+    grad = torch.zeros_like(out)
+    grad[:, :, grad.shape[-1] // 2] = 1
+    # Make a backward pass
+    out.backward(grad)
+    # Check non-zero values
+    gradmap = x.grad.squeeze(0)
+    gradmap = (gradmap != 0).sum(0)  # sum across features
+    rf = (gradmap != 0).sum()
+    print(f"Receptive field: {rf.item()}")
+    x = AudioSignal(torch.randn(1, 1, 44100 * 60), 44100)
+    model.decompress(model.compress(x, verbose=True), verbose=True)