SFMTS
/

esikoulutettu_p1

Model card Files Files and versions

SFMTS commited on Nov 15, 2024

Commit

497b0f7

verified ·

1 Parent(s): a07122f

Upload 9 files

Browse files

Files changed (9) hide show

DUR_0.safetensors +3 -0
D_0.safetensors +3 -0
G_0.safetensors +3 -0
WD_0.safetensors +3 -0
config.json +114 -0
models/models.py +1189 -0
nlp/japanese/normalizer.py +176 -0
nlp/symbols.py +259 -0
train_ms.py +1128 -0

DUR_0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ce46bc0fdc9188d555f17432387a1336aec7511e59a5b50d1e98de1d6c2c09d
+size 1124100

D_0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0150bb3e70144be31a4faa57a7d2e80ed6427e5cb6c30dc39b23debf53c6fcf2
+size 187270328

G_0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83ee428db6803c667c067f8d98c0db42c4c3e6711fa5ce584789eb84793c738e
+size 116087820

WD_0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b7279846e957cfd392fcd4ebabf0668ced2cd9526b4d811308b4629130858ce
+size 4695736

config.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+  "model_name": "v1",
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 1000,
+    "seed": 42,
+    "epochs": 10000,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 1,
+    "bf16_run": false,
+    "fp16_run": false,
+    "lr_decay": 0.99996,
+    "segment_size": 16384,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "c_commit": 100,
+    "skip_optimizer": false,
+    "freeze_ZH_bert": false,
+    "freeze_JP_bert": false,
+    "freeze_EN_bert": false,
+    "freeze_emo": false,
+    "freeze_style": false,
+    "freeze_decoder": false
+  },
+  "data": {
+    "use_jp_extra": false,
+    "training_files": "Data\\v1\\train.list",
+    "validation_files": "Data\\v1\\val.list",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 1024,
+    "n_mel_channels": 64,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 1,
+    "cleaned_text": true,
+    "spk2id": {
+      "test": 0,
+    }
+  },
+  "model": {
+    "use_spk_conditioned_encoder": true,
+    "use_noise_scaled_mas": true,
+    "use_mel_posterior_encoder": true,
+    "use_duration_discriminator": true,
+    "use_wavlm_discriminator": true,
+    "inter_channels": 128,
+    "hidden_channels": 128,
+    "filter_channels": 512,
+    "n_heads": 2,
+    "n_layers": 4,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 256,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": true,
+    "gin_channels": 256,
+    "slm": {
+      "model": "./slm/wavlm-base-plus",
+      "sr": 16000,
+      "hidden": 768,
+      "nlayers": 13,
+      "initial_channel": 64
+    }
+  },
+  "version": "2.6.1"
+}

models/models.py ADDED Viewed

	@@ -0,0 +1,1189 @@

+import math
+from typing import Any, Optional
+import torch
+from torch import nn
+from torch.nn import Conv1d, Conv2d, ConvTranspose1d
+from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+from style_bert_vits2.models import attentions, commons, modules, monotonic_alignment
+from style_bert_vits2.nlp.symbols import NUM_LANGUAGES, NUM_TONES, SYMBOLS
+class DurationDiscriminator(nn.Module):  # vits2
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        p_dropout: float,
+        gin_channels: int = 0,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
+        self.pre_out_conv_1 = nn.Conv1d(
+            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
+        self.pre_out_conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
+    def forward_probability(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        dur: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        dur = self.dur_proj(dur)
+        x = torch.cat([x, dur], dim=1)
+        x = self.pre_out_conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_1(x)
+        x = self.drop(x)
+        x = self.pre_out_conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_2(x)
+        x = self.drop(x)
+        x = x * x_mask
+        x = x.transpose(1, 2)
+        output_prob = self.output_layer(x)
+        return output_prob
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        dur_r: torch.Tensor,
+        dur_hat: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+    ) -> list[torch.Tensor]:
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        output_probs = []
+        for dur in [dur_r, dur_hat]:
+            output_prob = self.forward_probability(x, x_mask, dur, g)
+            output_probs.append(output_prob)
+        return output_probs
+class TransformerCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: float,
+        n_flows: int = 4,
+        gin_channels: int = 0,
+        share_parameter: bool = False,
+    ) -> None:
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        self.wn = (
+            # attentions.FFT(
+            #     hidden_channels,
+            #     filter_channels,
+            #     n_heads,
+            #     n_layers,
+            #     kernel_size,
+            #     p_dropout,
+            #     isflow=True,
+            #     gin_channels=self.gin_channels,
+            # )
+            None
+            if share_parameter
+            else None
+        )
+        for i in range(n_flows):
+            self.flows.append(
+                modules.TransformerCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    n_layers,
+                    n_heads,
+                    p_dropout,
+                    filter_channels,
+                    mean_only=True,
+                    wn_sharing_parameter=self.wn,
+                    gin_channels=self.gin_channels,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        reverse: bool = False,
+    ) -> torch.Tensor:
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class StochasticDurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        p_dropout: float,
+        n_flows: int = 4,
+        gin_channels: int = 0,
+    ) -> None:
+        super().__init__()
+        filter_channels = in_channels  # it needs to be removed from future version.
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.log_flow = modules.Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(modules.ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.flows.append(modules.Flip())
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(modules.ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.post_flows.append(modules.Flip())
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        w: Optional[torch.Tensor] = None,
+        g: Optional[torch.Tensor] = None,
+        reverse: bool = False,
+        noise_scale: float = 1.0,
+    ) -> torch.Tensor:
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+        if not reverse:
+            flows = self.flows
+            assert w is not None
+            logdet_tot_q = 0
+            h_w = self.post_pre(w)
+            h_w = self.post_convs(h_w, x_mask)
+            h_w = self.post_proj(h_w) * x_mask
+            e_q = (
+                torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
+                * x_mask
+            )
+            z_q = e_q
+            for flow in self.post_flows:
+                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+                logdet_tot_q += logdet_q
+            z_u, z1 = torch.split(z_q, [1, 1], 1)
+            u = torch.sigmoid(z_u) * x_mask
+            z0 = (w - u) * x_mask
+            logdet_tot_q += torch.sum(
+                (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
+            )
+            logq = (
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
+                - logdet_tot_q
+            )
+            logdet_tot = 0
+            z0, logdet = self.log_flow(z0, x_mask)
+            logdet_tot += logdet
+            z = torch.cat([z0, z1], 1)
+            for flow in flows:
+                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+                logdet_tot = logdet_tot + logdet
+            nll = (
+                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
+                - logdet_tot
+            )
+            return nll + logq  # [b]
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+            z = (
+                torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
+                * noise_scale
+            )
+            for flow in flows:
+                z = flow(z, x_mask, g=x, reverse=reverse)
+            z0, z1 = torch.split(z, [1, 1], 1)
+            logw = z0
+            return logw
+class DurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        kernel_size: int,
+        p_dropout: float,
+        gin_channels: int = 0,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+    def forward(
+        self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class Bottleneck(nn.Sequential):
+    def __init__(self, in_dim: int, hidden_dim: int) -> None:
+        c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
+        super().__init__(c_fc1, c_fc2)
+class Block(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(in_dim)
+        self.mlp = MLP(in_dim, hidden_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.mlp(self.norm(x))
+        return x
+class MLP(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int) -> None:
+        super().__init__()
+        self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        n_vocab: int,
+        out_channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: float,
+        n_speakers: int,
+        gin_channels: int = 0,
+    ) -> None:
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.emb = nn.Embedding(len(SYMBOLS), hidden_channels)
+        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+        self.tone_emb = nn.Embedding(NUM_TONES, hidden_channels)
+        nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
+        self.language_emb = nn.Embedding(NUM_LANGUAGES, hidden_channels)
+        nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
+        self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.style_proj = nn.Linear(256, hidden_channels)
+        self.encoder = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            gin_channels=self.gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        tone: torch.Tensor,
+        language: torch.Tensor,
+        bert: torch.Tensor,
+        ja_bert: torch.Tensor,
+        en_bert: torch.Tensor,
+        style_vec: torch.Tensor,
+        sid: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        bert_emb = self.bert_proj(bert).transpose(1, 2)
+        ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
+        en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
+        style_emb = self.style_proj(style_vec.unsqueeze(1))
+        x = (
+            self.emb(x)
+            + self.tone_emb(tone)
+            + self.language_emb(language)
+            + bert_emb
+            + ja_bert_emb
+            + en_bert_emb
+            + style_emb
+        ) * math.sqrt(
+            self.hidden_channels
+        )  # [b, t, h]
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        n_flows: int = 4,
+        gin_channels: int = 0,
+    ) -> None:
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        reverse: bool = False,
+    ) -> torch.Tensor:
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        dilation_rate: int,
+        n_layers: int,
+        gin_channels: int = 0,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel: int,
+        resblock_str: str,
+        resblock_kernel_sizes: list[int],
+        resblock_dilation_sizes: list[list[int]],
+        upsample_rates: list[int],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: list[int],
+        gin_channels: int = 0,
+    ) -> None:
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock_str == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        ch = None
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))  # type: ignore
+        assert ch is not None
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(commons.init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(
+        self, x: torch.Tensor, g: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            assert xs is not None
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self) -> None:
+        print("Removing weight norm...")
+        for layer in self.ups:
+            remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(
+        self,
+        period: int,
+        kernel_size: int = 5,
+        stride: int = 3,
+        use_spectral_norm: bool = False,
+    ) -> None:
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for layer in self.convs:
+            x = layer(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm: bool = False) -> None:
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm: bool = False) -> None:
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(
+        self,
+        y: torch.Tensor,
+        y_hat: torch.Tensor,
+    ) -> tuple[
+        list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]
+    ]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class WavLMDiscriminator(nn.Module):
+    """docstring for Discriminator."""
+    def __init__(
+        self,
+        slm_hidden: int = 768,
+        slm_layers: int = 13,
+        initial_channel: int = 64,
+        use_spectral_norm: bool = False,
+    ) -> None:
+        super(WavLMDiscriminator, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.pre = norm_f(
+            Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
+        )
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    nn.Conv1d(
+                        initial_channel, initial_channel * 2, kernel_size=5, padding=2
+                    )
+                ),
+                norm_f(
+                    nn.Conv1d(
+                        initial_channel * 2,
+                        initial_channel * 4,
+                        kernel_size=5,
+                        padding=2,
+                    )
+                ),
+                norm_f(
+                    nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2)
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.pre(x)
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        x = torch.flatten(x, 1, -1)
+        return x
+class ReferenceEncoder(nn.Module):
+    """
+    inputs --- [N, Ty/r, n_mels*r]  mels
+    outputs --- [N, ref_enc_gru_size]
+    """
+    def __init__(self, spec_channels: int, gin_channels: int = 0) -> None:
+        super().__init__()
+        self.spec_channels = spec_channels
+        ref_enc_filters = [32, 32, 64, 64, 128, 128]
+        K = len(ref_enc_filters)
+        filters = [1] + ref_enc_filters
+        convs = [
+            weight_norm(
+                nn.Conv2d(
+                    in_channels=filters[i],
+                    out_channels=filters[i + 1],
+                    kernel_size=(3, 3),
+                    stride=(2, 2),
+                    padding=(1, 1),
+                )
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)])
+        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1] * out_channels,
+            hidden_size=256 // 2,
+            batch_first=True,
+        )
+        self.proj = nn.Linear(128, gin_channels)
+    def forward(
+        self, inputs: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        N = inputs.size(0)
+        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
+        for conv in self.convs:
+            out = conv(out)
+            # out = wn(out)
+            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        T = out.size(1)
+        N = out.size(0)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+        self.gru.flatten_parameters()
+        memory, out = self.gru(out)  # out --- [1, N, 128]
+        return self.proj(out.squeeze(0))
+    def calculate_channels(
+        self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int
+    ) -> int:
+        for i in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        n_vocab: int,
+        spec_channels: int,
+        segment_size: int,
+        inter_channels: int,
+        hidden_channels: int,
+        filter_channels: int,
+        n_heads: int,
+        n_layers: int,
+        kernel_size: int,
+        p_dropout: float,
+        resblock: str,
+        resblock_kernel_sizes: list[int],
+        resblock_dilation_sizes: list[list[int]],
+        upsample_rates: list[int],
+        upsample_initial_channel: int,
+        upsample_kernel_sizes: list[int],
+        n_speakers: int = 256,
+        gin_channels: int = 256,
+        use_sdp: bool = True,
+        n_flow_layer: int = 4,
+        n_layers_trans_flow: int = 6,
+        flow_share_parameter: bool = False,
+        use_transformer_flow: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.n_layers_trans_flow = n_layers_trans_flow
+        self.use_spk_conditioned_encoder = kwargs.get(
+            "use_spk_conditioned_encoder", True
+        )
+        self.use_sdp = use_sdp
+        self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
+        self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
+        self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
+        self.current_mas_noise_scale = self.mas_noise_scale_initial
+        if self.use_spk_conditioned_encoder and gin_channels > 0:
+            self.enc_gin_channels = gin_channels
+        self.enc_p = TextEncoder(
+            n_vocab,
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            self.n_speakers,
+            gin_channels=self.enc_gin_channels,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        if use_transformer_flow:
+            self.flow = TransformerCouplingBlock(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers_trans_flow,
+                5,
+                p_dropout,
+                n_flow_layer,
+                gin_channels=gin_channels,
+                share_parameter=flow_share_parameter,
+            )
+        else:
+            self.flow = ResidualCouplingBlock(
+                inter_channels,
+                hidden_channels,
+                5,
+                1,
+                n_flow_layer,
+                gin_channels=gin_channels,
+            )
+        self.sdp = StochasticDurationPredictor(
+            hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
+        )
+        self.dp = DurationPredictor(
+            hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
+        )
+        if n_speakers >= 1:
+            self.emb_g = nn.Embedding(n_speakers, gin_channels)
+        else:
+            self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        y: torch.Tensor,
+        y_lengths: torch.Tensor,
+        sid: torch.Tensor,
+        tone: torch.Tensor,
+        language: torch.Tensor,
+        bert: torch.Tensor,
+        ja_bert: torch.Tensor,
+        en_bert: torch.Tensor,
+        style_vec: torch.Tensor,
+    ) -> tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        tuple[torch.Tensor, ...],
+        tuple[torch.Tensor, ...],
+    ]:
+        if self.n_speakers > 0:
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        else:
+            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        x, m_p, logs_p, x_mask = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, style_vec, sid, g=g
+        )
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        with torch.no_grad():
+            # negative cross-entropy
+            s_p_sq_r = torch.exp(-2 * logs_p)  # [b, d, t]
+            neg_cent1 = torch.sum(
+                -0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True
+            )  # [b, 1, t_s]
+            neg_cent2 = torch.matmul(
+                -0.5 * (z_p**2).transpose(1, 2), s_p_sq_r
+            )  # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
+            neg_cent3 = torch.matmul(
+                z_p.transpose(1, 2), (m_p * s_p_sq_r)
+            )  # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
+            neg_cent4 = torch.sum(
+                -0.5 * (m_p**2) * s_p_sq_r, [1], keepdim=True
+            )  # [b, 1, t_s]
+            neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
+            if self.use_noise_scaled_mas:
+                epsilon = (
+                    torch.std(neg_cent)
+                    * torch.randn_like(neg_cent)
+                    * self.current_mas_noise_scale
+                )
+                neg_cent = neg_cent + epsilon
+            attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+            attn = (
+                monotonic_alignment.maximum_path(neg_cent, attn_mask.squeeze(1))
+                .unsqueeze(1)
+                .detach()
+            )
+        w = attn.sum(2)
+        l_length_sdp = self.sdp(x, x_mask, w, g=g)
+        l_length_sdp = l_length_sdp / torch.sum(x_mask)
+        logw_ = torch.log(w + 1e-6) * x_mask
+        logw = self.dp(x, x_mask, g=g)
+        # logw_sdp = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=1.0)
+        l_length_dp = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(
+            x_mask
+        )  # for averaging
+        # l_length_sdp += torch.sum((logw_sdp - logw_) ** 2, [1, 2]) / torch.sum(x_mask)
+        l_length = l_length_dp + l_length_sdp
+        # expand prior
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return (
+            o,
+            l_length,
+            attn,
+            ids_slice,
+            x_mask,
+            y_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),  # type: ignore
+            (x, logw, logw_),  # , logw_sdp),
+            g,
+        )
+    def infer(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        sid: torch.Tensor,
+        tone: torch.Tensor,
+        language: torch.Tensor,
+        bert: torch.Tensor,
+        ja_bert: torch.Tensor,
+        en_bert: torch.Tensor,
+        style_vec: torch.Tensor,
+        noise_scale: float = 0.667,
+        length_scale: float = 1.0,
+        noise_scale_w: float = 0.8,
+        max_len: Optional[int] = None,
+        sdp_ratio: float = 0.0,
+        y: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, tuple[torch.Tensor, ...]]:
+        # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, bert)
+        # g = self.gst(y)
+        if self.n_speakers > 0:
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        else:
+            assert y is not None
+            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        x, m_p, logs_p, x_mask = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, style_vec, sid, g=g
+        )
+        logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
+            sdp_ratio
+        ) + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
+        w = torch.exp(logw) * x_mask * length_scale
+        w_ceil = torch.ceil(w)
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
+            x_mask.dtype
+        )
+        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        attn = commons.generate_path(w_ceil, attn_mask)
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        z = self.flow(z_p, y_mask, g=g, reverse=True)
+        o = self.dec((z * y_mask)[:, :, :max_len], g=g)
+        return o, attn, y_mask, (z, z_p, m_p, logs_p)

nlp/japanese/normalizer.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+記号類の正規化変換マップの； ： 「 」 括弧全般の扱いを変更
+記号類の正規化変換マップに、＝ ＜ ＞ ＃ ＾ ＊を追加
+"""
+import re
+import unicodedata
+from num2words import num2words
+from style_bert_vits2.nlp.symbols import PUNCTUATIONS
+def normalize_text(text: str) -> str:
+    """
+    日本語のテキストを正規化する。
+    結果は、ちょうど次の文字のみからなる：
+    - ひらがな
+    - カタカナ（全角長音記号「ー」が入る！）
+    - 漢字
+    - 半角アルファベット（大文字と小文字）
+    - ギリシャ文字
+    - `.` （句点`。`や`…`の一部や改行等）
+    - `,` （読点`、`や`:`等）
+    - `?` （疑問符`？`）
+    - `!` （感嘆符`！`）
+    - `'` （`「`や`」`等）
+    - `-` （`―`（ダッシュ、長音記号ではない）や`-`等）
+    注意点:
+    - 三点リーダー`…`は`...`に変換される（`なるほど…。` → `なるほど....`）
+    - 数字は漢字に変換される（`1,100円` → `千百円`、`52.34` → `五十二点三四`）
+    - 読点や疑問符等の位置・個数等は保持される（`??あ、、！！！` → `??あ,,!!!`）
+    Args:
+        text (str): 正規化するテキスト
+    Returns:
+        str: 正規化されたテキスト
+    """
+    res = unicodedata.normalize("NFKC", text)  # ここでアルファベットは半角になる
+    res = __convert_numbers_to_words(res)  # 「100円」→「百円」等
+    # 「～」と「〜」と「~」も長音記号として扱う
+    res = res.replace("~", "ー")
+    res = res.replace("～", "ー")
+    res = res.replace("〜", "ー")
+    res = replace_punctuation(res)  # 句読点等正規化、読めない文字を削除
+    # 結合文字の濁点・半濁点を削除
+    # 通常の「ば」等はそのままのこされる、「あ゛」は上で「あ゙」になりここで「あ」になる
+    res = res.replace("\u3099", "")  # 結合文字の濁点を削除、る゙ → る
+    res = res.replace("\u309A", "")  # 結合文字の半濁点を削除、な゚ → な
+    return res
+def replace_punctuation(text: str) -> str:
+    """
+    句読点等を「.」「,」「!」「?」「'」「-」に正規化し、OpenJTalk で読みが取得できるもののみ残す：
+    漢字・平仮名・カタカナ、アルファベット、ギリシャ文字
+    Args:
+        text (str): 正規化するテキスト
+    Returns:
+        str: 正規化されたテキスト
+    """
+    # 記号類の正規化変換マップ
+    REPLACE_MAP = {
+        "：": ":",
+        "；": ";",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "．": ".",
+        "…": "...",
+        "···": "...",
+        "・・・": "...",
+        "·": ",",
+        "・": ",",
+        "、": ",",
+        "$": ".",
+        "“": "'",
+        "”": "'",
+        '"': "'",
+        "‘": "'",
+        "’": "'",
+        "（": "(",
+        "）": ")",
+        "(": "(",
+        ")": ")",
+        "《": "(",
+        "》": ")",
+        "【": "(",
+        "】": ")",
+        "[": "(",
+        "]": ")",
+        # NFKC 正規化後のハイフン・ダッシュの変種を全て通常半角ハイフン - \u002d に変換
+        "\u02d7": "\u002d",  # ˗, Modifier Letter Minus Sign
+        "\u2010": "\u002d",  # ‐, Hyphen,
+        # "\u2011": "\u002d",  # ‑, Non-Breaking Hyphen, NFKC により \u2010 に変換される
+        "\u2012": "\u002d",  # ‒, Figure Dash
+        "\u2013": "\u002d",  # –, En Dash
+        "\u2014": "\u002d",  # —, Em Dash
+        "\u2015": "\u002d",  # ―, Horizontal Bar
+        "\u2043": "\u002d",  # ⁃, Hyphen Bullet
+        "\u2212": "\u002d",  # −, Minus Sign
+        "\u23af": "\u002d",  # ⎯, Horizontal Line Extension
+        "\u23e4": "\u002d",  # ⏤, Straightness
+        "\u2500": "\u002d",  # ─, Box Drawings Light Horizontal
+        "\u2501": "\u002d",  # ━, Box Drawings Heavy Horizontal
+        "\u2e3a": "\u002d",  # ⸺, Two-Em Dash
+        "\u2e3b": "\u002d",  # ⸻, Three-Em Dash
+        # "～": "-",  # これは長音記号「ー」として扱うよう変更
+        # "~": "-",  # これも長音記号「ー」として扱うよう変更
+        "「": "'",
+        "」": "'",
+        "＝": "=",
+        "＜": "<",
+        "＞": ">",
+        "＃": "#",
+        "＾": "^",
+        "＊": "*",
+    }
+    pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP.keys()))
+    # 句読点を辞書で置換
+    replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
+    replaced_text = re.sub(
+        # ↓ ひらがな、カタカナ、漢字
+        r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
+        # ↓ 半角アルファベット（大文字と小文字）
+        + r"\u0041-\u005A\u0061-\u007A"
+        # ↓ 全角アルファベット（大文字と小文字）
+        + r"\uFF21-\uFF3A\uFF41-\uFF5A"
+        # ↓ ギリシャ文字
+        + r"\u0370-\u03FF\u1F00-\u1FFF"
+        # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
+        + "".join(PUNCTUATIONS) + r"]+",
+        # 上述以外の文字を削除
+        "",
+        replaced_text,
+    )
+    return replaced_text
+def __convert_numbers_to_words(text: str) -> str:
+    """
+    記号や数字を日本語の文字表現に変換する。
+    Args:
+        text (str): 変換するテキスト
+    Returns:
+        str: 変換されたテキスト
+    """
+    NUMBER_WITH_SEPARATOR_PATTERN = re.compile("[0-9]{1,3}(,[0-9]{3})+")
+    CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
+    CURRENCY_PATTERN = re.compile(r"([$¥£€])([0-9.]*[0-9])")
+    NUMBER_PATTERN = re.compile(r"[0-9]+(\.[0-9]+)?")
+    res = NUMBER_WITH_SEPARATOR_PATTERN.sub(lambda m: m[0].replace(",", ""), text)
+    res = CURRENCY_PATTERN.sub(lambda m: m[2] + CURRENCY_MAP.get(m[1], m[1]), res)
+    res = NUMBER_PATTERN.sub(lambda m: num2words(m[0], lang="ja"), res)
+    return res

nlp/symbols.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+PUNCTUATIONSに ":", ";", "=", "#", "<", ">", "^", "(", ")", "*"を追加
+"""
+# Punctuations
+PUNCTUATIONS = ["!", "?", "…", ",", ".", "'", "-", ":", ";", "=", "#", "<", ">", "^", "(", ")", "*"]
+# Punctuations and special tokens
+PUNCTUATION_SYMBOLS = PUNCTUATIONS + ["SP", "UNK"]
+# Padding
+PAD = "_"
+# Chinese symbols
+ZH_SYMBOLS = [
+    "E",
+    "En",
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "f",
+    "g",
+    "h",
+    "i",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "ong",
+    "ou",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+    "AA",
+    "EE",
+    "OO",
+]
+NUM_ZH_TONES = 6
+# Japanese
+JP_SYMBOLS = [
+    "N",
+    "a",
+    "a:",
+    "b",
+    "by",
+    "ch",
+    "d",
+    "dy",
+    "e",
+    "e:",
+    "f",
+    "g",
+    "gy",
+    "h",
+    "hy",
+    "i",
+    "i:",
+    "j",
+    "k",
+    "ky",
+    "m",
+    "my",
+    "n",
+    "ny",
+    "o",
+    "o:",
+    "p",
+    "py",
+    "q",
+    "r",
+    "ry",
+    "s",
+    "sh",
+    "t",
+    "ts",
+    "ty",
+    "u",
+    "u:",
+    "w",
+    "y",
+    "z",
+    "zy",
+    "aa",
+    "ae",
+    "ah",
+    "ao",
+    "aw",
+    "ay",
+    "dh",
+    "eh",
+    "er",
+    "ey",
+    "hh",
+    "ih",
+    "iy",
+    "jh",
+    "l",
+    "ng",
+    "ow",
+    "oy",
+    "sh",
+    "th",
+    "uh",
+    "uw",
+    "V",
+    "zh",
+    "E",
+    "En",
+    "ai",
+    "an",
+    "ang",
+    "c",
+    "ei",
+    "en",
+    "eng",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "ong",
+    "ou",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+    "AA",
+    "EE",
+    "OO",
+]
+NUM_JP_TONES = 12
+# English
+EN_SYMBOLS = [
+    "aa",
+    "ae",
+    "ah",
+    "ao",
+    "aw",
+    "ay",
+    "b",
+    "ch",
+    "d",
+    "dh",
+    "eh",
+    "er",
+    "ey",
+    "f",
+    "g",
+    "hh",
+    "ih",
+    "iy",
+    "jh",
+    "k",
+    "l",
+    "m",
+    "n",
+    "ng",
+    "ow",
+    "oy",
+    "p",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "th",
+    "uh",
+    "uw",
+    "V",
+    "w",
+    "y",
+    "z",
+    "zh",
+]
+NUM_EN_TONES = 4
+# Combine all symbols
+NORMAL_SYMBOLS = sorted(set(ZH_SYMBOLS + JP_SYMBOLS + EN_SYMBOLS))
+SYMBOLS = [PAD] + NORMAL_SYMBOLS + PUNCTUATION_SYMBOLS
+SIL_PHONEMES_IDS = [SYMBOLS.index(i) for i in PUNCTUATION_SYMBOLS]
+# Combine all tones
+NUM_TONES = NUM_ZH_TONES + NUM_JP_TONES + NUM_EN_TONES
+# Language maps
+LANGUAGE_ID_MAP = {"ZH": 0, "JP": 1, "EN": 2}
+NUM_LANGUAGES = len(LANGUAGE_ID_MAP.keys())
+# Language tone start map
+LANGUAGE_TONE_START_MAP = {
+    "ZH": 0,
+    "JP": NUM_ZH_TONES,
+    "EN": NUM_ZH_TONES + NUM_JP_TONES,
+}
+if __name__ == "__main__":
+    a = set(ZH_SYMBOLS)
+    b = set(EN_SYMBOLS)
+    print(sorted(a & b))

train_ms.py ADDED Viewed

	@@ -0,0 +1,1128 @@

+import argparse
+import datetime
+import gc
+import os
+import platform
+import torch
+import torch.distributed as dist
+from huggingface_hub import HfApi
+from torch.cuda.amp import GradScaler, autocast
+from torch.nn import functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+# logging.getLogger("numba").setLevel(logging.WARNING)
+import default_style
+from config import get_config
+from data_utils import (
+    DistributedBucketSampler,
+    TextAudioSpeakerCollate,
+    TextAudioSpeakerLoader,
+)
+from losses import WavLMLoss, discriminator_loss, feature_loss, generator_loss, kl_loss
+from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
+from style_bert_vits2.logging import logger
+from style_bert_vits2.models import commons, utils
+from style_bert_vits2.models.hyper_parameters import HyperParameters
+from style_bert_vits2.models.models import (
+    DurationDiscriminator,
+    MultiPeriodDiscriminator,
+    SynthesizerTrn,
+    WavLMDiscriminator,
+)
+from style_bert_vits2.nlp.symbols import SYMBOLS
+from style_bert_vits2.utils.stdout_wrapper import SAFE_STDOUT
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = (
+    True  # If encontered training problem,please try to disable TF32.
+)
+torch.set_float32_matmul_precision("medium")
+torch.backends.cuda.sdp_kernel("flash")
+torch.backends.cuda.enable_flash_sdp(True)
+torch.backends.cuda.enable_mem_efficient_sdp(
+    True
+)  # Not available if torch version is lower than 2.0
+torch.backends.cuda.enable_math_sdp(True)
+config = get_config()
+global_step = 0
+api = HfApi()
+def run():
+    # Command line configuration is not recommended unless necessary, use config.yml
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        default=config.train_ms_config.config_path,
+        help="JSON file for configuration",
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        help="数据集文件夹路径，请注意，数据不再默认放在/logs文件夹下。如果需要用命令行配置，请声明相对于根目录的路径",
+        default=config.dataset_path,
+    )
+    parser.add_argument(
+        "--assets_root",
+        type=str,
+        help="Root directory of model assets needed for inference.",
+        default=config.assets_root,
+    )
+    parser.add_argument(
+        "--skip_default_style",
+        action="store_true",
+        help="Skip saving default style config and mean vector.",
+    )
+    parser.add_argument(
+        "--no_progress_bar",
+        action="store_true",
+        help="Do not show the progress bar while training.",
+    )
+    parser.add_argument(
+        "--speedup",
+        action="store_true",
+        help="Speed up training by disabling logging and evaluation.",
+    )
+    parser.add_argument(
+        "--repo_id",
+        help="Huggingface model repo id to backup the model.",
+        default=None,
+    )
+    parser.add_argument(
+        "--not_use_custom_batch_sampler",
+        help="Don't use custom batch sampler for training, which was used in the version < 2.5",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    # Set log file
+    model_dir = os.path.join(args.model, config.train_ms_config.model_dir)
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    logger.add(os.path.join(args.model, f"train_{timestamp}.log"))
+    # Parsing environment variables
+    envs = config.train_ms_config.env
+    for env_name, env_value in envs.items():
+        if env_name not in os.environ.keys():
+            logger.info(f"Loading configuration from config {env_value!s}")
+            os.environ[env_name] = str(env_value)
+    logger.info(
+        "Loading environment variables \nMASTER_ADDR: {},\nMASTER_PORT: {},\nWORLD_SIZE: {},\nRANK: {},\nLOCAL_RANK: {}".format(
+            os.environ["MASTER_ADDR"],
+            os.environ["MASTER_PORT"],
+            os.environ["WORLD_SIZE"],
+            os.environ["RANK"],
+            os.environ["LOCAL_RANK"],
+        )
+    )
+    backend = "nccl"
+    if platform.system() == "Windows":
+        backend = "gloo"  # If Windows,switch to gloo backend.
+    dist.init_process_group(
+        backend=backend,
+        init_method="env://",
+        timeout=datetime.timedelta(seconds=300),
+    )  # Use torchrun instead of mp.spawn
+    rank = dist.get_rank()
+    local_rank = int(os.environ["LOCAL_RANK"])
+    n_gpus = dist.get_world_size()
+    hps = HyperParameters.load_from_json(args.config)
+    # This is needed because we have to pass values to `train_and_evaluate()`
+    hps.model_dir = model_dir
+    hps.speedup = args.speedup
+    hps.repo_id = args.repo_id
+    # 比较路径是否相同
+    if os.path.realpath(args.config) != os.path.realpath(
+        config.train_ms_config.config_path
+    ):
+        with open(args.config, encoding="utf-8") as f:
+            data = f.read()
+        os.makedirs(os.path.dirname(config.train_ms_config.config_path), exist_ok=True)
+        with open(config.train_ms_config.config_path, "w", encoding="utf-8") as f:
+            f.write(data)
+    """
+    Path constants are a bit complicated...
+    TODO: Refactor or rename these?
+    (Both `config.yml` and `config.json` are used, which is confusing I think.)
+    args.model: For saving all info needed for training.
+        default: `Data/{model_name}`.
+    hps.model_dir := model_dir: For saving checkpoints (for resuming training).
+        default: `Data/{model_name}/models`.
+        (Use `hps` since we have to pass `model_dir` to `train_and_evaluate()`.
+    args.assets_root: The root directory of model assets needed for inference.
+        default: config.assets_root == `model_assets`.
+    config.out_dir: The directory for model assets of this model (for inference).
+        default: `model_assets/{model_name}`.
+    """
+    if args.repo_id is not None:
+        # First try to upload config.json to check if the repo exists
+        try:
+            api.upload_file(
+                path_or_fileobj=args.config,
+                path_in_repo=f"Data/{config.model_name}/config.json",
+                repo_id=hps.repo_id,
+            )
+        except Exception as e:
+            logger.error(e)
+            logger.error(
+                f"Failed to upload files to the repo {hps.repo_id}. Please check if the repo exists and you have logged in using `huggingface-cli login`."
+            )
+            raise e
+        # Upload Data dir for resuming training
+        api.upload_folder(
+            repo_id=hps.repo_id,
+            folder_path=config.dataset_path,
+            path_in_repo=f"Data/{config.model_name}",
+            delete_patterns="*.pth",  # Only keep the latest checkpoint
+            run_as_future=True,
+        )
+    os.makedirs(config.out_dir, exist_ok=True)
+    if not args.skip_default_style:
+        default_style.save_styles_by_dirs(
+            os.path.join(args.model, "wavs"),
+            config.out_dir,
+            config_path=args.config,
+            config_output_path=os.path.join(config.out_dir, "config.json"),
+        )
+    torch.manual_seed(hps.train.seed)
+    torch.cuda.set_device(local_rank)
+    global global_step
+    writer = None
+    writer_eval = None
+    if rank == 0 and not args.speedup:
+        # logger = utils.get_logger(hps.model_dir)
+        # logger.info(hps)
+        utils.check_git_hash(model_dir)
+        writer = SummaryWriter(log_dir=model_dir)
+        writer_eval = SummaryWriter(log_dir=os.path.join(model_dir, "eval"))
+    train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
+    collate_fn = TextAudioSpeakerCollate()
+    if not args.not_use_custom_batch_sampler:
+        train_sampler = DistributedBucketSampler(
+            train_dataset,
+            hps.train.batch_size,
+            [32, 300, 400, 500, 600, 700, 800, 900, 1000],
+            num_replicas=n_gpus,
+            rank=rank,
+            shuffle=True,
+        )
+        train_loader = DataLoader(
+            train_dataset,
+            # メモリ消費量を減らそうとnum_workersを1にしてみる
+            # num_workers=min(config.train_ms_config.num_workers, os.cpu_count() // 2),
+            num_workers=1,
+            shuffle=False,
+            pin_memory=True,
+            collate_fn=collate_fn,
+            batch_sampler=train_sampler,
+            # batch_size=hps.train.batch_size,
+            persistent_workers=True,
+            # これもメモリ消費量を減らそうとしてコメントアウト
+            # prefetch_factor=6,
+        )
+    else:
+        train_loader = DataLoader(
+            train_dataset,
+            # メモリ消費量を減らそうとnum_workersを1にしてみる
+            # num_workers=min(config.train_ms_config.num_workers, os.cpu_count() // 2),
+            num_workers=1,
+            shuffle=True,
+            pin_memory=True,
+            collate_fn=collate_fn,
+            # batch_sampler=train_sampler,
+            batch_size=hps.train.batch_size,
+            persistent_workers=True,
+            # これもメモリ消費量を減らそうとしてコメントアウト
+            # prefetch_factor=6,
+        )
+    eval_dataset = None
+    eval_loader = None
+    if rank == 0 and not args.speedup:
+        eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
+        eval_loader = DataLoader(
+            eval_dataset,
+            num_workers=0,
+            shuffle=False,
+            batch_size=1,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=collate_fn,
+        )
+    if hps.model.use_noise_scaled_mas is True:
+        logger.info("Using noise scaled MAS for VITS2")
+        mas_noise_scale_initial = 0.01
+        noise_scale_delta = 2e-6
+    else:
+        logger.info("Using normal MAS for VITS1")
+        mas_noise_scale_initial = 0.0
+        noise_scale_delta = 0.0
+    if hps.model.use_duration_discriminator is True:
+        logger.info("Using duration discriminator for VITS2")
+        net_dur_disc = DurationDiscriminator(
+            hps.model.hidden_channels,
+            hps.model.hidden_channels,
+            3,
+            0.1,
+            gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
+        ).cuda(local_rank)
+    else:
+        net_dur_disc = None
+    if hps.model.use_wavlm_discriminator is True:
+        net_wd = WavLMDiscriminator(
+            hps.model.slm.hidden, hps.model.slm.nlayers, hps.model.slm.initial_channel
+        ).cuda(local_rank)
+    else:
+        net_wd = None
+    if hps.model.use_spk_conditioned_encoder is True:
+        if hps.data.n_speakers == 0:
+            raise ValueError(
+                "n_speakers must be > 0 when using spk conditioned encoder to train multi-speaker model"
+            )
+    else:
+        logger.info("Using normal encoder for VITS1")
+    net_g = SynthesizerTrn(
+        len(SYMBOLS),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        mas_noise_scale_initial=mas_noise_scale_initial,
+        noise_scale_delta=noise_scale_delta,
+        # hps.model 以下のすべての値を引数に渡す
+        use_spk_conditioned_encoder=hps.model.use_spk_conditioned_encoder,
+        use_noise_scaled_mas=hps.model.use_noise_scaled_mas,
+        use_mel_posterior_encoder=hps.model.use_mel_posterior_encoder,
+        use_duration_discriminator=hps.model.use_duration_discriminator,
+        use_wavlm_discriminator=hps.model.use_wavlm_discriminator,
+        inter_channels=hps.model.inter_channels,
+        hidden_channels=hps.model.hidden_channels,
+        filter_channels=hps.model.filter_channels,
+        n_heads=hps.model.n_heads,
+        n_layers=hps.model.n_layers,
+        kernel_size=hps.model.kernel_size,
+        p_dropout=hps.model.p_dropout,
+        resblock=hps.model.resblock,
+        resblock_kernel_sizes=hps.model.resblock_kernel_sizes,
+        resblock_dilation_sizes=hps.model.resblock_dilation_sizes,
+        upsample_rates=hps.model.upsample_rates,
+        upsample_initial_channel=hps.model.upsample_initial_channel,
+        upsample_kernel_sizes=hps.model.upsample_kernel_sizes,
+        n_layers_q=hps.model.n_layers_q,
+        use_spectral_norm=hps.model.use_spectral_norm,
+        gin_channels=hps.model.gin_channels,
+        slm=hps.model.slm,
+    ).cuda(local_rank)
+    if getattr(hps.train, "freeze_ZH_bert", False):
+        logger.info("Freezing ZH bert encoder !!!")
+        for param in net_g.enc_p.bert_proj.parameters():
+            param.requires_grad = False
+    if getattr(hps.train, "freeze_EN_bert", False):
+        logger.info("Freezing EN bert encoder !!!")
+        for param in net_g.enc_p.en_bert_proj.parameters():
+            param.requires_grad = False
+    if getattr(hps.train, "freeze_JP_bert", False):
+        logger.info("Freezing JP bert encoder !!!")
+        for param in net_g.enc_p.ja_bert_proj.parameters():
+            param.requires_grad = False
+    if getattr(hps.train, "freeze_style", False):
+        logger.info("Freezing style encoder !!!")
+        for param in net_g.enc_p.style_proj.parameters():
+            param.requires_grad = False
+    if getattr(hps.train, "freeze_decoder", False):
+        logger.info("Freezing decoder !!!")
+        for param in net_g.dec.parameters():
+            param.requires_grad = False
+    net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(local_rank)
+    optim_g = torch.optim.AdamW(
+        filter(lambda p: p.requires_grad, net_g.parameters()),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
+    optim_d = torch.optim.AdamW(
+        net_d.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
+    if net_dur_disc is not None:
+        optim_dur_disc = torch.optim.AdamW(
+            net_dur_disc.parameters(),
+            hps.train.learning_rate,
+            betas=hps.train.betas,
+            eps=hps.train.eps,
+        )
+    else:
+        optim_dur_disc = None
+    if net_wd is not None:
+        optim_wd = torch.optim.AdamW(
+            net_wd.parameters(),
+            hps.train.learning_rate,
+            betas=hps.train.betas,
+            eps=hps.train.eps,
+        )
+    else:
+        optim_wd = None
+    net_g = DDP(net_g, device_ids=[local_rank])
+    net_d = DDP(net_d, device_ids=[local_rank])
+    dur_resume_lr = None
+    if net_dur_disc is not None:
+        net_dur_disc = DDP(
+            net_dur_disc, device_ids=[local_rank], find_unused_parameters=True
+        )
+    if net_wd is not None:
+        net_wd = DDP(
+            net_wd,
+            device_ids=[local_rank],
+            #  bucket_cap_mb=512
+        )
+    if utils.is_resuming(model_dir):
+        if net_dur_disc is not None:
+            _, _, dur_resume_lr, epoch_str = utils.checkpoints.load_checkpoint(
+                utils.checkpoints.get_latest_checkpoint_path(model_dir, "DUR_*.pth"),
+                net_dur_disc,
+                optim_dur_disc,
+                skip_optimizer=hps.train.skip_optimizer,
+            )
+            if not optim_dur_disc.param_groups[0].get("initial_lr"):
+                optim_dur_disc.param_groups[0]["initial_lr"] = dur_resume_lr
+        if net_wd is not None:
+            try:
+                _, optim_wd, wd_resume_lr, epoch_str = (
+                    utils.checkpoints.load_checkpoint(
+                        utils.checkpoints.get_latest_checkpoint_path(
+                            model_dir, "WD_*.pth"
+                        ),
+                        net_wd,
+                        optim_wd,
+                        skip_optimizer=hps.train.skip_optimizer,
+                    )
+                )
+                if not optim_wd.param_groups[0].get("initial_lr"):
+                    optim_wd.param_groups[0]["initial_lr"] = wd_resume_lr
+            except:
+                if not optim_wd.param_groups[0].get("initial_lr"):
+                    optim_wd.param_groups[0]["initial_lr"] = wd_resume_lr
+                logger.info("Initialize wavlm")
+        _, optim_g, g_resume_lr, epoch_str = utils.checkpoints.load_checkpoint(
+            utils.checkpoints.get_latest_checkpoint_path(model_dir, "G_*.pth"),
+            net_g,
+            optim_g,
+            skip_optimizer=hps.train.skip_optimizer,
+        )
+        _, optim_d, d_resume_lr, epoch_str = utils.checkpoints.load_checkpoint(
+            utils.checkpoints.get_latest_checkpoint_path(model_dir, "D_*.pth"),
+            net_d,
+            optim_d,
+            skip_optimizer=hps.train.skip_optimizer,
+        )
+        if not optim_g.param_groups[0].get("initial_lr"):
+            optim_g.param_groups[0]["initial_lr"] = g_resume_lr
+        if not optim_d.param_groups[0].get("initial_lr"):
+            optim_d.param_groups[0]["initial_lr"] = d_resume_lr
+        epoch_str = max(epoch_str, 1)
+        # global_step = (epoch_str - 1) * len(train_loader)
+        global_step = int(
+            utils.get_steps(
+                utils.checkpoints.get_latest_checkpoint_path(model_dir, "G_*.pth")
+            )
+        )
+        logger.info(
+            f"******************Found the model. Current epoch is {epoch_str}, gloabl step is {global_step}*********************"
+        )
+    else:
+        try:
+            _ = utils.safetensors.load_safetensors(
+                os.path.join(model_dir, "G_0.safetensors"), net_g
+            )
+            _ = utils.safetensors.load_safetensors(
+                os.path.join(model_dir, "D_0.safetensors"), net_d
+            )
+            if net_dur_disc is not None:
+                _ = utils.safetensors.load_safetensors(
+                    os.path.join(model_dir, "DUR_0.safetensors"), net_dur_disc
+                )
+            if net_wd is not None:
+                _ = utils.safetensors.load_safetensors(
+                    os.path.join(model_dir, "WD_0.safetensors"), net_wd
+                )
+            logger.info("Loaded the pretrained models.")
+        except Exception as e:
+            logger.warning(e)
+            logger.warning(
+                "It seems that you are not using the pretrained models, so we will train from scratch."
+            )
+        finally:
+            epoch_str = 1
+            global_step = 0
+    def lr_lambda(epoch):
+        """
+        Learning rate scheduler for warmup and exponential decay.
+        - During the warmup period, the learning rate increases linearly.
+        - After the warmup period, the learning rate decreases exponentially.
+        """
+        if epoch < hps.train.warmup_epochs:
+            return float(epoch) / float(max(1, hps.train.warmup_epochs))
+        else:
+            return hps.train.lr_decay ** (epoch - hps.train.warmup_epochs)
+    scheduler_last_epoch = epoch_str - 2
+    scheduler_g = torch.optim.lr_scheduler.LambdaLR(
+        optim_g, lr_lambda=lr_lambda, last_epoch=scheduler_last_epoch
+    )
+    scheduler_d = torch.optim.lr_scheduler.LambdaLR(
+        optim_d, lr_lambda=lr_lambda, last_epoch=scheduler_last_epoch
+    )
+    if net_dur_disc is not None:
+        scheduler_dur_disc = torch.optim.lr_scheduler.LambdaLR(
+            optim_dur_disc, lr_lambda=lr_lambda, last_epoch=scheduler_last_epoch
+        )
+    else:
+        scheduler_dur_disc = None
+    if net_wd is not None:
+        scheduler_wd = torch.optim.lr_scheduler.LambdaLR(
+            optim_wd, lr_lambda=lr_lambda, last_epoch=scheduler_last_epoch
+        )
+        wl = WavLMLoss(
+            hps.model.slm.model,
+            net_wd,
+            hps.data.sampling_rate,
+            hps.model.slm.sr,
+        ).to(local_rank)
+    else:
+        scheduler_wd = None
+        wl = None
+    scaler = GradScaler(enabled=hps.train.bf16_run)
+    logger.info("Start training.")
+    diff = abs(
+        epoch_str * len(train_loader) - (hps.train.epochs + 1) * len(train_loader)
+    )
+    pbar = None
+    if not args.no_progress_bar:
+        pbar = tqdm(
+            total=global_step + diff,
+            initial=global_step,
+            smoothing=0.05,
+            file=SAFE_STDOUT,
+        )
+    initial_step = global_step
+    for epoch in range(epoch_str, hps.train.epochs + 1):
+        if rank == 0:
+            train_and_evaluate(
+                rank,
+                local_rank,
+                epoch,
+                hps,
+                [net_g, net_d, net_dur_disc, net_wd, wl],
+                [optim_g, optim_d, optim_dur_disc, optim_wd],
+                [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd],
+                scaler,
+                [train_loader, eval_loader],
+                logger,
+                [writer, writer_eval],
+                pbar,
+                initial_step,
+            )
+        else:
+            train_and_evaluate(
+                rank,
+                local_rank,
+                epoch,
+                hps,
+                [net_g, net_d, net_dur_disc, net_wd, wl],
+                [optim_g, optim_d, optim_dur_disc, optim_wd],
+                [scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd],
+                scaler,
+                [train_loader, None],
+                None,
+                None,
+                pbar,
+                initial_step,
+            )
+        scheduler_g.step()
+        scheduler_d.step()
+        if net_dur_disc is not None:
+            scheduler_dur_disc.step()
+        if net_wd is not None:
+            scheduler_wd.step()
+        if epoch == hps.train.epochs:
+            # Save the final models
+            assert optim_g is not None
+            utils.checkpoints.save_checkpoint(
+                net_g,
+                optim_g,
+                hps.train.learning_rate,
+                epoch,
+                os.path.join(model_dir, f"G_{global_step}.pth"),
+            )
+            assert optim_d is not None
+            utils.checkpoints.save_checkpoint(
+                net_d,
+                optim_d,
+                hps.train.learning_rate,
+                epoch,
+                os.path.join(model_dir, f"D_{global_step}.pth"),
+            )
+            if net_dur_disc is not None:
+                assert optim_dur_disc is not None
+                utils.checkpoints.save_checkpoint(
+                    net_dur_disc,
+                    optim_dur_disc,
+                    hps.train.learning_rate,
+                    epoch,
+                    os.path.join(model_dir, f"DUR_{global_step}.pth"),
+                )
+            if net_wd is not None:
+                assert optim_wd is not None
+                utils.checkpoints.save_checkpoint(
+                    net_wd,
+                    optim_wd,
+                    hps.train.learning_rate,
+                    epoch,
+                    os.path.join(model_dir, f"WD_{global_step}.pth"),
+                )
+            utils.safetensors.save_safetensors(
+                net_g,
+                epoch,
+                os.path.join(
+                    config.out_dir,
+                    f"{config.model_name}_e{epoch}_s{global_step}.safetensors",
+                ),
+                for_infer=True,
+            )
+            if hps.repo_id is not None:
+                future1 = api.upload_folder(
+                    repo_id=hps.repo_id,
+                    folder_path=config.dataset_path,
+                    path_in_repo=f"Data/{config.model_name}",
+                    delete_patterns="*.pth",  # Only keep the latest checkpoint
+                    run_as_future=True,
+                )
+                future2 = api.upload_folder(
+                    repo_id=hps.repo_id,
+                    folder_path=config.out_dir,
+                    path_in_repo=f"model_assets/{config.model_name}",
+                    run_as_future=True,
+                )
+                try:
+                    future1.result()
+                    future2.result()
+                except Exception as e:
+                    logger.error(e)
+    if pbar is not None:
+        pbar.close()
+def train_and_evaluate(
+    rank,
+    local_rank,
+    epoch,
+    hps: HyperParameters,
+    nets,
+    optims,
+    schedulers,
+    scaler,
+    loaders,
+    logger,
+    writers,
+    pbar: tqdm,
+    initial_step: int,
+):
+    net_g, net_d, net_dur_disc, net_wd, wl = nets
+    optim_g, optim_d, optim_dur_disc, optim_wd = optims
+    scheduler_g, scheduler_d, scheduler_dur_disc, scheduler_wd = schedulers
+    train_loader, eval_loader = loaders
+    if writers is not None:
+        writer, writer_eval = writers
+    train_loader.batch_sampler.set_epoch(epoch)
+    global global_step
+    net_g.train()
+    net_d.train()
+    if net_dur_disc is not None:
+        net_dur_disc.train()
+    if net_wd is not None:
+        net_wd.train()
+    for batch_idx, (
+        x,
+        x_lengths,
+        spec,
+        spec_lengths,
+        y,
+        y_lengths,
+        speakers,
+        tone,
+        language,
+        bert,
+        ja_bert,
+        en_bert,
+        style_vec,
+    ) in enumerate(train_loader):
+        if net_g.module.use_noise_scaled_mas:
+            current_mas_noise_scale = (
+                net_g.module.mas_noise_scale_initial
+                - net_g.module.noise_scale_delta * global_step
+            )
+            net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0)
+        x, x_lengths = x.cuda(local_rank, non_blocking=True), x_lengths.cuda(
+            local_rank, non_blocking=True
+        )
+        spec, spec_lengths = spec.cuda(
+            local_rank, non_blocking=True
+        ), spec_lengths.cuda(local_rank, non_blocking=True)
+        y, y_lengths = y.cuda(local_rank, non_blocking=True), y_lengths.cuda(
+            local_rank, non_blocking=True
+        )
+        speakers = speakers.cuda(local_rank, non_blocking=True)
+        tone = tone.cuda(local_rank, non_blocking=True)
+        language = language.cuda(local_rank, non_blocking=True)
+        bert = bert.cuda(local_rank, non_blocking=True)
+        ja_bert = ja_bert.cuda(local_rank, non_blocking=True)
+        en_bert = en_bert.cuda(local_rank, non_blocking=True)
+        style_vec = style_vec.cuda(local_rank, non_blocking=True)
+        with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+            (
+                y_hat,
+                l_length,
+                attn,
+                ids_slice,
+                x_mask,
+                z_mask,
+                (z, z_p, m_p, logs_p, m_q, logs_q),
+                (hidden_x, logw, logw_),  # , logw_sdp),
+                g,
+            ) = net_g(
+                x,
+                x_lengths,
+                spec,
+                spec_lengths,
+                speakers,
+                tone,
+                language,
+                bert,
+                ja_bert,
+                en_bert,
+                style_vec,
+            )
+            mel = spec_to_mel_torch(
+                spec,
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax,
+            )
+            y_mel = commons.slice_segments(
+                mel, ids_slice, hps.train.segment_size // hps.data.hop_length
+            )
+            y_hat_mel = mel_spectrogram_torch(
+                y_hat.squeeze(1).float(),
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax,
+            )
+            y = commons.slice_segments(
+                y, ids_slice * hps.data.hop_length, hps.train.segment_size
+            )  # slice
+            # Discriminator
+            y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
+            with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+                loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
+                    y_d_hat_r, y_d_hat_g
+                )
+                loss_disc_all = loss_disc
+            if net_dur_disc is not None:
+                y_dur_hat_r, y_dur_hat_g = net_dur_disc(
+                    hidden_x.detach(), x_mask.detach(), logw.detach(), logw_.detach()
+                )
+                with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+                    # TODO: I think need to mean using the mask, but for now, just mean all
+                    (
+                        loss_dur_disc,
+                        losses_dur_disc_r,
+                        losses_dur_disc_g,
+                    ) = discriminator_loss(y_dur_hat_r, y_dur_hat_g)
+                    loss_dur_disc_all = loss_dur_disc
+                optim_dur_disc.zero_grad()
+                scaler.scale(loss_dur_disc_all).backward()
+                scaler.unscale_(optim_dur_disc)
+                commons.clip_grad_value_(net_dur_disc.parameters(), None)
+                scaler.step(optim_dur_disc)
+            if net_wd is not None:
+                # logger.debug(f"y.shape: {y.shape}, y_hat.shape: {y_hat.shape}")
+                # shape: (batch, 1, time)
+                with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+                    loss_slm = wl.discriminator(
+                        y.detach().squeeze(1), y_hat.detach().squeeze(1)
+                    ).mean()
+                optim_wd.zero_grad()
+                scaler.scale(loss_slm).backward()
+                scaler.unscale_(optim_wd)
+                # torch.nn.utils.clip_grad_norm_(parameters=net_wd.parameters(), max_norm=200)
+                grad_norm_wd = commons.clip_grad_value_(net_wd.parameters(), None)
+                scaler.step(optim_wd)
+        optim_d.zero_grad()
+        scaler.scale(loss_disc_all).backward()
+        scaler.unscale_(optim_d)
+        if getattr(hps.train, "bf16_run", False):
+            torch.nn.utils.clip_grad_norm_(parameters=net_d.parameters(), max_norm=200)
+        grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
+        scaler.step(optim_d)
+        with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+            # Generator
+            y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
+            if net_dur_disc is not None:
+                y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw, logw_)
+            if net_wd is not None:
+                loss_lm = wl(y.detach().squeeze(1), y_hat.squeeze(1)).mean()
+                loss_lm_gen = wl.generator(y_hat.squeeze(1))
+            with autocast(enabled=hps.train.bf16_run, dtype=torch.bfloat16):
+                loss_dur = torch.sum(l_length.float())
+                loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
+                loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
+                loss_fm = feature_loss(fmap_r, fmap_g)
+                loss_gen, losses_gen = generator_loss(y_d_hat_g)
+                loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
+                if net_dur_disc is not None:
+                    loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g)
+                    if net_wd is not None:
+                        loss_gen_all += loss_dur_gen + loss_lm + loss_lm_gen
+                    loss_gen_all += loss_dur_gen
+        optim_g.zero_grad()
+        scaler.scale(loss_gen_all).backward()
+        scaler.unscale_(optim_g)
+        if getattr(hps.train, "bf16_run", False):
+            torch.nn.utils.clip_grad_norm_(parameters=net_g.parameters(), max_norm=500)
+        grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
+        scaler.step(optim_g)
+        scaler.update()
+        if rank == 0:
+            if global_step % hps.train.log_interval == 0 and not hps.speedup:
+                lr = optim_g.param_groups[0]["lr"]
+                losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl]
+                # logger.info(
+                #     "Train Epoch: {} [{:.0f}%]".format(
+                #         epoch, 100.0 * batch_idx / len(train_loader)
+                #     )
+                # )
+                # logger.info([x.item() for x in losses] + [global_step, lr])
+                scalar_dict = {
+                    "loss/g/total": loss_gen_all,
+                    "loss/d/total": loss_disc_all,
+                    "learning_rate": lr,
+                    "grad_norm_d": grad_norm_d,
+                    "grad_norm_g": grad_norm_g,
+                }
+                scalar_dict.update(
+                    {
+                        "loss/g/fm": loss_fm,
+                        "loss/g/mel": loss_mel,
+                        "loss/g/dur": loss_dur,
+                        "loss/g/kl": loss_kl,
+                    }
+                )
+                scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)})
+                scalar_dict.update(
+                    {f"loss/d_r/{i}": v for i, v in enumerate(losses_disc_r)}
+                )
+                scalar_dict.update(
+                    {f"loss/d_g/{i}": v for i, v in enumerate(losses_disc_g)}
+                )
+                if net_wd is not None:
+                    scalar_dict.update(
+                        {
+                            "loss/wd/total": loss_slm,
+                            "grad_norm_wd": grad_norm_wd,
+                            "loss/g/lm": loss_lm,
+                            "loss/g/lm_gen": loss_lm_gen,
+                        }
+                    )
+                # 以降のログは計算が重い気がするし誰も見てない気がするのでコメントアウト
+                # image_dict = {
+                #     "slice/mel_org": utils.plot_spectrogram_to_numpy(
+                #         y_mel[0].data.cpu().numpy()
+                #     ),
+                #     "slice/mel_gen": utils.plot_spectrogram_to_numpy(
+                #         y_hat_mel[0].data.cpu().numpy()
+                #     ),
+                #     "all/mel": utils.plot_spectrogram_to_numpy(
+                #         mel[0].data.cpu().numpy()
+                #     ),
+                #     "all/attn": utils.plot_alignment_to_numpy(
+                #         attn[0, 0].data.cpu().numpy()
+                #     ),
+                # }
+                utils.summarize(
+                    writer=writer,
+                    global_step=global_step,
+                    # images=image_dict,
+                    scalars=scalar_dict,
+                )
+            if (
+                global_step % hps.train.eval_interval == 0
+                and global_step != 0
+                and initial_step != global_step
+            ):
+                if not hps.speedup:
+                    evaluate(hps, net_g, eval_loader, writer_eval)
+                assert hps.model_dir is not None
+                utils.checkpoints.save_checkpoint(
+                    net_g,
+                    optim_g,
+                    hps.train.learning_rate,
+                    epoch,
+                    os.path.join(hps.model_dir, f"G_{global_step}.pth"),
+                )
+                utils.checkpoints.save_checkpoint(
+                    net_d,
+                    optim_d,
+                    hps.train.learning_rate,
+                    epoch,
+                    os.path.join(hps.model_dir, f"D_{global_step}.pth"),
+                )
+                if net_dur_disc is not None:
+                    utils.checkpoints.save_checkpoint(
+                        net_dur_disc,
+                        optim_dur_disc,
+                        hps.train.learning_rate,
+                        epoch,
+                        os.path.join(hps.model_dir, f"DUR_{global_step}.pth"),
+                    )
+                if net_wd is not None:
+                    utils.checkpoints.save_checkpoint(
+                        net_wd,
+                        optim_wd,
+                        hps.train.learning_rate,
+                        epoch,
+                        os.path.join(hps.model_dir, f"WD_{global_step}.pth"),
+                    )
+                keep_ckpts = config.train_ms_config.keep_ckpts
+                if keep_ckpts > 0:
+                    utils.checkpoints.clean_checkpoints(
+                        model_dir_path=hps.model_dir,
+                        n_ckpts_to_keep=keep_ckpts,
+                        sort_by_time=True,
+                    )
+                # Save safetensors (for inference) to `model_assets/{model_name}`
+                utils.safetensors.save_safetensors(
+                    net_g,
+                    epoch,
+                    os.path.join(
+                        config.out_dir,
+                        f"{config.model_name}_e{epoch}_s{global_step}.safetensors",
+                    ),
+                    for_infer=True,
+                )
+                if hps.repo_id is not None:
+                    api.upload_folder(
+                        repo_id=hps.repo_id,
+                        folder_path=config.dataset_path,
+                        path_in_repo=f"Data/{config.model_name}",
+                        delete_patterns="*.pth",  # Only keep the latest checkpoint
+                        run_as_future=True,
+                    )
+                    api.upload_folder(
+                        repo_id=hps.repo_id,
+                        folder_path=config.out_dir,
+                        path_in_repo=f"model_assets/{config.model_name}",
+                        run_as_future=True,
+                    )
+        global_step += 1
+        if pbar is not None:
+            pbar.set_description(
+                f"Epoch {epoch}({100.0 * batch_idx / len(train_loader):.0f}%)/{hps.train.epochs}"
+            )
+            pbar.update()
+    # 本家ではこれをスピードアップのために消すと書かれていたので、一応消してみる
+    # と思ったけどメモリ使用量が減るかもしれないのでつけてみる
+    gc.collect()
+    torch.cuda.empty_cache()
+    if pbar is None and rank == 0:
+        logger.info(f"====> Epoch: {epoch}, step: {global_step}")
+def evaluate(hps, generator, eval_loader, writer_eval):
+    generator.eval()
+    image_dict = {}
+    audio_dict = {}
+    print()
+    logger.info("Evaluating ...")
+    with torch.no_grad():
+        for batch_idx, (
+            x,
+            x_lengths,
+            spec,
+            spec_lengths,
+            y,
+            y_lengths,
+            speakers,
+            tone,
+            language,
+            bert,
+            ja_bert,
+            en_bert,
+            style_vec,
+        ) in enumerate(eval_loader):
+            x, x_lengths = x.cuda(), x_lengths.cuda()
+            spec, spec_lengths = spec.cuda(), spec_lengths.cuda()
+            y, y_lengths = y.cuda(), y_lengths.cuda()
+            speakers = speakers.cuda()
+            bert = bert.cuda()
+            ja_bert = ja_bert.cuda()
+            en_bert = en_bert.cuda()
+            tone = tone.cuda()
+            language = language.cuda()
+            style_vec = style_vec.cuda()
+            for use_sdp in [True, False]:
+                y_hat, attn, mask, *_ = generator.module.infer(
+                    x,
+                    x_lengths,
+                    speakers,
+                    tone,
+                    language,
+                    bert,
+                    ja_bert,
+                    en_bert,
+                    style_vec,
+                    y=spec,
+                    max_len=1000,
+                    sdp_ratio=0.0 if not use_sdp else 1.0,
+                )
+                y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
+                # 以降のログは計算が重い気がするし誰も見てない気がするのでコメントアウト
+                # mel = spec_to_mel_torch(
+                #     spec,
+                #     hps.data.filter_length,
+                #     hps.data.n_mel_channels,
+                #     hps.data.sampling_rate,
+                #     hps.data.mel_fmin,
+                #     hps.data.mel_fmax,
+                # )
+                # y_hat_mel = mel_spectrogram_torch(
+                #     y_hat.squeeze(1).float(),
+                #     hps.data.filter_length,
+                #     hps.data.n_mel_channels,
+                #     hps.data.sampling_rate,
+                #     hps.data.hop_length,
+                #     hps.data.win_length,
+                #     hps.data.mel_fmin,
+                #     hps.data.mel_fmax,
+                # )
+                # image_dict.update(
+                #     {
+                #         f"gen/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(
+                #             y_hat_mel[0].cpu().numpy()
+                #         )
+                #     }
+                # )
+                # image_dict.update(
+                #     {
+                #         f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(
+                #             mel[0].cpu().numpy()
+                #         )
+                #     }
+                # )
+                audio_dict.update(
+                    {
+                        f"gen/audio_{batch_idx}_{use_sdp}": y_hat[
+                            0, :, : y_hat_lengths[0]
+                        ]
+                    }
+                )
+                audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]})
+    utils.summarize(
+        writer=writer_eval,
+        global_step=global_step,
+        images=image_dict,
+        audios=audio_dict,
+        audio_sampling_rate=hps.data.sampling_rate,
+    )
+    generator.train()
+if __name__ == "__main__":
+    run()