jiaqili3
/

valle_v2.1

Model card Files Files and versions Community

jiaqili3 commited on Jul 27, 2024

Commit

d949a00

1 Parent(s): addb7e5

add files

Browse files

Files changed (16) hide show

models/tts/valle_v2.1/base_trainer.py +810 -0
models/tts/valle_v2.1/cfg/base.yaml +136 -0
models/tts/valle_v2.1/emilia_dataset_whole.py +393 -0
models/tts/valle_v2.1/g2p_processor.py +363 -0
models/tts/valle_v2.1/libritts_dataset.py +271 -0
models/tts/valle_v2.1/modeling_llama.py +1043 -0
models/tts/valle_v2.1/train.py +19 -0
models/tts/valle_v2.1/valle_ar.py +302 -0
models/tts/valle_v2.1/valle_ar_trainer.py +371 -0
models/tts/valle_v2.1/valle_collator.py +57 -0
models/tts/valle_v2.1/valle_inference.py +169 -0
models/tts/valle_v2.1/valle_nar.py +801 -0
models/tts/valle_v2.1/valle_nar_trainer.py +205 -0
utils/g2p/g2p.py +321 -0
utils/g2p/mls_emilia.json +335 -0
utils/g2p/mls_en.json +323 -0

models/tts/valle_v2.1/base_trainer.py ADDED Viewed

	@@ -0,0 +1,810 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import random
+import shutil
+import time
+from abc import abstractmethod
+from pathlib import Path
+import math
+import accelerate
+import json5
+import numpy as np
+import torch
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from torch.utils.data import ConcatDataset, DataLoader
+from tqdm import tqdm
+from models.base.base_sampler import build_samplers
+from optimizer.optimizers import NoamLR
+class MainProcessLogger:
+    def __init__(self, is_main_process=True, name=None, **kwargs):
+        import logging
+        if name is None:
+            logger = logging.getLogger(__name__)
+        else:
+            logger = logging.getLogger(name)
+        self.logger = logger
+        self.is_main_process = is_main_process
+    def info(self, msg):
+        if self.is_main_process:
+            print(msg)
+            # self.logger.info(msg)
+    def debug(self, msg):
+        if self.is_main_process:
+            print(msg)
+            # self.logger.debug(msg)
+    def warning(self, msg):
+        if self.is_main_process:
+            print(msg)
+            # self.logger.warning(msg)
+class BaseTrainer(object):
+    r"""The base trainer for all tasks. Any trainer should inherit from this class."""
+    def __init__(self, args=None, cfg=None):
+        super().__init__()
+        self.args = args
+        self.cfg = cfg
+        cfg.exp_name = args.exp_name
+        # init with accelerate
+        self._init_accelerator()
+        self.accelerator.wait_for_everyone()
+        # Use accelerate logger for distributed training
+        with self.accelerator.main_process_first():
+            self.logger = MainProcessLogger(
+                self.accelerator.is_main_process,
+                name=args.exp_name,
+                log_level=args.log_level,
+            )
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New training process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+        self.logger.info(f"Experiment name: {args.exp_name}")
+        self.logger.info(f"Experiment directory: {self.exp_dir}")
+        self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+        if self.accelerator.is_main_process:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+        # init counts
+        self.batch_count: int = 0
+        self.step: int = 0
+        self.epoch: int = 0
+        self.max_epoch = (
+            self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
+        )
+        self.logger.info(
+            "Max epoch: {}".format(
+                self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
+            )
+        )
+        # Check values
+        if self.accelerator.is_main_process:
+            self.__check_basic_configs()
+            # Set runtime configs
+            self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
+            self.checkpoints_path = [
+                [] for _ in range(len(self.save_checkpoint_stride))
+            ]
+            self.keep_last = [
+                i if i > 0 else float("inf") for i in self.cfg.train.keep_last
+            ]
+            self.run_eval = self.cfg.train.run_eval
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(args.seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {args.seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.train_dataloader, self.valid_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
+        # setup model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            self.logger.debug(self.model)
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
+            self.logger.info(
+                f"Model parameters: {self.__count_parameters(self.model)/1e6:.2f}M"
+            )
+        # optimizer & scheduler
+        with self.accelerator.main_process_first():
+            self.logger.info("Building optimizer and scheduler...")
+            start = time.monotonic_ns()
+            self.optimizer = self._build_optimizer()
+            self.scheduler = self._build_scheduler()
+            end = time.monotonic_ns()
+            self.logger.info(
+                f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
+            )
+        # accelerate prepare
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self._accelerator_prepare()
+        end = time.monotonic_ns()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms")
+        # create criterion
+        with self.accelerator.main_process_first():
+            self.logger.info("Building criterion...")
+            start = time.monotonic_ns()
+            self.criterion = self._build_criterion()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building criterion done in {(end - start) / 1e6:.2f}ms")
+        # Resume or Finetune
+        with self.accelerator.main_process_first():
+            if args.resume:
+                if args.resume_from_ckpt_path == "":
+                    ## Automatically resume according to the current exprimental name
+                    self.logger.info(
+                        "Automatically resuming from latest checkpoint in {}...".format(
+                            self.checkpoint_dir
+                        )
+                    )
+                    start = time.monotonic_ns()
+                    ckpt_path = self._load_model(
+                        checkpoint_dir=self.checkpoint_dir, resume_type=args.resume_type
+                    )
+                    end = time.monotonic_ns()
+                    self.logger.info(
+                        f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+                    )
+                else:
+                    ## Resume from the given checkpoint path
+                    if not os.path.exists(args.resume_from_ckpt_path):
+                        raise ValueError(
+                            "[Error] The resumed checkpoint path {} don't exist.".format(
+                                args.resume_from_ckpt_path
+                            )
+                        )
+                    self.logger.info(
+                        "Resuming from {}...".format(args.resume_from_ckpt_path)
+                    )
+                    start = time.monotonic_ns()
+                    ckpt_path = self._load_model(
+                        checkpoint_path=args.resume_from_ckpt_path,
+                        resume_type=args.resume_type,
+                    )
+                    end = time.monotonic_ns()
+                    self.logger.info(
+                        f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+                    )
+        # save config file path
+        self.config_save_path = os.path.join(self.exp_dir, "args.json")
+    def _accelerator_prepare(self):
+        (
+            self.train_dataloader,
+            self.valid_dataloader,
+            self.model,
+            self.optimizer,
+            self.scheduler,
+        ) = self.accelerator.prepare(
+            self.train_dataloader,
+            self.valid_dataloader,
+            self.model,
+            self.optimizer,
+            self.scheduler,
+        )
+    ### Following are abstract methods that should be implemented in child classes ###
+    @abstractmethod
+    def _build_dataset(self):
+        r"""Build dataset for model training/validating/evaluating."""
+        pass
+    @staticmethod
+    @abstractmethod
+    def _build_criterion():
+        r"""Build criterion function for model loss calculation."""
+        pass
+    @abstractmethod
+    def _build_model(self):
+        r"""Build model for training/validating/evaluating."""
+        pass
+    @abstractmethod
+    def _forward_step(self, batch):
+        r"""One forward step of the neural network. This abstract method is trying to
+        unify ``_train_step`` and ``_valid_step`` and avoid redundant implementation.
+        However, for special case that using different forward step pattern for
+        training and validating, you could just override this method with ``pass`` and
+        implement ``_train_step`` and ``_valid_step`` separately.
+        """
+        pass
+    def save_checkpoint(self):
+        if self.accelerator.is_main_process:
+            keep_last = self.keep_last[0]
+            # 读取self.checkpoint_dir所有的folder
+            all_ckpts = os.listdir(self.checkpoint_dir)
+            all_ckpts = filter(lambda x: x.startswith("epoch"), all_ckpts)
+            all_ckpts = list(all_ckpts)
+            if len(all_ckpts) > keep_last:
+                # 只保留keep_last个的folder in self.checkpoint_dir, sort by step  "epoch-{:04d}_step-{:07d}_loss-{:.6f}"
+                all_ckpts = sorted(
+                    all_ckpts, key=lambda x: int(x.split("_")[1].split("-")[1])
+                )
+                for ckpt in all_ckpts[:-keep_last]:
+                    shutil.rmtree(os.path.join(self.checkpoint_dir, ckpt))
+            checkpoint_filename = "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                self.epoch, self.step, self.current_loss
+            )
+            path = os.path.join(self.checkpoint_dir, checkpoint_filename)
+            self.logger.info("Saving state to {}...".format(path))
+            self.accelerator.save_state(path)
+            self.logger.info("Finished saving state.")
+    @abstractmethod
+    def _save_auxiliary_states(self):
+        r"""To save some auxiliary states when saving model's ckpt"""
+        pass
+    def echo_log(self, losses, mode="Training"):
+        message = [
+            "{} - Epoch {} Step {}: [{:.3f} s/step]".format(
+                mode, self.epoch + 1, self.step, self.time_window.average
+            )
+        ]
+        for key in sorted(losses.keys()):
+            if isinstance(losses[key], dict):
+                for k, v in losses[key].items():
+                    message.append(
+                        str(k).split("/")[-1] + "=" + str(round(float(v), 5))
+                    )
+            else:
+                message.append(
+                    str(key).split("/")[-1] + "=" + str(round(float(losses[key]), 5))
+                )
+        self.logger.info(", ".join(message))
+    ### Abstract methods end ###
+    ### THIS IS MAIN ENTRY ###
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self.__dump_cfg(self.config_save_path)
+        self.model.train()
+        self.optimizer.zero_grad()
+        while self.epoch < self.max_epoch:
+            self.logger.info("\n")
+            self.logger.info("-" * 32)
+            self.logger.info("Epoch {}: ".format(self.epoch))
+            ### TODO: change the return values of _train_epoch() to a loss dict, or (total_loss, loss_dict)
+            ### It's inconvenient for the model with multiple losses
+            # Do training & validating epoch
+            train_loss = self._train_epoch()
+            self.logger.info("  |- Train/Loss: {:.6f}".format(train_loss))
+            valid_loss = self._valid_epoch()
+            self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_loss))
+            self.accelerator.log(
+                {"Epoch/Train Loss": train_loss, "Epoch/Valid Loss": valid_loss},
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_loss
+                    ),
+                )
+            )
+            self._save_auxiliary_states()
+        self.accelerator.end_training()
+    def get_lr(self, it):
+        # 1) linear warmup for warmup_iters steps
+        if it < self.cfg.train.scheduler.warmup_steps:
+            return self.cfg.train.adamw.lr * it / self.cfg.train.scheduler.warmup_steps
+        # 2) if it > lr_decay_iters, return min learning rate
+        if it > self.cfg.train.scheduler.total_steps:
+            return self.cfg.train.scheduler.min_lr
+        # 3) in between, use cosine decay down to min learning rate
+        decay_ratio = (it - self.cfg.train.scheduler.warmup_steps) / (
+            self.cfg.train.scheduler.total_steps - self.cfg.train.scheduler.warmup_steps
+        )
+        assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
+        return self.cfg.train.scheduler.min_lr + coeff * (
+            self.cfg.train.adamw.lr - self.cfg.train.scheduler.min_lr
+        )
+    ### Following are methods that can be used directly in child classes ###
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.train()
+        epoch_sum_loss: float = 0.0
+        ema_loss = None
+        # profiler
+        start_this_step_time = time.time()
+        finish_last_step_time = time.time()
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            assert batch is not None
+            # start_this_step_time = time.time()
+            # print(f'load batch took: {start_this_step_time - finish_last_step_time:.6f}s')
+            # update learning rate
+            lr = self.get_lr(self.step)
+            for param_group in self.optimizer.param_groups:
+                param_group["lr"] = lr
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                loss = self._train_step(batch)
+                self.current_loss = loss.item()
+                ema_loss = (
+                    0.99 * ema_loss + 0.01 * self.current_loss
+                    if ema_loss is not None
+                    else self.current_loss
+                )
+                self.accelerator.backward(loss)
+                if self.accelerator.sync_gradients:
+                    self.accelerator.clip_grad_norm_(self.model.parameters(), 1.0)
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+            self.batch_count += 1
+            # if self.accelerator.is_main_process:
+            #     print(self.current_loss)
+            if self.accelerator.sync_gradients:
+                if self.step % self.cfg.train.save_checkpoint_stride[0] == 0:
+                    self.accelerator.wait_for_everyone()
+                    if self.accelerator.is_main_process:
+                        try:
+                            self.save_checkpoint()
+                        except:
+                            self.logger.info("Failed to save checkpoint, resuming...")
+                if self.accelerator.is_main_process:
+                    if self.step % 100 == 0:
+                        self.logger.info(f"EMA Loss: {ema_loss:.6f}")
+                self.accelerator.log(
+                    {
+                        "Step/Train Loss": loss,
+                        "Step/Learning Rate": self.optimizer.param_groups[0]["lr"],
+                    },
+                    step=self.step,
+                )
+                epoch_sum_loss += loss.item()
+                self.step += 1
+            # finish_last_step_time = time.time()
+            # print(f'load took: {finish_last_step_time - start_this_step_time:.6f}s')
+        return (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step
+        )
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.eval()
+        epoch_sum_loss = 0.0
+        for batch in tqdm(
+            self.valid_dataloader,
+            desc=f"Validating Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            batch_loss = self._valid_step(batch)
+            epoch_sum_loss += batch_loss.item()
+        return epoch_sum_loss / len(self.valid_dataloader)
+    def _train_step(self, batch):
+        r"""Training forward step. Should return average loss of a sample over
+        one batch. Provoke ``_forward_step`` is recommended except for special case.
+        See ``_train_epoch`` for usage.
+        """
+        return self._forward_step(batch)
+    @torch.inference_mode()
+    def _valid_step(self, batch):
+        r"""Testing forward step. Should return average loss of a sample over
+        one batch. Provoke ``_forward_step`` is recommended except for special case.
+        See ``_test_epoch`` for usage.
+        """
+        return self._forward_step(batch)
+    def _load_model(
+        self,
+        checkpoint_dir: str = None,
+        checkpoint_path: str = None,
+        resume_type: str = "",
+    ):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            try:
+                all_ckpts = os.listdir(checkpoint_dir)
+                all_ckpts = filter(lambda x: x.startswith("epoch"), all_ckpts)
+                ls = list(all_ckpts)
+                ls = [os.path.join(checkpoint_dir, i) for i in ls]
+                ls.sort(
+                    key=lambda x: int(x.split("_")[-2].split("-")[-1]), reverse=True
+                )
+                checkpoint_path = ls[0]
+                self.logger.info("Resume from {}".format(checkpoint_path))
+            except Exception as e:
+                print(
+                    "Failed to load checkpoint from {}, starting FROM SCRATCH...".format(
+                        checkpoint_dir
+                    )
+                )
+                return None
+        if resume_type in ["resume", ""]:
+            # Load all the things, including model weights, optimizer, scheduler, and random states.
+            self.accelerator.load_state(input_dir=checkpoint_path)
+            # set epoch and step
+            self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
+            self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
+        elif resume_type == "finetune":
+            # Load only the model weights
+            accelerate.load_checkpoint_and_dispatch(
+                self.accelerator.unwrap_model(self.model),
+                os.path.join(checkpoint_path, "pytorch_model.bin"),
+            )
+            self.logger.info("Load model weights for finetune...")
+        else:
+            raise ValueError("Resume_type must be `resume` or `finetune`.")
+        return checkpoint_path
+    # TODO: LEGACY CODE
+    def _build_dataloader(self):
+        Dataset, Collator = self._build_dataset()
+        # build dataset instance for each dataset and combine them by ConcatDataset
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(train_dataset, self.cfg, self.logger, "train")
+        self.logger.debug(f"train batch_sampler: {list(batch_sampler)}")
+        self.logger.debug(f"length: {train_dataset.cumulative_sizes}")
+        # TODO: use config instead of (sampler, shuffle, drop_last, batch_size)
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        # Build valid dataloader
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=True)
+            datasets_list.append(subdataset)
+        valid_dataset = ConcatDataset(datasets_list)
+        valid_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(valid_dataset, self.cfg, self.logger, "valid")
+        self.logger.debug(f"valid batch_sampler: {list(batch_sampler)}")
+        self.logger.debug(f"length: {valid_dataset.cumulative_sizes}")
+        valid_loader = DataLoader(
+            valid_dataset,
+            collate_fn=valid_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        return train_loader, valid_loader
+    @staticmethod
+    def _set_random_seed(seed):
+        r"""Set random seed for all possible random modules."""
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.random.manual_seed(seed)
+    def _check_nan(self, loss, y_pred, y_gt):
+        if torch.any(torch.isnan(loss)):
+            self.logger.fatal("Fatal Error: Training is down since loss has Nan!")
+            self.logger.error("loss = {:.6f}".format(loss.item()), in_order=True)
+            if torch.any(torch.isnan(y_pred)):
+                self.logger.error(
+                    f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
+                )
+            else:
+                self.logger.debug(
+                    f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True
+                )
+            if torch.any(torch.isnan(y_gt)):
+                self.logger.error(
+                    f"y_gt has Nan: {torch.any(torch.isnan(y_gt))}", in_order=True
+                )
+            else:
+                self.logger.debug(
+                    f"y_gt has nan: {torch.any(torch.isnan(y_gt))}", in_order=True
+                )
+            if torch.any(torch.isnan(y_pred)):
+                self.logger.error(f"y_pred: {y_pred}", in_order=True)
+            else:
+                self.logger.debug(f"y_pred: {y_pred}", in_order=True)
+            if torch.any(torch.isnan(y_gt)):
+                self.logger.error(f"y_gt: {y_gt}", in_order=True)
+            else:
+                self.logger.debug(f"y_gt: {y_gt}", in_order=True)
+            # TODO: still OK to save tracking?
+            self.accelerator.end_training()
+            raise RuntimeError("Loss has Nan! See log for more info.")
+    ### Protected methods end ###
+    ## Following are private methods ##
+    ## !!! These are inconvenient for GAN-based model training. It'd be better to move these to svc_trainer.py if needed.
+    def _build_optimizer(self):
+        r"""Build optimizer for model."""
+        # Make case-insensitive matching
+        if self.cfg.train.optimizer.lower() == "adadelta":
+            optimizer = torch.optim.Adadelta(
+                self.model.parameters(), **self.cfg.train.adadelta
+            )
+            self.logger.info("Using Adadelta optimizer.")
+        elif self.cfg.train.optimizer.lower() == "adagrad":
+            optimizer = torch.optim.Adagrad(
+                self.model.parameters(), **self.cfg.train.adagrad
+            )
+            self.logger.info("Using Adagrad optimizer.")
+        elif self.cfg.train.optimizer.lower() == "adam":
+            optimizer = torch.optim.Adam(self.model.parameters(), **self.cfg.train.adam)
+            self.logger.info("Using Adam optimizer.")
+        elif self.cfg.train.optimizer.lower() == "adamw":
+            optimizer = torch.optim.AdamW(
+                self.model.parameters(), **self.cfg.train.adamw
+            )
+        elif self.cfg.train.optimizer.lower() == "sparseadam":
+            optimizer = torch.optim.SparseAdam(
+                self.model.parameters(), **self.cfg.train.sparseadam
+            )
+        elif self.cfg.train.optimizer.lower() == "adamax":
+            optimizer = torch.optim.Adamax(
+                self.model.parameters(), **self.cfg.train.adamax
+            )
+        elif self.cfg.train.optimizer.lower() == "asgd":
+            optimizer = torch.optim.ASGD(self.model.parameters(), **self.cfg.train.asgd)
+        elif self.cfg.train.optimizer.lower() == "lbfgs":
+            optimizer = torch.optim.LBFGS(
+                self.model.parameters(), **self.cfg.train.lbfgs
+            )
+        elif self.cfg.train.optimizer.lower() == "nadam":
+            optimizer = torch.optim.NAdam(
+                self.model.parameters(), **self.cfg.train.nadam
+            )
+        elif self.cfg.train.optimizer.lower() == "radam":
+            optimizer = torch.optim.RAdam(
+                self.model.parameters(), **self.cfg.train.radam
+            )
+        elif self.cfg.train.optimizer.lower() == "rmsprop":
+            optimizer = torch.optim.RMSprop(
+                self.model.parameters(), **self.cfg.train.rmsprop
+            )
+        elif self.cfg.train.optimizer.lower() == "rprop":
+            optimizer = torch.optim.Rprop(
+                self.model.parameters(), **self.cfg.train.rprop
+            )
+        elif self.cfg.train.optimizer.lower() == "sgd":
+            optimizer = torch.optim.SGD(self.model.parameters(), **self.cfg.train.sgd)
+        else:
+            raise NotImplementedError(
+                f"Optimizer {self.cfg.train.optimizer} not supported yet!"
+            )
+        return optimizer
+    def _build_scheduler(self):
+        r"""Build scheduler for optimizer."""
+        # Make case-insensitive matching
+        if self.cfg.train.scheduler.lower() == "lambdalr":
+            scheduler = torch.optim.lr_scheduler.LambdaLR(
+                self.optimizer, **self.cfg.train.lambdalr
+            )
+        elif self.cfg.train.scheduler.lower() == "multiplicativelr":
+            scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
+                self.optimizer, **self.cfg.train.multiplicativelr
+            )
+        elif self.cfg.train.scheduler.lower() == "steplr":
+            scheduler = torch.optim.lr_scheduler.StepLR(
+                self.optimizer, **self.cfg.train.steplr
+            )
+        elif self.cfg.train.scheduler.lower() == "multisteplr":
+            scheduler = torch.optim.lr_scheduler.MultiStepLR(
+                self.optimizer, **self.cfg.train.multisteplr
+            )
+        elif self.cfg.train.scheduler.lower() == "constantlr":
+            scheduler = torch.optim.lr_scheduler.ConstantLR(
+                self.optimizer, **self.cfg.train.constantlr
+            )
+        elif self.cfg.train.scheduler.lower() == "linearlr":
+            scheduler = torch.optim.lr_scheduler.LinearLR(
+                self.optimizer, **self.cfg.train.linearlr
+            )
+        elif self.cfg.train.scheduler.lower() == "exponentiallr":
+            scheduler = torch.optim.lr_scheduler.ExponentialLR(
+                self.optimizer, **self.cfg.train.exponentiallr
+            )
+        elif self.cfg.train.scheduler.lower() == "polynomiallr":
+            scheduler = torch.optim.lr_scheduler.PolynomialLR(
+                self.optimizer, **self.cfg.train.polynomiallr
+            )
+        elif self.cfg.train.scheduler.lower() == "cosineannealinglr":
+            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                self.optimizer, **self.cfg.train.cosineannealinglr
+            )
+        elif self.cfg.train.scheduler.lower() == "sequentiallr":
+            scheduler = torch.optim.lr_scheduler.SequentialLR(
+                self.optimizer, **self.cfg.train.sequentiallr
+            )
+        elif self.cfg.train.scheduler.lower() == "reducelronplateau":
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                self.optimizer, **self.cfg.train.reducelronplateau
+            )
+        elif self.cfg.train.scheduler.lower() == "cycliclr":
+            scheduler = torch.optim.lr_scheduler.CyclicLR(
+                self.optimizer, **self.cfg.train.cycliclr
+            )
+        elif self.cfg.train.scheduler.lower() == "onecyclelr":
+            scheduler = torch.optim.lr_scheduler.OneCycleLR(
+                self.optimizer, **self.cfg.train.onecyclelr
+            )
+        elif self.cfg.train.scheduler.lower() == "cosineannearingwarmrestarts":
+            scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+                self.optimizer, **self.cfg.train.cosineannearingwarmrestarts
+            )
+        elif self.cfg.train.scheduler.lower() == "noamlr":
+            scheduler = NoamLR(self.optimizer, **self.cfg.train.lr_scheduler)
+        else:
+            raise NotImplementedError(
+                f"Scheduler {self.cfg.train.scheduler} not supported yet!"
+            )
+        return scheduler
+    def _init_accelerator(self):
+        self.exp_dir = os.path.join(
+            os.path.abspath(self.cfg.log_dir), self.args.exp_name
+        )
+        project_config = ProjectConfiguration(
+            project_dir=self.exp_dir,
+            logging_dir=os.path.join(self.exp_dir, "log"),
+        )
+        from accelerate import DistributedDataParallelKwargs
+        kwargs = DistributedDataParallelKwargs(
+            find_unused_parameters=self.cfg.train.find_unused_parameters
+        )
+        self.accelerator = accelerate.Accelerator(
+            gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
+            log_with=self.cfg.train.tracker,
+            project_config=project_config,
+            kwargs_handlers=[kwargs],
+        )
+        if self.accelerator.is_main_process:
+            os.makedirs(project_config.project_dir, exist_ok=True)
+            os.makedirs(project_config.logging_dir, exist_ok=True)
+        with self.accelerator.main_process_first():
+            self.accelerator.init_trackers(self.args.exp_name)
+    def __check_basic_configs(self):
+        if self.cfg.train.gradient_accumulation_step <= 0:
+            self.logger.fatal("Invalid gradient_accumulation_step value!")
+            self.logger.error(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+            self.accelerator.end_training()
+            raise ValueError(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+        # TODO: check other values
+    @staticmethod
+    def __count_parameters(model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )
+    @torch.inference_mode()
+    def test_loop(self):
+        pass
+    ### Private methods end ###

models/tts/valle_v2.1/cfg/base.yaml ADDED Viewed

	@@ -0,0 +1,136 @@

+dataset_folder: /gluster-tts/emilia
+w2v_path: /gluster-tts/jiaqi_repos/w2v-bert-2
+ckpt_root_path: /gluster-tts/jiaqi_repos/soundstorm_ckpts
+log_dir: /gluster-tts/jiaqi_repos/tmp_checkpoints
+max_tokens: 9000
+# fixed params cosyvoice
+# sample_rate: 22050
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+spk_embed_dim: 192
+transformer_model:
+  _target_: models.tts.bpe_text2semantic.llm.TransformerLM
+  text_encoder_input_size: ${text_encoder_input_size}
+  llm_input_size: ${llm_input_size}
+  llm_output_size: ${llm_output_size}
+  text_token_size: 51866
+  speech_token_size: 8192
+  length_normalized_loss: true
+  lsm_weight: 0
+  spk_embed_dim: ${spk_embed_dim}
+  text_encoder:
+    _target_: cosyvoice.transformer.encoder.ConformerEncoder
+    input_size: ${text_encoder_input_size}
+    output_size: 1024
+    attention_heads: 16
+    linear_units: 4096
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0
+    normalize_before: true
+    input_layer: 'linear'
+    pos_enc_layer_type: 'rel_pos_espnet'
+    selfattention_layer_type: 'rel_selfattn'
+    use_cnn_module: false
+    macaron_style: false
+    use_dynamic_chunk: false
+    use_dynamic_left_chunk: false
+    static_chunk_size: 1
+  llm:
+    _target_: cosyvoice.transformer.encoder.TransformerEncoder
+    input_size: ${llm_input_size}
+    output_size: ${llm_output_size}
+    attention_heads: 16
+    linear_units: 4096
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0
+    input_layer: 'linear_legacy'
+    pos_enc_layer_type: 'rel_pos_espnet'
+    selfattention_layer_type: 'rel_selfattn'
+    static_chunk_size: 1
+args:
+  exp_name: text2semantic
+  log_level: DEBUG
+  seed: 22
+  resume: false
+  resume_type: resume
+  resume_from_ckpt_path: ""
+preprocess_cfg:
+  w2v_path: ${w2v_path}
+  preprocess:
+    sample_rate: 16000
+    min_dur: 3
+    max_dur: 30
+    hop_size: 320
+cfg:
+  w2v_path: ${w2v_path}
+  log_dir: ${log_dir}
+  dataset:
+    _target_: models.tts.text2semantic.emilia_dataset.T2SDataset
+    cache_folder: ${dataset_folder}/
+    cfg: ${preprocess_cfg}
+    mnt_path: ${dataset_folder}/output_gzips/
+  # collator:
+  #   _target_: models.tts.text2semantic.emilia_dataset.T2SCollator
+  collator:
+    _target_: models.tts.bpe_text2semantic.collator.T2SCollatorDynamic
+    max_tokens: 13000
+    tokenizer:
+      _target_: whisper.tokenizer.get_tokenizer
+      multilingual: True
+      num_languages: 100
+      language: 'en'
+      task: 'transcribe'
+  train:
+    gradient_accumulation_step: 1
+    find_unused_parameters: true
+    tracker: tensorboard
+    max_epoch: 1000
+    save_checkpoint_stride:
+      - 2000
+    keep_last: [1]
+    run_eval: true
+    dataloader:
+      num_worker: 0
+      pin_memory: false
+      persistent_workers: false
+    use_dynamic_batchsize: true
+    optimizer: adamW
+    adamw:
+      lr: 2e-4
+    scheduler:
+      warmup_steps: 8000
+      total_steps: 400000
+      min_lr: 5e-5
+    exponentiallr:
+      gamma: 0.999999
+    batch_size: 10
+    max_tokens: ${max_tokens}
+    max_sentences: 64
+  model: ${transformer_model}
+  kmeans:
+    type: repcodec
+    w2v_path: ${w2v_path}
+    stat_mean_var_path: ${ckpt_root_path}/semantic_kmeans/emilia_wav2vec2bert_stats_10k.pt
+    repcodec:
+      codebook_size: 8192
+      hidden_size: 1024
+      codebook_dim: 8
+      vocos_dim: 384
+      vocos_intermediate_dim: 2048
+      vocos_num_layers: 12
+    pretrained_path: ${ckpt_root_path}/repcodec_emilia_50k_8192_norm_8d/86k_steps/model.safetensors
+trainer:
+  _target_: models.tts.bpe_text2semantic.t2s_trainer.T2STrainer
+  args: ${args}
+  cfg: ${cfg}

models/tts/valle_v2.1/emilia_dataset_whole.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import oss2 #pip install oss2
+import io
+import librosa
+import torch
+import json
+import tqdm
+import numpy as np
+import logging
+import pickle
+import os
+import time
+from torch.utils.data import Dataset
+from utils.g2p.g2p import phonemizer_g2p
+from multiprocessing import Pool
+import concurrent.futures
+from pathlib import Path
+from models.base.new_trainer import MainProcessLogger
+# class PhonemizerWarningFilter(logging.Filter):
+#     def filter(self, record):
+#         # 只过滤 phonemizer 中的 WARNING 级别日志
+#         if record.name == 'phonemizer' and record.levelno == logging.WARNING:
+#             return False
+#         return False
+# logger = logging.getLogger('phonemizer')
+# filter = PhonemizerWarningFilter()
+# logger.addFilter(filter)
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+logger = MainProcessLogger(is_main_process=False)
+os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1'
+os.environ['PHONEMIZER_ESPEAK_PATH'] = '/usr/bin/espeak-ng'
+LANG2CODE = {
+    'zh': 349,
+    'en': 350,
+    'ja': 351,
+    'ko': 352,
+    'fr': 353,
+    'de': 354,
+}
+AK = "LTAI5tJU3mNZASp8kUwWFjcq"
+SK = "Ukhy7qWtMgwYVIMJSK3LTBpi1MLYrd"
+bucket_name = "pjlab-3090-openmmlabpartner"
+MOUNT_PATH = "/mnt/data/oss_beijing/"
+data_json_path = '/mnt/petrelfs/hehaorui/jiaqi/Emilia-44.7k.json.gz'
+num_token_per_second = 75
+default_sr = 24000  # it may need to change sampling rate
+duration_setting = {'min': 4, 'max': 20}
+class EmiliaDataset(Dataset):
+    def __init__(self,
+                access_key_id=AK,
+                access_key_secret=SK,
+                bucket_name=bucket_name,
+                cache_type='path',
+                **kwargs): # 'path' or 'meta'
+        self.cache_type = cache_type
+        # Initialize OSS client
+        self.init_client(access_key_id, access_key_secret, bucket_name)
+        self.json_paths = []
+        self.wav_paths = []
+        self.language_list = ['en'] # Data language list
+        self.wav_path_index2duration = []
+        self.wav_path_index2phonelen = []
+        self.index2num_frames = []
+        self.json_path2meta = {}
+        self.json2filtered_idx = {}
+        self.cache_folder = '/mnt/petrelfs/hehaorui/jiaqi/tmp/emilia-cache-en'
+        Path(self.cache_folder).mkdir(parents=True, exist_ok=True)
+        self.wav_paths_cache = os.path.join(self.cache_folder, "wav_paths_cache.pkl")
+        self.json_paths_cache = os.path.join(self.cache_folder, "json_paths_cache.pkl")
+        self.duration_cache = os.path.join(self.cache_folder, "duration_cache.pkl")
+        self.phone_count_cache = os.path.join(self.cache_folder, "phone_count_cache.pkl")
+        self.json_path2meta_cache = os.path.join(self.cache_folder, "json_path2meta.pkl")
+        if cache_type == 'path':
+            if os.path.exists(self.wav_paths_cache) and os.path.exists(self.json_paths_cache) and os.path.exists(self.duration_cache) and os.path.exists(self.phone_count_cache):
+                self.load_cached_paths()
+            else:
+                logger.info("No cache exists")
+                self.get_all_paths_from_json(data_json_path)
+                self.save_cached_paths()
+        elif cache_type == 'meta':
+            if os.path.exists(self.wav_paths_cache) and os.path.exists(self.json_paths_cache):
+                self.load_cached_paths()
+            else:
+                logger.info("No cache exists")
+                self.get_all_paths_from_json(data_json_path)
+                self.save_cached_paths()
+        else:
+            logger.info("Incorrect cache loading way")
+            exit()
+        if cache_type == 'meta':
+            if os.path.exists(self.json_path2meta_cache):
+                self.load_path2meta()
+            else:
+                self.get_jsoncache_multiprocess(pool_size=8)
+        self.num_frame_indices = np.array(sorted(range(len(self.index2num_frames)), key=lambda k: self.index2num_frames[k]))
+    def init_client(self, access_key_id, access_key_secret, bucket_name):
+        logger.info("Start to initialize OSS client")
+        self.auth = oss2.Auth(access_key_id, access_key_secret)
+        self.bucket = oss2.Bucket(self.auth, "https://oss-cn-beijing.aliyuncs.com", bucket_name)
+        logger.info("OSS client initialized successfully")
+    def load_cached_paths(self):
+        logger.info("Loaded paths from cache files")
+        with open(self.wav_paths_cache, "rb") as f:
+            self.wav_paths = pickle.load(f)
+        with open(self.json_paths_cache, "rb") as f:
+            self.json_paths = pickle.load(f)
+        if self.cache_type == 'path':
+            with open(self.duration_cache, "rb") as f:
+                self.wav_path_index2duration = pickle.load(f)
+            with open(self.phone_count_cache, "rb") as f:
+                self.wav_path_index2phonelen = pickle.load(f)
+            for duration, phone_count in zip(self.wav_path_index2duration, self.wav_path_index2phonelen):
+                self.index2num_frames.append(duration * num_token_per_second + phone_count)
+        logger.info("All paths got successfully")
+        logger.info("Number of wavs: %d, Number of jsons: %d"
+            % (len(self.wav_paths), len(self.json_paths)))
+    def save_cached_paths(self):
+        with open(self.wav_paths_cache, "wb") as f:
+            pickle.dump(self.wav_paths, f)
+        with open(self.json_paths_cache, "wb") as f:
+            pickle.dump(self.json_paths, f)
+        if self.cache_type == 'path':
+            with open(self.duration_cache, "wb") as f:
+                pickle.dump(self.wav_path_index2duration, f)
+            with open(self.phone_count_cache, "wb") as f:
+                pickle.dump(self.wav_path_index2phonelen, f)
+        logger.info("Saved paths to cache files")
+    # Load JSON data from a compressed GZIP file
+    def load_compressed_json(self, filename):
+        import gzip
+        with gzip.open(filename, "rt", encoding="utf-8") as f:
+            return json.load(f)
+    def get_path_from_json(self, data):
+        if data['language'][0] not in self.language_list:
+            return
+        self.json_paths.append(data['json_path'])
+        is_exists = True
+        try:
+            if not self.bucket.object_exists(data['wav_path'][0]):
+                is_exists = False
+        except oss2.api.Exception as e:
+            is_exists = False
+        remove_idx = []
+        for wav, duration, phone_count in zip(data['wav_path'], data['duration'], data['phone_count']):
+            if duration < duration_setting['min'] or duration > duration_setting['max']:
+                idx = wav.split("_")[-1].split(".")[0]
+                remove_idx.append(idx)
+                continue
+            if is_exists:
+                self.wav_paths.append(wav)
+            else:
+                if '.mp3' in wav:
+                    wav = wav.replace('.mp3', '.wav')
+                    self.wav_paths.append(wav)
+                else:
+                    wav = wav.replace('.wav', '.mp3')
+                    self.wav_paths.append(wav)
+            self.wav_path_index2duration.append(duration)
+            self.wav_path_index2phonelen.append(phone_count)
+            self.index2num_frames.append(duration * num_token_per_second + phone_count)
+        self.json2filtered_idx[data['json_path']] = [int(i) for i in data['filtered_idx'].split(',') if i not in remove_idx]
+        if not self.json2filtered_idx[data['json_path']]:
+            self.json_paths.pop()
+    def get_all_paths_from_json(self, json_path):
+        data_list = self.load_compressed_json(json_path)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(self.get_path_from_json, data) for data in tqdm.tqdm(data_list)]
+            data = [future.result() for future in tqdm.tqdm(futures)]
+    # Only 'meta' cache type use
+    def get_phone_count_and_duration(self, meta, idx_list):
+        new_meta = {}
+        if meta[0]['language'] not in self.language_list:
+            new_meta['0'] = meta[0]
+            return new_meta
+        text_list = []
+        for i in idx_list:
+            text_list.append(meta[i]['text'])
+        token_id = self.g2p(text_list, meta[0]['language'])[1]
+        for i, token in zip(idx_list, token_id):
+            nm = {}
+            nm['language'] = meta[i]['language']
+            nm['phone_id'] = token
+            nm['phone_count'] = len(token)
+            nm['duration'] = meta[i]['end'] - meta[i]['start']
+            new_meta[str(i)] = nm
+        del meta
+        return new_meta
+    # Only 'meta' cache type use
+    def process_json_cache(self, json_path):
+        default_meta = [{'text': '-1', 'language': 'others'}]
+        try:
+            file_bytes = self.bucket.get_object(json_path)
+            buffer = io.BytesIO(file_bytes.read())
+            json_cache = json.load(buffer)
+            del buffer, file_bytes
+            if json_cache is None:
+                logger.info("json is none")
+            elif isinstance(json_cache, (dict, list)) and not json_cache:
+                logger.info("json is none")
+            else:
+                return json_cache
+        except oss2.exceptions.NoSuchKey as e:
+            logger.info(
+                "Not found: http_status={0}, request_id={1}".format(e.status, e.request_id))
+        except Exception as e:
+            logger.info("Error json: {} error: {}".format(json_path, e))
+        return default_meta
+    # Only 'meta' cache type use
+    def get_jsoncache_multiprocess(self, pool_size):
+        logger.info("Start to build json pool")
+        logger.info("Start to get json cache")
+        json2meta = []
+        json_data = []
+        tmp_json_cache = os.path.join(self.cache_folder, 'json_cache.pkl')
+        if os.path.exists(tmp_json_cache):
+            with open(tmp_json_cache, 'rb') as f:
+                json_data = pickle.load(f)
+            logging.info("Load json_cache.pkl")
+        else:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=pool_size) as executor:
+                futures = [executor.submit(self.process_json_cache, path) for path in self.json_paths]
+                json_data = [future.result() for future in tqdm.tqdm(futures)]
+            with open(tmp_json_cache, 'wb') as f:
+                pickle.dump(json_data, f)
+            logging.info("Save json_cache.pkl")
+        logging.info("Get meta from cache")
+        for json, path in tqdm.tqdm(zip(json_data, self.json_paths), total=len(json_data)):
+            # print(json)
+            json2meta.append(self.get_phone_count_and_duration(json, self.json2filtered_idx[path]))
+        error_json_path_list = []
+        for i in range(len(json2meta)):
+            if not json2meta[i]:
+                error_json_path_list.append(self.json_paths[i])
+            elif json2meta[i][next(iter(json2meta[i]))]['language'] not in self.language_list:
+                language = json2meta[i][next(iter(json2meta[i]))]['language']
+                logger.info("{} is not in language list".format(language))
+                error_json_path_list.append(self.json_paths[i])
+            else:
+                self.json_path2meta[self.json_paths[i]] = json2meta[i]
+        logger.info("Remove error json path {}".format(error_json_path_list))
+        error_wav_path_list = []
+        for error in tqdm.tqdm(error_json_path_list):
+            self.json_paths.remove(error)
+            error = error.split('.json')[0]
+            for wav in self.wav_paths:
+                if error in wav:
+                    error_wav_path_list.append(wav)
+        logger.info("Remove error wav path {}".format(error_wav_path_list))
+        for error in tqdm.tqdm(error_wav_path_list):
+            self.wav_paths.remove(error)
+        logger.info("Update cache")
+        with open(self.wav_paths_cache, "wb") as f:
+            pickle.dump(self.wav_paths, f)
+        with open(self.json_paths_cache, "wb") as f:
+            pickle.dump(self.json_paths, f)
+        with open(self.json_path2meta_cache, "wb") as f:
+            pickle.dump(self.json_path2meta, f)
+        logger.info("Json cache write to json_path2meta.pkl successfully")
+        del json2meta, error_wav_path_list, error_json_path_list
+    # Only 'meta' cache type use
+    def load_path2meta(self):
+        logger.info("Loaded meta from cache files")
+        self.json_path2meta = pickle.load(open(self.json_path2meta_cache, "rb"))
+        for path in self.wav_paths:
+            meta = self.get_meta_from_wav_path(path)
+            duration = meta['duration']
+            phone_count = meta['phone_count']
+            self.wav_path_index2duration.append(duration)
+            self.wav_path_index2phonelen.append(phone_count)
+            self.index2num_frames.append(duration * num_token_per_second + phone_count)
+    def get_meta_from_wav_path(self, wav_path):
+        index = int(wav_path.split("_")[-1].split(".")[0])
+        audio_name = "_".join(wav_path.split("/")[-1].split("_")[:-1])
+        dir_name = "/".join(wav_path.split("/")[:-1])
+        json_name = audio_name + ".json"
+        json_path = dir_name + "/" + json_name
+        meta = None
+        if self.cache_type == 'meta':
+            meta = self.json_path2meta[json_path][str(index)]
+            return meta
+        elif self.cache_type == 'path':
+            try:
+                file_bytes = self.bucket.get_object(json_path)
+                buffer = io.BytesIO(file_bytes.read())
+                meta = json.load(buffer)[index]
+            except oss2.exceptions.NoSuchKey as e:
+                logger.info(
+                    "Not found: http_status={0}, request_id={1}".format(e.status, e.request_id))
+            except Exception as e:
+                logger.info("Error json: {} error: {}".format(json_path, e))
+        del index, audio_name, dir_name, json_name, json_path
+        return meta
+    def g2p(self, text, language):
+        return phonemizer_g2p(text, language)
+    def get_num_frames(self, index):
+        return self.wav_path_index2duration[index] * num_token_per_second + self.wav_path_index2phonelen[index]
+    def __len__(self):
+        return self.wav_paths.__len__()
+    def __getitem__(self, idx):
+        wav_path = self.wav_paths[idx]
+        file_bytes = None
+        position = np.where(self.num_frame_indices == idx)[0][0]
+        try:
+            random_index = np.random.choice(self.num_frame_indices[:position])
+        except:
+            random_index = np.random.choice(self.num_frame_indices)
+        del position
+        try:
+            for i in range(2):
+                try:
+                    file_bytes = self.bucket.get_object(wav_path.replace("_new", ""))
+                    break
+                except Exception as e:
+                    logger.info(f"[Filter meta func] Error is {e}")
+                    time.sleep(i)
+                    logger.info("retry")
+        except:
+            logger.info("Get data from oss failed. Get another.")
+            return self.__getitem__(random_index)
+        meta = self.get_meta_from_wav_path(wav_path)
+        if file_bytes is not None and meta is not None:
+            try:
+                buffer = io.BytesIO(file_bytes.read())
+                speech, sr = librosa.load(buffer, sr=default_sr)
+            except:
+                return self.__getitem__(random_index)
+            assert sr == 24000
+            shape = speech.shape
+            pad_shape = ((shape[0] // 320) + 1) * 320 - shape[0]
+            speech = np.pad(speech, (0, pad_shape), mode='constant')
+            del buffer, pad_shape, shape
+            if speech.shape[0] < default_sr * duration_setting['min'] and speech.shape[0] > default_sr * duration_setting['max']:
+                logger.info("Wav length exceeds the requirement")
+                return self.__getitem__(random_index)
+            else:
+                speech_tensor = torch.tensor(speech, dtype=torch.float32)
+                phone_id = self.g2p(meta['text'], meta['language'])[1] if self.cache_type == 'path' else meta['phone_id']
+                phone_id = torch.tensor(phone_id, dtype=torch.long)
+                phone_id = torch.cat([torch.tensor(LANG2CODE[meta['language']], dtype=torch.long).reshape(1), phone_id]) # add language token
+                return dict(
+                    speech=speech_tensor,
+                    phone=phone_id,
+                    text=meta['text'],
+                    language=meta['language'],
+                )
+        else:
+            logger.info("Failed to get file after retries.")
+            return self.__getitem__(random_index)
+if __name__ == '__main__':
+    dataset = EmiliaDataset(AK, SK, bucket_name)
+    # print(dataset.__getitem__(0))
+    for batch in dataset:
+        breakpoint()
+        print()

models/tts/valle_v2.1/g2p_processor.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import numpy as np
+import os
+import torch
+import copy
+from g2p_en import G2p
+import re
+import unicodedata
+from g2p_en import G2p
+from g2p_en.expand import normalize_numbers
+g2p = G2p()
+PHONE_SET = [
+    "!",
+    ",",
+    ".",
+    ".B",
+    ":",
+    "<BOS>",
+    "<EOS>",
+    "<PAD>",
+    "<UNK>",
+    "?",
+    "AA0B",
+    "AA0E",
+    "AA0I",
+    "AA1B",
+    "AA1E",
+    "AA1I",
+    "AA2B",
+    "AA2E",
+    "AA2I",
+    "AE0B",
+    "AE0E",
+    "AE0I",
+    "AE1B",
+    "AE1E",
+    "AE1I",
+    "AE2B",
+    "AE2E",
+    "AE2I",
+    "AH0B",
+    "AH0E",
+    "AH0I",
+    "AH1B",
+    "AH1E",
+    "AH1I",
+    "AH2B",
+    "AH2E",
+    "AH2I",
+    "AO0B",
+    "AO0E",
+    "AO0I",
+    "AO1",
+    "AO1B",
+    "AO1E",
+    "AO1I",
+    "AO2B",
+    "AO2E",
+    "AO2I",
+    "AW0B",
+    "AW0E",
+    "AW0I",
+    "AW1B",
+    "AW1E",
+    "AW1I",
+    "AW2B",
+    "AW2E",
+    "AW2I",
+    "AY0B",
+    "AY0E",
+    "AY0I",
+    "AY1B",
+    "AY1E",
+    "AY1I",
+    "AY2B",
+    "AY2E",
+    "AY2I",
+    "BB",
+    "BE",
+    "BI",
+    "CHB",
+    "CHE",
+    "CHI",
+    "DB",
+    "DE",
+    "DHB",
+    "DHE",
+    "DHI",
+    "DI",
+    "EH0B",
+    "EH0E",
+    "EH0I",
+    "EH1B",
+    "EH1E",
+    "EH1I",
+    "EH2B",
+    "EH2E",
+    "EH2I",
+    "ER0B",
+    "ER0E",
+    "ER0I",
+    "ER1B",
+    "ER1E",
+    "ER1I",
+    "ER2B",
+    "ER2E",
+    "ER2I",
+    "EY0B",
+    "EY0E",
+    "EY0I",
+    "EY1B",
+    "EY1E",
+    "EY1I",
+    "EY2B",
+    "EY2E",
+    "EY2I",
+    "FB",
+    "FE",
+    "FI",
+    "GB",
+    "GE",
+    "GI",
+    "HHB",
+    "HHE",
+    "HHI",
+    "IH0B",
+    "IH0E",
+    "IH0I",
+    "IH1B",
+    "IH1E",
+    "IH1I",
+    "IH2B",
+    "IH2E",
+    "IH2I",
+    "IY0B",
+    "IY0E",
+    "IY0I",
+    "IY1B",
+    "IY1E",
+    "IY1I",
+    "IY2B",
+    "IY2E",
+    "IY2I",
+    "JHB",
+    "JHE",
+    "JHI",
+    "KB",
+    "KE",
+    "KI",
+    "L",
+    "LB",
+    "LE",
+    "LI",
+    "MB",
+    "ME",
+    "MI",
+    "NB",
+    "NE",
+    "NGB",
+    "NGE",
+    "NGI",
+    "NI",
+    "OW0B",
+    "OW0E",
+    "OW0I",
+    "OW1B",
+    "OW1E",
+    "OW1I",
+    "OW2B",
+    "OW2E",
+    "OW2I",
+    "OY0B",
+    "OY0E",
+    "OY0I",
+    "OY1B",
+    "OY1E",
+    "OY1I",
+    "OY2B",
+    "OY2E",
+    "OY2I",
+    "PB",
+    "PE",
+    "PI",
+    "RB",
+    "RE",
+    "RI",
+    "SB",
+    "SE",
+    "SHB",
+    "SHE",
+    "SHI",
+    "SI",
+    "TB",
+    "TE",
+    "THB",
+    "THE",
+    "THI",
+    "TI",
+    "UH0B",
+    "UH0E",
+    "UH0I",
+    "UH1B",
+    "UH2B",
+    "UH1E",
+    "UH1I",
+    "UH2E",
+    "UH2I",
+    "UW0B",
+    "UW0E",
+    "UW0I",
+    "UW1B",
+    "UW1E",
+    "UW1I",
+    "UW2B",
+    "UW2E",
+    "UW2I",
+    "VB",
+    "VE",
+    "VI",
+    "WB",
+    "WE",
+    "WI",
+    "YB",
+    "YE",
+    "YI",
+    "ZB",
+    "ZE",
+    "ZHB",
+    "ZHE",
+    "ZHI",
+    "ZI",
+    "|",
+]
+PHPONE2ID = {PHONE_SET[i]: i for i in range(len(PHONE_SET))}
+PUNCS = "!,.?;:"
+def is_sil_phoneme(p):
+    return p == "" or not p[0].isalpha()
+def add_bdr(txt_struct):
+    txt_struct_ = []
+    for i, ts in enumerate(txt_struct):
+        txt_struct_.append(ts)
+        if (
+            i != len(txt_struct) - 1
+            and not is_sil_phoneme(txt_struct[i][0])
+            and not is_sil_phoneme(txt_struct[i + 1][0])
+        ):
+            txt_struct_.append(["|", ["|"]])
+    return txt_struct_
+def preprocess_text(text):
+    text = normalize_numbers(text)
+    text = "".join(
+        char
+        for char in unicodedata.normalize("NFD", text)
+        if unicodedata.category(char) != "Mn"
+    )  # Strip accents
+    text = text.lower()
+    text = re.sub("['\"()]+", "", text)
+    text = re.sub("[-]+", " ", text)
+    text = re.sub(f"[^ a-z{PUNCS}]", "", text)
+    text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text)  # !! -> !
+    text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
+    text = text.replace("i.e.", "that is")
+    text = text.replace("i.e.", "that is")
+    text = text.replace("etc.", "etc")
+    text = re.sub(f"([{PUNCS}])", r" ", text)  # remove punctuations for now
+    text = re.sub(rf"\s+", r" ", text)
+    return text
+def postprocess(txt_struct):
+    while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
+        txt_struct = txt_struct[1:]
+    while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
+        txt_struct = txt_struct[:-1]
+    txt_struct = add_bdr(txt_struct)
+    txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
+    return txt_struct
+def process(txt, g2p):
+    txt = preprocess_text(txt).strip()
+    phs = g2p(txt)
+    txt_struct = [[w, []] for w in txt.split(" ")]
+    i_word = 0
+    for p in phs:
+        if p == " ":
+            i_word += 1
+        else:
+            txt_struct[i_word][1].append(p)
+    txt_struct_ret = copy.deepcopy(txt_struct)
+    for i_word in range(len(txt_struct)):
+        if not is_sil_phoneme(txt_struct[i_word][0]):
+            if len(txt_struct[i_word][1]) > 1:
+                txt_struct_ret[i_word][1][0] += "B"
+                for i in range(1, len(txt_struct[i_word][1]) - 1):
+                    txt_struct_ret[i_word][1][i] += "I"
+                txt_struct_ret[i_word][1][-1] += "E"
+            else:
+                txt_struct_ret[i_word][1][0] += "B"
+    txt_struct_ret = postprocess(txt_struct_ret)
+    return txt_struct_ret, txt
+def test():
+    g2p = G2p()
+    txt = "This is a test sentence."
+    txt_struct, txt = process(txt, g2p)
+    print(txt_struct)
+    print(txt)
+    phone_seq = [p for w in txt_struct for p in w[1]]
+    print(phone_seq)
+    phone_id = [PHPONE2ID[p] for p in phone_seq]
+    print(phone_id)
+class G2pProcessor:
+    def __init__(self):
+        self.g2p = G2p()
+    def __call__(self, txt, lang="en"):
+        return self.txt2phoneid(txt)
+    def txt2phoneid(self, txt):
+        txt_struct, txt = process(txt, self.g2p)
+        phone_seq = [p for w in txt_struct for p in w[1]]
+        phone_id = [PHPONE2ID[p] for p in phone_seq]
+        return None, phone_id
+    def phoneid2txt(self, phone_id):
+        txt = []
+        for i in phone_id:
+            txt.append(PHONE_SET[i])
+        return txt
+if __name__ == "__main__":
+    g2p = G2pProcessor()
+    txt = "This is a test sentence."
+    phoneid = g2p.txt2phoneid(txt)[1]
+    # output: [5, 73, 118, 175, 218, 116, 213, 218, 28, 218, 180, 82, 179, 181, 218, 174, 82, 149, 185, 30, 149, 175, 6]
+    # print(phoneid)
+    print(g2p.phoneid2txt(phoneid))
+    # output: ['<BOS>', 'DHB', 'IH1I', 'SE', '|', 'IH1B', 'ZE', '|', 'AH0B', '|', 'TB', 'EH1I', 'SI', 'TE', '|', 'SB', 'EH1I', 'NI', 'TI', 'AH0I', 'NI', 'SE', '<EOS>']
+    print(len(PHONE_SET))
+    # output: 219

models/tts/valle_v2.1/libritts_dataset.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from tqdm import tqdm
+from g2p_en import G2p
+import librosa
+from torch.utils.data import Dataset
+import pandas as pd
+import time
+import io
+SAMPLE_RATE = 16000
+# g2p
+from .g2p_processor import G2pProcessor
+phonemizer_g2p = G2pProcessor()
+class VALLEDataset(Dataset):
+    def __init__(self, args):
+        print(f"Initializing VALLEDataset")
+        self.dataset_list = args.dataset_list
+        print(f"using sampling rate {SAMPLE_RATE}")
+        # set dataframe clumn name
+        book_col_name = [
+            "ID",
+            "Original_text",
+            "Normalized_text",
+            "Aligned_or_not",
+            "Start_time",
+            "End_time",
+            "Signal_to_noise_ratio",
+        ]
+        trans_col_name = [
+            "ID",
+            "Original_text",
+            "Normalized_text",
+            "Dir_path",
+            "Duration",
+        ]
+        self.metadata_cache = pd.DataFrame(columns=book_col_name)
+        self.trans_cache = pd.DataFrame(columns=trans_col_name)
+        # dataset_cache_dir = args.cache_dir # cache_dir
+        # print(f"args.cache_dir = ", args.cache_dir)
+        # os.makedirs(dataset_cache_dir, exist_ok=True)
+        ######## add data dir to dataset2dir ##########
+        self.dataset2dir = {
+            "dev-clean": f"{args.data_dir}/dev-clean",
+            "dev-other": f"{args.data_dir}/dev-other",
+            "test-clean": f"{args.data_dir}/test-clean",
+            "test-other": f"{args.data_dir}/test-other",
+            "train-clean-100": f"{args.data_dir}/train-clean-100",
+            "train-clean-360": f"{args.data_dir}/train-clean-360",
+            "train-other-500": f"{args.data_dir}/train-other-500",
+        }
+        ###### load metadata and transcripts #####
+        for dataset_name in self.dataset_list:
+            print("Initializing dataset: ", dataset_name)
+            # get [book,transcripts,audio] files list
+            self.book_files_list = self.get_metadata_files(
+                self.dataset2dir[dataset_name]
+            )
+            self.trans_files_list = self.get_trans_files(self.dataset2dir[dataset_name])
+            ## create metadata_cache (book.tsv file is not filtered, some file is not exist, but contain Duration and Signal_to_noise_ratio)
+            print("reading paths for dataset...")
+            for book_path in tqdm(self.book_files_list):
+                tmp_cache = pd.read_csv(
+                    book_path, sep="\t", names=book_col_name, quoting=3
+                )
+                self.metadata_cache = pd.concat(
+                    [self.metadata_cache, tmp_cache], ignore_index=True
+                )
+            self.metadata_cache.set_index("ID", inplace=True)
+            ## create transcripts (the trans.tsv file)
+            print("creating transcripts for dataset...")
+            for trans_path in tqdm(self.trans_files_list):
+                tmp_cache = pd.read_csv(
+                    trans_path, sep="\t", names=trans_col_name, quoting=3
+                )
+                tmp_cache["Dir_path"] = os.path.dirname(trans_path)
+                self.trans_cache = pd.concat(
+                    [self.trans_cache, tmp_cache], ignore_index=True
+                )
+            self.trans_cache.set_index("ID", inplace=True)
+            ## calc duration
+            self.trans_cache["Duration"] = (
+                self.metadata_cache.End_time[self.trans_cache.index]
+                - self.metadata_cache.Start_time[self.trans_cache.index]
+            )
+            ## add fullpath
+            # self.trans_cache['Full_path'] = os.path.join(self.dataset2dir[dataset_name],self.trans_cache['ID'])
+        # filter_by_duration: filter_out files with duration < 3.0 or > 15.0
+        print(f"Filtering files with duration between 3.0 and 15.0 seconds")
+        print(f"Before filtering: {len(self.trans_cache)}")
+        self.trans_cache = self.trans_cache[
+            (self.trans_cache["Duration"] >= 3.0)
+            & (self.trans_cache["Duration"] <= 15.0)
+        ]
+        print(f"After filtering: {len(self.trans_cache)}")
+    def get_metadata_files(self, directory):
+        book_files = []
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if file.endswith(".book.tsv") and file[0] != ".":
+                    rel_path = os.path.join(root, file)
+                    book_files.append(rel_path)
+        return book_files
+    def get_trans_files(self, directory):
+        trans_files = []
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if file.endswith(".trans.tsv") and file[0] != ".":
+                    rel_path = os.path.join(root, file)
+                    trans_files.append(rel_path)
+        return trans_files
+    def get_audio_files(self, directory):
+        audio_files = []
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if file.endswith((".flac", ".wav", ".opus")):
+                    rel_path = os.path.relpath(os.path.join(root, file), directory)
+                    audio_files.append(rel_path)
+        return audio_files
+    def get_num_frames(self, index):
+        # get_num_frames(durations) by index
+        duration = self.meta_data_cache["Duration"][index]
+        # num_frames = duration * SAMPLE_RATE
+        num_frames = int(duration * 75)
+        # file_rel_path = self.meta_data_cache['relpath'][index]
+        # uid = file_rel_path.rstrip('.flac').split('/')[-1]
+        # num_frames += len(self.transcripts[uid])
+        return num_frames
+    def __len__(self):
+        return len(self.trans_cache)
+    def __getitem__(self, idx):
+        # Get the file rel path
+        file_dir_path = self.trans_cache["Dir_path"].iloc[idx]
+        # Get uid
+        uid = self.trans_cache.index[idx]
+        # Get the file name from cache uid
+        file_name = uid + ".wav"
+        # Get the full file path
+        full_file_path = os.path.join(file_dir_path, file_name)
+        # get phone
+        phone = self.trans_cache["Normalized_text"][uid]
+        phone = phonemizer_g2p(phone, "en")[1]
+        # load speech
+        speech, _ = librosa.load(full_file_path, sr=SAMPLE_RATE)
+        # if self.resample_to_24k:
+        #     speech = librosa.resample(speech, orig_sr=SAMPLE_RATE, target_sr=24000)
+        # speech = torch.tensor(speech, dtype=torch.float32)
+        # pad speech to multiples of 200
+        # remainder = speech.size(0) % 200
+        # if remainder > 0:
+        #     pad = 200 - remainder
+        #     speech = torch.cat([speech, torch.zeros(pad, dtype=torch.float32)], dim=0)
+        # inputs = self._get_reference_vc(speech, hop_length=200)
+        inputs = {}
+        # Get the speaker id
+        # speaker = self.meta_data_cache['speaker'][idx]
+        # speaker_id = self.speaker2id[speaker]
+        # inputs["speaker_id"] = speaker_id
+        inputs["speech"] = speech  # 24khz speech, [T]
+        inputs["phone"] = phone  # [T]
+        return inputs
+def _is_batch_full(batch, num_tokens, max_tokens, max_sentences):
+    if len(batch) == 0:
+        return 0
+    if len(batch) == max_sentences:
+        return 1
+    if num_tokens > max_tokens:
+        return 1
+    return 0
+def batch_by_size(
+    indices,
+    num_tokens_fn,
+    max_tokens=None,
+    max_sentences=None,
+    required_batch_size_multiple=1,
+):
+    """
+    Yield mini-batches of indices bucketed by size. Batches may contain
+    sequences of different lengths.
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        num_tokens_fn (callable): function that returns the number of tokens at
+            a given index
+        max_tokens (int, optional): max number of tokens in each batch
+            (default: None).
+        max_sentences (int, optional): max number of sentences in each
+            batch (default: None).
+        required_batch_size_multiple (int, optional): require batch size to
+            be a multiple of N (default: 1).
+    """
+    bsz_mult = required_batch_size_multiple
+    sample_len = 0
+    sample_lens = []
+    batch = []
+    batches = []
+    for i in range(len(indices)):
+        idx = indices[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+        assert (
+            sample_len <= max_tokens
+        ), "sentence at index {} of size {} exceeds max_tokens " "limit of {}!".format(
+            idx, sample_len, max_tokens
+        )
+        num_tokens = (len(batch) + 1) * sample_len
+        if _is_batch_full(batch, num_tokens, max_tokens, max_sentences):
+            mod_len = max(
+                bsz_mult * (len(batch) // bsz_mult),
+                len(batch) % bsz_mult,
+            )
+            batches.append(batch[:mod_len])
+            batch = batch[mod_len:]
+            sample_lens = sample_lens[mod_len:]
+            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+        batch.append(idx)
+    if len(batch) > 0:
+        batches.append(batch)
+    return batches
+def test():
+    from utils.util import load_config
+    cfg = load_config("./egs/tts/VALLE_V2/exp_ar_libritts.json")
+    dataset = VALLEDataset(cfg.dataset)
+    metadata_cache = dataset.metadata_cache
+    trans_cache = dataset.trans_cache
+    print(trans_cache.head(10))
+    # print(dataset.book_files_list)
+    breakpoint()
+if __name__ == "__main__":
+    test()

models/tts/valle_v2.1/modeling_llama.py ADDED Viewed

	@@ -0,0 +1,1043 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# Original work copyright
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.models.llama.modeling_llama import ACT2FN
+from transformers.models.llama.modeling_llama import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.models.llama.modeling_llama import PreTrainedModel
+from transformers.models.llama.modeling_llama import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.llama.modeling_llama import LlamaConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full(
+        (tgt_len, tgt_len),
+        torch.tensor(torch.finfo(dtype).min, device=device),
+        device=device,
+    )
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(
+            self.max_seq_len_cached,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", emb.cos()[None, None, :, :], persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", emb.sin()[None, None, :, :], persistent=False
+        )
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(
+                self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer(
+                "cos_cached", emb.cos()[None, None, :, :], persistent=False
+            )
+            self.register_buffer(
+                "sin_cached", emb.sin()[None, None, :, :], persistent=False
+            )
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig, **kwargs):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=False
+        )
+        self.k_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=False
+        )
+        self.v_proj = nn.Linear(
+            self.hidden_size, self.num_heads * self.head_dim, bias=False
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, self.hidden_size, bias=False
+        )
+        self.rotary_emb = LlamaRotaryEmbedding(
+            self.head_dim, max_position_embeddings=self.max_position_embeddings
+        )
+        if "layer_idx" in kwargs:
+            self.layer_idx = kwargs["layer_idx"]
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = (
+            self.q_proj(hidden_states)
+            .view(bsz, q_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        key_states = (
+            self.k_proj(hidden_states)
+            .view(bsz, q_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        value_states = (
+            self.v_proj(hidden_states)
+            .view(bsz, q_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, position_ids
+        )
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights,
+                torch.tensor(
+                    torch.finfo(attn_weights.dtype).min, device=attn_weights.device
+                ),
+            )
+        unnormed_attn_weights = attn_weights
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, unnormed_attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, **kwargs):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError(
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (
+                    torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                ).to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[
+            torch.arange(batch_size, device=logits.device), sequence_lengths
+        ]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
+                )
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

models/tts/valle_v2.1/train.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from omegaconf import DictConfig, OmegaConf
+from typing import Optional
+import hydra
+import os
+def train(cfg):
+    trainer = hydra.utils.instantiate(cfg.trainer)
+    trainer.train_loop()
+@hydra.main(version_base="1.3", config_path="./cfg", config_name="base.yaml")
+def main(cfg: DictConfig) -> Optional[float]:
+    # train the model
+    train(cfg)
+if __name__ == "__main__":
+    main()

models/tts/valle_v2.1/valle_ar.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .modeling_llama import LlamaConfig, LlamaForCausalLM, LlamaModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torch.nn as nn
+class ValleAR(nn.Module):
+    def __init__(
+        self,
+        phone_vocab_size=256,
+        target_vocab_size=1024,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        pad_token_id=1281,
+        bos_target_id=1282,
+        eos_target_id=1283,
+        bos_phone_id=1284,
+        eos_phone_id=1285,
+        use_input_embeds=False,
+        emb_dim=256,
+        **kwargs,
+    ):
+        super(ValleAR, self).__init__()
+        self.config = LlamaConfig(
+            vocab_size=phone_vocab_size + target_vocab_size + 10,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_target_id,
+            eos_token_id=eos_target_id,
+        )
+        self.phone_vocab_size = phone_vocab_size
+        self.target_vocab_size = target_vocab_size
+        self.pad_token_id = pad_token_id
+        self.bos_target_id = bos_target_id
+        self.eos_target_id = eos_target_id
+        self.bos_phone_id = bos_phone_id
+        self.eos_phone_id = eos_phone_id
+        self.model = LlamaForCausalLM(self.config)
+        self.use_input_embeds = use_input_embeds
+        # no input embedding is used to provide speaker information
+        if self.use_input_embeds:
+            self.emb_linear = nn.Linear(emb_dim, hidden_size)
+            self.emb_linear.weight.data.normal_(mean=0.0, std=0.01)
+            self.emb_linear.bias.data.zero_()
+    def forward(
+        self, phone_ids, phone_mask, target_ids, target_mask, input_embeds=None
+    ):
+        if input_embeds is not None:
+            input_embeds = self.emb_linear(input_embeds)
+        phone_ids, phone_mask, phone_label = self.add_phone_eos_bos_label(
+            phone_ids,
+            phone_mask,
+            self.eos_phone_id,
+            self.bos_phone_id,
+            self.pad_token_id,
+        )
+        target_ids, target_mask, target_label = self.add_target_eos_bos_label(
+            target_ids,
+            target_mask,
+            self.eos_target_id,
+            self.bos_target_id,
+            self.pad_token_id,
+        )
+        input_token_ids = torch.cat([phone_ids, target_ids], dim=-1)
+        attention_mask = torch.cat([phone_mask, target_mask], dim=-1)
+        # breakpoint()
+        if input_embeds is not None:
+            raise NotImplementedError
+            attention_mask = torch.cat(
+                [
+                    torch.ones(
+                        (input_embeds.shape[0], input_embeds.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    ),
+                    attention_mask,
+                ],
+                dim=-1,
+            )
+        labels = torch.cat([phone_label, target_label], dim=-1)
+        if input_embeds is not None:
+            raise NotImplementedError
+            labels = torch.cat(
+                [
+                    -100
+                    * torch.ones(
+                        (input_embeds.shape[0], input_embeds.shape[1]),
+                        dtype=labels.dtype,
+                        device=labels.device,
+                    ),
+                    labels,
+                ],
+                dim=-1,
+            )
+        if input_embeds is not None:
+            raise NotImplementedError
+            inputs_embeds = torch.cat(
+                [input_embeds, self.model.model.embed_tokens(input_token_ids)], dim=1
+            )
+            out = self.model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                labels=labels,
+                return_dict=True,
+            )
+            return out
+        out = self.model(
+            input_token_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            return_dict=True,
+        )
+        # calcualte top1, top5, top10 accuracy
+        logits = out.logits
+        logits = logits[:, -target_ids.shape[1] :]
+        top1_acc = logits.argmax(-1)[..., :-1] == target_ids[:, 1:]
+        top1_acc = (top1_acc * target_mask[..., :-1]).sum() / target_mask.sum()
+        top5_acc = torch.topk(logits[..., :-1, :], 5, dim=-1)[1]
+        top5_acc = top5_acc == target_ids[:, 1:].unsqueeze(-1)
+        top5_acc = (
+            top5_acc * target_mask[..., :-1].unsqueeze(-1)
+        ).sum() / target_mask.sum()
+        top10_acc = torch.topk(logits[..., :-1, :], 10, dim=-1)[1]
+        top10_acc = top10_acc == target_ids[:, 1:].unsqueeze(-1)
+        top10_acc = (
+            top10_acc * target_mask[..., :-1].unsqueeze(-1)
+        ).sum() / target_mask.sum()
+        out.top1_acc = top1_acc
+        out.top5_acc = top5_acc
+        out.top10_acc = top10_acc
+        return out
+    def add_phone_eos_bos_label(
+        self, phone_ids, phone_mask, phone_eos_id, phone_bos_id, pad_token_id
+    ):
+        # phone_ids: [B, T]
+        # phone_mask: [B, T]
+        phone_ids = phone_ids + self.target_vocab_size * phone_mask
+        phone_ids = phone_ids * phone_mask
+        phone_ids = F.pad(phone_ids, (0, 1), value=0) + phone_eos_id * F.pad(
+            1 - phone_mask, (0, 1), value=1
+        )  # make pad token eos token, add eos token at the end
+        phone_mask = F.pad(phone_mask, (1, 0), value=1)  # add eos mask
+        phone_ids = phone_ids * phone_mask + pad_token_id * (
+            1 - phone_mask
+        )  # restore pad token ids
+        phone_ids = F.pad(phone_ids, (1, 0), value=phone_bos_id)  # add bos token
+        phone_mask = F.pad(phone_mask, (1, 0), value=1)  # add bos mask
+        phone_label = -100 * torch.ones_like(
+            phone_ids
+        )  # loss for entire phone is not computed (passed to llama)
+        return phone_ids, phone_mask, phone_label
+    def add_target_eos_bos_label(
+        self, target_ids, target_mask, target_eos_id, target_bos_id, pad_token_id
+    ):
+        # target_ids: [B, T]
+        # target_mask: [B, T]
+        target_ids = target_ids * target_mask
+        target_ids = F.pad(target_ids, (0, 1), value=0) + target_eos_id * F.pad(
+            1 - target_mask, (0, 1), value=1
+        )
+        target_mask = F.pad(target_mask, (1, 0), value=1)
+        target_ids = target_ids * target_mask + pad_token_id * (1 - target_mask)
+        target_ids = F.pad(target_ids, (1, 0), value=target_bos_id)
+        target_mask = F.pad(target_mask, (1, 0), value=1)
+        target_label = target_ids * target_mask + (-100) * (
+            1 - target_mask
+        )  # loss for target is computed on unmasked tokens
+        return target_ids, target_mask, target_label
+    def sample_hf(
+        self,
+        phone_ids,  # the phones of prompt and target should be concatenated together
+        prompt_ids,
+        inputs_embeds=None,
+        max_length=2000,
+        temperature=1.0,
+        top_k=100,
+        top_p=0.9,
+        repeat_penalty=1.0,
+        num_beams=1,
+    ):
+        if inputs_embeds is not None:
+            inputs_embeds = self.emb_linear(inputs_embeds)
+        phone_mask = torch.ones_like(phone_ids)
+        prompt_mask = torch.ones_like(prompt_ids)
+        phone_ids, _, _ = self.add_phone_eos_bos_label(
+            phone_ids,
+            phone_mask,
+            self.eos_phone_id,
+            self.bos_phone_id,
+            self.pad_token_id,
+        )
+        prompt_ids, _, _ = self.add_target_eos_bos_label(
+            prompt_ids,
+            prompt_mask,
+            self.eos_target_id,
+            self.bos_target_id,
+            self.pad_token_id,
+        )
+        prompt_ids = prompt_ids[:, :-1]  # remove end token. Make it continue mode
+        input_token_ids = torch.cat([phone_ids, prompt_ids], dim=-1)
+        if inputs_embeds is not None:
+            raise NotImplementedError
+            inputs_embeds = torch.cat(
+                [inputs_embeds, self.model.model.embed_tokens(input_token_ids)], dim=1
+            )
+            generated_ids = self.model.generate(
+                inputs_embeds=inputs_embeds,
+                do_sample=True,
+                max_length=max_length,
+                pad_token_id=self.pad_token_id,
+                eos_token_id=self.eos_target_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repeat_penalty,
+            )
+            gen_tokens = generated_ids[:, :-1]
+            return gen_tokens
+        input_length = input_token_ids.shape[1]
+        generated_ids = self.model.generate(
+            input_token_ids,
+            do_sample=True,
+            max_length=max_length,
+            pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_target_id,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repeat_penalty,
+            num_beams=num_beams,
+        )
+        gen_tokens = generated_ids[:, input_length:-1]
+        return gen_tokens
+def test():
+    model = ValleAR()
+    phone_ids = torch.LongTensor([[1, 2, 3, 4, 5, 0], [1, 2, 3, 4, 5, 6]])
+    phone_mask = torch.LongTensor([[1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0]])
+    target_ids = torch.LongTensor([765, 234, 123, 234, 123, 599]).expand(2, -1)
+    target_mask = torch.LongTensor([1, 1, 1, 1, 0, 0]).expand(2, -1)
+    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
+    for i in range(15):
+        optimizer.zero_grad()
+        out = model(
+            phone_ids=phone_ids,
+            phone_mask=phone_mask,
+            target_ids=target_ids,
+            target_mask=target_mask,
+        )
+        loss = out.loss
+        loss.backward()
+        optimizer.step()
+        print(f"iter={i}, {loss}.")
+    phone_ids = torch.LongTensor([1, 2, 3]).reshape(1, -1)
+    target_ids = torch.LongTensor([765, 234]).reshape(1, -1)
+    sampled = model.sample_hf(phone_ids, target_ids)
+    breakpoint()
+if __name__ == "__main__":
+    test()

models/tts/valle_v2.1/valle_ar_trainer.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import shutil
+import torch
+import time
+from pathlib import Path
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+from .base_trainer import BaseTrainer
+def make_pad_mask(
+    lengths: torch.Tensor, max_len: int = 0, left_pad=False
+) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    left_pad:
+        A boolean indicating whether to left pad the mask.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    >>> lengths = torch.tensor([1, 3, 2, 5])
+    >>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    mask = expaned_lengths >= lengths.unsqueeze(-1)
+    if left_pad:
+        mask = mask.flip(dims=[1])
+    return mask
+class ValleARTrainer(BaseTrainer):
+    def __init__(self, args=None, cfg=None):
+        super().__init__(args, cfg)
+        if self.cfg.use_speechtokenizer:
+            from models.codec.speechtokenizer.model import SpeechTokenizer
+            config_path = "./ckpts/speechtokenizer_hubert_avg/config.json"
+            ckpt_path = "./ckpts/speechtokenizer_hubert_avg/SpeechTokenizer.pt"
+            assert os.path.isfile(
+                config_path
+            ), f"codec model {config_path} not found! Download with huggingface-cli download fnlp/SpeechTokenizer speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts"
+            assert os.path.isfile(
+                ckpt_path
+            ), f"codec model {ckpt_path} not found! Download with huggingface-cli download fnlp/SpeechTokenizer speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts"
+            self.codec_encoder = SpeechTokenizer.load_from_checkpoint(
+                config_path, ckpt_path
+            )
+            self.codec_encoder.eval()
+            self.codec_encoder.to(self.accelerator.device)
+            print(f"Loaded SpeechTokenizer from {config_path} and {ckpt_path}")
+        else:
+            from encodec import EncodecModel
+            with self.accelerator.main_process_first():
+                self.codec_encoder = EncodecModel.encodec_model_24khz()
+                self.codec_encoder.set_target_bandwidth(6.0)
+                self.codec_encoder.to(self.accelerator.device)
+                self.codec_decoder = None
+                print("Loaded EncodecModel")
+        self.top1_accuracies = []
+        self.top5_accuracies = []
+        self.top10_accuracies = []
+        if hasattr(self.cfg, "flatten_first_2_layers"):
+            self.flatten_first_2_layers = self.cfg.flatten_first_2_layers
+            print("flattened:", self.flatten_first_2_layers)
+        else:
+            self.flatten_first_2_layers = False
+        if hasattr(self.cfg, "num_prediction_heads"):
+            self.num_prediction_heads = self.cfg.num_prediction_heads
+            print("num_prediction_heads:", self.num_prediction_heads)
+    def _accelerator_prepare(self):
+        # if self.accelerator.is_main_process:
+        #     breakpoint()
+        # self.accelerator.wait_for_everyone()
+        (
+            self.model,
+            self.optimizer,
+        ) = self.accelerator.prepare(
+            self.model,
+            self.optimizer,
+        )
+    def _build_criterion(self):
+        pass  # loss is directly returned from model
+    def _build_scheduler(self):
+        from transformers import (
+            get_cosine_schedule_with_warmup,
+            get_constant_schedule_with_warmup,
+        )
+        return get_cosine_schedule_with_warmup(
+            self.optimizer,
+            num_warmup_steps=self.cfg.train.scheduler.warmup_steps,
+            num_training_steps=self.cfg.train.scheduler.total_steps,
+        )
+    def _build_model(self):
+        if hasattr(self.cfg.model, "num_prediction_heads"):
+            from .valle_ar_multihead import ValleAR
+        else:
+            from .valle_ar import ValleAR
+        return ValleAR(**self.cfg.model)
+    def _train_step(self, batch):
+        # inference codec
+        """Returns: dict('speech', 'speech_len', 'phone_ids', 'phone_lens')
+        speech: [B, T]
+        speech_len: [B]
+        phone_ids: [B, T]
+        phone_lens: [B]
+        """
+        device = self.accelerator.device
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(device)
+        with torch.no_grad():
+            if self.cfg.use_speechtokenizer:
+                # Extract discrete codes from SpeechTokenizer
+                vq_id = self.codec_encoder.encode(
+                    batch["speech"].unsqueeze(1)
+                )  # [B,1,T] -> (n_q, B, T)
+            else:
+                vq_id = self.codec_encoder.encode(batch["speech"].unsqueeze(1))
+                vq_id = torch.cat([encoded[0] for encoded in vq_id], dim=-1).transpose(
+                    0, 1
+                )
+            # recovered_audio = self.codec_decoder(vq_emb, vq=False)
+            # torchaudio.save('a.wav', recovered_audio[0], 16000)
+            # vq_id: [8, B, T//320]
+            if self.flatten_first_2_layers:
+                first_layer = vq_id[0]
+                second_layer = vq_id[1]
+                # flatten the first two layers
+                batch["speech"] = torch.stack(
+                    [first_layer, second_layer], dim=-1
+                ).flatten(-2, -1)
+                batch["speech_len"] = batch["speech_len"] // 160
+            elif hasattr(self.cfg.model, "num_prediction_heads"):
+                batch["speech"] = vq_id[:2]  # first two layers
+                batch["speech_len"] = (
+                    batch["speech_len"] // 320
+                )  # our codec downsamples 320x
+            else:
+                batch["speech"] = vq_id[0]  # use first layer
+                batch["speech_len"] = (
+                    batch["speech_len"] // 320
+                )  # our codec downsamples 320x
+        assert batch["speech_len"].max() <= batch["speech"].shape[-1]
+        phone_mask = 1 - make_pad_mask(
+            batch["phone_lens"], max_len=batch["phone_ids"].size(1), left_pad=False
+        ).to(torch.long)
+        speech_mask = 1 - make_pad_mask(
+            batch["speech_len"], max_len=batch["speech"].size(1)
+        ).to(torch.long)
+        out = self.model(
+            phone_ids=batch["phone_ids"],
+            phone_mask=phone_mask,
+            target_ids=batch["speech"],
+            target_mask=speech_mask,
+        )
+        loss = out.loss
+        # if self.accelerator.is_main_process:
+        #     print(loss)
+        # if hasattr(out, 'top1_acc'):
+        #     self.top1_accuracies.append(out.top1_acc)
+        #     self.top5_accuracies.append(out.top5_acc)
+        #     self.top10_accuracies.append(out.top10_acc)
+        #     print(f'avgs: top1: {sum(self.top1_accuracies)/len(self.top1_accuracies)}, top5: {sum(self.top5_accuracies)/len(self.top5_accuracies)}, top10: {sum(self.top10_accuracies)/len(self.top10_accuracies)}')
+        #     breakpoint()
+        return loss
+    ##########add your own dataloader to the trainer#############
+    def _build_dataloader(self):
+        from torch.utils.data import ConcatDataset, DataLoader
+        if self.cfg.train.dataset.name == "emilia":
+            from .emilia_dataset import EmiliaDataset as VALLEDataset
+            train_dataset = VALLEDataset()
+        elif self.cfg.train.dataset.name == "mls":
+            from .mls_dataset import VALLEDataset as VALLEDataset
+            train_dataset = VALLEDataset(self.cfg.dataset, resample_to_24k=False)
+        elif self.cfg.train.dataset.name == "libritts":
+            from .libritts_dataset import VALLEDataset as VALLEDataset
+            train_dataset = VALLEDataset(self.cfg.dataset)
+        from .valle_collator import VALLECollator
+        import numpy as np
+        print("length of train_dataset:", len(train_dataset))
+        collator = VALLECollator()
+        if self.cfg.train.dataset.use_dynamic_batchsize:
+            if self.accelerator.is_main_process:
+                self.logger.info("Use Dynamic Batchsize......")
+            from .mls_dataset import batch_by_size
+            batch_sampler = batch_by_size(
+                train_dataset.num_frame_indices,
+                train_dataset.get_num_frames,
+                max_tokens=self.cfg.train.max_tokens * self.accelerator.num_processes,
+                max_sentences=self.cfg.train.max_sentences
+                * self.accelerator.num_processes,
+                required_batch_size_multiple=self.accelerator.num_processes,
+            )
+            np.random.shuffle(batch_sampler)
+            print(batch_sampler[0])
+            batches = [
+                x[
+                    self.accelerator.local_process_index :: self.accelerator.num_processes
+                ]
+                for x in batch_sampler
+                if len(x) % self.accelerator.num_processes == 0
+            ]
+            from models.base.base_sampler import VariableSampler
+            train_loader = DataLoader(
+                train_dataset,
+                collate_fn=collator,
+                num_workers=self.cfg.train.dataloader.num_worker,
+                batch_sampler=VariableSampler(
+                    batches, drop_last=True, use_random_sampler=True
+                ),
+                pin_memory=self.cfg.train.dataloader.pin_memory,
+                persistent_workers=self.cfg.train.dataloader.persistent_workers,
+                prefetch_factor=4,
+            )
+            print(
+                f"process {self.accelerator.local_process_index} has {len(batches)} batches"
+            )
+            self.accelerator.wait_for_everyone()
+        else:
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                train_dataset,
+                num_replicas=self.accelerator.num_processes,
+                rank=self.accelerator.local_process_index,
+                shuffle=True,
+            )
+            train_loader = DataLoader(
+                train_dataset,
+                batch_size=self.cfg.train.batch_size,
+                num_workers=self.cfg.train.dataloader.num_worker,
+                pin_memory=self.cfg.train.dataloader.pin_memory,
+                collate_fn=collator,
+                sampler=sampler,
+            )
+            print(
+                f"process {self.accelerator.local_process_index} has {len(train_loader)} batches"
+            )
+        return train_loader, None
+    def _test_step(self, batch):
+        # inference codec
+        """Returns: dict('speech', 'speech_len', 'phone_ids', 'phone_lens')
+        speech: [B, T]
+        speech_len: [B]
+        phone_ids: [B, T]
+        phone_lens: [B]
+        """
+        import torchaudio
+        device = self.accelerator.device
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(device)
+        with torch.no_grad():
+            if self.cfg.use_speechtokenizer:
+                # Extract discrete codes from SpeechTokenizer
+                vq_id = self.codec_encoder.encode(
+                    batch["speech"].unsqueeze(1)
+                )  # [B,1,T] -> (n_q, B, T)
+            else:
+                vq_id = self.codec_encoder.encode(batch["speech"].unsqueeze(1))
+                vq_id = torch.cat([encoded[0] for encoded in vq_id], dim=-1).transpose(
+                    0, 1
+                )
+            # recovered_audio = self.codec_decoder(vq_emb, vq=False)
+            # torchaudio.save('a.wav', recovered_audio[0], 16000)
+            # vq_id: [8, B, T//200]
+            # vq_emb = self.codec_decoder.quantizer.vq2emb(vq=vq_id[:1], n_quantizers=1)
+            # recovered_audio = self.codec_decoder(vq_emb, vq=False)
+            # recovered_audio.shape: torch.Size([1, 1, 50200])
+            if self.flatten_first_2_layers:
+                first_layer = vq_id[0]
+                second_layer = vq_id[1]
+                # flatten the first two layers
+                batch["speech"] = torch.stack(
+                    [first_layer, second_layer], dim=-1
+                ).flatten(-2, -1)
+                batch["speech_len"] = batch["speech_len"] // 160
+            elif hasattr(self.cfg.model, "num_prediction_heads"):
+                batch["speech"] = vq_id[:2]  # first two layers
+                batch["speech_len"] = (
+                    batch["speech_len"] // 320
+                )  # our codec downsamples 320x
+            else:
+                batch["speech"] = vq_id[0]  # use first layer
+                batch["speech_len"] = (
+                    batch["speech_len"] // 320
+                )  # our codec downsamples 320x
+            # save gt
+            breakpoint()
+            recovered_audio = self.codec_encoder.decode(vq_id[:1, :1])
+            # recovered_audio = self.codec_encoder.decode([(vq_id[:1].transpose(0,1), None)])
+            torchaudio.save("gt.wav", recovered_audio[0].cpu(), 16000)
+            out_vq_ids = self.model.sample_hf(
+                batch["phone_ids"][:1, ...], batch["speech"][:1, :225], temperature=0.9
+            )
+            # out_vq_ids = torch.cat([batch['speech'][:1, :225], out_vq_ids[:1, ...]], dim=1)
+            # reconstruct form tokens
+            recovered_audio = self.codec_encoder.decode(out_vq_ids.unsqueeze(0))
+            # recovered_audio = self.codec_encoder.decode([(out_vq_ids, None)])
+            torchaudio.save("a.wav", recovered_audio[0].cpu(), 16000)
+            breakpoint()
+            print()
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        epoch_sum_loss = 0.0
+        return epoch_sum_loss
+    def _inference(self):
+        pass
+    def test_loop(self):
+        self.model.eval()
+        for batch in self.train_dataloader:
+            self._test_step(batch)

models/tts/valle_v2.1/valle_collator.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch.nn.utils.rnn import pad_sequence
+class VALLECollator:
+    def __init__(self, cfg=None):
+        self.cfg = cfg
+    def __call__(self, batch):
+        """Returns: dict('speech', 'speech_len', 'phone_ids', 'phone_lens')
+        speech: [B, T]
+        speech_len: [B]
+        phone_ids: [B, T]
+        phone_lens: [B]
+        """
+        assert len(batch) != 0, "batch is empty before None checking"
+        batch = [b for b in batch if b is not None]
+        assert len(batch) != 0, "batch is empty after None checking"
+        packed_batch_features = {}
+        # Function to handle tensor copying
+        def process_tensor(data, dtype=torch.float32):
+            if isinstance(data, torch.Tensor):
+                return data.detach()
+            else:
+                return torch.tensor(data, dtype=dtype)
+        # Process 'speech' data
+        speeches = [process_tensor(b["speech"]) for b in batch]
+        packed_batch_features["speech_len"] = torch.tensor(
+            [len(s) for s in speeches], dtype=torch.long
+        )
+        packed_batch_features["speech"] = pad_sequence(
+            speeches, batch_first=True, padding_value=0
+        )
+        # right-padding 'phone' data
+        phones = [process_tensor(b["phone"], dtype=torch.long) for b in batch]
+        packed_batch_features["phone_lens"] = torch.tensor(
+            [len(phone) for phone in phones], dtype=torch.long
+        )
+        packed_batch_features["phone_ids"] = pad_sequence(
+            phones, batch_first=True, padding_value=0
+        )
+        # # Process 'phone' data, with left padding
+        # phones = [process_tensor(b['phone'], dtype=torch.long).flip(0) for b in batch] # first reverse the whole sequence
+        # packed_batch_features['phone_lens'] = torch.tensor([len(phone) for phone in phones], dtype=torch.long)
+        # packed_batch_features['phone_ids'] = pad_sequence(phones, batch_first=True, padding_value=0) # do the right padding
+        # packed_batch_features['phone_ids'] = packed_batch_features['phone_ids'].flip(1) # flip back to original order (left padding)
+        return packed_batch_features

models/tts/valle_v2.1/valle_inference.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torchaudio
+class ValleInference(torch.nn.Module):
+    def __init__(
+        self,
+        use_vocos=False,
+        use_speechtokenizer=True,
+        ar_path=None,
+        nar_path=None,
+        speechtokenizer_path=None,
+        device="cuda",
+    ):
+        super().__init__()
+        self.device = device
+        # prepare pretrained VALLE AR model
+        from .valle_ar import ValleAR
+        self.ar_model = ValleAR(
+            phone_vocab_size=300,
+            target_vocab_size=1024,
+            pad_token_id=1324,
+            bos_target_id=1325,
+            eos_target_id=1326,
+            bos_phone_id=1327,
+            eos_phone_id=1328,
+            bos_prompt_id=1329,
+            eos_prompt_id=1330,
+            num_hidden_layers=16,
+        )
+        # change the following path to your trained model path
+        assert ar_path is not None
+        self.ar_model.load_state_dict(torch.load(ar_path, map_location="cpu"))
+        self.ar_model.eval().to(self.device)
+        # prepare pretrained VALLE NAR model
+        from .valle_nar import ValleNAR
+        self.nar_model = ValleNAR(
+            phone_vocab_size=300,
+            target_vocab_size=1024,
+            pad_token_id=1324,
+            bos_target_id=1325,
+            eos_target_id=1326,
+            bos_phone_id=1327,
+            eos_phone_id=1328,
+            bos_prompt_id=1329,
+            eos_prompt_id=1330,
+            num_hidden_layers=16,
+        )
+        assert nar_path is not None
+        self.nar_model.load_state_dict(torch.load(nar_path, map_location="cpu"))
+        self.nar_model.eval().to(self.device)
+        # prepare codec encoder
+        assert not (
+            use_speechtokenizer and use_vocos
+        ), "Only one of use_speechtokenizer and use_vocos can be True"
+        self.use_speechtokenizer = use_speechtokenizer
+        if use_speechtokenizer:
+            from models.codec.speechtokenizer.model import SpeechTokenizer
+            # download from https://huggingface.co/fnlp/SpeechTokenizer/tree/main/speechtokenizer_hubert_avg
+            config_path = speechtokenizer_path + "/config.json"
+            ckpt_path = speechtokenizer_path + "/SpeechTokenizer.pt"
+            self.codec_encoder = SpeechTokenizer.load_from_checkpoint(
+                config_path, ckpt_path
+            )
+            self.codec_encoder.eval()
+            self.codec_encoder.to(device)
+            print(f"Loaded SpeechTokenizer from {config_path} and {ckpt_path}")
+        else:
+            # use Encodec
+            from encodec import EncodecModel
+            self.codec_encoder = EncodecModel.encodec_model_24khz()
+            self.codec_encoder.set_target_bandwidth(6.0)
+            self.codec_encoder.to(self.device)
+            if use_vocos:
+                from vocos import Vocos
+                self.codec_decoder = Vocos.from_pretrained(
+                    "charactr/vocos-encodec-24khz"
+                )
+                self.codec_decoder.to(self.device)
+                print("Loaded Vocos")
+            print("Loaded EncodecModel")
+        self.use_vocos = use_vocos
+    def decode(self, vq_ids):
+        """vq_ids.shape: [8, B, T],
+        returns: [B, 1, T]"""
+        if self.use_speechtokenizer:
+            # infer speechtokenizer
+            return self.codec_encoder.decode(vq_ids)  # [B, 1, T]
+        else:
+            if not self.use_vocos:
+                # vocos decoder
+                return self.codec_encoder.decode([(vq_ids.transpose(0, 1), None)])
+            else:
+                # encodec decoder
+                features = self.codec_decoder.codes_to_features(vq_ids.squeeze(1))
+                bandwidth_id = torch.tensor([2], device=vq_ids.device)
+                return self.codec_decoder.decode(
+                    features, bandwidth_id=bandwidth_id
+                ).unsqueeze(0)
+    def forward(self, batch, chunk_configs: list, return_prompt=False, prompt_len=None):
+        """batch: dict(
+            speech: [B, T]
+            phone_ids: [B, T]
+        )
+        returns: [B, 1, T] audio
+        """
+        if prompt_len is None:
+            prompt_len = 100000  # no prompt length limiting
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(self.device)
+        with torch.no_grad():
+            if self.use_speechtokenizer:
+                vq_id = self.codec_encoder.encode(
+                    batch["speech"].unsqueeze(1)
+                )  # [B,1,T] -> (n_q, B, T)
+            else:
+                vq_id = self.codec_encoder.encode(batch["speech"].unsqueeze(1))
+                vq_id = torch.cat([encoded[0] for encoded in vq_id], dim=-1).transpose(
+                    0, 1
+                )
+            # typically we only require one config in the chunk,
+            # but we can also use multiple configs to, for example, use different sampling temperature at different positions
+            for chunk in chunk_configs:
+                ar_vq_ids = self.ar_model.sample_hf(
+                    batch["phone_ids"],
+                    vq_id[0, :, :prompt_len],
+                    top_p=chunk["top_p"],
+                    top_k=chunk["top_k"],
+                    temperature=chunk["temperature"],
+                    num_beams=chunk["num_beams"],
+                    repeat_penalty=chunk["repeat_penalty"],
+                    max_length=chunk["max_length"],
+                )
+                # recovered_audio_ar = self.decode(ar_vq_ids.unsqueeze(0))
+                # torchaudio.save('recovered_audio_ar.wav', recovered_audio_ar[0].cpu(), 24000)
+                nar_vq_ids = self.nar_model.sample_hf(
+                    phone_ids=batch["phone_ids"],
+                    prompt_ids=vq_id[:, :, :prompt_len],
+                    first_stage_ids=ar_vq_ids,
+                    # first_stage_ids=vq_id[0, :, prompt_len:],
+                )
+                if return_prompt:
+                    nar_vq_ids = torch.cat(
+                        [vq_id[..., :prompt_len], nar_vq_ids], dim=-1
+                    )
+                recovered_audio = self.decode(nar_vq_ids)
+                return recovered_audio  # [B, 1, T]

models/tts/valle_v2.1/valle_nar.py ADDED Viewed

	@@ -0,0 +1,801 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+NUM_QUANTIZERS = 8  # number of quantizers in total, currently assumes first layer AR.
+START_QUANTIZATION_LAYER = 1  # start quantization layer
+END_QUANTIZATION_LAYER = 7  # end quantization layer
+class LlamaAdaptiveRMSNorm(nn.Module):
+    def __init__(self, hidden_size=1024, eps=1e-9, dim_cond=1024):
+        super().__init__()
+        self.to_weight = nn.Linear(dim_cond, hidden_size)
+        nn.init.normal_(self.to_weight.weight, mean=0.0, std=0.02)
+        # nn.init.zeros_(self.to_weight.weight)
+        # nn.init.ones_(self.to_weight.bias)
+        self.variance_epsilon = eps
+        self._is_hf_initialized = True  # disable automatic init
+    def forward(self, hidden_states, cond_embedding):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        weight = self.to_weight(cond_embedding)
+        return (weight * hidden_states).to(input_dtype)
+class LlamaNARDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: LlamaConfig):
+        """Override to adaptive layer norm"""
+        super().__init__(config=config, layer_idx=0)  # init attention, mlp, etc.
+        self.input_layernorm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+        )
+        self.post_attention_layernorm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+        )
+    # add `cond` in forward function
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_embedding: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(
+            hidden_states, cond_embedding=cond_embedding
+        )
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(
+            hidden_states, cond_embedding=cond_embedding
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+from transformers.models.llama.modeling_llama import BaseModelOutputWithPast
+class MultiEmbedding(nn.Module):
+    """Embedding for multiple quantization layers, summing up the embeddings of each layer."""
+    def __init__(
+        self,
+        num_embeddings=1034,
+        embedding_dim=1024,
+        num_quantization_layers=NUM_QUANTIZERS,
+    ):
+        super().__init__()
+        self.embeddings = nn.ModuleList(
+            [
+                nn.Embedding(num_embeddings, embedding_dim)
+                for _ in range(num_quantization_layers)
+            ]
+        )
+        # initialize embeddings
+        for i in range(num_quantization_layers):
+            self.embeddings[i].weight.data.normal_(mean=0.0, std=0.02)
+        self._is_hf_initialized = True  # disable automatic init
+    def forward(self, input_ids):
+        """Input: [num_quant, B, T] -> Output: [B, T, H]"""
+        num_quant, B, T = input_ids.shape
+        summed_embeddings = torch.zeros(
+            B, T, self.embeddings[0].embedding_dim, device=input_ids.device
+        )
+        for i in range(num_quant):
+            summed_embeddings += self.embeddings[i](input_ids[i])
+        return summed_embeddings
+class LlammaNARModel(LlamaModel):
+    def __init__(self, config):
+        """Adding adaptive layer norm, conditional embeddings, and multi-level input embeddings to the decoder layer"""
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [LlamaNARDecoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+        )
+        self.embed_cond = nn.Embedding(
+            NUM_QUANTIZERS, config.hidden_size
+        )  # 7 quantization layers
+        for layer in self.layers:
+            layer.input_layernorm = LlamaAdaptiveRMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+            )
+            layer.post_attention_layernorm = LlamaAdaptiveRMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+            )
+        self.post_init()
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create noncausal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        def _expand_mask(
+            mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+        ):
+            """
+            Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+            """
+            bsz, src_len = mask.size()
+            tgt_len = tgt_len if tgt_len is not None else src_len
+            expanded_mask = (
+                mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+            )
+            inverted_mask = 1.0 - expanded_mask
+            return inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(dtype).min
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,  # [num_quant, B, T]
+        cond: torch.LongTensor = None,  # index for conditional embeddings, [B]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # retrieve some shape info
+        batch_size, seq_length, _ = input_ids.shape
+        inputs_embeds = input_ids  # [B, T, H]
+        # embed cond
+        cond_embedding = self.embed_cond(cond)  # [B, H]
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cond_embedding=cond_embedding,  # using cond embed
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states, cond_embedding=cond_embedding)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+from transformers.models.llama.modeling_llama import LlamaPreTrainedModel
+from transformers.models.llama.modeling_llama import CrossEntropyLoss
+from easydict import EasyDict as edict
+class LlamaForNARModeling(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlammaNARModel(config)
+        self.lm_head = nn.ModuleList(
+            [
+                nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+                for i in range(END_QUANTIZATION_LAYER - START_QUANTIZATION_LAYER + 1)
+            ]
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        cond: torch.LongTensor,  # added
+        prediction_target: torch.LongTensor = None,  # added. No shifting. -100 means no loss
+        input_ids: torch.LongTensor = None,  # expect an embedding, [B, T, H]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        # labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """Prediction target: [B, T]"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            cond=cond,  # added
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head[cond - START_QUANTIZATION_LAYER](hidden_states)
+        loss = None
+        loss_fct = CrossEntropyLoss()
+        if prediction_target is not None:
+            # calculate loss if prediction_target is provided
+            logits_tmp = logits.view(-1, logits.size(-1))
+            prediction_target = prediction_target.view(-1)
+            loss = loss_fct(logits_tmp, prediction_target)
+        return edict(
+            loss=loss,
+            logits=logits,
+        )
+class ValleNAR(nn.Module):
+    def __init__(
+        self,
+        phone_vocab_size=256,
+        target_vocab_size=1024,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        pad_token_id=1024 + 256,
+        bos_target_id=1282,
+        eos_target_id=1283,
+        bos_phone_id=1284,
+        eos_phone_id=1285,
+        bos_prompt_id=1286,
+        eos_prompt_id=1287,
+        use_input_embeds=False,
+        emb_dim=256,
+    ):
+        super(ValleNAR, self).__init__()
+        self.config = LlamaConfig(
+            vocab_size=phone_vocab_size + target_vocab_size + 10,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_target_id,
+            eos_token_id=eos_target_id,
+            use_cache=False,
+        )
+        self.phone_vocab_size = phone_vocab_size
+        self.target_vocab_size = target_vocab_size
+        self.pad_token_id = pad_token_id
+        self.bos_target_id = bos_target_id
+        self.eos_target_id = eos_target_id
+        self.bos_phone_id = bos_phone_id
+        self.eos_phone_id = eos_phone_id
+        self.bos_prompt_id = bos_prompt_id
+        self.eos_prompt_id = eos_prompt_id
+        self.model = LlamaForNARModeling(self.config)
+        self.use_input_embeds = use_input_embeds
+        self.phone_embedder = nn.Embedding(
+            self.phone_vocab_size + 10, hidden_size
+        )  # use phone_embedder to embed all eos, bos tokens
+        self.prompt_embedder = MultiEmbedding(
+            num_embeddings=self.target_vocab_size,
+            embedding_dim=hidden_size,
+            num_quantization_layers=NUM_QUANTIZERS,
+        )
+        self.phone_embedder.weight.data.normal_(mean=0.0, std=0.02)
+        # use linear mask schedule when training
+        # another option is uniform
+        self.mask_layer_schedule = "uniform"
+        # no input embedding is used to provide speaker information
+        if self.use_input_embeds:
+            self.emb_linear = nn.Linear(emb_dim, hidden_size)
+            self.emb_linear.weight.data.normal_(mean=0.0, std=0.01)
+            self.emb_linear.bias.data.zero_()
+    def forward(
+        self,
+        phone_ids,
+        phone_mask,
+        target_ids,
+        target_mask,
+        target_quantization_layer=None,
+        prompt_len=None,
+        dropout=0.0,
+    ):
+        """
+        phone_ids: [B, T]
+        phone_mask: [B, T]
+        target_ids: [8,B,T]
+        target_mask: [B, T]
+        dropout: rate of dropping out the target tokens
+        """
+        assert (target_ids < 1024).all(), "target_ids should be less than 1024"
+        phone_ids = phone_ids + self.target_vocab_size
+        phone_ids = phone_ids * phone_mask + (1 - phone_mask) * self.pad_token_id
+        # assert (phone_ids >= 1024).all(), "phone_ids should be greater than 1024"
+        # phone_ids, phone_mask, phone_label = self.add_phone_eos_bos_label(
+        #     phone_ids,
+        #     phone_mask,
+        #     self.eos_phone_id,
+        #     self.bos_phone_id,
+        #     self.pad_token_id,
+        # )
+        phone_label = -100 * (1 - phone_mask)
+        # get phone embedding
+        phone_embedding = self.phone_embedder(
+            phone_ids - self.target_vocab_size
+        )  # [B, T, H]
+        if prompt_len is not None:
+            assert not self.training  # inference stage fix prompt len to input
+            NUM_PROMPT_TOKENS = prompt_len
+        else:
+            assert self.training
+            # randomly select a prompt length
+            assert self.training  # randomize prompt len in training
+            NUM_PROMPT_TOKENS = np.random.randint(
+                min(target_ids.shape[-1] // 4, 5), target_ids.shape[-1] // 2
+            )
+        # extract 8-level prompts
+        prompt_tokens = target_ids[:, :, :NUM_PROMPT_TOKENS]  # [Q, B, T]
+        prompt_mask = torch.ones_like(prompt_tokens[0])
+        prompt_label = -100 * prompt_mask
+        # get prompt embedding
+        prompt_embedding = self.prompt_embedder(prompt_tokens)  # [B, T, H]
+        # randomly select a target qnt layer to predict
+        # total quant layer is 0 to 7
+        if target_quantization_layer is None:
+            if self.mask_layer_schedule == "linear":
+                weights = torch.tensor(
+                    [
+                        NUM_QUANTIZERS - i
+                        for i in range(
+                            START_QUANTIZATION_LAYER, END_QUANTIZATION_LAYER + 1
+                        )
+                    ]
+                )
+                weights = weights / weights.sum()
+                mask_layer = (
+                    torch.multinomial(weights, 1, replacement=True)
+                    + START_QUANTIZATION_LAYER
+                )
+                assert (
+                    mask_layer >= START_QUANTIZATION_LAYER
+                    and mask_layer <= END_QUANTIZATION_LAYER
+                )
+                target_quantization_layer = mask_layer.item()
+            elif self.mask_layer_schedule == "cosine":
+                weights = torch.tensor(
+                    [
+                        np.cos(i / NUM_QUANTIZERS * np.pi / 2)
+                        for i in range(
+                            START_QUANTIZATION_LAYER, END_QUANTIZATION_LAYER + 1
+                        )
+                    ]
+                )
+                weights = weights / weights.sum()
+                mask_layer = (
+                    torch.multinomial(weights, 1, replacement=True)
+                    + START_QUANTIZATION_LAYER
+                )
+                assert (
+                    mask_layer >= START_QUANTIZATION_LAYER
+                    and mask_layer <= END_QUANTIZATION_LAYER
+                )
+                target_quantization_layer = mask_layer.item()
+                breakpoint()
+            elif self.mask_layer_schedule == "uniform":
+                target_quantization_layer = np.random.randint(
+                    START_QUANTIZATION_LAYER, END_QUANTIZATION_LAYER + 1
+                )
+            # print(f'target layer: {target_quantization_layer}')
+        # prompt of the target part
+        target_prompt_ids = target_ids[
+            :target_quantization_layer, :, NUM_PROMPT_TOKENS:
+        ]
+        def randomly_set_elements(tensor, fraction, value):
+            """
+            Randomly set a fraction of the elements in a tensor to a specific value.
+            Args:
+            tensor (torch.Tensor): The input tensor.
+            fraction (float): The fraction of elements to set to the specified value (between 0 and 1).
+            value (float or int): The value to set the elements to.
+            Returns:
+            torch.Tensor: The tensor with some elements set to the specified value.
+            """
+            # Create a mask with the same shape as the tensor
+            mask = torch.rand_like(tensor, dtype=torch.float32) < fraction
+            # Clone the tensor to avoid modifying the original tensor
+            result_tensor = tensor.clone()
+            # Set the elements where the mask is True to the specified value
+            result_tensor[mask] = value
+            return result_tensor
+        if dropout != 0.0:
+            target_prompt_ids = randomly_set_elements(
+                target_prompt_ids, dropout, self.target_vocab_size
+            )
+        target_embedding = self.prompt_embedder(target_prompt_ids)
+        # mask of the target part
+        target_mask = target_mask[:, NUM_PROMPT_TOKENS:]
+        target_labels = target_ids[
+            target_quantization_layer, :, NUM_PROMPT_TOKENS:
+        ] * target_mask + (-100 * (1 - target_mask))
+        # input embeddings
+        input_embeddings = torch.cat(
+            [phone_embedding, prompt_embedding, target_embedding], dim=1
+        )
+        input_mask = torch.cat([phone_mask, prompt_mask, target_mask], dim=1)  # [B, T]
+        prediction_target = torch.cat(
+            [phone_label, prompt_label, target_labels], dim=1
+        )  # [B, T]
+        out = self.model(
+            cond=torch.tensor(
+                target_quantization_layer,
+                device=prediction_target.device,
+                dtype=torch.long,
+            ),
+            input_ids=input_embeddings,
+            prediction_target=prediction_target,
+            attention_mask=input_mask,
+            return_dict=True,
+        )
+        logits = out.logits[:, -target_embedding.shape[1] :, :]
+        targets = prediction_target[..., -target_embedding.shape[1] :]
+        top1_acc = logits.argmax(-1) == targets
+        top1_acc = (top1_acc * target_mask).sum() / target_mask.sum()
+        top5_acc = (logits.topk(5, dim=-1).indices == targets.unsqueeze(-1)).any(-1)
+        top5_acc = (top5_acc * target_mask).sum() / target_mask.sum()
+        top10_acc = (logits.topk(10, dim=-1).indices == targets.unsqueeze(-1)).any(-1)
+        top10_acc = (top10_acc * target_mask).sum() / target_mask.sum()
+        out.target_quantization_layer = target_quantization_layer
+        out.top1_acc = top1_acc
+        out.top5_acc = top5_acc
+        out.top10_acc = top10_acc
+        return out
+    def add_phone_eos_bos_label(
+        self, phone_ids, phone_mask, phone_eos_id, phone_bos_id, pad_token_id
+    ):
+        # phone_ids: [B, T]
+        # phone_mask: [B, T]
+        phone_ids = phone_ids + self.target_vocab_size * phone_mask
+        phone_ids = phone_ids * phone_mask
+        phone_ids = F.pad(phone_ids, (0, 1), value=0) + phone_eos_id * F.pad(
+            1 - phone_mask, (0, 1), value=1
+        )  # make pad token eos token, add eos token at the end
+        phone_mask = F.pad(phone_mask, (1, 0), value=1)  # add eos mask
+        phone_ids = phone_ids * phone_mask + pad_token_id * (
+            1 - phone_mask
+        )  # restore pad token ids
+        phone_ids = F.pad(phone_ids, (1, 0), value=phone_bos_id)  # add bos token
+        phone_mask = F.pad(phone_mask, (1, 0), value=1)  # add bos mask
+        phone_label = -100 * torch.ones_like(
+            phone_ids
+        )  # loss for entire phone is not computed (passed to llama)
+        return phone_ids, phone_mask, phone_label
+    @torch.no_grad()
+    def sample_hf(
+        self,
+        phone_ids,  # [B, T]
+        prompt_ids,  # [8, B, T]
+        first_stage_ids,  # [B, T]
+        top_k=50,
+        top_p=1,
+        temperature=1.1,
+        first_stage_ids_gt=None,  # [Q, B, T]
+        first_stage_ids_gt_end_layer=None,  # 2 to 8
+    ):
+        """
+        phone_ids: [B, T]
+        prompt_ids: [8, B, T]
+        first_stage_ids: [B, T] result from first quant layer. Should be continuation of prompt_ids
+        """
+        phone_mask = torch.ones_like(phone_ids, dtype=torch.long)
+        assert prompt_ids.shape[-1] >= 5, "prompt_ids should have at least 5 tokens"
+        target_ids = torch.cat(
+            [prompt_ids, first_stage_ids.expand(prompt_ids.shape[0], -1, -1)], dim=-1
+        )
+        target_mask = torch.ones_like(target_ids[0], dtype=torch.long)
+        if first_stage_ids_gt is not None:
+            target_ids[
+                :first_stage_ids_gt_end_layer, :, -first_stage_ids_gt.shape[-1] :
+            ] = first_stage_ids_gt[:first_stage_ids_gt_end_layer]
+        gen_len = first_stage_ids.shape[-1]
+        start_qnt_layer = 1
+        if first_stage_ids_gt_end_layer is not None:
+            start_qnt_layer = first_stage_ids_gt_end_layer
+        for qnt_level in range(start_qnt_layer, 8):
+            out = self.forward(
+                phone_ids=phone_ids,
+                phone_mask=phone_mask,
+                target_ids=target_ids,
+                target_mask=target_mask,
+                target_quantization_layer=qnt_level,
+                prompt_len=prompt_ids.shape[-1],
+            )
+            logits = out.logits
+            gen_tokens = torch.argmax(logits, dim=-1).reshape(-1)[
+                -gen_len:
+            ]  # [T], generated tokens in this level
+            # overwrite the target_ids with the generated tokens
+            target_ids[qnt_level, :, -gen_len:] = gen_tokens
+        return target_ids[:, :, -gen_len:]
+def test():
+    model = ValleNAR().cuda()
+    phone_ids = torch.LongTensor([1, 2, 3, 4, 5]).reshape(1, -1).cuda()
+    phone_mask = torch.LongTensor([1, 1, 1, 1, 1]).reshape(1, -1).cuda()
+    target_ids = torch.randint(high=1024, size=(8, 1, 250), dtype=torch.long).cuda()
+    target_mask = torch.ones(1, 250, dtype=torch.long).cuda()
+    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
+    for i in range(200):
+        optimizer.zero_grad()
+        out = model(
+            phone_ids=phone_ids,
+            phone_mask=phone_mask,
+            target_ids=target_ids,
+            target_mask=target_mask,
+            # target_quantization_layer=1+i%6,
+        )
+        loss = out.loss
+        loss.backward()
+        optimizer.step()
+        print(f"iter={i}, {loss}.")
+    target_ids_short = target_ids[:, :, :240]
+    model.eval()
+    sampled = model.sample_hf(
+        phone_ids, prompt_ids=target_ids_short, first_stage_ids=target_ids[0, :, 240:]
+    )
+    print(target_ids[:, :, -10:])
+    print(sampled)
+    print((sampled == target_ids[:, :, -10:]).all())
+if __name__ == "__main__":
+    test()

models/tts/valle_v2.1/valle_nar_trainer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torchaudio
+import numpy as np
+import time
+from .valle_ar_trainer import ValleARTrainer, make_pad_mask
+class ValleNARTrainer(ValleARTrainer):
+    def __init__(self, args=None, cfg=None):
+        super().__init__(args, cfg)
+        print("simple NAR")
+        self.top1_accuracies = {
+            1: [],
+            2: [],
+            3: [],
+            4: [],
+            5: [],
+            6: [],
+            7: [],
+        }
+        self.top5_accuracies = {
+            1: [],
+            2: [],
+            3: [],
+            4: [],
+            5: [],
+            6: [],
+            7: [],
+        }
+        self.top10_accuracies = {
+            1: [],
+            2: [],
+            3: [],
+            4: [],
+            5: [],
+            6: [],
+            7: [],
+        }
+    def _build_model(self):
+        from .valle_nar import ValleNAR
+        return ValleNAR(**self.cfg.model)
+    def _train_step(self, batch):
+        # inference codec
+        """Returns: dict('speech', 'speech_len', 'phone_ids', 'phone_lens')
+        speech: [B, T]
+        speech_len: [B]
+        phone_ids: [B, T]
+        phone_lens: [B]
+        """
+        device = self.accelerator.device
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(device)
+        with torch.no_grad():
+            if self.cfg.use_speechtokenizer:
+                # Extract discrete codes from SpeechTokenizer
+                # 16k
+                vq_id = self.codec_encoder.encode(
+                    batch["speech"].unsqueeze(1)
+                )  # [B,T] -> (n_q, B, T)
+                # RVQ_1 = codes[:1, :, :] # Contain content info, can be considered as semantic tokens
+                # RVQ_supplement = codes[1:, :, :] # Contain timbre info, complete info lost by the first quantizer
+                # Concatenating semantic tokens (RVQ_1) and supplementary timbre tokens and then decoding
+                # wav = self.codec_encoder.decode(vq_id)
+                # torchaudio.save('a.wav', wav[0].cpu(), 16000)
+                # # Decoding from RVQ-i:j tokens from the ith quantizers to the jth quantizers
+                # wav = model.decode(codes[i: (j + 1)], st=i)
+            else:
+                # using encodec, 24k
+                vq_id = self.codec_encoder.encode(batch["speech"].unsqueeze(1))
+                vq_id = torch.cat([encoded[0] for encoded in vq_id], dim=-1).transpose(
+                    0, 1
+                )
+            # recovered_audio = self.codec_decoder(vq_emb, vq=False)
+            # torchaudio.save('a.wav', recovered_audio[0], 16000)
+            # vq_id: [8, B, T//320]
+            batch["speech"] = vq_id
+        batch["speech_len"] = batch["speech_len"] // 320  # our codec downsamples 320x
+        assert batch["speech_len"].max() <= batch["speech"].shape[-1]
+        phone_mask = 1 - make_pad_mask(
+            batch["phone_lens"], max_len=batch["phone_ids"].size(1), left_pad=False
+        ).to(torch.long)
+        speech_mask = 1 - make_pad_mask(
+            batch["speech_len"], max_len=batch["speech"].size(-1)
+        ).to(torch.long)
+        np.random.seed(int(time.time()) - 5 * self.accelerator.process_index)
+        if hasattr(self.cfg.train, "dropout"):
+            dropout = self.cfg.train.dropout
+        else:
+            dropout = 0.0
+        out = self.model(
+            phone_ids=batch["phone_ids"],
+            phone_mask=phone_mask,
+            target_ids=batch["speech"],
+            target_mask=speech_mask,
+            dropout=dropout,
+        )
+        loss = out.loss
+        self.accelerator.log(
+            {f"Train/NAR L{out.target_quantization_layer} Top1 acc": out.top1_acc},
+            step=self.step,
+        )
+        self.accelerator.log(
+            {f"Train/NAR L{out.target_quantization_layer} Top5 acc": out.top5_acc},
+            step=self.step,
+        )
+        self.accelerator.log(
+            {f"Train/NAR L{out.target_quantization_layer} Top10 acc": out.top10_acc},
+            step=self.step,
+        )
+        # if hasattr(out, 'top1_acc'):
+        #     idx = out.target_quantization_layer
+        #     self.top1_accuracies[idx].append(out.top1_acc)
+        #     self.top5_accuracies[idx].append(out.top5_acc)
+        #     self.top10_accuracies[idx].append(out.top10_acc)
+        #     if len(self.top1_accuracies[idx]) >= 160:
+        #         breakpoint()
+        # if self.accelerator.is_main_process:
+        #     print(loss)
+        return loss
+    def _test_step(self, batch):
+        # inference codec
+        """Returns: dict('speech', 'speech_len', 'phone_ids', 'phone_lens')
+        speech: [B, T]
+        speech_len: [B]
+        phone_ids: [B, T]
+        phone_lens: [B]
+        """
+        import torchaudio
+        device = self.accelerator.device
+        for k, v in batch.items():
+            if isinstance(v, torch.Tensor):
+                batch[k] = v.to(device)
+        with torch.no_grad():
+            if self.cfg.use_speechtokenizer:
+                # Extract discrete codes from SpeechTokenizer
+                # 16k
+                vq_id = self.codec_encoder.encode(
+                    batch["speech"].unsqueeze(1)
+                )  # [B,1,T] -> (n_q, B, T)
+                # Concatenating semantic tokens (RVQ_1) and supplementary timbre tokens and then decoding
+                # wav = self.codec_encoder.decode(vq_id)
+                # torchaudio.save('a.wav', wav[0].cpu(), 16000)
+            else:
+                vq_id = self.codec_encoder.encode(batch["speech"].unsqueeze(1))
+                vq_id = torch.cat([encoded[0] for encoded in vq_id], dim=-1).transpose(
+                    0, 1
+                )
+            # recovered_audio = self.codec_encoder.decode([(vq_id.transpose(0,1), None)])
+            # recovered_audio = self.codec_decoder(vq_emb, vq=False)
+            # torchaudio.save('a.wav', recovered_audio[0], 16000)
+            # vq_id: [8, B, T//200]
+            # vq_emb = self.codec_decoder.quantizer.vq2emb(vq=vq_id[:1], n_quantizers=1)
+            # recovered_audio = self.codec_decoder(vq_emb, vq=False)
+            # recovered_audio.shape: torch.Size([1, 1, 50200])
+            batch["speech"] = vq_id
+            # save gt
+            if self.cfg.use_speechtokenizer:
+                recovered_audio = self.codec_encoder.decode(vq_id)
+            else:
+                recovered_audio = self.codec_encoder.decode(
+                    [(vq_id.transpose(0, 1), None)]
+                )
+            torchaudio.save("gt.wav", recovered_audio[0].cpu(), 16000)
+            self.model.eval()
+            out_vq_ids = self.model.sample_hf(
+                phone_ids=batch["phone_ids"][:1],
+                prompt_ids=batch["speech"][:, :1, :150],
+                first_stage_ids=batch["speech"][0, :1, 150:],
+            )
+            # breakpoint()
+            # out_vq_ids = torch.cat([batch['speech'][:, :225], out_vq_ids], dim=1)
+            # reconstruct form tokens
+            if self.cfg.use_speechtokenizer:
+                recovered_audio = self.codec_encoder.decode(out_vq_ids)
+            else:
+                recovered_audio = self.codec_encoder.decode(
+                    [(out_vq_ids.transpose(0, 1)[:1], None)]
+                )
+            torchaudio.save("a.wav", recovered_audio[0].cpu(), 16000)
+            breakpoint()

utils/g2p/g2p.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import json
+import re
+import os
+from typing import List, Pattern, Union
+from phonemizer.utils import list2str, str2list
+from phonemizer.backend import EspeakBackend
+from phonemizer.backend.espeak.language_switch import LanguageSwitch
+from phonemizer.backend.espeak.words_mismatch import WordMismatch
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import Separator
+import jieba
+import cn2an
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', 'ㄟˉ'),
+    ('b', 'ㄅㄧˋ'),
+    ('c', 'ㄙㄧˉ'),
+    ('d', 'ㄉㄧˋ'),
+    ('e', 'ㄧˋ'),
+    ('f', 'ㄝˊㄈㄨˋ'),
+    ('g', 'ㄐㄧˋ'),
+    ('h', 'ㄝˇㄑㄩˋ'),
+    ('i', 'ㄞˋ'),
+    ('j', 'ㄐㄟˋ'),
+    ('k', 'ㄎㄟˋ'),
+    ('l', 'ㄝˊㄛˋ'),
+    ('m', 'ㄝˊㄇㄨˋ'),
+    ('n', 'ㄣˉ'),
+    ('o', 'ㄡˉ'),
+    ('p', 'ㄆㄧˉ'),
+    ('q', 'ㄎㄧㄡˉ'),
+    ('r', 'ㄚˋ'),
+    ('s', 'ㄝˊㄙˋ'),
+    ('t', 'ㄊㄧˋ'),
+    ('u', 'ㄧㄡˉ'),
+    ('v', 'ㄨㄧˉ'),
+    ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
+    ('x', 'ㄝˉㄎㄨˋㄙˋ'),
+    ('y', 'ㄨㄞˋ'),
+    ('z', 'ㄗㄟˋ')
+]]
+# List of (bopomofo, ipa) pairs:
+_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'p⁼wo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄧㄢ', '|jɛn'),
+    ('ㄩㄢ', '|ɥæn'),
+    ('ㄧㄣ', '|in'),
+    ('ㄩㄣ', '|ɥn'),
+    ('ㄧㄥ', '|iŋ'),
+    ('ㄨㄥ', '|ʊŋ'),
+    ('ㄩㄥ', '|jʊŋ'),
+    # Add
+    ('ㄧㄚ', '|ia'),
+    ('ㄧㄝ', '|iɛ'),
+    ('ㄧㄠ', '|iɑʊ'),
+    ('ㄧㄡ', '|ioʊ'),
+    ('ㄧㄤ', '|iɑŋ'),
+    ('ㄨㄚ', '|ua'),
+    ('ㄨㄛ', '|uo'),
+    ('ㄨㄞ', '|uaɪ'),
+    ('ㄨㄟ', '|ueɪ'),
+    ('ㄨㄢ', '|uan'),
+    ('ㄨㄣ', '|uən'),
+    ('ㄨㄤ', '|uɑŋ'),
+    ('ㄩㄝ', '|ɥɛ'),
+    # End
+    ('ㄅ', 'p⁼'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't⁼'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k⁼'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'x'),
+    ('ㄐ', 'tʃ⁼'),
+    ('ㄑ', 'tʃʰ'),
+    ('ㄒ', 'ʃ'),
+    ('ㄓ', 'ts`⁼'),
+    ('ㄔ', 'ts`ʰ'),
+    ('ㄕ', 's`'),
+    ('ㄖ', 'ɹ`'),
+    ('ㄗ', 'ts⁼'),
+    ('ㄘ', 'tsʰ'),
+    ('ㄙ', '|s'),
+    ('ㄚ', '|a'),
+    ('ㄛ', '|o'),
+    ('ㄜ', '|ə'),
+    ('ㄝ', '|ɛ'),
+    ('ㄞ', '|aɪ'),
+    ('ㄟ', '|eɪ'),
+    ('ㄠ', '|ɑʊ'),
+    ('ㄡ', '|oʊ'),
+    ('ㄢ', '|an'),
+    ('ㄣ', '|ən'),
+    ('ㄤ', '|ɑŋ'),
+    ('ㄥ', '|əŋ'),
+    ('ㄦ', 'əɹ'),
+    ('ㄧ', '|i'),
+    ('ㄨ', '|u'),
+    ('ㄩ', '|ɥ'),
+    ('ˉ', '→|'),
+    ('ˊ', '↑|'),
+    ('ˇ', '↓↑|'),
+    ('ˋ', '↓|'),
+    ('˙', '|'),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-'),
+]]
+# Convert numbers to Chinese pronunciation
+def number_to_chinese(text):
+    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    for number in numbers:
+        text = text.replace(number, cn2an.an2cn(number), 1)
+    return text
+# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
+def chinese_to_bopomofo(text):
+    from pypinyin import lazy_pinyin, BOPOMOFO
+    text = text.replace('、', '，').replace('；', '，').replace('：', '，')
+    text = re.sub(r"\s+", "", text)
+    words = jieba.lcut(text, cut_all=False)
+    text = ''
+    for word in words:
+        bopomofos = lazy_pinyin(word, BOPOMOFO)
+        if not re.search('[\u4e00-\u9fff]', word):
+            text += word
+            continue
+        for i in range(len(bopomofos)):
+            bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
+        if text != '':
+            text += '|'
+        text += '|'.join(bopomofos)
+    return text
+# Convert latin pronunciation to pinyin (bopomofo)
+def latin_to_bopomofo(text):
+    for regex, replacement in _latin_to_bopomofo:
+        text = re.sub(regex, replacement, text)
+    return text
+# Convert pinyin (bopomofo) to IPA
+def bopomofo_to_ipa(text):
+    for regex, replacement in _bopomofo_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def _chinese_to_ipa(text):
+    text = number_to_chinese(text.strip())
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa(text)
+    text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+                  r'\1ɹ\2', text)
+    text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
+    text = re.sub(r'^\||[^\w\s_,\.\?!\|\'→↓↑⁼ʰ`]', '', text)
+    text = re.sub(r'([,.!?])', r'|\1', text)
+    text = re.sub(r'\|+', '|', text)
+    return text
+# Convert Chinese to IPA
+def chinese_to_ipa(text, text_tokenizer):
+    # phonemes = text_tokenizer(text.strip())
+    if type(text) == str:
+        return _chinese_to_ipa(text)
+    else:
+        result_ph = []
+        for t in text:
+            result_ph.append(_chinese_to_ipa(t))
+        return result_ph
+_special_map = [
+    ('t|ɹ', 'tɹ'),
+    ('d|ɹ', 'dɹ'),
+    ('t|s', 'ts'),
+    ('d|z', 'dz'),
+    ('ɐ', 'ɚ'),
+    ('ᵻ', 'ɪ'),
+    ('əl', 'l'),
+    ('x', 'k'),
+    ('ɬ', 'l'),
+    ('ʔ', 't'),
+    ('n̩', 'n'),
+    ('oː|ɹ', 'oːɹ')
+]
+# special map
+def special_map(text):
+    for regex, replacement in _special_map:
+        regex = regex.replace("|", "\|")
+        while re.search(r'(^|[_|]){}([_|]|$)'.format(regex), text):
+            text = re.sub(r'(^|[_|]){}([_|]|$)'.format(regex), r'\1{}\2'.format(replacement), text)
+    text = re.sub(r'([,.!?])', r'|\1', text)
+    return text
+def english_to_ipa(text, text_tokenizer):
+    # text = _english_to_ipa(text)
+    phonemes = text_tokenizer(text)
+    if type(text) == str:
+        return special_map(phonemes)
+    else:
+        result_ph = []
+        for phone in phonemes:
+            result_ph.append(special_map(phone))
+        return result_ph
+def cjekfd_cleaners(text, language, text_tokenizers):
+    if language == 'zh':
+        return chinese_to_ipa(text, text_tokenizers['zh'])
+    elif language == 'en':
+        return english_to_ipa(text, text_tokenizers['en'])
+    else:
+        raise Exception('Unknown language: %s' % language)
+        return None
+class TextTokenizer:
+    """Phonemize Text."""
+    def __init__(
+        self,
+        language="en-us",
+        backend="espeak",
+        separator=Separator(word="|_|", syllable="-", phone="|"),
+        preserve_punctuation=False,
+        punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
+        with_stress: bool = False,
+        tie: Union[bool, str] = False,
+        language_switch: LanguageSwitch = "remove-flags",
+        words_mismatch: WordMismatch = "ignore",
+    ) -> None:
+        self.backend = EspeakBackend(
+            language,
+            punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            with_stress=with_stress,
+            tie=tie,
+            language_switch=language_switch,
+            words_mismatch=words_mismatch,
+        )
+        self.separator = separator
+    def __call__(self, text, strip=True) -> List[str]:
+        text_type = type(text)
+        text = [re.sub(r'[^\w\s_,\.\?!\|\']', '', line.strip()) for line in str2list(text)]
+        phonemized = self.backend.phonemize(
+            text, separator=self.separator, strip=strip, njobs=1
+        )
+        if text_type == str:
+            return list2str(phonemized)
+        return phonemized
+class PhonemeBpeTokenizer:
+  def __init__(self, vacab_path="./utils/g2p/mls_en.json"):
+    self.lang2backend = {
+        'zh': "cmn",
+        'ja': "ja",
+        "en": "en-us",
+        "fr": "fr-fr",
+        "ko": "ko",
+        "de": "de",
+    }
+    self.text_tokenizers = {}
+    self.int_text_tokenizers()
+    with open(vacab_path, 'r') as f:
+      json_data = f.read()
+    data = json.loads(json_data)
+    self.vocab = data['vocab']
+  def int_text_tokenizers(self):
+    for key, value in self.lang2backend.items():
+      self.text_tokenizers[key] = TextTokenizer(language=value)
+  def tokenize(self, text, language):
+    # 1. convert text to phoneme
+    phonemes = self._clean_text(text, language, ['cjekfd_cleaners'])
+    # print('clean text: ', phonemes)
+    # 2. tokenize phonemes
+    phoneme_tokens = self.phoneme2token(phonemes)
+    return phonemes, phoneme_tokens
+  def _clean_text(self, text, language, cleaner_names):
+    text = cjekfd_cleaners(text, language, self.text_tokenizers)
+    return text
+  def phoneme2token(self, phonemes):
+    tokens = []
+    if isinstance(phonemes, list):
+      for phone in phonemes:
+        phonemes_split = phone.split("|")
+        tokens.append([self.vocab[p] for p in phonemes_split if p in self.vocab])
+    else:
+      phonemes_split = phonemes.split("|")
+      tokens = [self.vocab[p] for p in phonemes_split if p in self.vocab]
+    return tokens
+text_tokenizer = PhonemeBpeTokenizer()
+def phonemizer_g2p(text, language):
+    return text_tokenizer.tokenize(text=text, language=language)

utils/g2p/mls_emilia.json ADDED Viewed

	@@ -0,0 +1,335 @@

+{
+	"[UNK]": 0,
+	"_": 1,
+	"b": 2,
+	"d": 3,
+	"f": 4,
+	"h": 5,
+	"i": 6,
+	"j": 7,
+	"k": 8,
+	"l": 9,
+	"m": 10,
+	"n": 11,
+	"p": 12,
+	"r": 13,
+	"s": 14,
+	"t": 15,
+	"v": 16,
+	"w": 17,
+	"x": 18,
+	"z": 19,
+	"æ": 20,
+	"ç": 21,
+	"ð": 22,
+	"ŋ": 23,
+	"ɐ": 24,
+	"ɔ": 25,
+	"ə": 26,
+	"ɚ": 27,
+	"ɛ": 28,
+	"ɡ": 29,
+	"ɪ": 30,
+	"ɬ": 31,
+	"ɹ": 32,
+	"ɾ": 33,
+	"ʃ": 34,
+	"ʊ": 35,
+	"ʌ": 36,
+	"ʒ": 37,
+	"ʔ": 38,
+	"θ": 39,
+	"ᵻ": 40,
+	"aɪ": 41,
+	"aʊ": 42,
+	"dʒ": 43,
+	"eɪ": 44,
+	"iə": 45,
+	"iː": 46,
+	"n̩": 47,
+	"oʊ": 48,
+	"oː": 49,
+	"tʃ": 50,
+	"uː": 51,
+	"ææ": 52,
+	"ɐɐ": 53,
+	"ɑː": 54,
+	"ɑ̃": 55,
+	"ɔɪ": 56,
+	"ɔː": 57,
+	"ɔ̃": 58,
+	"əl": 59,
+	"ɛɹ": 60,
+	"ɜː": 61,
+	"ɡʲ": 62,
+	"ɪɹ": 63,
+	"ʊɹ": 64,
+	"aɪə": 65,
+	"aɪɚ": 66,
+	"iːː": 67,
+	"oːɹ": 68,
+	"ɑːɹ": 69,
+	"ɔːɹ": 70,
+	"1": 71,
+	"a": 72,
+	"e": 73,
+	"o": 74,
+	"q": 75,
+	"u": 76,
+	"y": 77,
+	"ɑ": 78,
+	"ɒ": 79,
+	"ɕ": 80,
+	"ɣ": 81,
+	"ɫ": 82,
+	"ɯ": 83,
+	"ʐ": 84,
+	"ʲ": 85,
+	"a1": 86,
+	"a2": 87,
+	"a5": 88,
+	"ai": 89,
+	"aɜ": 90,
+	"aː": 91,
+	"ei": 92,
+	"eə": 93,
+	"i.": 94,
+	"i1": 95,
+	"i2": 96,
+	"i5": 97,
+	"io": 98,
+	"iɑ": 99,
+	"iɛ": 100,
+	"iɜ": 101,
+	"i̪": 102,
+	"kh": 103,
+	"nʲ": 104,
+	"o1": 105,
+	"o2": 106,
+	"o5": 107,
+	"ou": 108,
+	"oɜ": 109,
+	"ph": 110,
+	"s.": 111,
+	"th": 112,
+	"ts": 113,
+	"tɕ": 114,
+	"u1": 115,
+	"u2": 116,
+	"u5": 117,
+	"ua": 118,
+	"uo": 119,
+	"uə": 120,
+	"uɜ": 121,
+	"y1": 122,
+	"y2": 123,
+	"y5": 124,
+	"yu": 125,
+	"yæ": 126,
+	"yə": 127,
+	"yɛ": 128,
+	"yɜ": 129,
+	"ŋɜ": 130,
+	"ŋʲ": 131,
+	"ɑ1": 132,
+	"ɑ2": 133,
+	"ɑ5": 134,
+	"ɑu": 135,
+	"ɑɜ": 136,
+	"ɑʲ": 137,
+	"ə1": 138,
+	"ə2": 139,
+	"ə5": 140,
+	"ər": 141,
+	"əɜ": 142,
+	"əʊ": 143,
+	"ʊə": 144,
+	"ai1": 145,
+	"ai2": 146,
+	"ai5": 147,
+	"aiɜ": 148,
+	"ei1": 149,
+	"ei2": 150,
+	"ei5": 151,
+	"eiɜ": 152,
+	"i.1": 153,
+	"i.2": 154,
+	"i.5": 155,
+	"i.ɜ": 156,
+	"io5": 157,
+	"iou": 158,
+	"iɑ1": 159,
+	"iɑ2": 160,
+	"iɑ5": 161,
+	"iɑɜ": 162,
+	"iɛ1": 163,
+	"iɛ2": 164,
+	"iɛ5": 165,
+	"iɛɜ": 166,
+	"i̪1": 167,
+	"i̪2": 168,
+	"i̪5": 169,
+	"i̪ɜ": 170,
+	"onɡ": 171,
+	"ou1": 172,
+	"ou2": 173,
+	"ou5": 174,
+	"ouɜ": 175,
+	"ts.": 176,
+	"tsh": 177,
+	"tɕh": 178,
+	"u5ʲ": 179,
+	"ua1": 180,
+	"ua2": 181,
+	"ua5": 182,
+	"uai": 183,
+	"uaɜ": 184,
+	"uei": 185,
+	"uo1": 186,
+	"uo2": 187,
+	"uo5": 188,
+	"uoɜ": 189,
+	"uə1": 190,
+	"uə2": 191,
+	"uə5": 192,
+	"uəɜ": 193,
+	"yiɜ": 194,
+	"yu2": 195,
+	"yu5": 196,
+	"yæ2": 197,
+	"yæ5": 198,
+	"yæɜ": 199,
+	"yə2": 200,
+	"yə5": 201,
+	"yəɜ": 202,
+	"yɛ1": 203,
+	"yɛ2": 204,
+	"yɛ5": 205,
+	"yɛɜ": 206,
+	"ɑu1": 207,
+	"ɑu2": 208,
+	"ɑu5": 209,
+	"ɑuɜ": 210,
+	"ər1": 211,
+	"ər2": 212,
+	"ər5": 213,
+	"ərɜ": 214,
+	"əː1": 215,
+	"iou1": 216,
+	"iou2": 217,
+	"iou5": 218,
+	"iouɜ": 219,
+	"onɡ1": 220,
+	"onɡ2": 221,
+	"onɡ5": 222,
+	"onɡɜ": 223,
+	"ts.h": 224,
+	"uai2": 225,
+	"uai5": 226,
+	"uaiɜ": 227,
+	"uei1": 228,
+	"uei2": 229,
+	"uei5": 230,
+	"ueiɜ": 231,
+	"uoɜʲ": 232,
+	"yɛ5ʲ": 233,
+	"ɑu2ʲ": 234,
+	"2": 235,
+	"5": 236,
+	"ɜ": 237,
+	"ʂ": 238,
+	"dʑ": 239,
+	"iɪ": 240,
+	"uɪ": 241,
+	"xʲ": 242,
+	"ɑt": 243,
+	"ɛɜ": 244,
+	"ɛː": 245,
+	"ɪː": 246,
+	"phʲ": 247,
+	"ɑ5ʲ": 248,
+	"ɑuʲ": 249,
+	"ərə": 250,
+	"uozʰ": 251,
+	"ər1ʲ": 252,
+	"tɕhtɕh": 253,
+	"c": 254,
+	"ʋ": 255,
+	"ʍ": 256,
+	"ʑ": 257,
+	"ː": 258,
+	"aə": 259,
+	"eː": 260,
+	"hʲ": 261,
+	"iʊ": 262,
+	"kʲ": 263,
+	"lʲ": 264,
+	"oə": 265,
+	"oɪ": 266,
+	"oʲ": 267,
+	"pʲ": 268,
+	"sʲ": 269,
+	"u4": 270,
+	"uʲ": 271,
+	"yi": 272,
+	"yʲ": 273,
+	"ŋ2": 274,
+	"ŋ5": 275,
+	"ŋ̩": 276,
+	"ɑɪ": 277,
+	"ɑʊ": 278,
+	"ɕʲ": 279,
+	"ət": 280,
+	"əə": 281,
+	"əɪ": 282,
+	"əʲ": 283,
+	"ɛ1": 284,
+	"ɛ5": 285,
+	"aiə": 286,
+	"aiɪ": 287,
+	"azʰ": 288,
+	"eiə": 289,
+	"eiɪ": 290,
+	"eiʊ": 291,
+	"i.ə": 292,
+	"i.ɪ": 293,
+	"i.ʊ": 294,
+	"ioɜ": 295,
+	"izʰ": 296,
+	"iɑə": 297,
+	"iɑʊ": 298,
+	"iɑʲ": 299,
+	"iɛə": 300,
+	"iɛɪ": 301,
+	"iɛʊ": 302,
+	"i̪ə": 303,
+	"i̪ʊ": 304,
+	"khʲ": 305,
+	"ouʲ": 306,
+	"tsʲ": 307,
+	"u2ʲ": 308,
+	"uoɪ": 309,
+	"uzʰ": 310,
+	"uɜʲ": 311,
+	"yæɪ": 312,
+	"yəʊ": 313,
+	"ərt": 314,
+	"ərɪ": 315,
+	"ərʲ": 316,
+	"əːt": 317,
+	"iouə": 318,
+	"iouʊ": 319,
+	"iouʲ": 320,
+	"iɛzʰ": 321,
+	"onɡə": 322,
+	"onɡɪ": 323,
+	"onɡʊ": 324,
+	"ouzʰ": 325,
+	"uai1": 326,
+	"ueiɪ": 327,
+	"ɑuzʰ": 328,
+	"iouzʰ": 329
+}

utils/g2p/mls_en.json ADDED Viewed

	@@ -0,0 +1,323 @@

+{
+	"vocab": {
+		",": 0,
+		".": 1,
+		"?": 2,
+		"!": 3,
+		"_": 4,
+		"iː": 5,
+		"ɪ": 6,
+		"ɜː": 7,
+		"ɚ": 8,
+		"oːɹ": 9,
+		"ɔː": 10,
+		"ɔːɹ": 11,
+		"ɑː": 12,
+		"uː": 13,
+		"ʊ": 14,
+		"ɑːɹ": 15,
+		"ʌ": 16,
+		"ɛ": 17,
+		"æ": 18,
+		"eɪ": 19,
+		"aɪ": 20,
+		"ɔɪ": 21,
+		"aʊ": 22,
+		"oʊ": 23,
+		"ɪɹ": 24,
+		"ɛɹ": 25,
+		"ʊɹ": 26,
+		"p": 27,
+		"b": 28,
+		"t": 29,
+		"d": 30,
+		"k": 31,
+		"ɡ": 32,
+		"f": 33,
+		"v": 34,
+		"θ": 35,
+		"ð": 36,
+		"s": 37,
+		"z": 38,
+		"ʃ": 39,
+		"ʒ": 40,
+		"h": 41,
+		"tʃ": 42,
+		"dʒ": 43,
+		"m": 44,
+		"n": 45,
+		"ŋ": 46,
+		"j": 47,
+		"w": 48,
+		"ɹ": 49,
+		"l": 50,
+		"tɹ": 51,
+		"dɹ": 52,
+		"ts": 53,
+		"dz": 54,
+		"i": 55,
+		"ɔ": 56,
+		"ə": 57,
+		"ɾ": 58,
+		"iə": 59,
+		"r": 60,
+		"u": 61,
+		"oː": 62,
+		"ɛː": 63,
+		"ɪː": 64,
+		"aɪə": 65,
+		"aɪɚ": 66,
+		"ɑ̃": 67,
+		"ç": 68,
+		"ɔ̃": 69,
+		"ææ": 70,
+		"ɐɐ": 71,
+		"ɡʲ": 72,
+		"nʲ": 73,
+		"iːː": 74,
+		"p⁼": 75,
+		"pʰ": 76,
+		"t⁼": 77,
+		"tʰ": 78,
+		"k⁼": 79,
+		"kʰ": 80,
+		"x": 81,
+		"tʃ⁼": 82,
+		"tʃʰ": 83,
+		"ts`⁼": 84,
+		"ts`ʰ": 85,
+		"s`": 86,
+		"ɹ`": 87,
+		"ts⁼": 88,
+		"tsʰ": 89,
+		"p⁼wo": 90,
+		"p⁼wo→": 91,
+		"p⁼wo↑": 92,
+		"p⁼wo↓↑": 93,
+		"p⁼wo↓": 94,
+		"pʰwo": 95,
+		"pʰwo→": 96,
+		"pʰwo↑": 97,
+		"pʰwo↓↑": 98,
+		"pʰwo↓": 99,
+		"mwo": 100,
+		"mwo→": 101,
+		"mwo↑": 102,
+		"mwo↓↑": 103,
+		"mwo↓": 104,
+		"fwo": 105,
+		"fwo→": 106,
+		"fwo↑": 107,
+		"fwo↓↑": 108,
+		"fwo↓": 109,
+		"jɛn": 110,
+		"jɛn→": 111,
+		"jɛn↑": 112,
+		"jɛn↓↑": 113,
+		"jɛn↓": 114,
+		"ɥæn": 115,
+		"ɥæn→": 116,
+		"ɥæn↑": 117,
+		"ɥæn↓↑": 118,
+		"ɥæn↓": 119,
+		"in": 120,
+		"in→": 121,
+		"in↑": 122,
+		"in↓↑": 123,
+		"in↓": 124,
+		"ɥn": 125,
+		"ɥn→": 126,
+		"ɥn↑": 127,
+		"ɥn↓↑": 128,
+		"ɥn↓": 129,
+		"iŋ": 130,
+		"iŋ→": 131,
+		"iŋ↑": 132,
+		"iŋ↓↑": 133,
+		"iŋ↓": 134,
+		"ʊŋ": 135,
+		"ʊŋ→": 136,
+		"ʊŋ↑": 137,
+		"ʊŋ↓↑": 138,
+		"ʊŋ↓": 139,
+		"jʊŋ": 140,
+		"jʊŋ→": 141,
+		"jʊŋ↑": 142,
+		"jʊŋ↓↑": 143,
+		"jʊŋ↓": 144,
+		"ia": 145,
+		"ia→": 146,
+		"ia↑": 147,
+		"ia↓↑": 148,
+		"ia↓": 149,
+		"iɛ": 150,
+		"iɛ→": 151,
+		"iɛ↑": 152,
+		"iɛ↓↑": 153,
+		"iɛ↓": 154,
+		"iɑʊ": 155,
+		"iɑʊ→": 156,
+		"iɑʊ↑": 157,
+		"iɑʊ↓↑": 158,
+		"iɑʊ↓": 159,
+		"ioʊ": 160,
+		"ioʊ→": 161,
+		"ioʊ↑": 162,
+		"ioʊ↓↑": 163,
+		"ioʊ↓": 164,
+		"iɑŋ": 165,
+		"iɑŋ→": 166,
+		"iɑŋ↑": 167,
+		"iɑŋ↓↑": 168,
+		"iɑŋ↓": 169,
+		"ua": 170,
+		"ua→": 171,
+		"ua↑": 172,
+		"ua↓↑": 173,
+		"ua↓": 174,
+		"uo": 175,
+		"uo→": 176,
+		"uo↑": 177,
+		"uo↓↑": 178,
+		"uo↓": 179,
+		"uaɪ": 180,
+		"uaɪ→": 181,
+		"uaɪ↑": 182,
+		"uaɪ↓↑": 183,
+		"uaɪ↓": 184,
+		"ueɪ": 185,
+		"ueɪ→": 186,
+		"ueɪ↑": 187,
+		"ueɪ↓↑": 188,
+		"ueɪ↓": 189,
+		"uan": 190,
+		"uan→": 191,
+		"uan↑": 192,
+		"uan↓↑": 193,
+		"uan↓": 194,
+		"uən": 195,
+		"uən→": 196,
+		"uən↑": 197,
+		"uən↓↑": 198,
+		"uən↓": 199,
+		"uɑŋ": 200,
+		"uɑŋ→": 201,
+		"uɑŋ↑": 202,
+		"uɑŋ↓↑": 203,
+		"uɑŋ↓": 204,
+		"ɥɛ": 205,
+		"ɥɛ→": 206,
+		"ɥɛ↑": 207,
+		"ɥɛ↓↑": 208,
+		"ɥɛ↓": 209,
+		"a": 210,
+		"a→": 211,
+		"a↑": 212,
+		"a↓↑": 213,
+		"a↓": 214,
+		"o": 215,
+		"o→": 216,
+		"o↑": 217,
+		"o↓↑": 218,
+		"o↓": 219,
+		"ə→": 220,
+		"ə↑": 221,
+		"ə↓↑": 222,
+		"ə↓": 223,
+		"ɛ→": 224,
+		"ɛ↑": 225,
+		"ɛ↓↑": 226,
+		"ɛ↓": 227,
+		"aɪ→": 228,
+		"aɪ↑": 229,
+		"aɪ↓↑": 230,
+		"aɪ↓": 231,
+		"eɪ→": 232,
+		"eɪ↑": 233,
+		"eɪ↓↑": 234,
+		"eɪ↓": 235,
+		"ɑʊ": 236,
+		"ɑʊ→": 237,
+		"ɑʊ↑": 238,
+		"ɑʊ↓↑": 239,
+		"ɑʊ↓": 240,
+		"oʊ→": 241,
+		"oʊ↑": 242,
+		"oʊ↓↑": 243,
+		"oʊ↓": 244,
+		"an": 245,
+		"an→": 246,
+		"an↑": 247,
+		"an↓↑": 248,
+		"an↓": 249,
+		"ən": 250,
+		"ən→": 251,
+		"ən↑": 252,
+		"ən↓↑": 253,
+		"ən↓": 254,
+		"ɑŋ": 255,
+		"ɑŋ→": 256,
+		"ɑŋ↑": 257,
+		"ɑŋ↓↑": 258,
+		"ɑŋ↓": 259,
+		"əŋ": 260,
+		"əŋ→": 261,
+		"əŋ↑": 262,
+		"əŋ↓↑": 263,
+		"əŋ↓": 264,
+		"əɹ": 265,
+		"əɹ→": 266,
+		"əɹ↑": 267,
+		"əɹ↓↑": 268,
+		"əɹ↓": 269,
+		"i→": 270,
+		"i↑": 271,
+		"i↓↑": 272,
+		"i↓": 273,
+		"u→": 274,
+		"u↑": 275,
+		"u↓↑": 276,
+		"u↓": 277,
+		"ɥ": 278,
+		"ɥ→": 279,
+		"ɥ↑": 280,
+		"ɥ↓↑": 281,
+		"ɥ↓": 282,
+		"ts`⁼ɹ": 283,
+		"ts`⁼ɹ→": 284,
+		"ts`⁼ɹ↑": 285,
+		"ts`⁼ɹ↓↑": 286,
+		"ts`⁼ɹ↓": 287,
+		"ts`ʰɹ": 288,
+		"ts`ʰɹ→": 289,
+		"ts`ʰɹ↑": 290,
+		"ts`ʰɹ↓↑": 291,
+		"ts`ʰɹ↓": 292,
+		"s`ɹ": 293,
+		"s`ɹ→": 294,
+		"s`ɹ↑": 295,
+		"s`ɹ↓↑": 296,
+		"s`ɹ���": 297,
+		"ɹ`ɹ": 298,
+		"ɹ`ɹ→": 299,
+		"ɹ`ɹ↑": 300,
+		"ɹ`ɹ↓↑": 301,
+		"ɹ`ɹ↓": 302,
+		"ts⁼ɹ": 303,
+		"ts⁼ɹ→": 304,
+		"ts⁼ɹ↑": 305,
+		"ts⁼ɹ↓↑": 306,
+		"ts⁼ɹ↓": 307,
+		"tsʰɹ": 308,
+		"tsʰɹ→": 309,
+		"tsʰɹ↑": 310,
+		"tsʰɹ↓↑": 311,
+		"tsʰɹ↓": 312,
+		"sɹ": 313,
+		"sɹ→": 314,
+		"sɹ↑": 315,
+		"sɹ↓↑": 316,
+		"sɹ↓": 317
+	}
+}