Balaji-1904 commited on 22 days ago

Commit

06c6114

verified ·

1 Parent(s): a967224

Upload folder using huggingface_hub

Browse files

Files changed (37) hide show

.gitattributes +9 -0
.ipynb_checkpoints/Spark_TTS_FT-checkpoint.ipynb +1211 -0
.ipynb_checkpoints/config-checkpoint.yaml +7 -0
BiCodec/config.yaml +60 -0
BiCodec/model.safetensors +3 -0
LLM/.gitattributes +36 -0
LLM/added_tokens.json +0 -0
LLM/chat_template.jinja +54 -0
LLM/config.json +56 -0
LLM/generation_config.json +8 -0
LLM/merges.txt +0 -0
LLM/model.safetensors +3 -0
LLM/special_tokens_map.json +31 -0
LLM/tokenizer.json +3 -0
LLM/tokenizer_config.json +0 -0
LLM/vocab.json +0 -0
README.md +208 -0
Spark_TTS_FT.ipynb +1732 -0
config.yaml +7 -0
src/figures/gradio_TTS.png +0 -0
src/figures/gradio_control.png +0 -0
src/figures/infer_control.png +3 -0
src/figures/infer_voice_cloning.png +3 -0
src/logo/HKUST.jpg +3 -0
src/logo/NPU.jpg +3 -0
src/logo/NTU.jpg +0 -0
src/logo/SJU.jpg +3 -0
src/logo/SparkAudio.jpg +0 -0
src/logo/SparkAudio2.jpg +0 -0
src/logo/SparkTTS.jpg +0 -0
src/logo/SparkTTS.png +3 -0
src/logo/mobvoi.jpg +3 -0
src/logo/mobvoi.png +3 -0
wav2vec2-large-xlsr-53/README.md +29 -0
wav2vec2-large-xlsr-53/config.json +83 -0
wav2vec2-large-xlsr-53/preprocessor_config.json +9 -0
wav2vec2-large-xlsr-53/pytorch_model.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+LLM/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+src/figures/infer_control.png filter=lfs diff=lfs merge=lfs -text
+src/figures/infer_voice_cloning.png filter=lfs diff=lfs merge=lfs -text
+src/logo/HKUST.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/NPU.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/SJU.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/SparkTTS.png filter=lfs diff=lfs merge=lfs -text
+src/logo/mobvoi.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/mobvoi.png filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/Spark_TTS_FT-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1211 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Qpw04rkbynx0"
+   },
+   "source": [
+    "To run this, press \"*Runtime*\" and press \"*Run all*\" on a **free** Tesla T4 Google Colab instance!\n",
+    "<div class=\"align-center\">\n",
+    "<a href=\"https://unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n",
+    "<a href=\"https://discord.gg/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord button.png\" width=\"145\"></a>\n",
+    "<a href=\"https://docs.unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true\" width=\"125\"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href=\"https://github.com/unslothai/unsloth\">Github</a> </i> ⭐\n",
+    "</div>\n",
+    "\n",
+    "To install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://docs.unsloth.ai/get-started/installing-+-updating).\n",
+    "\n",
+    "You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5fs-yYEaynx1"
+   },
+   "source": [
+    "### News"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pyJK0UZaynx2"
+   },
+   "source": [
+    "Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).\n",
+    "\n",
+    "Read our **[Gemma 3N Guide](https://docs.unsloth.ai/basics/gemma-3n-how-to-run-and-fine-tune)** and check out our new **[Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)** quants which outperforms other quantization methods!\n",
+    "\n",
+    "Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SDUHv0mwynx3"
+   },
+   "source": [
+    "### Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "MY4G3EIbynx3"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "import os\n",
+    "if \"COLAB_\" not in \"\".join(os.environ.keys()):\n",
+    "    %pip install unsloth\n",
+    "else:\n",
+    "    # Do this only in Colab notebooks! Otherwise use pip install unsloth\n",
+    "    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo\n",
+    "    %pip install sentencepiece protobuf \"datasets>=3.4.1,<4.0.0\" \"huggingface_hub>=0.34.0\" hf_transfer\n",
+    "    %pip install --no-deps unsloth\n",
+    "%git clone https://github.com/SparkAudio/Spark-TTS\n",
+    "%pip install omegaconf einx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AkWYsztAs9Ky"
+   },
+   "source": [
+    "### Unsloth\n",
+    "\n",
+    "`FastModel` supports loading nearly any model now! This includes Vision and Text models!\n",
+    "\n",
+    "Thank you to [Etherl](https://huggingface.co/Etherll) for creating this notebook!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:48:54.511089Z",
+     "iopub.status.busy": "2025-03-22T00:48:54.510770Z",
+     "iopub.status.idle": "2025-03-22T00:51:37.363415Z",
+     "shell.execute_reply": "2025-03-22T00:51:37.362696Z",
+     "shell.execute_reply.started": "2025-03-22T00:48:54.511053Z"
+    },
+    "id": "QmUBVEnvCDJv",
+    "outputId": "42083a68-d3cc-48c9-d852-b60796377434"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
+      "🦥 Unsloth Zoo will now patch everything to make training faster!\n",
+      "==((====))==  Unsloth 2025.8.1: Fast Qwen2 patching. Transformers: 4.54.1.\n",
+      "   \\\\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.\n",
+      "O^O/ \\_/ \\    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0\n",
+      "\\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]\n",
+      " \"-____-\"     Free license: http://github.com/unslothai/unsloth\n",
+      "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n",
+      "Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from unsloth import FastModel\n",
+    "import torch\n",
+    "from huggingface_hub import snapshot_download\n",
+    "\n",
+    "max_seq_length = 2048 # Choose any for long context!\n",
+    "\n",
+    "fourbit_models = [\n",
+    "    # 4bit dynamic quants for superior accuracy and low memory use\n",
+    "    \"unsloth/gemma-3-4b-it-unsloth-bnb-4bit\",\n",
+    "    \"unsloth/gemma-3-12b-it-unsloth-bnb-4bit\",\n",
+    "    \"unsloth/gemma-3-27b-it-unsloth-bnb-4bit\",\n",
+    "    # Qwen3 new models\n",
+    "    \"unsloth/Qwen3-4B-unsloth-bnb-4bit\",\n",
+    "    \"unsloth/Qwen3-8B-unsloth-bnb-4bit\",\n",
+    "    # Other very popular models!\n",
+    "    \"unsloth/Llama-3.1-8B\",\n",
+    "    \"unsloth/Llama-3.2-3B\",\n",
+    "    \"unsloth/Llama-3.3-70B\",\n",
+    "    \"unsloth/mistral-7b-instruct-v0.3\",\n",
+    "    \"unsloth/Phi-4\",\n",
+    "] # More models at https://huggingface.co/unsloth\n",
+    "\n",
+    "# Download model and code\n",
+    "snapshot_download(\"unsloth/Spark-TTS-0.5B\", local_dir = \"Spark-TTS-0.5B\")\n",
+    "\n",
+    "model, tokenizer = FastModel.from_pretrained(\n",
+    "    model_name = f\"Spark-TTS-0.5B/LLM\",\n",
+    "    max_seq_length = max_seq_length,\n",
+    "    dtype = torch.float32, # Spark seems to only work on float32 for now\n",
+    "    full_finetuning = True, # We support full finetuning now!\n",
+    "    load_in_4bit = False,\n",
+    "    #token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SXd9bTZd1aaL"
+   },
+   "source": [
+    "We now add LoRA adapters so we only need to update 1 to 10% of all parameters!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:51:37.365079Z",
+     "iopub.status.busy": "2025-03-22T00:51:37.364731Z",
+     "iopub.status.idle": "2025-03-22T00:51:44.221612Z",
+     "shell.execute_reply": "2025-03-22T00:51:44.220949Z",
+     "shell.execute_reply.started": "2025-03-22T00:51:37.365045Z"
+    },
+    "id": "6bZsfBuZDeCL",
+    "outputId": "292447b8-fd80-4b8b-ba3f-4637a1045166"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: Full finetuning is enabled, so .get_peft_model has no effect\n"
+     ]
+    }
+   ],
+   "source": [
+    "#LoRA does not work with float32 only works with bfloat16 !!!\n",
+    "model = FastModel.get_peft_model(\n",
+    "    model,\n",
+    "    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
+    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "                      \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
+    "    lora_alpha = 128,\n",
+    "    lora_dropout = 0, # Supports any, but = 0 is optimized\n",
+    "    bias = \"none\",    # Supports any, but = \"none\" is optimized\n",
+    "    # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
+    "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
+    "    random_state = 3407,\n",
+    "    use_rslora = False,  # We support rank stabilized LoRA\n",
+    "    loftq_config = None, # And LoftQ\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vITh0KVJ10qX"
+   },
+   "source": [
+    "<a name=\"Data\"></a>\n",
+    "### Data Prep  \n",
+    "\n",
+    "We will use the `MrDragonFox/Elise`, which is designed for training TTS models. Ensure that your dataset follows the required format: **text, audio** for single-speaker models or **source, text, audio** for multi-speaker models. You can modify this section to accommodate your own dataset, but maintaining the correct structure is essential for optimal training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:51:44.222880Z",
+     "iopub.status.busy": "2025-03-22T00:51:44.222617Z",
+     "iopub.status.idle": "2025-03-22T00:52:16.516878Z",
+     "shell.execute_reply": "2025-03-22T00:52:16.516033Z",
+     "shell.execute_reply.started": "2025-03-22T00:51:44.222848Z"
+    },
+    "id": "LjY75GoYUCB8"
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "dataset = load_dataset(\"Balaji-1904/TTS_KN_DS_V1.1\", split = \"train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 173,
+     "referenced_widgets": [
+      "a3b0c0581f1f4c428baaadd8e9a39b6f",
+      "2315228ff2b141afabe1263471f5364b",
+      "0474debc340943bd85f3daf92aebf7aa",
+      "cff1b0fa2ea24f45aab26685353eefdd",
+      "b7e20be79df246f19b35114a690e44f0",
+      "426eb100a94642f79e6b99777406a265",
+      "a36b5cf197dd4bd9a7f70aa6671b804c",
+      "0de4d0f282404edfbc191dca73f15f35",
+      "e58b5ad2f781475d8af2ddb38009baa6",
+      "33fbacbb2aa146cd90586357eec1dc3e",
+      "930b4d1d5f4b494b830df4d4c398e67c"
+     ]
+    },
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:52:16.518175Z",
+     "iopub.status.busy": "2025-03-22T00:52:16.517841Z",
+     "iopub.status.idle": "2025-03-22T00:52:35.039329Z",
+     "shell.execute_reply": "2025-03-22T00:52:35.038356Z",
+     "shell.execute_reply.started": "2025-03-22T00:52:16.518146Z"
+    },
+    "id": "zK94B-Pfioto",
+    "outputId": "3f11cf35-c173-410d-f709-43552323f26f"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.11/dist-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
+      "  WeightNorm.apply(module, name, dim)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Missing tensor: mel_transformer.spectrogram.window\n",
+      "Missing tensor: mel_transformer.mel_scale.fb\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'=<function formatting_audio_func at 0x7bd438943100> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n",
+      "WARNING:datasets.fingerprint:Parameter 'function'=<function formatting_audio_func at 0x7bd438943100> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a3b0c0581f1f4c428baaadd8e9a39b6f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/401 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#@title Tokenization Function\n",
+    "\n",
+    "import locale\n",
+    "import torchaudio.transforms as T\n",
+    "import os\n",
+    "import torch\n",
+    "import sys\n",
+    "import numpy as np\n",
+    "sys.path.append('Spark-TTS')\n",
+    "from sparktts.models.audio_tokenizer import BiCodecTokenizer\n",
+    "from sparktts.utils.audio import audio_volume_normalize\n",
+    "\n",
+    "audio_tokenizer = BiCodecTokenizer(\"Spark-TTS-0.5B\", \"cuda\")\n",
+    "def extract_wav2vec2_features( wavs: torch.Tensor) -> torch.Tensor:\n",
+    "        \"\"\"extract wav2vec2 features\"\"\"\n",
+    "\n",
+    "        if wavs.shape[0] != 1:\n",
+    "\n",
+    "             raise ValueError(f\"Expected batch size 1, but got shape {wavs.shape}\")\n",
+    "        wav_np = wavs.squeeze(0).cpu().numpy()\n",
+    "\n",
+    "        processed = audio_tokenizer.processor(\n",
+    "            wav_np,\n",
+    "            sampling_rate=16000,\n",
+    "            return_tensors=\"pt\",\n",
+    "            padding=True,\n",
+    "        )\n",
+    "        input_values = processed.input_values\n",
+    "\n",
+    "        input_values = input_values.to(audio_tokenizer.feature_extractor.device)\n",
+    "\n",
+    "        model_output = audio_tokenizer.feature_extractor(\n",
+    "            input_values,\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "        if model_output.hidden_states is None:\n",
+    "             raise ValueError(\"Wav2Vec2Model did not return hidden states. Ensure config `output_hidden_states=True`.\")\n",
+    "\n",
+    "        num_layers = len(model_output.hidden_states)\n",
+    "        required_layers = [11, 14, 16]\n",
+    "        if any(l >= num_layers for l in required_layers):\n",
+    "             raise IndexError(f\"Requested hidden state indices {required_layers} out of range for model with {num_layers} layers.\")\n",
+    "\n",
+    "        feats_mix = (\n",
+    "            model_output.hidden_states[11] + model_output.hidden_states[14] + model_output.hidden_states[16]\n",
+    "        ) / 3\n",
+    "\n",
+    "        return feats_mix\n",
+    "def formatting_audio_func(example):\n",
+    "    text = f\"{example['source']}: {example['text']}\" if \"source\" in example else example[\"text\"]\n",
+    "    audio_array = example[\"audio\"][\"array\"]\n",
+    "    sampling_rate = example[\"audio\"][\"sampling_rate\"]\n",
+    "\n",
+    "    target_sr = audio_tokenizer.config['sample_rate']\n",
+    "\n",
+    "    if sampling_rate != target_sr:\n",
+    "        resampler = T.Resample(orig_freq=sampling_rate, new_freq=target_sr)\n",
+    "        audio_tensor_temp = torch.from_numpy(audio_array).float()\n",
+    "        audio_array = resampler(audio_tensor_temp).numpy()\n",
+    "\n",
+    "    if audio_tokenizer.config[\"volume_normalize\"]:\n",
+    "        audio_array = audio_volume_normalize(audio_array)\n",
+    "\n",
+    "    ref_wav_np = audio_tokenizer.get_ref_clip(audio_array)\n",
+    "\n",
+    "    audio_tensor = torch.from_numpy(audio_array).unsqueeze(0).float().to(audio_tokenizer.device)\n",
+    "    ref_wav_tensor = torch.from_numpy(ref_wav_np).unsqueeze(0).float().to(audio_tokenizer.device)\n",
+    "\n",
+    "\n",
+    "    feat = extract_wav2vec2_features(audio_tensor)\n",
+    "\n",
+    "    batch = {\n",
+    "\n",
+    "        \"wav\": audio_tensor,\n",
+    "        \"ref_wav\": ref_wav_tensor,\n",
+    "        \"feat\": feat.to(audio_tokenizer.device),\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "    semantic_token_ids, global_token_ids = audio_tokenizer.model.tokenize(batch)\n",
+    "\n",
+    "    global_tokens = \"\".join(\n",
+    "        [f\"<|bicodec_global_{i}|>\" for i in global_token_ids.squeeze().cpu().numpy()] # Squeeze batch dim\n",
+    "    )\n",
+    "    semantic_tokens = \"\".join(\n",
+    "        [f\"<|bicodec_semantic_{i}|>\" for i in semantic_token_ids.squeeze().cpu().numpy()] # Squeeze batch dim\n",
+    "    )\n",
+    "\n",
+    "    inputs = [\n",
+    "        \"<|task_tts|>\",\n",
+    "        \"<|start_content|>\",\n",
+    "        text,\n",
+    "        \"<|end_content|>\",\n",
+    "        \"<|start_global_token|>\",\n",
+    "        global_tokens,\n",
+    "        \"<|end_global_token|>\",\n",
+    "        \"<|start_semantic_token|>\",\n",
+    "        semantic_tokens,\n",
+    "        \"<|end_semantic_token|>\",\n",
+    "        \"<|im_end|>\"\n",
+    "    ]\n",
+    "    inputs = \"\".join(inputs)\n",
+    "    return {\"text\": inputs}\n",
+    "\n",
+    "\n",
+    "dataset = dataset.map(formatting_audio_func, remove_columns=[\"audio\"])\n",
+    "print(\"Moving Bicodec model and Wav2Vec2Model to cpu.\")\n",
+    "audio_tokenizer.model.cpu()\n",
+    "audio_tokenizer.feature_extractor.cpu()\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "idAEIeSQ3xdS"
+   },
+   "source": [
+    "<a name=\"Train\"></a>\n",
+    "### Train the model\n",
+    "Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:34:09.688959Z",
+     "iopub.status.busy": "2025-03-22T00:34:09.688649Z",
+     "iopub.status.idle": "2025-03-22T00:34:09.729661Z",
+     "shell.execute_reply": "2025-03-22T00:34:09.729001Z",
+     "shell.execute_reply.started": "2025-03-22T00:34:09.688939Z"
+    },
+    "id": "95_Nn-89DhsL"
+   },
+   "outputs": [],
+   "source": [
+    "from trl import SFTConfig, SFTTrainer\n",
+    "trainer = SFTTrainer(\n",
+    "    model = model,\n",
+    "    tokenizer = tokenizer,\n",
+    "    train_dataset = dataset,\n",
+    "    dataset_text_field = \"text\",\n",
+    "    max_seq_length = max_seq_length,\n",
+    "    packing = False, # Can make training 5x faster for short sequences.\n",
+    "    args = SFTConfig(\n",
+    "        per_device_train_batch_size = 2,\n",
+    "        gradient_accumulation_steps = 4,\n",
+    "        warmup_steps = 5,\n",
+    "        num_train_epochs = 5, # Set this for 1 full training run.\n",
+    "        #max_steps = 60,\n",
+    "        learning_rate = 2e-4,\n",
+    "        fp16 = False, # We're doing full float32 s disable mixed precision\n",
+    "        bf16 = False, # We're doing full float32 s disable mixed precision\n",
+    "        logging_steps = 1,\n",
+    "        optim = \"adamw_8bit\",\n",
+    "        weight_decay = 0.01,\n",
+    "        lr_scheduler_type = \"linear\",\n",
+    "        seed = 3407,\n",
+    "        output_dir = \"outputs\",\n",
+    "        report_to = \"tensorboard\", # Use this for WandB etc\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2ejIt2xSNKKp"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Show current memory stats\n",
+    "gpu_stats = torch.cuda.get_device_properties(0)\n",
+    "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+    "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+    "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:34:12.049152Z",
+     "iopub.status.busy": "2025-03-22T00:34:12.048862Z",
+     "iopub.status.idle": "2025-03-22T00:34:14.404349Z",
+     "shell.execute_reply": "2025-03-22T00:34:14.403239Z",
+     "shell.execute_reply.started": "2025-03-22T00:34:12.049130Z"
+    },
+    "id": "yqxqAZ7KJ4oL"
+   },
+   "outputs": [],
+   "source": [
+    "trainer_stats = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "pCqnaKmlO1U9"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Show final memory and time stats\n",
+    "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+    "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+    "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+    "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+    "print(\n",
+    "    f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\"\n",
+    ")\n",
+    "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+    "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+    "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+    "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ekOmTR1hSNcr"
+   },
+   "source": [
+    "<a name=\"Inference\"></a>\n",
+    "### Inference\n",
+    "Let's run the model! You can change the prompts\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "apUdB40Ep6Ki"
+   },
+   "outputs": [],
+   "source": [
+    "input_text = \"Hey there my name is Elise, <giggles> and I'm a speech generation model that can sound like a person.\"\n",
+    "\n",
+    "chosen_voice = None # None for single-speaker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:52:35.040842Z",
+     "iopub.status.busy": "2025-03-22T00:52:35.040125Z",
+     "iopub.status.idle": "2025-03-22T00:52:35.050560Z",
+     "shell.execute_reply": "2025-03-22T00:52:35.049663Z",
+     "shell.execute_reply.started": "2025-03-22T00:52:35.040818Z"
+    },
+    "id": "krYI8PrRJ6MX"
+   },
+   "outputs": [],
+   "source": [
+    "#@title Run Inference\n",
+    "\n",
+    "import torch\n",
+    "import re\n",
+    "import numpy as np\n",
+    "from typing import Dict, Any\n",
+    "import torchaudio.transforms as T\n",
+    "\n",
+    "FastModel.for_inference(model) # Enable native 2x faster inference\n",
+    "\n",
+    "@torch.inference_mode()\n",
+    "def generate_speech_from_text(\n",
+    "    text: str,\n",
+    "    temperature: float = 0.8,   # Generation temperature\n",
+    "    top_k: int = 50,            # Generation top_k\n",
+    "    top_p: float = 1,        # Generation top_p\n",
+    "    max_new_audio_tokens: int = 2048, # Max tokens for audio part\n",
+    "    device: torch.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    ") -> np.ndarray:\n",
+    "    \"\"\"\n",
+    "    Generates speech audio from text using default voice control parameters.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): The text input to be converted to speech.\n",
+    "        temperature (float): Sampling temperature for generation.\n",
+    "        top_k (int): Top-k sampling parameter.\n",
+    "        top_p (float): Top-p (nucleus) sampling parameter.\n",
+    "        max_new_audio_tokens (int): Max number of new tokens to generate (limits audio length).\n",
+    "        device (torch.device): Device to run inference on.\n",
+    "\n",
+    "    Returns:\n",
+    "        np.ndarray: Generated waveform as a NumPy array.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    torch.compiler.reset()\n",
+    "\n",
+    "    prompt = \"\".join([\n",
+    "        \"<|task_tts|>\",\n",
+    "        \"<|start_content|>\",\n",
+    "        text,\n",
+    "        \"<|end_content|>\",\n",
+    "        \"<|start_global_token|>\"\n",
+    "    ])\n",
+    "\n",
+    "    model_inputs = tokenizer([prompt], return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "    print(\"Generating token sequence...\")\n",
+    "    generated_ids = model.generate(\n",
+    "        **model_inputs,\n",
+    "        max_new_tokens=max_new_audio_tokens, # Limit generation length\n",
+    "        do_sample=True,\n",
+    "        temperature=temperature,\n",
+    "        top_k=top_k,\n",
+    "        top_p=top_p,\n",
+    "        eos_token_id=tokenizer.eos_token_id, # Stop token\n",
+    "        pad_token_id=tokenizer.pad_token_id # Use models pad token id\n",
+    "    )\n",
+    "    print(\"Token sequence generated.\")\n",
+    "\n",
+    "\n",
+    "    generated_ids_trimmed = generated_ids[:, model_inputs.input_ids.shape[1]:]\n",
+    "\n",
+    "\n",
+    "    predicts_text = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=False)[0]\n",
+    "    # print(f\"\\nGenerated Text (for parsing):\\n{predicts_text}\\n\") # Debugging\n",
+    "\n",
+    "    # Extract semantic token IDs using regex\n",
+    "    semantic_matches = re.findall(r\"<\\|bicodec_semantic_(\\d+)\\|>\", predicts_text)\n",
+    "    if not semantic_matches:\n",
+    "        print(\"Warning: No semantic tokens found in the generated output.\")\n",
+    "        # Handle appropriately - perhaps return silence or raise error\n",
+    "        return np.array([], dtype=np.float32)\n",
+    "\n",
+    "    pred_semantic_ids = torch.tensor([int(token) for token in semantic_matches]).long().unsqueeze(0) # Add batch dim\n",
+    "\n",
+    "    # Extract global token IDs using regex (assuming controllable mode also generates these)\n",
+    "    global_matches = re.findall(r\"<\\|bicodec_global_(\\d+)\\|>\", predicts_text)\n",
+    "    if not global_matches:\n",
+    "         print(\"Warning: No global tokens found in the generated output (controllable mode). Might use defaults or fail.\")\n",
+    "         pred_global_ids = torch.zeros((1, 1), dtype=torch.long)\n",
+    "    else:\n",
+    "         pred_global_ids = torch.tensor([int(token) for token in global_matches]).long().unsqueeze(0) # Add batch dim\n",
+    "\n",
+    "    pred_global_ids = pred_global_ids.unsqueeze(0) # Shape becomes (1, 1, N_global)\n",
+    "\n",
+    "    print(f\"Found {pred_semantic_ids.shape[1]} semantic tokens.\")\n",
+    "    print(f\"Found {pred_global_ids.shape[2]} global tokens.\")\n",
+    "\n",
+    "\n",
+    "    # 5. Detokenize using BiCodecTokenizer\n",
+    "    print(\"Detokenizing audio tokens...\")\n",
+    "    # Ensure audio_tokenizer and its internal model are on the correct device\n",
+    "    audio_tokenizer.device = device\n",
+    "    audio_tokenizer.model.to(device)\n",
+    "    # Squeeze the extra dimension from global tokens as seen in SparkTTS example\n",
+    "    wav_np = audio_tokenizer.detokenize(\n",
+    "        pred_global_ids.to(device).squeeze(0), # Shape (1, N_global)\n",
+    "        pred_semantic_ids.to(device)           # Shape (1, N_semantic)\n",
+    "    )\n",
+    "    print(\"Detokenization complete.\")\n",
+    "\n",
+    "    return wav_np\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    print(f\"Generating speech for: '{input_text}'\")\n",
+    "    text = f\"{chosen_voice}: \" + input_text if chosen_voice else input_text\n",
+    "    generated_waveform = generate_speech_from_text(input_text)\n",
+    "\n",
+    "    if generated_waveform.size > 0:\n",
+    "        import soundfile as sf\n",
+    "        output_filename = \"generated_speech_controllable.wav\"\n",
+    "        sample_rate = audio_tokenizer.config.get(\"sample_rate\", 16000)\n",
+    "        sf.write(output_filename, generated_waveform, sample_rate)\n",
+    "        print(f\"Audio saved to {output_filename}\")\n",
+    "\n",
+    "        # Optional: Play in notebook\n",
+    "        from IPython.display import Audio, display\n",
+    "        display(Audio(generated_waveform, rate=sample_rate))\n",
+    "    else:\n",
+    "        print(\"Audio generation failed (no tokens found?).\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uMuVrWbjAzhc"
+   },
+   "source": [
+    "<a name=\"Save\"></a>\n",
+    "### Saving, loading finetuned models\n",
+    "To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.\n",
+    "\n",
+    "**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "upcOlWe7A1vc"
+   },
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(\"lora_model\")  # Local saving\n",
+    "tokenizer.save_pretrained(\"lora_model\")\n",
+    "# model.push_to_hub(\"your_name/lora_model\", token = \"...\") # Online saving\n",
+    "# tokenizer.push_to_hub(\"your_name/lora_model\", token = \"...\") # Online saving"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "f422JgM9sdVT"
+   },
+   "source": [
+    "\n",
+    "### Saving to float16\n",
+    "\n",
+    "We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "iHjt_SMYsd3P",
+    "outputId": "bd8cccb7-6b95-45bf-80da-de120988447e"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.\n",
+      "We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.\n",
+      "To force `safe_serialization`, set it to `None` instead.\n",
+      "Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n",
+      "model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.\n",
+      "Unsloth: Will remove a cached repo with size 15.1G\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: Merging 4bit and LoRA weights to 16bit...\n",
+      "Unsloth: Will use up to 3.99 out of 12.67 RAM for saving.\n",
+      "Unsloth: Saving model... This might take 5 minutes ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 28/28 [00:01<00:00, 27.83it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: Saving tokenizer... Done.\n",
+      "Unsloth: Saving model/pytorch_model-00001-of-00002.bin...\n",
+      "Unsloth: Saving model/pytorch_model-00002-of-00002.bin...\n",
+      "Done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Merge to 16bit\n",
+    "if False: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_16bit\",)\n",
+    "if False: model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"merged_16bit\", token = \"\")\n",
+    "\n",
+    "# Merge to 4bit\n",
+    "if False: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_4bit\",)\n",
+    "if False: model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"merged_4bit\", token = \"\")\n",
+    "\n",
+    "# Just LoRA adapters\n",
+    "if False:\n",
+    "    model.save_pretrained(\"model\")\n",
+    "    tokenizer.save_pretrained(\"model\")\n",
+    "if False:\n",
+    "    model.push_to_hub(\"hf/model\", token = \"\")\n",
+    "    tokenizer.push_to_hub(\"hf/model\", token = \"\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "egOSE7Cgynx7"
+   },
+   "source": [
+    "And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!\n",
+    "\n",
+    "Some other links:\n",
+    "1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)\n",
+    "2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)\n",
+    "3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)\n",
+    "6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!\n",
+    "\n",
+    "<div class=\"align-center\">\n",
+    "  <a href=\"https://unsloth.ai\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n",
+    "  <a href=\"https://discord.gg/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord.png\" width=\"145\"></a>\n",
+    "  <a href=\"https://docs.unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true\" width=\"125\"></a>\n",
+    "\n",
+    "  Join Discord if you need help + ⭐️ <i>Star us on <a href=\"https://github.com/unslothai/unsloth\">Github</a> </i> ⭐️\n",
+    "</div>\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [],
+   "dockerImageVersionId": 30919,
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "TTS_ft",
+   "language": "python",
+   "name": "tts_ft"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "0474debc340943bd85f3daf92aebf7aa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0de4d0f282404edfbc191dca73f15f35",
+      "max": 401,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_e58b5ad2f781475d8af2ddb38009baa6",
+      "value": 354
+     }
+    },
+    "0de4d0f282404edfbc191dca73f15f35": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2315228ff2b141afabe1263471f5364b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_426eb100a94642f79e6b99777406a265",
+      "placeholder": "",
+      "style": "IPY_MODEL_a36b5cf197dd4bd9a7f70aa6671b804c",
+      "value": "Map:  88%"
+     }
+    },
+    "33fbacbb2aa146cd90586357eec1dc3e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "426eb100a94642f79e6b99777406a265": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "930b4d1d5f4b494b830df4d4c398e67c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a36b5cf197dd4bd9a7f70aa6671b804c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a3b0c0581f1f4c428baaadd8e9a39b6f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_2315228ff2b141afabe1263471f5364b",
+       "IPY_MODEL_0474debc340943bd85f3daf92aebf7aa",
+       "IPY_MODEL_cff1b0fa2ea24f45aab26685353eefdd"
+      ],
+      "layout": "IPY_MODEL_b7e20be79df246f19b35114a690e44f0"
+     }
+    },
+    "b7e20be79df246f19b35114a690e44f0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "cff1b0fa2ea24f45aab26685353eefdd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_33fbacbb2aa146cd90586357eec1dc3e",
+      "placeholder": "",
+      "style": "IPY_MODEL_930b4d1d5f4b494b830df4d4c398e67c",
+      "value": " 354/401 [03:01&lt;00:22,  2.11 examples/s]"
+     }
+    },
+    "e58b5ad2f781475d8af2ddb38009baa6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

.ipynb_checkpoints/config-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+highpass_cutoff_freq: 40
+sample_rate: 16000
+segment_duration: 2.4 # (s)
+max_val_duration: 12 # (s)
+latent_hop_length: 320
+ref_segment_duration: 6
+volume_normalize: true

BiCodec/config.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+audio_tokenizer:
+  mel_params:
+    sample_rate: 16000
+    n_fft: 1024
+    win_length: 640
+    hop_length: 320
+    mel_fmin: 10
+    mel_fmax: null
+    num_mels: 128
+  encoder:
+    input_channels: 1024
+    vocos_dim: 384
+    vocos_intermediate_dim: 2048
+    vocos_num_layers: 12
+    out_channels: 1024
+    sample_ratios: [1,1]
+  decoder:
+    input_channel: 1024
+    channels: 1536
+    rates: [8, 5, 4, 2]
+    kernel_sizes: [16,11,8,4]
+  quantizer:
+    input_dim: 1024
+    codebook_size: 8192
+    codebook_dim: 8
+    commitment: 0.25
+    codebook_loss_weight: 2.0
+    use_l2_normlize: True
+    threshold_ema_dead_code: 0.2
+  speaker_encoder:
+    input_dim: 128
+    out_dim: 1024
+    latent_dim: 128
+    token_num: 32
+    fsq_levels: [4, 4, 4, 4, 4, 4]
+    fsq_num_quantizers: 1
+  prenet:
+    input_channels: 1024
+    vocos_dim: 384
+    vocos_intermediate_dim: 2048
+    vocos_num_layers: 12
+    out_channels: 1024
+    condition_dim: 1024
+    sample_ratios: [1,1]
+    use_tanh_at_final: False
+  postnet:
+    input_channels: 1024
+    vocos_dim: 384
+    vocos_intermediate_dim: 2048
+    vocos_num_layers: 6
+    out_channels: 1024
+    use_tanh_at_final: False

BiCodec/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9940cd48d4446e4340ced82d234bf5618350dd9f5db900ebe47a4fdb03867ec
+size 625518756

LLM/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LLM/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

LLM/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.54.1",
+  "unsloth_version": "2025.8.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 166000
+}

LLM/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "max_length": 32768,
+  "pad_token_id": 151643,
+  "transformers_version": "4.54.1"
+}

LLM/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de8e649c4c889e92eca6d18afbb7ea7be71ac874797c29e954f1ff89bfd4e237
+size 2026568872

LLM/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

LLM/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
+size 14129172

LLM/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+license: cc-by-nc-sa-4.0
+language:
+- en
+- zh
+tags:
+- text-to-speech
+library_tag: spark-tts
+base_model:
+- SparkAudio/Spark-TTS-0.5B
+---
+<div>
+  <p style="margin-bottom: 0; margin-top: 0;">
+    <strong>See <a href="https://huggingface.co/collections/unsloth/text-to-speech-tts-models-68007ab12522e96be1e02155">our collection</a> for all our TTS model uploads.</strong>
+  </p>
+  <p style="margin-bottom: 0;">
+    <em>Learn to fine-tune TTS models - <a href="https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning">Read our Guide</a>.</em>
+  </p>
+<p style="margin-top: 0;margin-bottom: 0;">
+    <em><a href="https://docs.unsloth.ai/basics/unsloth-dynamic-v2.0-gguf">Unsloth Dynamic 2.0</a> achieves superior accuracy & outperforms other leading quants.</em>
+  </p>
+  <div style="display: flex; gap: 5px; align-items: center; ">
+    <a href="https://github.com/unslothai/unsloth/">
+      <img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="133">
+    </a>
+    <a href="https://discord.gg/unsloth">
+      <img src="https://github.com/unslothai/unsloth/raw/main/images/Discord%20button.png" width="173">
+    </a>
+    <a href="https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning">
+      <img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation%20green%20button.png" width="143">
+    </a>
+  </div>
+<h1 style="margin-top: 0rem;">✨ Run & Fine-tune TTS models with Unsloth!</h1>
+</div>
+- Fine-tune TTS models for free using our Google [Colab notebooks here](https://docs.unsloth.ai/get-started/unsloth-notebooks#text-to-speech-tts-notebooks)!
+- Read our Blog about TTS support: [unsloth.ai/blog/tts](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning)
+| Unsloth supports          |    Free Notebooks                                                                                           | Performance | Memory use |
+|-----------------|--------------------------------------------------------------------------------------------------------------------------|-------------|----------|
+| **Spark-TTS**      | [▶️ Start on Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Spark_TTS_(0_5B).ipynb)               | 1.5x faster | 58% less |
+| **Whisper Large V3**      | [▶️ Start on Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Whisper.ipynb)               | 1.5x faster | 50% less |
+| **Qwen3 (14B)**      | [▶️ Start on Colab](https://docs.unsloth.ai/get-started/unsloth-notebooks)               | 2x faster | 70% less |
+| **Llama 3.2 Vision (11B)**      | [▶️ Start on Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)               | 1.8x faster | 50% less |
+<div align="center">
+    <h1>
+    Spark-TTS
+    </h1>
+    <p>
+    Official model for <br>
+    <b><em>Spark-TTS: An Efficient LLM-Based Text-to-Speech Model with Single-Stream Decoupled Speech Tokens</em></b>
+    </p>
+    <p>
+    <img src="src/logo/SparkTTS.jpg" alt="Spark-TTS Logo" style="width: 200px; height: 200px;">
+    </p>
+</div>
+## Spark-TTS 🔥
+### 👉🏻 [Spark-TTS Demos](https://sparkaudio.github.io/spark-tts/) 👈🏻
+### 👉🏻 [Github Repo](https://github.com/SparkAudio/Spark-TTS) 👈🏻
+### 👉🏻 [Paper](https://arxiv.org/pdf/2503.01710) 👈🏻
+### Overview
+Spark-TTS is an advanced text-to-speech system that uses the power of large language models (LLM) for highly accurate and natural-sounding voice synthesis. It is designed to be efficient, flexible, and powerful for both research and production use.
+### Key Features
+- **Simplicity and Efficiency**: Built entirely on Qwen2.5, Spark-TTS eliminates the need for additional generation models like flow matching. Instead of relying on separate models to generate acoustic features, it directly reconstructs audio from the code predicted by the LLM. This approach streamlines the process, improving efficiency and reducing complexity.
+- **High-Quality Voice Cloning**: Supports zero-shot voice cloning, which means it can replicate a speaker's voice even without specific training data for that voice. This is ideal for cross-lingual and code-switching scenarios, allowing for seamless transitions between languages and voices without requiring separate training for each one.
+- **Bilingual Support**: Supports both Chinese and English, and is capable of zero-shot voice cloning for cross-lingual and code-switching scenarios, enabling the model to synthesize speech in multiple languages with high naturalness and accuracy.
+- **Controllable Speech Generation**: Supports creating virtual speakers by adjusting parameters such as gender, pitch, and speaking rate.
+---
+<table align="center">
+  <tr>
+    <td align="center"><b>Inference Overview of Voice Cloning</b><br><img src="src/figures/infer_voice_cloning.png" width="80%" /></td>
+  </tr>
+  <tr>
+    <td align="center"><b>Inference Overview of Controlled Generation</b><br><img src="src/figures/infer_control.png" width="80%" /></td>
+  </tr>
+</table>
+## Install
+**Clone and Install**
+- Clone the repo
+``` sh
+git clone https://github.com/SparkAudio/Spark-TTS.git
+cd Spark-TTS
+```
+- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
+- Create Conda env:
+``` sh
+conda create -n sparktts -y python=3.12
+conda activate sparktts
+pip install -r requirements.txt
+# If you are in mainland China, you can set the mirror as follows:
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+```
+**Model Download**
+Download via python:
+```python
+from huggingface_hub import snapshot_download
+snapshot_download("SparkAudio/Spark-TTS-0.5B", local_dir="pretrained_models/Spark-TTS-0.5B")
+```
+Download via git clone:
+```sh
+mkdir -p pretrained_models
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/SparkAudio/Spark-TTS-0.5B pretrained_models/Spark-TTS-0.5B
+```
+**Basic Usage**
+You can simply run the demo with the following commands:
+``` sh
+cd example
+bash infer.sh
+```
+Alternatively, you can directly execute the following command in the command line to perform inference：
+``` sh
+python -m cli.inference \
+    --text "text to synthesis." \
+    --device 0 \
+    --save_dir "path/to/save/audio" \
+    --model_dir pretrained_models/Spark-TTS-0.5B \
+    --prompt_text "transcript of the prompt audio" \
+    --prompt_speech_path "path/to/prompt_audio"
+```
+**UI Usage**
+You can start the UI interface by running `python webui.py`, which allows you to perform Voice Cloning and Voice Creation. Voice Cloning supports uploading reference audio or directly recording the audio.
+| **Voice Cloning** | **Voice Creation** |
+|:-------------------:|:-------------------:|
+| ![Image 1](src/figures/gradio_TTS.png) | ![Image 2](src/figures/gradio_control.png) |
+## To-Do List
+- [x] Release the Spark-TTS paper.
+- [ ] Release the training code.
+- [ ] Release the training dataset, VoxBox.
+## Citation
+```
+@misc{wang2025sparktts,
+      title={Spark-TTS: An Efficient LLM-Based Text-to-Speech Model with Single-Stream Decoupled Speech Tokens},
+      author={Xinsheng Wang and Mingqi Jiang and Ziyang Ma and Ziyu Zhang and Songxiang Liu and Linqin Li and Zheng Liang and Qixi Zheng and Rui Wang and Xiaoqin Feng and Weizhen Bian and Zhen Ye and Sitong Cheng and Ruibin Yuan and Zhixian Zhao and Xinfa Zhu and Jiahao Pan and Liumeng Xue and Pengcheng Zhu and Yunlin Chen and Zhifei Li and Xie Chen and Lei Xie and Yike Guo and Wei Xue},
+      year={2025},
+      eprint={2503.01710},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2503.01710},
+}
+```
+## ⚠ License Update
+The model's license has been updated from Apache 2.0 to CC BY-NC-SA due to the licensing terms of some training data.
+Key Changes:
+- The model can only be used for non-commercial purposes.
+- Any modifications or derivatives must also be released under CC BY-NC-SA 4.0.
+- Proper attribution is required when using or modifying the model.
+Please ensure compliance with the new license terms.
+## ⚠️ Usage Disclaimer
+This project provides a zero-shot voice cloning TTS model intended for academic research, educational purposes, and legitimate applications, such as personalized speech synthesis, assistive technologies, and linguistic research.
+Please note:
+- Do not use this model for unauthorized voice cloning, impersonation, fraud, scams, deepfakes, or any illegal activities.
+- Ensure compliance with local laws and regulations when using this model and uphold ethical standards.
+- The developers assume no liability for any misuse of this model.
+We advocate for the responsible development and use of AI and encourage the community to uphold safety and ethical principles in AI research and applications. If you have any concerns regarding ethics or misuse, please contact us.

Spark_TTS_FT.ipynb ADDED Viewed

	@@ -0,0 +1,1732 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Qpw04rkbynx0"
+   },
+   "source": [
+    "To run this, press \"*Runtime*\" and press \"*Run all*\" on a **free** Tesla T4 Google Colab instance!\n",
+    "<div class=\"align-center\">\n",
+    "<a href=\"https://unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n",
+    "<a href=\"https://discord.gg/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord button.png\" width=\"145\"></a>\n",
+    "<a href=\"https://docs.unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true\" width=\"125\"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href=\"https://github.com/unslothai/unsloth\">Github</a> </i> ⭐\n",
+    "</div>\n",
+    "\n",
+    "To install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://docs.unsloth.ai/get-started/installing-+-updating).\n",
+    "\n",
+    "You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5fs-yYEaynx1"
+   },
+   "source": [
+    "### News"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pyJK0UZaynx2"
+   },
+   "source": [
+    "Unsloth now supports Text-to-Speech (TTS) models. Read our [guide here](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning).\n",
+    "\n",
+    "Read our **[Gemma 3N Guide](https://docs.unsloth.ai/basics/gemma-3n-how-to-run-and-fine-tune)** and check out our new **[Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)** quants which outperforms other quantization methods!\n",
+    "\n",
+    "Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SDUHv0mwynx3"
+   },
+   "source": [
+    "### Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "MY4G3EIbynx3"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "import os\n",
+    "if \"COLAB_\" not in \"\".join(os.environ.keys()):\n",
+    "    %pip install unsloth\n",
+    "else:\n",
+    "    # Do this only in Colab notebooks! Otherwise use pip install unsloth\n",
+    "    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo\n",
+    "    %pip install sentencepiece protobuf \"datasets>=3.4.1,<4.0.0\" \"huggingface_hub>=0.34.0\" hf_transfer\n",
+    "    %pip install --no-deps unsloth\n",
+    "%git clone https://github.com/SparkAudio/Spark-TTS\n",
+    "%pip install omegaconf einx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "QmUBVEnvCDJv",
+    "outputId": "42083a68-d3cc-48c9-d852-b60796377434"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
+      "🦥 Unsloth Zoo will now patch everything to make training faster!\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9ad0d25a6f8549d1ac79addbe171b758",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       ".gitattributes: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7e83dd9464b64a6d963c349d1660a28c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.yaml: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "332e86b12a4c45a89a95f1f265ca0f12",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "BiCodec/model.safetensors:   0%|          | 0.00/626M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1a54d8c9dc8472e8f0f37603ccd3904",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "added_tokens.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8402d2f2ef204022b0727f2b09437bad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "43f438eabd1843cc8c5977f0ef6226ec",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "merges.txt: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "87dce305eba54c1797547c06a2ab7cf6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "LLM/model.safetensors:   0%|          | 0.00/2.03G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3ea6e51894454a5c82bb4cfe1fd0a47f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "94e7da1bdc7549e0ba4dcd0b73d38667",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "LLM/tokenizer.json:   0%|          | 0.00/14.1M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1aa226f63eac4ee48537df6b26d921c1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "420eaeeb7bee4c21964c17968c266ac1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bdcb3d5d6a8e4e969afa77631e7c3104",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1cd60c7dbe61410ca5bc61310367635a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.yaml:   0%|          | 0.00/169 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0ea819afc66b437ca8b0dc7337f5ce5f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "gradio_TTS.png:   0%|          | 0.00/81.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00f074bbbc5b44d59c590cc217187aa5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "gradio_control.png:   0%|          | 0.00/62.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d050a4b7cf2b4f78af51986b9c2eee45",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/figures/infer_control.png:   0%|          | 0.00/127k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6ed5ce435b89443f9cca00ed1b97311e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/figures/infer_voice_cloning.png:   0%|          | 0.00/119k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0a7db4ff0d204ed4839471cbd8ebefef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/logo/HKUST.jpg:   0%|          | 0.00/102k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7b682f3d5d142c68ec6bea0be196792",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/logo/NPU.jpg:   0%|          | 0.00/152k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd49989b32d3492894bf08b084059ba6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "NTU.jpg:   0%|          | 0.00/77.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b4576071c87448ef8ba94df410964d6c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/logo/SJU.jpg:   0%|          | 0.00/364k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3dbdd98fca6741d2874849b2b26662db",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SparkAudio.jpg:   0%|          | 0.00/89.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ce753e6904ff4dd4ae5c5824ac554d76",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SparkAudio2.jpg:   0%|          | 0.00/40.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "90c48554b64b46f388ee14df2c401a02",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SparkTTS.jpg:   0%|          | 0.00/52.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "059f5fe90c324bd7b0aef23095af1c21",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/logo/SparkTTS.png:   0%|          | 0.00/102k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ccf1938072024151ab5c50492866e253",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/logo/mobvoi.jpg:   0%|          | 0.00/431k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "771681ce27b94c71a61da27b133427ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "src/logo/mobvoi.png:   0%|          | 0.00/120k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "243ff52bb35242eeb330a2bb2ffe4166",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c17c5bd399fd411d8f2ee43f79539cca",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2d6ae8fc962b41aeb4ce1fec0d3f0864",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f3394d8a215e406f8f50b8770dd354d3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "wav2vec2-large-xlsr-53/pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "==((====))==  Unsloth 2025.8.1: Fast Qwen2 patching. Transformers: 4.55.0.\n",
+      "   \\\\   /|    NVIDIA GeForce RTX 2080 SUPER. Num GPUs = 2. Max memory: 7.785 GB. Platform: Linux.\n",
+      "O^O/ \\_/ \\    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1\n",
+      "\\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]\n",
+      " \"-____-\"     Free license: http://github.com/unslothai/unsloth\n",
+      "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n",
+      "Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from unsloth import FastModel\n",
+    "import torch\n",
+    "from huggingface_hub import snapshot_download\n",
+    "\n",
+    "max_seq_length = 2048 # Choose any for long context!\n",
+    "\n",
+    "fourbit_models = [\n",
+    "    # 4bit dynamic quants for superior accuracy and low memory use\n",
+    "    \"unsloth/gemma-3-4b-it-unsloth-bnb-4bit\",\n",
+    "    \"unsloth/gemma-3-12b-it-unsloth-bnb-4bit\",\n",
+    "    \"unsloth/gemma-3-27b-it-unsloth-bnb-4bit\",\n",
+    "    # Qwen3 new models\n",
+    "    \"unsloth/Qwen3-4B-unsloth-bnb-4bit\",\n",
+    "    \"unsloth/Qwen3-8B-unsloth-bnb-4bit\",\n",
+    "    # Other very popular models!\n",
+    "    \"unsloth/Llama-3.1-8B\",\n",
+    "    \"unsloth/Llama-3.2-3B\",\n",
+    "    \"unsloth/Llama-3.3-70B\",\n",
+    "    \"unsloth/mistral-7b-instruct-v0.3\",\n",
+    "    \"unsloth/Phi-4\",\n",
+    "] # More models at https://huggingface.co/unsloth\n",
+    "\n",
+    "# Download model and code\n",
+    "snapshot_download(\"unsloth/Spark-TTS-0.5B\", local_dir = \"Spark-TTS-0.5B\")\n",
+    "\n",
+    "model, tokenizer = FastModel.from_pretrained(\n",
+    "    model_name = f\"Spark-TTS-0.5B/LLM\",\n",
+    "    max_seq_length = max_seq_length,\n",
+    "    dtype = torch.float32, # Spark seems to only work on float32 for now\n",
+    "    full_finetuning = True, # We support full finetuning now!\n",
+    "    load_in_4bit = False,\n",
+    "    #token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "SXd9bTZd1aaL"
+   },
+   "source": [
+    "We now add LoRA adapters so we only need to update 1 to 10% of all parameters!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "6bZsfBuZDeCL",
+    "outputId": "292447b8-fd80-4b8b-ba3f-4637a1045166"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: Full finetuning is enabled, so .get_peft_model has no effect\n"
+     ]
+    }
+   ],
+   "source": [
+    "#LoRA does not work with float32 only works with bfloat16 !!!\n",
+    "model = FastModel.get_peft_model(\n",
+    "    model,\n",
+    "    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
+    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "                      \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
+    "    lora_alpha = 128,\n",
+    "    lora_dropout = 0, # Supports any, but = 0 is optimized\n",
+    "    bias = \"none\",    # Supports any, but = \"none\" is optimized\n",
+    "    # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
+    "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
+    "    random_state = 3407,\n",
+    "    use_rslora = False,  # We support rank stabilized LoRA\n",
+    "    loftq_config = None, # And LoftQ\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vITh0KVJ10qX"
+   },
+   "source": [
+    "<a name=\"Data\"></a>\n",
+    "### Data Prep  \n",
+    "\n",
+    "We will use the `Balaji-1904/TTS_KN_DS_V1.1`, which is designed for training TTS models. Ensure that your dataset follows the required format: **text, audio** for single-speaker models or **source, text, audio** for multi-speaker models. You can modify this section to accommodate your own dataset, but maintaining the correct structure is essential for optimal training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "LjY75GoYUCB8"
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "dataset = load_dataset(\"Balaji-1904/TTS_KN_DS_V1.1\", split = \"train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 173,
+     "referenced_widgets": [
+      "a3b0c0581f1f4c428baaadd8e9a39b6f",
+      "2315228ff2b141afabe1263471f5364b",
+      "0474debc340943bd85f3daf92aebf7aa",
+      "cff1b0fa2ea24f45aab26685353eefdd",
+      "b7e20be79df246f19b35114a690e44f0",
+      "426eb100a94642f79e6b99777406a265",
+      "a36b5cf197dd4bd9a7f70aa6671b804c",
+      "0de4d0f282404edfbc191dca73f15f35",
+      "e58b5ad2f781475d8af2ddb38009baa6",
+      "33fbacbb2aa146cd90586357eec1dc3e",
+      "930b4d1d5f4b494b830df4d4c398e67c"
+     ]
+    },
+    "id": "zK94B-Pfioto",
+    "outputId": "3f11cf35-c173-410d-f709-43552323f26f"
+   },
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'torchaudio'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m#@title Tokenization Function\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlocale\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorchaudio\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtransforms\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mT\u001b[39;00m\n\u001b[32m      5\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mos\u001b[39;00m\n\u001b[32m      6\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'torchaudio'"
+     ]
+    }
+   ],
+   "source": [
+    "#@title Tokenization Function\n",
+    "\n",
+    "import locale\n",
+    "import torchaudio.transforms as T\n",
+    "import os\n",
+    "import torch\n",
+    "import sys\n",
+    "import numpy as np\n",
+    "sys.path.append('Spark-TTS')\n",
+    "from sparktts.models.audio_tokenizer import BiCodecTokenizer\n",
+    "from sparktts.utils.audio import audio_volume_normalize\n",
+    "\n",
+    "audio_tokenizer = BiCodecTokenizer(\"Spark-TTS-0.5B\", \"cuda\")\n",
+    "def extract_wav2vec2_features( wavs: torch.Tensor) -> torch.Tensor:\n",
+    "        \"\"\"extract wav2vec2 features\"\"\"\n",
+    "\n",
+    "        if wavs.shape[0] != 1:\n",
+    "\n",
+    "             raise ValueError(f\"Expected batch size 1, but got shape {wavs.shape}\")\n",
+    "        wav_np = wavs.squeeze(0).cpu().numpy()\n",
+    "\n",
+    "        processed = audio_tokenizer.processor(\n",
+    "            wav_np,\n",
+    "            sampling_rate=16000,\n",
+    "            return_tensors=\"pt\",\n",
+    "            padding=True,\n",
+    "        )\n",
+    "        input_values = processed.input_values\n",
+    "\n",
+    "        input_values = input_values.to(audio_tokenizer.feature_extractor.device)\n",
+    "\n",
+    "        model_output = audio_tokenizer.feature_extractor(\n",
+    "            input_values,\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "        if model_output.hidden_states is None:\n",
+    "             raise ValueError(\"Wav2Vec2Model did not return hidden states. Ensure config `output_hidden_states=True`.\")\n",
+    "\n",
+    "        num_layers = len(model_output.hidden_states)\n",
+    "        required_layers = [11, 14, 16]\n",
+    "        if any(l >= num_layers for l in required_layers):\n",
+    "             raise IndexError(f\"Requested hidden state indices {required_layers} out of range for model with {num_layers} layers.\")\n",
+    "\n",
+    "        feats_mix = (\n",
+    "            model_output.hidden_states[11] + model_output.hidden_states[14] + model_output.hidden_states[16]\n",
+    "        ) / 3\n",
+    "\n",
+    "        return feats_mix\n",
+    "def formatting_audio_func(example):\n",
+    "    text = f\"{example['source']}: {example['text']}\" if \"source\" in example else example[\"text\"]\n",
+    "    audio_array = example[\"audio\"][\"array\"]\n",
+    "    sampling_rate = example[\"audio\"][\"sampling_rate\"]\n",
+    "\n",
+    "    target_sr = audio_tokenizer.config['sample_rate']\n",
+    "\n",
+    "    if sampling_rate != target_sr:\n",
+    "        resampler = T.Resample(orig_freq=sampling_rate, new_freq=target_sr)\n",
+    "        audio_tensor_temp = torch.from_numpy(audio_array).float()\n",
+    "        audio_array = resampler(audio_tensor_temp).numpy()\n",
+    "\n",
+    "    if audio_tokenizer.config[\"volume_normalize\"]:\n",
+    "        audio_array = audio_volume_normalize(audio_array)\n",
+    "\n",
+    "    ref_wav_np = audio_tokenizer.get_ref_clip(audio_array)\n",
+    "\n",
+    "    audio_tensor = torch.from_numpy(audio_array).unsqueeze(0).float().to(audio_tokenizer.device)\n",
+    "    ref_wav_tensor = torch.from_numpy(ref_wav_np).unsqueeze(0).float().to(audio_tokenizer.device)\n",
+    "\n",
+    "\n",
+    "    feat = extract_wav2vec2_features(audio_tensor)\n",
+    "\n",
+    "    batch = {\n",
+    "\n",
+    "        \"wav\": audio_tensor,\n",
+    "        \"ref_wav\": ref_wav_tensor,\n",
+    "        \"feat\": feat.to(audio_tokenizer.device),\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "    semantic_token_ids, global_token_ids = audio_tokenizer.model.tokenize(batch)\n",
+    "\n",
+    "    global_tokens = \"\".join(\n",
+    "        [f\"<|bicodec_global_{i}|>\" for i in global_token_ids.squeeze().cpu().numpy()] # Squeeze batch dim\n",
+    "    )\n",
+    "    semantic_tokens = \"\".join(\n",
+    "        [f\"<|bicodec_semantic_{i}|>\" for i in semantic_token_ids.squeeze().cpu().numpy()] # Squeeze batch dim\n",
+    "    )\n",
+    "\n",
+    "    inputs = [\n",
+    "        \"<|task_tts|>\",\n",
+    "        \"<|start_content|>\",\n",
+    "        text,\n",
+    "        \"<|end_content|>\",\n",
+    "        \"<|start_global_token|>\",\n",
+    "        global_tokens,\n",
+    "        \"<|end_global_token|>\",\n",
+    "        \"<|start_semantic_token|>\",\n",
+    "        semantic_tokens,\n",
+    "        \"<|end_semantic_token|>\",\n",
+    "        \"<|im_end|>\"\n",
+    "    ]\n",
+    "    inputs = \"\".join(inputs)\n",
+    "    return {\"text\": inputs}\n",
+    "\n",
+    "\n",
+    "dataset = dataset.map(formatting_audio_func, remove_columns=[\"audio\"])\n",
+    "print(\"Moving Bicodec model and Wav2Vec2Model to cpu.\")\n",
+    "audio_tokenizer.model.cpu()\n",
+    "audio_tokenizer.feature_extractor.cpu()\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting torchaudio\n",
+      "  Downloading torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (7.2 kB)\n",
+      "Collecting torch==2.8.0 (from torchaudio)\n",
+      "  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)\n",
+      "Requirement already satisfied: filelock in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (3.18.0)\n",
+      "Requirement already satisfied: typing-extensions>=4.10.0 in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (4.14.1)\n",
+      "Requirement already satisfied: setuptools in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (80.9.0)\n",
+      "Requirement already satisfied: sympy>=1.13.3 in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (1.14.0)\n",
+      "Requirement already satisfied: networkx in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (3.5)\n",
+      "Requirement already satisfied: jinja2 in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (3.1.6)\n",
+      "Requirement already satisfied: fsspec in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from torch==2.8.0->torchaudio) (2025.3.0)\n",
+      "Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)\n",
+      "Collecting nvidia-cublas-cu12==12.8.4.1 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-cufft-cu12==11.3.3.83 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-curand-cu12==10.3.9.90 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-cusolver-cu12==11.7.3.90 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)\n",
+      "Collecting nvidia-cusparse-cu12==12.5.8.93 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)\n",
+      "Collecting nvidia-cusparselt-cu12==0.7.1 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl.metadata (7.0 kB)\n",
+      "Collecting nvidia-nccl-cu12==2.27.3 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)\n",
+      "Collecting nvidia-nvtx-cu12==12.8.90 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)\n",
+      "Collecting nvidia-nvjitlink-cu12==12.8.93 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-cufile-cu12==1.13.1.3 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting triton==3.4.0 (from torch==2.8.0->torchaudio)\n",
+      "  Using cached triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from sympy>=1.13.3->torch==2.8.0->torchaudio) (1.3.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /datadrive/jupyter/devbase/Balaji/TTS_ft/lib/python3.12/site-packages (from jinja2->torch==2.8.0->torchaudio) (3.0.2)\n",
+      "Downloading torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl (4.0 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl (887.9 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887.9/887.9 MB\u001b[0m \u001b[31m979.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m eta \u001b[36m0:00:01\u001b[0m[36m0:00:19\u001b[0mm\n",
+      "\u001b[?25hDownloading nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl (594.3 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m594.3/594.3 MB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:13\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (10.2 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.2/10.2 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (88.0 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.0/88.0 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:02\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (954 kB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m954.8/954.8 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl (706.8 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m706.8/706.8 MB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:15\u001b[0mm\n",
+      "\u001b[?25hDownloading nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (193.1 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.1/193.1 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:05\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (1.2 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl (63.6 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.6/63.6 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:02\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl (267.5 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m267.5/267.5 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:06\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (288.2 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m288.2/288.2 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:07\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl (287.2 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.2/287.2 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:06\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.4 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m322.4/322.4 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:07\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.3 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.3/39.3 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89 kB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m90.0/90.0 kB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n",
+      "\u001b[?25hDownloading triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (155.6 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m155.6/155.6 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:04\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: nvidia-cusparselt-cu12, triton, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufile-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cufft-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch, torchaudio\n",
+      "  Attempting uninstall: nvidia-cusparselt-cu12\n",
+      "    Found existing installation: nvidia-cusparselt-cu12 0.6.3\n",
+      "    Uninstalling nvidia-cusparselt-cu12-0.6.3:\n",
+      "      Successfully uninstalled nvidia-cusparselt-cu12-0.6.3\n",
+      "  Attempting uninstall: triton\n",
+      "    Found existing installation: triton 3.3.1\n",
+      "    Uninstalling triton-3.3.1:\n",
+      "      Successfully uninstalled triton-3.3.1\n",
+      "  Attempting uninstall: nvidia-nvtx-cu12\n",
+      "    Found existing installation: nvidia-nvtx-cu12 12.6.77\n",
+      "    Uninstalling nvidia-nvtx-cu12-12.6.77:\n",
+      "      Successfully uninstalled nvidia-nvtx-cu12-12.6.77\n",
+      "  Attempting uninstall: nvidia-nvjitlink-cu12\n",
+      "    Found existing installation: nvidia-nvjitlink-cu12 12.6.85\n",
+      "    Uninstalling nvidia-nvjitlink-cu12-12.6.85:\n",
+      "      Successfully uninstalled nvidia-nvjitlink-cu12-12.6.85\n",
+      "  Attempting uninstall: nvidia-nccl-cu12\n",
+      "    Found existing installation: nvidia-nccl-cu12 2.26.2\n",
+      "    Uninstalling nvidia-nccl-cu12-2.26.2:\n",
+      "      Successfully uninstalled nvidia-nccl-cu12-2.26.2\n",
+      "  Attempting uninstall: nvidia-curand-cu12\n",
+      "    Found existing installation: nvidia-curand-cu12 10.3.7.77\n",
+      "    Uninstalling nvidia-curand-cu12-10.3.7.77:\n",
+      "      Successfully uninstalled nvidia-curand-cu12-10.3.7.77\n",
+      "  Attempting uninstall: nvidia-cufile-cu12\n",
+      "    Found existing installation: nvidia-cufile-cu12 1.11.1.6\n",
+      "    Uninstalling nvidia-cufile-cu12-1.11.1.6:\n",
+      "      Successfully uninstalled nvidia-cufile-cu12-1.11.1.6\n",
+      "  Attempting uninstall: nvidia-cuda-runtime-cu12\n",
+      "    Found existing installation: nvidia-cuda-runtime-cu12 12.6.77\n",
+      "    Uninstalling nvidia-cuda-runtime-cu12-12.6.77:\n",
+      "      Successfully uninstalled nvidia-cuda-runtime-cu12-12.6.77\n",
+      "  Attempting uninstall: nvidia-cuda-nvrtc-cu12\n",
+      "    Found existing installation: nvidia-cuda-nvrtc-cu12 12.6.77\n",
+      "    Uninstalling nvidia-cuda-nvrtc-cu12-12.6.77:\n",
+      "      Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.6.77\n",
+      "  Attempting uninstall: nvidia-cuda-cupti-cu12\n",
+      "    Found existing installation: nvidia-cuda-cupti-cu12 12.6.80\n",
+      "    Uninstalling nvidia-cuda-cupti-cu12-12.6.80:\n",
+      "      Successfully uninstalled nvidia-cuda-cupti-cu12-12.6.80\n",
+      "  Attempting uninstall: nvidia-cublas-cu12\n",
+      "    Found existing installation: nvidia-cublas-cu12 12.6.4.1\n",
+      "    Uninstalling nvidia-cublas-cu12-12.6.4.1:\n",
+      "      Successfully uninstalled nvidia-cublas-cu12-12.6.4.1\n",
+      "  Attempting uninstall: nvidia-cusparse-cu12\n",
+      "    Found existing installation: nvidia-cusparse-cu12 12.5.4.2\n",
+      "    Uninstalling nvidia-cusparse-cu12-12.5.4.2:\n",
+      "      Successfully uninstalled nvidia-cusparse-cu12-12.5.4.2\n",
+      "  Attempting uninstall: nvidia-cufft-cu12\n",
+      "    Found existing installation: nvidia-cufft-cu12 11.3.0.4\n",
+      "    Uninstalling nvidia-cufft-cu12-11.3.0.4:\n",
+      "      Successfully uninstalled nvidia-cufft-cu12-11.3.0.4\n",
+      "  Attempting uninstall: nvidia-cudnn-cu12\n",
+      "    Found existing installation: nvidia-cudnn-cu12 9.5.1.17\n",
+      "    Uninstalling nvidia-cudnn-cu12-9.5.1.17:\n",
+      "      Successfully uninstalled nvidia-cudnn-cu12-9.5.1.17\n",
+      "  Attempting uninstall: nvidia-cusolver-cu12\n",
+      "    Found existing installation: nvidia-cusolver-cu12 11.7.1.2\n",
+      "    Uninstalling nvidia-cusolver-cu12-11.7.1.2:\n",
+      "      Successfully uninstalled nvidia-cusolver-cu12-11.7.1.2\n",
+      "  Attempting uninstall: torch\n",
+      "    Found existing installation: torch 2.7.1\n",
+      "    Uninstalling torch-2.7.1:\n",
+      "      Successfully uninstalled torch-2.7.1\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "xformers 0.0.31.post1 requires torch==2.7.1, but you have torch 2.8.0 which is incompatible.\n",
+      "torchvision 0.22.1 requires torch==2.7.1, but you have torch 2.8.0 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed nvidia-cublas-cu12-12.8.4.1 nvidia-cuda-cupti-cu12-12.8.90 nvidia-cuda-nvrtc-cu12-12.8.93 nvidia-cuda-runtime-cu12-12.8.90 nvidia-cudnn-cu12-9.10.2.21 nvidia-cufft-cu12-11.3.3.83 nvidia-cufile-cu12-1.13.1.3 nvidia-curand-cu12-10.3.9.90 nvidia-cusolver-cu12-11.7.3.90 nvidia-cusparse-cu12-12.5.8.93 nvidia-cusparselt-cu12-0.7.1 nvidia-nccl-cu12-2.27.3 nvidia-nvjitlink-cu12-12.8.93 nvidia-nvtx-cu12-12.8.90 torch-2.8.0 torchaudio-2.8.0 triton-3.4.0\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install torchaudio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "idAEIeSQ3xdS"
+   },
+   "source": [
+    "<a name=\"Train\"></a>\n",
+    "### Train the model\n",
+    "Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "95_Nn-89DhsL"
+   },
+   "outputs": [],
+   "source": [
+    "from trl import SFTConfig, SFTTrainer\n",
+    "trainer = SFTTrainer(\n",
+    "    model = model,\n",
+    "    tokenizer = tokenizer,\n",
+    "    train_dataset = dataset,\n",
+    "    dataset_text_field = \"text\",\n",
+    "    max_seq_length = max_seq_length,\n",
+    "    packing = False, # Can make training 5x faster for short sequences.\n",
+    "    args = SFTConfig(\n",
+    "        per_device_train_batch_size = 2,\n",
+    "        gradient_accumulation_steps = 4,\n",
+    "        warmup_steps = 5,\n",
+    "        num_train_epochs = 5, # Set this for 1 full training run.\n",
+    "        #max_steps = 60,\n",
+    "        learning_rate = 1e-5,\n",
+    "        fp16 = False, # We're doing full float32 s disable mixed precision\n",
+    "        bf16 = False, # We're doing full float32 s disable mixed precision\n",
+    "        logging_steps = 1,\n",
+    "        optim = \"adamw_8bit\",\n",
+    "        weight_decay = 0.01,\n",
+    "        lr_scheduler_type = \"linear\",\n",
+    "        seed = 3407,\n",
+    "        output_dir = \"outputs\",\n",
+    "        report_to = \"tensorboard\", # Use this for WandB etc\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2ejIt2xSNKKp"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Show current memory stats\n",
+    "gpu_stats = torch.cuda.get_device_properties(0)\n",
+    "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+    "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+    "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yqxqAZ7KJ4oL"
+   },
+   "outputs": [],
+   "source": [
+    "trainer_stats = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "pCqnaKmlO1U9"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Show final memory and time stats\n",
+    "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+    "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+    "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+    "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+    "print(\n",
+    "    f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\"\n",
+    ")\n",
+    "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+    "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+    "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+    "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ekOmTR1hSNcr"
+   },
+   "source": [
+    "<a name=\"Inference\"></a>\n",
+    "### Inference\n",
+    "Let's run the model! You can change the prompts\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "apUdB40Ep6Ki"
+   },
+   "outputs": [],
+   "source": [
+    "input_text = \"Hey there my name is Elise, <giggles> and I'm a speech generation model that can sound like a person.\"\n",
+    "\n",
+    "chosen_voice = None # None for single-speaker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "execution": {
+     "iopub.execute_input": "2025-03-22T00:52:35.040842Z",
+     "iopub.status.busy": "2025-03-22T00:52:35.040125Z",
+     "iopub.status.idle": "2025-03-22T00:52:35.050560Z",
+     "shell.execute_reply": "2025-03-22T00:52:35.049663Z",
+     "shell.execute_reply.started": "2025-03-22T00:52:35.040818Z"
+    },
+    "id": "krYI8PrRJ6MX"
+   },
+   "outputs": [],
+   "source": [
+    "#@title Run Inference\n",
+    "\n",
+    "import torch\n",
+    "import re\n",
+    "import numpy as np\n",
+    "from typing import Dict, Any\n",
+    "import torchaudio.transforms as T\n",
+    "\n",
+    "FastModel.for_inference(model) # Enable native 2x faster inference\n",
+    "\n",
+    "@torch.inference_mode()\n",
+    "def generate_speech_from_text(\n",
+    "    text: str,\n",
+    "    temperature: float = 0.8,   # Generation temperature\n",
+    "    top_k: int = 50,            # Generation top_k\n",
+    "    top_p: float = 1,        # Generation top_p\n",
+    "    max_new_audio_tokens: int = 2048, # Max tokens for audio part\n",
+    "    device: torch.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    ") -> np.ndarray:\n",
+    "    \"\"\"\n",
+    "    Generates speech audio from text using default voice control parameters.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): The text input to be converted to speech.\n",
+    "        temperature (float): Sampling temperature for generation.\n",
+    "        top_k (int): Top-k sampling parameter.\n",
+    "        top_p (float): Top-p (nucleus) sampling parameter.\n",
+    "        max_new_audio_tokens (int): Max number of new tokens to generate (limits audio length).\n",
+    "        device (torch.device): Device to run inference on.\n",
+    "\n",
+    "    Returns:\n",
+    "        np.ndarray: Generated waveform as a NumPy array.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    torch.compiler.reset()\n",
+    "\n",
+    "    prompt = \"\".join([\n",
+    "        \"<|task_tts|>\",\n",
+    "        \"<|start_content|>\",\n",
+    "        text,\n",
+    "        \"<|end_content|>\",\n",
+    "        \"<|start_global_token|>\"\n",
+    "    ])\n",
+    "\n",
+    "    model_inputs = tokenizer([prompt], return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "    print(\"Generating token sequence...\")\n",
+    "    generated_ids = model.generate(\n",
+    "        **model_inputs,\n",
+    "        max_new_tokens=max_new_audio_tokens, # Limit generation length\n",
+    "        do_sample=True,\n",
+    "        temperature=temperature,\n",
+    "        top_k=top_k,\n",
+    "        top_p=top_p,\n",
+    "        eos_token_id=tokenizer.eos_token_id, # Stop token\n",
+    "        pad_token_id=tokenizer.pad_token_id # Use models pad token id\n",
+    "    )\n",
+    "    print(\"Token sequence generated.\")\n",
+    "\n",
+    "\n",
+    "    generated_ids_trimmed = generated_ids[:, model_inputs.input_ids.shape[1]:]\n",
+    "\n",
+    "\n",
+    "    predicts_text = tokenizer.batch_decode(generated_ids_trimmed, skip_special_tokens=False)[0]\n",
+    "    # print(f\"\\nGenerated Text (for parsing):\\n{predicts_text}\\n\") # Debugging\n",
+    "\n",
+    "    # Extract semantic token IDs using regex\n",
+    "    semantic_matches = re.findall(r\"<\\|bicodec_semantic_(\\d+)\\|>\", predicts_text)\n",
+    "    if not semantic_matches:\n",
+    "        print(\"Warning: No semantic tokens found in the generated output.\")\n",
+    "        # Handle appropriately - perhaps return silence or raise error\n",
+    "        return np.array([], dtype=np.float32)\n",
+    "\n",
+    "    pred_semantic_ids = torch.tensor([int(token) for token in semantic_matches]).long().unsqueeze(0) # Add batch dim\n",
+    "\n",
+    "    # Extract global token IDs using regex (assuming controllable mode also generates these)\n",
+    "    global_matches = re.findall(r\"<\\|bicodec_global_(\\d+)\\|>\", predicts_text)\n",
+    "    if not global_matches:\n",
+    "         print(\"Warning: No global tokens found in the generated output (controllable mode). Might use defaults or fail.\")\n",
+    "         pred_global_ids = torch.zeros((1, 1), dtype=torch.long)\n",
+    "    else:\n",
+    "         pred_global_ids = torch.tensor([int(token) for token in global_matches]).long().unsqueeze(0) # Add batch dim\n",
+    "\n",
+    "    pred_global_ids = pred_global_ids.unsqueeze(0) # Shape becomes (1, 1, N_global)\n",
+    "\n",
+    "    print(f\"Found {pred_semantic_ids.shape[1]} semantic tokens.\")\n",
+    "    print(f\"Found {pred_global_ids.shape[2]} global tokens.\")\n",
+    "\n",
+    "\n",
+    "    # 5. Detokenize using BiCodecTokenizer\n",
+    "    print(\"Detokenizing audio tokens...\")\n",
+    "    # Ensure audio_tokenizer and its internal model are on the correct device\n",
+    "    audio_tokenizer.device = device\n",
+    "    audio_tokenizer.model.to(device)\n",
+    "    # Squeeze the extra dimension from global tokens as seen in SparkTTS example\n",
+    "    wav_np = audio_tokenizer.detokenize(\n",
+    "        pred_global_ids.to(device).squeeze(0), # Shape (1, N_global)\n",
+    "        pred_semantic_ids.to(device)           # Shape (1, N_semantic)\n",
+    "    )\n",
+    "    print(\"Detokenization complete.\")\n",
+    "\n",
+    "    return wav_np\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    print(f\"Generating speech for: '{input_text}'\")\n",
+    "    text = f\"{chosen_voice}: \" + input_text if chosen_voice else input_text\n",
+    "    generated_waveform = generate_speech_from_text(input_text)\n",
+    "\n",
+    "    if generated_waveform.size > 0:\n",
+    "        import soundfile as sf\n",
+    "        output_filename = \"generated_speech_controllable.wav\"\n",
+    "        sample_rate = audio_tokenizer.config.get(\"sample_rate\", 16000)\n",
+    "        sf.write(output_filename, generated_waveform, sample_rate)\n",
+    "        print(f\"Audio saved to {output_filename}\")\n",
+    "\n",
+    "        # Optional: Play in notebook\n",
+    "        from IPython.display import Audio, display\n",
+    "        display(Audio(generated_waveform, rate=sample_rate))\n",
+    "    else:\n",
+    "        print(\"Audio generation failed (no tokens found?).\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "uMuVrWbjAzhc"
+   },
+   "source": [
+    "<a name=\"Save\"></a>\n",
+    "### Saving, loading finetuned models\n",
+    "To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.\n",
+    "\n",
+    "**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "upcOlWe7A1vc"
+   },
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(\"lora_model\")  # Local saving\n",
+    "tokenizer.save_pretrained(\"lora_model\")\n",
+    "# model.push_to_hub(\"your_name/lora_model\", token = \"...\") # Online saving\n",
+    "# tokenizer.push_to_hub(\"your_name/lora_model\", token = \"...\") # Online saving"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "f422JgM9sdVT"
+   },
+   "source": [
+    "\n",
+    "### Saving to float16\n",
+    "\n",
+    "We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "iHjt_SMYsd3P",
+    "outputId": "bd8cccb7-6b95-45bf-80da-de120988447e"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.\n",
+      "We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.\n",
+      "To force `safe_serialization`, set it to `None` instead.\n",
+      "Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded\n",
+      "model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.\n",
+      "Unsloth: Will remove a cached repo with size 15.1G\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: Merging 4bit and LoRA weights to 16bit...\n",
+      "Unsloth: Will use up to 3.99 out of 12.67 RAM for saving.\n",
+      "Unsloth: Saving model... This might take 5 minutes ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 28/28 [00:01<00:00, 27.83it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Unsloth: Saving tokenizer... Done.\n",
+      "Unsloth: Saving model/pytorch_model-00001-of-00002.bin...\n",
+      "Unsloth: Saving model/pytorch_model-00002-of-00002.bin...\n",
+      "Done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Merge to 16bit\n",
+    "if False: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_16bit\",)\n",
+    "if False: model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"merged_16bit\", token = \"\")\n",
+    "\n",
+    "# Merge to 4bit\n",
+    "if False: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_4bit\",)\n",
+    "if False: model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"merged_4bit\", token = \"\")\n",
+    "\n",
+    "# Just LoRA adapters\n",
+    "if False:\n",
+    "    model.save_pretrained(\"model\")\n",
+    "    tokenizer.save_pretrained(\"model\")\n",
+    "if False:\n",
+    "    model.push_to_hub(\"hf/model\", token = \"\")\n",
+    "    tokenizer.push_to_hub(\"hf/model\", token = \"\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "egOSE7Cgynx7"
+   },
+   "source": [
+    "And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!\n",
+    "\n",
+    "Some other links:\n",
+    "1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)\n",
+    "2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)\n",
+    "3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)\n",
+    "6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!\n",
+    "\n",
+    "<div class=\"align-center\">\n",
+    "  <a href=\"https://unsloth.ai\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png\" width=\"115\"></a>\n",
+    "  <a href=\"https://discord.gg/unsloth\"><img src=\"https://github.com/unslothai/unsloth/raw/main/images/Discord.png\" width=\"145\"></a>\n",
+    "  <a href=\"https://docs.unsloth.ai/\"><img src=\"https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true\" width=\"125\"></a>\n",
+    "\n",
+    "  Join Discord if you need help + ⭐️ <i>Star us on <a href=\"https://github.com/unslothai/unsloth\">Github</a> </i> ⭐️\n",
+    "</div>\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [],
+   "dockerImageVersionId": 30919,
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "TTS_ft",
+   "language": "python",
+   "name": "tts_ft"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "0474debc340943bd85f3daf92aebf7aa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0de4d0f282404edfbc191dca73f15f35",
+      "max": 401,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_e58b5ad2f781475d8af2ddb38009baa6",
+      "value": 354
+     }
+    },
+    "0de4d0f282404edfbc191dca73f15f35": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2315228ff2b141afabe1263471f5364b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_426eb100a94642f79e6b99777406a265",
+      "placeholder": "",
+      "style": "IPY_MODEL_a36b5cf197dd4bd9a7f70aa6671b804c",
+      "value": "Map:  88%"
+     }
+    },
+    "33fbacbb2aa146cd90586357eec1dc3e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "426eb100a94642f79e6b99777406a265": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "930b4d1d5f4b494b830df4d4c398e67c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a36b5cf197dd4bd9a7f70aa6671b804c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a3b0c0581f1f4c428baaadd8e9a39b6f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_2315228ff2b141afabe1263471f5364b",
+       "IPY_MODEL_0474debc340943bd85f3daf92aebf7aa",
+       "IPY_MODEL_cff1b0fa2ea24f45aab26685353eefdd"
+      ],
+      "layout": "IPY_MODEL_b7e20be79df246f19b35114a690e44f0"
+     }
+    },
+    "b7e20be79df246f19b35114a690e44f0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "cff1b0fa2ea24f45aab26685353eefdd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_33fbacbb2aa146cd90586357eec1dc3e",
+      "placeholder": "",
+      "style": "IPY_MODEL_930b4d1d5f4b494b830df4d4c398e67c",
+      "value": " 354/401 [03:01&lt;00:22,  2.11 examples/s]"
+     }
+    },
+    "e58b5ad2f781475d8af2ddb38009baa6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

config.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+highpass_cutoff_freq: 40
+sample_rate: 16000
+segment_duration: 2.4 # (s)
+max_val_duration: 12 # (s)
+latent_hop_length: 320
+ref_segment_duration: 6
+volume_normalize: true

src/figures/gradio_TTS.png ADDED Viewed

src/figures/gradio_control.png ADDED Viewed

src/figures/infer_control.png ADDED Viewed

Git LFS Details

SHA256: bfb1617c623cc25f5af6ce1e18331f1e0c61892e21bb90702c9ce2b2b5c0e8e2
Pointer size: 131 Bytes
Size of remote file: 127 kB

src/figures/infer_voice_cloning.png ADDED Viewed

Git LFS Details

SHA256: 9cb85e0217bfa18ce35dc5237c85b46fd1eeac7434b3a27c89a9136b3707d920
Pointer size: 131 Bytes
Size of remote file: 119 kB

src/logo/HKUST.jpg ADDED Viewed

Git LFS Details

SHA256: 65810d8063ffa90864943ff6a7aec52c0704c9230b71fc46b302732950facace
Pointer size: 131 Bytes
Size of remote file: 102 kB

src/logo/NPU.jpg ADDED Viewed

Git LFS Details

SHA256: 7d5af77d53b2ad6bd409c068c47ea30c8a4c23fa53e12efab77354f862625bb2
Pointer size: 131 Bytes
Size of remote file: 152 kB

src/logo/NTU.jpg ADDED Viewed

src/logo/SJU.jpg ADDED Viewed

Git LFS Details

SHA256: 82d9ab05b6c7ceed18348a4ad43ad147ae5607ba629265482cd049e812e65e1b
Pointer size: 131 Bytes
Size of remote file: 364 kB

src/logo/SparkAudio.jpg ADDED Viewed

src/logo/SparkAudio2.jpg ADDED Viewed

src/logo/SparkTTS.jpg ADDED Viewed

src/logo/SparkTTS.png ADDED Viewed

Git LFS Details

SHA256: d257dd65c174aa28e21a341906f18b242e08a859e14cc72d8b86a54ad7e096ec
Pointer size: 131 Bytes
Size of remote file: 102 kB

src/logo/mobvoi.jpg ADDED Viewed

Git LFS Details

SHA256: 74ac35394031e708f738fd99508f8f941f15bbcb99ea5abbc98cd8795ed44831
Pointer size: 131 Bytes
Size of remote file: 431 kB

src/logo/mobvoi.png ADDED Viewed

Git LFS Details

SHA256: 48af5c5a1a8077e6b8db7bfe63c249ed99bf2e6a085ecbceb728614f6a3c94ae
Pointer size: 131 Bytes
Size of remote file: 120 kB

wav2vec2-large-xlsr-53/README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+---
+language: multilingual
+datasets:
+- common_voice
+tags:
+- speech
+license: apache-2.0
+---
+# Wav2Vec2-XLSR-53
+[Facebook's XLSR-Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
+The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. Note that this model should be fine-tuned on a downstream task, like Automatic Speech Recognition. Check out [this blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) for more information.
+[Paper](https://arxiv.org/abs/2006.13979)
+Authors: Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli
+**Abstract**
+This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over masked latent speech representations and jointly learns a quantization of the latents shared across languages. The resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong individual models. Analysis shows that the latent discrete speech representations are shared across languages with increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing XLSR-53, a large model pretrained in 53 languages.
+The original model can be found under https://github.com/pytorch/fairseq/tree/master/examples/wav2vec#wav2vec-20.
+# Usage
+See [this notebook](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb) for more information on how to fine-tune the model.
+![model image](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/xlsr_wav2vec2.png)

wav2vec2-large-xlsr-53/config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.075,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "transformers_version": "4.7.0.dev0",
+  "vocab_size": 32
+}

wav2vec2-large-xlsr-53/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

wav2vec2-large-xlsr-53/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:314340227371a608f71adcd5f0de5933824fe77e55822aa4b24dba9c1c364dcb
+size 1269737156