{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2025-06-15T14:21:25.974502Z",
     "iopub.status.busy": "2025-06-15T14:21:25.974227Z",
     "iopub.status.idle": "2025-06-15T14:21:31.475226Z",
     "shell.execute_reply": "2025-06-15T14:21:31.474663Z",
     "shell.execute_reply.started": "2025-06-15T14:21:25.974478Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.system(\"pip install -q wget\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:21:31.476734Z",
     "iopub.status.busy": "2025-06-15T14:21:31.476449Z",
     "iopub.status.idle": "2025-06-15T14:21:37.092039Z",
     "shell.execute_reply": "2025-06-15T14:21:37.091491Z",
     "shell.execute_reply.started": "2025-06-15T14:21:31.476715Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import wget\n",
    "import tarfile\n",
    "import torchaudio\n",
    "import pandas as pd\n",
    "from huggingface_hub import snapshot_download, login\n",
    "login(\"<your_huggingface_token>\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:21:37.092984Z",
     "iopub.status.busy": "2025-06-15T14:21:37.092705Z",
     "iopub.status.idle": "2025-06-15T14:21:37.096562Z",
     "shell.execute_reply": "2025-06-15T14:21:37.096039Z",
     "shell.execute_reply.started": "2025-06-15T14:21:37.092967Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "os.chdir(\"/content\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T13:59:06.772020Z",
     "iopub.status.busy": "2025-06-15T13:59:06.771694Z",
     "iopub.status.idle": "2025-06-15T14:00:28.043176Z",
     "shell.execute_reply": "2025-06-15T14:00:28.041603Z",
     "shell.execute_reply.started": "2025-06-15T13:59:06.771995Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from huggingface_hub import HfApi\n",
    "from huggingface_hub import snapshot_download\n",
    "import os\n",
    "api = HfApi()\n",
    "!git lfs install --force\n",
    "\n",
    "# Define the dataset name and local directory\n",
    "\n",
    "repo_id = \"heboya8/t5-tts-temp-model\"\n",
    "save_path = \".\"\n",
    "\n",
    "# Create the directory if it doesn't exist\n",
    "os.makedirs(save_path, exist_ok=True)\n",
    "\n",
    "# Download the dataset\n",
    "snapshot_download(repo_id=repo_id, repo_type=\"model\", local_dir=save_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:21:37.389642Z",
     "iopub.status.busy": "2025-06-15T14:21:37.389399Z",
     "iopub.status.idle": "2025-06-15T14:24:47.468892Z",
     "shell.execute_reply": "2025-06-15T14:24:47.468139Z",
     "shell.execute_reply.started": "2025-06-15T14:21:37.389623Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Step 1: Set Up the Environment\n",
    "os.system(\"pip install -e . >/dev/null 2>&1\")\n",
    "os.system(\"pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 torchvision==0.19.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124 >/dev/null 2>&1\")\n",
    "os.system(\"pip install accelerate==0.33.0 tensorboard >/dev/null 2>&1\")\n",
    "if not os.path.exists(\"F5-TTS\"):\n",
    "    os.system(\"git clone https://github.com/SWivid/F5-TTS.git\")\n",
    "os.chdir(\"F5-TTS\")\n",
    "os.system(\"pip install -e . >/dev/null 2>&1\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:24:47.470454Z",
     "iopub.status.busy": "2025-06-15T14:24:47.470177Z",
     "iopub.status.idle": "2025-06-15T14:24:47.473922Z",
     "shell.execute_reply": "2025-06-15T14:24:47.473261Z",
     "shell.execute_reply.started": "2025-06-15T14:24:47.470429Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "os.chdir(\"/content/F5-TTS\")\n",
    "    # os.chdir(\"F5-TTS-Vietnamese\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T06:47:34.909957Z",
     "iopub.status.busy": "2025-06-15T06:47:34.909372Z",
     "iopub.status.idle": "2025-06-15T06:47:35.040348Z",
     "shell.execute_reply": "2025-06-15T06:47:35.039424Z",
     "shell.execute_reply.started": "2025-06-15T06:47:34.909927Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:24:47.475053Z",
     "iopub.status.busy": "2025-06-15T14:24:47.474827Z",
     "iopub.status.idle": "2025-06-15T14:24:47.644337Z",
     "shell.execute_reply": "2025-06-15T14:24:47.643562Z",
     "shell.execute_reply.started": "2025-06-15T14:24:47.475031Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!mkdir ./ckpts/vin100h-preprocessed-v2\n",
    "# !cp /kaggle/input/vi-fine-tuned-t5-tts/69/model_last.pt \\\n",
    "# ./ckpts/vin100h-preprocessed-v2\n",
    "# !cp -r /content/73/* ./ckpts/vin100h-preprocessed-v2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:24:47.646473Z",
     "iopub.status.busy": "2025-06-15T14:24:47.646278Z",
     "iopub.status.idle": "2025-06-15T14:25:20.275283Z",
     "shell.execute_reply": "2025-06-15T14:25:20.274453Z",
     "shell.execute_reply.started": "2025-06-15T14:24:47.646454Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# !cp -r /kaggle/input/vi-fine-tuned-t5-tts/7/* ./ckpts\n",
    "!cp -r /kaggle/input/vi-fine-tuned-t5-tts/75/model_last.pt \\\n",
    "    ./ckpts/vin100h-preprocessed-v2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:25:20.276407Z",
     "iopub.status.busy": "2025-06-15T14:25:20.276159Z",
     "iopub.status.idle": "2025-06-15T14:25:20.413414Z",
     "shell.execute_reply": "2025-06-15T14:25:20.412180Z",
     "shell.execute_reply.started": "2025-06-15T14:25:20.276382Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!ls -a ./ckpts/vin100h-preprocessed-v2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T15:59:08.329794Z",
     "iopub.status.busy": "2025-05-10T15:59:08.329442Z",
     "iopub.status.idle": "2025-05-10T15:59:09.362207Z",
     "shell.execute_reply": "2025-05-10T15:59:09.361253Z",
     "shell.execute_reply.started": "2025-05-10T15:59:08.329757Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from pathlib import Path\n",
    "import shutil\n",
    "import torchaudio\n",
    "from datasets import load_dataset\n",
    "from datasets.arrow_writer import ArrowWriter\n",
    "from tqdm import tqdm\n",
    "import soundfile as sf\n",
    "import csv\n",
    "\n",
    "def save_dataset_to_local_disk(output_dir=\"./data/vin100h-preprocessed-v2\",\n",
    "                               base_model=\"htdung167/vin100h-preprocessed-v2\",\n",
    "                               audio_header='audio',\n",
    "                               text_header='transcription'):\n",
    "  \n",
    "    wavs_dir = os.path.join(output_dir, \"wavs\")\n",
    "    metadata_path = os.path.join(output_dir, \"metadata.csv\")\n",
    "    os.makedirs(wavs_dir, exist_ok=True)\n",
    "\n",
    "    ds = load_dataset(base_model)['train']\n",
    "    metadata = []\n",
    "\n",
    "    for idx, sample in tqdm(enumerate(ds), total=len(ds),\n",
    "                            desc=\"Saving samples to directory\"):\n",
    "        audio_array = sample[audio_header]['array']\n",
    "        sampling_rate = sample[audio_header]['sampling_rate']\n",
    "        filename = f\"audio_{idx:06d}.wav\"\n",
    "        sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)\n",
    "        # metadata.append([f\"wavs/{filename}\", sample['preprocessed_sentence_v2']])\n",
    "        metadata.append([f\"wavs/{filename}\", sample[text_header]])\n",
    "        # metadata.append([f\"{filename}\", sample['transcription']])\n",
    "        \n",
    "    with open(metadata_path, 'w', newline='', encoding='utf-8') as f:\n",
    "        csv.writer(f, delimiter='|').writerows(metadata)\n",
    "\n",
    "    print(f\"Dataset saved to {output_dir}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T15:59:10.399030Z",
     "iopub.status.busy": "2025-05-10T15:59:10.397916Z",
     "iopub.status.idle": "2025-05-10T16:10:46.269067Z",
     "shell.execute_reply": "2025-05-10T16:10:46.267298Z",
     "shell.execute_reply.started": "2025-05-10T15:59:10.398995Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "output_dir = \"./data/vin100h-preprocessed-v2\"\n",
    "tokenizer_type = \"pinyin\"\n",
    "\n",
    "save_dataset_to_local_disk(output_dir=output_dir,\n",
    "                           base_model=\"htdung167/vin100h-preprocessed-v2\",\n",
    "                           text_header=\"preprocessed_sentence_v2\"\n",
    "                          )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_kg_hide-output": true,
    "execution": {
     "iopub.execute_input": "2025-05-10T16:10:46.273403Z",
     "iopub.status.busy": "2025-05-10T16:10:46.272176Z",
     "iopub.status.idle": "2025-05-10T17:15:19.405258Z",
     "shell.execute_reply": "2025-05-10T17:15:19.402002Z",
     "shell.execute_reply.started": "2025-05-10T16:10:46.273366Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \\\n",
    "    \"./data/vin100h-preprocessed-v2\" \\\n",
    "    \"./data/vin100h-preprocessed-v2_pinyin\" \\\n",
    "    --workers 4 # Sets the number of parallel processes for preprocessing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:25:20.414900Z",
     "iopub.status.busy": "2025-06-15T14:25:20.414621Z",
     "iopub.status.idle": "2025-06-15T14:25:21.649820Z",
     "shell.execute_reply": "2025-06-15T14:25:21.648942Z",
     "shell.execute_reply.started": "2025-06-15T14:25:20.414873Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "%%writefile ./src/f5_tts/configs/vi-fine-tuned-t5-tts.yaml\n",
    "hydra:\n",
    "  run:\n",
    "    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}\n",
    "\n",
    "datasets:\n",
    "  name: vin100h-preprocessed-v2  # dataset name\n",
    "  batch_size_per_gpu: 3200  # 1 GPUs, 1 * 3200 = 3200\n",
    "  batch_size_type: frame  # frame | sample\n",
    "  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models\n",
    "  num_workers: 4\n",
    "\n",
    "optim:\n",
    "  epochs: 10\n",
    "  learning_rate: 1e-5\n",
    "  num_warmup_updates: 2761  # warmup updates\n",
    "  grad_accumulation_steps: 2  # note: updates = steps / grad_accumulation_steps\n",
    "  max_grad_norm: 1.0  # gradient clipping\n",
    "  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not\n",
    "\n",
    "model:\n",
    "  name: vi_fine_tuned_t5_tts  # model name\n",
    "  tokenizer: pinyin  # tokenizer type\n",
    "  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)\n",
    "  backbone: DiT\n",
    "  arch:\n",
    "    dim: 1024\n",
    "    depth: 22\n",
    "    heads: 16\n",
    "    ff_mult: 2\n",
    "    text_dim: 512\n",
    "    text_mask_padding: False\n",
    "    conv_layers: 4\n",
    "    pe_attn_head: 1\n",
    "    checkpoint_activations: False  # recompute activations and save memory for extra compute\n",
    "  mel_spec:\n",
    "    target_sample_rate: 24000\n",
    "    n_mel_channels: 100\n",
    "    hop_length: 256\n",
    "    win_length: 1024\n",
    "    n_fft: 1024\n",
    "    mel_spec_type: vocos  # vocos | bigvgan\n",
    "  vocoder:\n",
    "    is_local: False  # use local offline ckpt or not\n",
    "    local_path: null  # local vocoder path\n",
    "\n",
    "ckpts:\n",
    "  logger: null  # wandb | tensorboard | null\n",
    "  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples\n",
    "  save_per_updates: 4000  # save checkpoint per updates\n",
    "  keep_last_n_checkpoints: 1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints\n",
    "  last_per_updates: 4000  # save last checkpoint per updates\n",
    "  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:25:21.651011Z",
     "iopub.status.busy": "2025-06-15T14:25:21.650749Z",
     "iopub.status.idle": "2025-06-15T14:25:22.958480Z",
     "shell.execute_reply": "2025-06-15T14:25:22.957781Z",
     "shell.execute_reply.started": "2025-06-15T14:25:21.650992Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!echo hello"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:25:22.959726Z",
     "iopub.status.busy": "2025-06-15T14:25:22.959476Z",
     "iopub.status.idle": "2025-06-15T14:25:38.131765Z",
     "shell.execute_reply": "2025-06-15T14:25:38.130931Z",
     "shell.execute_reply.started": "2025-06-15T14:25:22.959692Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!accelerate config default"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:28:31.671797Z",
     "iopub.status.busy": "2025-06-15T14:28:31.671483Z",
     "iopub.status.idle": "2025-06-15T14:28:31.803519Z",
     "shell.execute_reply": "2025-06-15T14:28:31.802848Z",
     "shell.execute_reply.started": "2025-06-15T14:28:31.671770Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!echo go"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:28:31.804624Z",
     "iopub.status.busy": "2025-06-15T14:28:31.804419Z",
     "iopub.status.idle": "2025-06-15T17:59:02.693078Z",
     "shell.execute_reply": "2025-06-15T17:59:02.692025Z",
     "shell.execute_reply.started": "2025-06-15T14:28:31.804591Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ************\n",
    "!accelerate launch ./src/f5_tts/train/finetune_cli.py \\\n",
    "                    --exp_name F5TTS_Base \\\n",
    "                    --dataset_name vin100h-preprocessed-v2 \\\n",
    "                    --finetune \\\n",
    "                    --tokenizer pinyin \\\n",
    "                    --learning_rate 1e-05 \\\n",
    "                    --batch_size_type frame \\\n",
    "                    --batch_size_per_gpu 3200 \\\n",
    "                    --max_samples 64 \\\n",
    "                    --grad_accumulation_steps 2 \\\n",
    "                    --max_grad_norm 1 \\\n",
    "                    --epochs 76 \\\n",
    "                    --num_warmup_updates 2761 \\\n",
    "                    --save_per_updates 4000 \\\n",
    "                    --keep_last_n_checkpoints 1 \\\n",
    "                    --last_per_updates 4000 \\\n",
    "                    --log_samples \\\n",
    "                    --pretrain ./ckpts/vin100h-preprocessed-v2/model_last.pt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T18:05:50.705629Z",
     "iopub.status.busy": "2025-06-15T18:05:50.704903Z",
     "iopub.status.idle": "2025-06-15T18:05:50.891227Z",
     "shell.execute_reply": "2025-06-15T18:05:50.890434Z",
     "shell.execute_reply.started": "2025-06-15T18:05:50.705578Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!echo abc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Copy and save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-14T10:18:46.384990Z",
     "iopub.status.busy": "2025-06-14T10:18:46.384685Z",
     "iopub.status.idle": "2025-06-14T10:18:46.518166Z",
     "shell.execute_reply": "2025-06-14T10:18:46.517174Z",
     "shell.execute_reply.started": "2025-06-14T10:18:46.384965Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!rm -rf /kaggle/working/.cache"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-07T16:58:20.250613Z",
     "iopub.status.busy": "2025-06-07T16:58:20.250305Z",
     "iopub.status.idle": "2025-06-07T16:58:20.446725Z",
     "shell.execute_reply": "2025-06-07T16:58:20.445927Z",
     "shell.execute_reply.started": "2025-06-07T16:58:20.250588Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!ls -a ckpts/vin100h-preprocessed-v2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T18:06:00.980687Z",
     "iopub.status.busy": "2025-06-15T18:06:00.979884Z",
     "iopub.status.idle": "2025-06-15T18:06:07.418545Z",
     "shell.execute_reply": "2025-06-15T18:06:07.417240Z",
     "shell.execute_reply.started": "2025-06-15T18:06:00.980649Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# *******************Importance\n",
    "model_dir = \"/kaggle/working/76\"\n",
    "os.makedirs(model_dir, exist_ok=True)\n",
    "!cp -r ./ckpts/vin100h-preprocessed-v2/model_last.pt $model_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2025-06-14T10:34:21.049620Z",
     "iopub.status.idle": "2025-06-14T10:34:21.049856Z",
     "shell.execute_reply": "2025-06-14T10:34:21.049753Z",
     "shell.execute_reply.started": "2025-06-14T10:34:21.049740Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# To temporary Model hub\n",
    "from huggingface_hub import HfApi\n",
    "from huggingface_hub import snapshot_download\n",
    "# Initialize API\n",
    "api = HfApi()\n",
    "\n",
    "# Upload the folder to the repository root\n",
    "api.upload_large_folder(\n",
    "    folder_path=\"/kaggle/working\",  # Local folder path\n",
    "    repo_id=\"heboya8/t5-tts-temp-model\",\n",
    "    repo_type=\"model\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prune Checkpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-11T14:11:57.837831Z",
     "iopub.status.busy": "2025-05-11T14:11:57.837476Z",
     "iopub.status.idle": "2025-05-11T14:11:57.844498Z",
     "shell.execute_reply": "2025-05-11T14:11:57.843701Z",
     "shell.execute_reply.started": "2025-05-11T14:11:57.837803Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "def prune_checkpoint(checkpoint_path: str, new_checkpoint_path: str, save_ema: bool, safetensors: bool) -> str:\n",
    "    try:\n",
    "        checkpoint = torch.load(checkpoint_path, weights_only=True)\n",
    "        print(\"Original Checkpoint Keys:\", checkpoint.keys())\n",
    "\n",
    "        to_retain = \"ema_model_state_dict\" if save_ema else \"model_state_dict\"\n",
    "        try:\n",
    "            model_state_dict_to_retain = checkpoint[to_retain]\n",
    "        except KeyError:\n",
    "            return f\"{to_retain} not found in the checkpoint.\"\n",
    "\n",
    "        if safetensors:\n",
    "            new_checkpoint_path = new_checkpoint_path.replace(\".pt\", \".safetensors\")\n",
    "            save_file(model_state_dict_to_retain, new_checkpoint_path)\n",
    "        else:\n",
    "            new_checkpoint_path = new_checkpoint_path.replace(\".safetensors\", \".pt\")\n",
    "            new_checkpoint = {\"ema_model_state_dict\": model_state_dict_to_retain}\n",
    "            torch.save(new_checkpoint, new_checkpoint_path)\n",
    "\n",
    "        return f\"New checkpoint saved at: {new_checkpoint_path}\"\n",
    "\n",
    "    except Exception as e:\n",
    "        return f\"An error occurred: {e}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-11T14:22:24.624318Z",
     "iopub.status.busy": "2025-05-11T14:22:24.623974Z",
     "iopub.status.idle": "2025-05-11T14:22:30.316195Z",
     "shell.execute_reply": "2025-05-11T14:22:30.315529Z",
     "shell.execute_reply.started": "2025-05-11T14:22:24.624292Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Prune a checkpoint after training resize model\n",
    "result = prune_checkpoint(\n",
    "    checkpoint_path=\"/kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.pt\",\n",
    "    new_checkpoint_path=\"/root/.cache/abc.pt\",\n",
    "    save_ema=False,\n",
    "    safetensors=False\n",
    ")\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-20T17:08:02.683953Z",
     "iopub.status.busy": "2025-05-20T17:08:02.683595Z",
     "iopub.status.idle": "2025-05-20T17:08:02.753448Z",
     "shell.execute_reply": "2025-05-20T17:08:02.752714Z",
     "shell.execute_reply.started": "2025-05-20T17:08:02.683922Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from IPython.display import Audio\n",
    "\n",
    "# Path to your audio file\n",
    "audio_path = './data/vin100h-preprocessed-v2/wavs/audio_000010.wav'\n",
    "\n",
    "# Display and play the audio\n",
    "Audio(audio_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-14T10:24:03.249295Z",
     "iopub.status.busy": "2025-06-14T10:24:03.248968Z",
     "iopub.status.idle": "2025-06-14T10:24:41.393133Z",
     "shell.execute_reply": "2025-06-14T10:24:41.391987Z",
     "shell.execute_reply.started": "2025-06-14T10:24:03.249273Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!python ./src/f5_tts/infer/infer_cli.py \\\n",
    "        --model \"vin100h-preprocessed-v2\" \\\n",
    "        --model_cfg \"./src/f5_tts/configs/F5TTS_Base.yaml\" \\\n",
    "        --ckpt_file \"./ckpts/vin100h-preprocessed-v2/model_last.pt\" \\\n",
    "        --vocab_file \"./data/vin100h-preprocessed-v2_pinyin/vocab.txt\" \\\n",
    "        --ref_audio \"./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\" \\\n",
    "        --ref_text \"Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa.\" \\\n",
    "        --gen_text \"Về giá cả so với giá bán ngoài các siêu thị\" \\\n",
    "        --output_dir \"/kaggle/working/\"\n",
    "        # --output_file \"/content/abc.wav\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-14T10:24:41.395230Z",
     "iopub.status.busy": "2025-06-14T10:24:41.394917Z",
     "iopub.status.idle": "2025-06-14T10:24:41.404325Z",
     "shell.execute_reply": "2025-06-14T10:24:41.403321Z",
     "shell.execute_reply.started": "2025-06-14T10:24:41.395199Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from IPython.display import Audio\n",
    "\n",
    "# Path to your audio file\n",
    "audio_path = '/kaggle/working/infer_cli_basic.wav'\n",
    "\n",
    "# Display and play the audio\n",
    "Audio(audio_path)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:25:38.133173Z",
     "iopub.status.busy": "2025-06-15T14:25:38.132898Z",
     "iopub.status.idle": "2025-06-15T14:26:12.006111Z",
     "shell.execute_reply": "2025-06-15T14:26:12.005444Z",
     "shell.execute_reply.started": "2025-06-15T14:25:38.133137Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from huggingface_hub import HfApi\n",
    "from huggingface_hub import snapshot_download\n",
    "import os\n",
    "api = HfApi()\n",
    "!git lfs install --force\n",
    "\n",
    "# Define the dataset name and local directory\n",
    "repo_id = \"heboya8/f5-tts-dataset\"\n",
    "save_path = \"/root/.cache\"\n",
    "\n",
    "# Create the directory if it doesn't exist\n",
    "os.makedirs(save_path, exist_ok=True)\n",
    "\n",
    "# Download the dataset\n",
    "snapshot_download(repo_id=repo_id, repo_type=\"dataset\", local_dir=save_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-06-15T14:26:12.009357Z",
     "iopub.status.busy": "2025-06-15T14:26:12.009122Z",
     "iopub.status.idle": "2025-06-15T14:28:31.670192Z",
     "shell.execute_reply": "2025-06-15T14:28:31.669158Z",
     "shell.execute_reply.started": "2025-06-15T14:26:12.009338Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!unzip -q -o /root/.cache/data_compress.zip -d \".\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Upload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T20:06:26.721683Z",
     "iopub.status.busy": "2025-05-10T20:06:26.720825Z",
     "iopub.status.idle": "2025-05-10T20:11:36.850624Z",
     "shell.execute_reply": "2025-05-10T20:11:36.849599Z",
     "shell.execute_reply.started": "2025-05-10T20:06:26.721632Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from huggingface_hub import HfApi\n",
    "from huggingface_hub import snapshot_download\n",
    "# Initialize API\n",
    "api = HfApi()\n",
    "\n",
    "# Upload the folder to the repository root\n",
    "api.upload_large_folder(\n",
    "    folder_path=\"/root/.cache/dataset\",  # Local folder path\n",
    "    repo_id=\"heboya8/f5-tts-dataset\",\n",
    "    repo_type=\"dataset\",\n",
    "    # multi_commits=True,  # Enable resumable uploads\n",
    "    # multi_commits_verbose=True  # Show progress\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## /kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.ptDowload Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T20:16:38.191744Z",
     "iopub.status.busy": "2025-05-10T20:16:38.191338Z",
     "iopub.status.idle": "2025-05-10T20:16:56.134770Z",
     "shell.execute_reply": "2025-05-10T20:16:56.133810Z",
     "shell.execute_reply.started": "2025-05-10T20:16:38.191712Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T20:19:28.100798Z",
     "iopub.status.busy": "2025-05-10T20:19:28.099915Z",
     "iopub.status.idle": "2025-05-10T20:19:28.249902Z",
     "shell.execute_reply": "2025-05-10T20:19:28.248723Z",
     "shell.execute_reply.started": "2025-05-10T20:19:28.100762Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!mkdir dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T20:20:05.322822Z",
     "iopub.status.busy": "2025-05-10T20:20:05.322019Z",
     "iopub.status.idle": "2025-05-10T20:20:05.567705Z",
     "shell.execute_reply": "2025-05-10T20:20:05.566624Z",
     "shell.execute_reply.started": "2025-05-10T20:20:05.322785Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!rm -rf d /root/.cache/dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-05-10T20:20:07.132689Z",
     "iopub.status.busy": "2025-05-10T20:20:07.132287Z",
     "iopub.status.idle": "2025-05-10T20:22:58.875583Z",
     "shell.execute_reply": "2025-05-10T20:22:58.874368Z",
     "shell.execute_reply.started": "2025-05-10T20:20:07.132656Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!unzip -q /kaggle/working/F5-TTS/~/.cache/data_compress.zip -d /root/.cache/dataset"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "none",
   "dataSources": [
    {
     "sourceId": 245622735,
     "sourceType": "kernelVersion"
    }
   ],
   "dockerImageVersionId": 31012,
   "isGpuEnabled": false,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}