{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2025-06-17T15:01:35.597327Z", "iopub.status.busy": "2025-06-17T15:01:35.596909Z", "iopub.status.idle": "2025-06-17T15:01:41.413712Z", "shell.execute_reply": "2025-06-17T15:01:41.413097Z", "shell.execute_reply.started": "2025-06-17T15:01:35.597299Z" }, "trusted": true }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "os.system(\"pip install -q wget\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:01:41.415249Z", "iopub.status.busy": "2025-06-17T15:01:41.415003Z", "iopub.status.idle": "2025-06-17T15:01:47.137659Z", "shell.execute_reply": "2025-06-17T15:01:47.137095Z", "shell.execute_reply.started": "2025-06-17T15:01:41.415231Z" }, "trusted": true }, "outputs": [], "source": [ "import wget\n", "import tarfile\n", "import torchaudio\n", "import pandas as pd\n", "from huggingface_hub import snapshot_download, login\n", "login(\"\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:01:47.138696Z", "iopub.status.busy": "2025-06-17T15:01:47.138320Z", "iopub.status.idle": "2025-06-17T15:01:47.142640Z", "shell.execute_reply": "2025-06-17T15:01:47.141872Z", "shell.execute_reply.started": "2025-06-17T15:01:47.138677Z" }, "trusted": true }, "outputs": [], "source": [ "os.chdir(\"/content\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "from huggingface_hub import HfApi\n", "from huggingface_hub import snapshot_download\n", "import os\n", "api = HfApi()\n", "!git lfs install --force\n", "\n", "# Define the dataset name and local directory\n", "\n", "repo_id = \"heboya8/t5-tts-temp-model\"\n", "save_path = \"/content\"\n", "\n", "# Create the directory if it doesn't exist\n", "os.makedirs(save_path, exist_ok=True)\n", "\n", "# Download the dataset\n", "snapshot_download(repo_id=repo_id, repo_type=\"model\", local_dir=save_path)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T01:51:26.479981Z", "iopub.status.busy": "2025-06-17T01:51:26.477420Z", "iopub.status.idle": "2025-06-17T01:51:26.676233Z", "shell.execute_reply": "2025-06-17T01:51:26.674985Z", "shell.execute_reply.started": "2025-06-17T01:51:26.479923Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ". .. 71 73 75 78 .cache .config\t.gitattributes\tsample_data\n" ] } ], "source": [ "!ls -a" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:01:47.144207Z", "iopub.status.busy": "2025-06-17T15:01:47.143938Z", "iopub.status.idle": "2025-06-17T15:05:03.276239Z", "shell.execute_reply": "2025-06-17T15:05:03.275559Z", "shell.execute_reply.started": "2025-06-17T15:01:47.144181Z" }, "trusted": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cloning into 'F5-TTS'...\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Step 1: Set Up the Environment\n", "os.system(\"pip install -e . >/dev/null 2>&1\")\n", "os.system(\"pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 torchvision==0.19.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124 >/dev/null 2>&1\")\n", "os.system(\"pip install accelerate==0.33.0 tensorboard >/dev/null 2>&1\")\n", "if not os.path.exists(\"F5-TTS\"):\n", " # os.system(\"git clone https://github.com/SWivid/F5-TTS.git\")\n", " os.system(\"git clone https://github.com/danhtran2mind/F5-TTS.git\")\n", "os.chdir(\"F5-TTS\")\n", "os.system(\"pip install -e . >/dev/null 2>&1\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:05:03.277361Z", "iopub.status.busy": "2025-06-17T15:05:03.277007Z", "iopub.status.idle": "2025-06-17T15:05:03.280866Z", "shell.execute_reply": "2025-06-17T15:05:03.280113Z", "shell.execute_reply.started": "2025-06-17T15:05:03.277341Z" }, "trusted": true }, "outputs": [], "source": [ "os.chdir(\"/content/F5-TTS\")\n", "# os.chdir(\"F5-TTS-Vietnamese\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:05:03.283201Z", "iopub.status.busy": "2025-06-17T15:05:03.282849Z", "iopub.status.idle": "2025-06-17T15:05:03.431616Z", "shell.execute_reply": "2025-06-17T15:05:03.430672Z", "shell.execute_reply.started": "2025-06-17T15:05:03.283176Z" }, "trusted": true }, "outputs": [], "source": [ "!mkdir ./ckpts/vin100h-preprocessed-v2" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2025-05-29T14:55:41.394312Z", "iopub.status.busy": "2025-05-29T14:55:41.394058Z", "iopub.status.idle": "2025-05-29T14:56:35.002821Z", "shell.execute_reply": "2025-05-29T14:56:35.001574Z", "shell.execute_reply.started": "2025-05-29T14:55:41.394290Z" }, "trusted": true }, "outputs": [], "source": [ "# !cp -r /kaggle/input/vi-fine-tuned-t5-tts/29/model_last.pt ./ckpts/vin100h-preprocessed-v2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "execution_failed": "2025-05-29T13:44:32.926Z" }, "trusted": true }, "outputs": [], "source": [ "!mkdir ./ckpts/vin100h-preprocessed-v2" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:05:03.433154Z", "iopub.status.busy": "2025-06-17T15:05:03.432814Z", "iopub.status.idle": "2025-06-17T15:05:53.201849Z", "shell.execute_reply": "2025-06-17T15:05:53.200797Z", "shell.execute_reply.started": "2025-06-17T15:05:03.433120Z" }, "trusted": true }, "outputs": [], "source": [ "!cp -r /kaggle/input/vi-fine-tuned-t5-tts/80/model_last.pt \\\n", "./ckpts/vin100h-preprocessed-v2" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:05:53.203394Z", "iopub.status.busy": "2025-06-17T15:05:53.203095Z", "iopub.status.idle": "2025-06-17T15:05:53.337400Z", "shell.execute_reply": "2025-06-17T15:05:53.336629Z", "shell.execute_reply.started": "2025-06-17T15:05:53.203359Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ". .. model_last.pt\n" ] } ], "source": [ "!ls -a ./ckpts/vin100h-preprocessed-v2" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2025-05-21T16:31:43.230974Z", "iopub.status.busy": "2025-05-21T16:31:43.230651Z", "iopub.status.idle": "2025-05-21T16:31:57.026928Z", "shell.execute_reply": "2025-05-21T16:31:57.025871Z", "shell.execute_reply.started": "2025-05-21T16:31:43.230950Z" }, "trusted": true }, "outputs": [], "source": [ "# !cp -r ./ckpts/vin100h-preprocessed-v2/model_last.pt /kaggle/working/" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2025-05-21T16:33:02.071467Z", "iopub.status.busy": "2025-05-21T16:33:02.071064Z", "iopub.status.idle": "2025-05-21T16:33:02.193401Z", "shell.execute_reply": "2025-05-21T16:33:02.192650Z", "shell.execute_reply.started": "2025-05-21T16:33:02.071435Z" }, "trusted": true }, "outputs": [], "source": [ "!mv /kaggle/working/model_last.pt /kaggle/working/12/model_last.pt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-05-10T15:59:08.329794Z", "iopub.status.busy": "2025-05-10T15:59:08.329442Z", "iopub.status.idle": "2025-05-10T15:59:09.362207Z", "shell.execute_reply": "2025-05-10T15:59:09.361253Z", "shell.execute_reply.started": "2025-05-10T15:59:08.329757Z" }, "trusted": true }, "outputs": [], "source": [ "import json\n", "import os\n", "from pathlib import Path\n", "import shutil\n", "import torchaudio\n", "from datasets import load_dataset\n", "from datasets.arrow_writer import ArrowWriter\n", "from tqdm import tqdm\n", "import soundfile as sf\n", "import csv\n", "\n", "def save_dataset_to_local_disk(output_dir=\"./data/vin100h-preprocessed-v2\",\n", " base_model=\"htdung167/vin100h-preprocessed-v2\",\n", " audio_header='audio', text_header='transcription'):\n", " \n", " wavs_dir = os.path.join(output_dir, \"wavs\")\n", " metadata_path = os.path.join(output_dir, \"metadata.csv\")\n", " os.makedirs(wavs_dir, exist_ok=True)\n", "\n", " ds = load_dataset(base_model)['train']\n", " metadata = []\n", "\n", " for idx, sample in tqdm(enumerate(ds), total=len(ds),\n", " desc=\"Saving samples to directory\"):\n", " audio_array = sample[audio_header]['array']\n", " sampling_rate = sample[audio_header]['sampling_rate']\n", " filename = f\"audio_{idx:06d}.wav\"\n", " sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)\n", " # metadata.append([f\"wavs/{filename}\", sample['preprocessed_sentence_v2']])\n", " metadata.append([f\"wavs/{filename}\", sample[text_header]])\n", " # metadata.append([f\"{filename}\", sample['transcription']])\n", " \n", " with open(metadata_path, 'w', newline='', encoding='utf-8') as f:\n", " csv.writer(f, delimiter='|').writerows(metadata)\n", "\n", " print(f\"Dataset saved to {output_dir}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-05-10T15:59:10.399030Z", "iopub.status.busy": "2025-05-10T15:59:10.397916Z", "iopub.status.idle": "2025-05-10T16:10:46.269067Z", "shell.execute_reply": "2025-05-10T16:10:46.267298Z", "shell.execute_reply.started": "2025-05-10T15:59:10.398995Z" }, "trusted": true }, "outputs": [], "source": [ "output_dir = \"./data/vin100h-preprocessed-v2\"\n", "tokenizer_type = \"pinyin\"\n", "\n", "save_dataset_to_local_disk(output_dir=output_dir,\n", " base_model=\"htdung167/vin100h-preprocessed-v2\",\n", " text_header=\"preprocessed_sentence_v2\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-05-10T16:10:46.273403Z", "iopub.status.busy": "2025-05-10T16:10:46.272176Z", "iopub.status.idle": "2025-05-10T17:15:19.405258Z", "shell.execute_reply": "2025-05-10T17:15:19.402002Z", "shell.execute_reply.started": "2025-05-10T16:10:46.273366Z" }, "trusted": true }, "outputs": [], "source": [ "!python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \\\n", " \"./data/vin100h-preprocessed-v2\" \\\n", " \"./data/vin100h-preprocessed-v2_pinyin\" \\\n", " --workers 4 # Sets the number of parallel processes for preprocessing." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:20:02.239561Z", "iopub.status.busy": "2025-06-17T15:20:02.238766Z", "iopub.status.idle": "2025-06-17T15:20:02.245371Z", "shell.execute_reply": "2025-06-17T15:20:02.244794Z", "shell.execute_reply.started": "2025-06-17T15:20:02.239531Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing ./src/f5_tts/configs/vi-fine-tuned-f5-tts.yaml\n" ] } ], "source": [ "%%writefile ./src/f5_tts/configs/vi-fine-tuned-f5-tts.yaml\n", "hydra:\n", " run:\n", " dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}\n", "\n", "datasets:\n", " name: vin100h-preprocessed-v2 # dataset name\n", " batch_size_per_gpu: 3200 # 1 GPUs, 1 * 3200 = 3200\n", " batch_size_type: frame # frame | sample\n", " max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models\n", " num_workers: 4\n", "\n", "optim:\n", " epochs: 80\n", " learning_rate: 1e-5\n", " num_warmup_updates: 2761 # warmup updates\n", " grad_accumulation_steps: 2 # note: updates = steps / grad_accumulation_steps\n", " max_grad_norm: 1.0 # gradient clipping\n", " bnb_optimizer: False # use bnb 8bit AdamW optimizer or not\n", "\n", "model:\n", " name: vi_fine_tuned_t5_tts # model name\n", " tokenizer: pinyin # tokenizer type\n", " tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)\n", " backbone: DiT\n", " arch:\n", " dim: 1024\n", " depth: 22\n", " heads: 16\n", " ff_mult: 2\n", " text_dim: 512\n", " text_mask_padding: False\n", " conv_layers: 4\n", " pe_attn_head: 1\n", " checkpoint_activations: False # recompute activations and save memory for extra compute\n", " mel_spec:\n", " target_sample_rate: 24000\n", " n_mel_channels: 100\n", " hop_length: 256\n", " win_length: 1024\n", " n_fft: 1024\n", " mel_spec_type: vocos # vocos | bigvgan\n", " vocoder:\n", " is_local: False # use local offline ckpt or not\n", " local_path: null # local vocoder path\n", "\n", "ckpts:\n", " logger: null # wandb | tensorboard | null\n", " log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples\n", " save_per_updates: 4000 # save checkpoint per updates\n", " keep_last_n_checkpoints: 1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints\n", " last_per_updates: 4000 # save last checkpoint per updates\n", " save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2025-06-16T08:41:51.536675Z", "iopub.status.busy": "2025-06-16T08:41:51.536402Z", "iopub.status.idle": "2025-06-16T08:41:51.666812Z", "shell.execute_reply": "2025-06-16T08:41:51.665931Z", "shell.execute_reply.started": "2025-06-16T08:41:51.536657Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello\n" ] } ], "source": [ "!echo hello" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:05:54.147828Z", "iopub.status.busy": "2025-06-17T15:05:54.147535Z", "iopub.status.idle": "2025-06-17T15:06:09.542218Z", "shell.execute_reply": "2025-06-17T15:06:09.541348Z", "shell.execute_reply.started": "2025-06-17T15:05:54.147805Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accelerate configuration saved at /root/.cache/huggingface/accelerate/default_config.yaml\n" ] } ], "source": [ "!accelerate config default" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T02:33:01.507167Z", "iopub.status.busy": "2025-06-17T02:33:01.506782Z", "iopub.status.idle": "2025-06-17T02:33:01.644738Z", "shell.execute_reply": "2025-06-17T02:33:01.643748Z", "shell.execute_reply.started": "2025-06-17T02:33:01.507086Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "go\n" ] } ], "source": [ "!echo go" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:16:30.232115Z", "iopub.status.busy": "2025-06-17T15:16:30.231283Z", "iopub.status.idle": "2025-06-17T15:18:25.550165Z", "shell.execute_reply": "2025-06-17T15:18:25.548630Z", "shell.execute_reply.started": "2025-06-17T15:16:30.232085Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "copy checkpoint for finetune\n", "\n", "vocab : 2545\n", "\n", "vocoder : vocos\n", "Using logger: None\n", "Gradient accumulation checkpointing with per_updates now, old logic per_steps used with before f992c4e\n", "Loading dataset ...\n", "2025-06-17 15:17:40.763073: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1750173460.969428 249 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1750173461.025851 249 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "Download Vocos from huggingface charactr/vocos-mel-24khz\n", "config.yaml: 100%|█████████████████████████████| 461/461 [00:00<00:00, 3.57MB/s]\n", "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n", "pytorch_model.bin: 100%|████████████████████| 54.4M/54.4M [00:00<00:00, 261MB/s]\n", "Sorting with sampler... if slow, check whether dataset is provided with duration\n", "Creating dynamic batches with 3200 audio frames per gpu: 100%|█| 56400/56400 [00\n", "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 16 worker processes in total. Our suggested max number of worker in current system is 4, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", " warnings.warn(_create_warning_msg(\n", "Epoch 79/80: 0%| | 5/6182 [00:13<5:10:52, 3.02s/update, loss=0.843, update=49^C\n" ] } ], "source": [ "# ************\n", "!accelerate launch ./src/f5_tts/train/finetune_cli.py \\\n", " --exp_name F5TTS_Base \\\n", " --dataset_name vin100h-preprocessed-v2 \\\n", " --finetune \\\n", " --tokenizer pinyin \\\n", " --learning_rate 1e-05 \\\n", " --batch_size_type frame \\\n", " --batch_size_per_gpu 3200 \\\n", " --max_samples 64 \\\n", " --grad_accumulation_steps 2 \\\n", " --max_grad_norm 1 \\\n", " --epochs 80 \\\n", " --num_warmup_updates 2761 \\\n", " --save_per_updates 4000 \\\n", " --keep_last_n_checkpoints 1 \\\n", " --last_per_updates 4000 \\\n", " --log_samples \\\n", " --pretrain ./ckpts/vin100h-preprocessed-v2/model_last.pt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Copy and save" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T10:12:47.949751Z", "iopub.status.busy": "2025-06-17T10:12:47.949452Z", "iopub.status.idle": "2025-06-17T10:13:01.658980Z", "shell.execute_reply": "2025-06-17T10:13:01.657915Z", "shell.execute_reply.started": "2025-06-17T10:12:47.949726Z" }, "trusted": true }, "outputs": [], "source": [ "# *******************Importance\n", "save_path = \"/kaggle/working/80\"\n", "os.makedirs(save_path, exist_ok=True)\n", "!cp -r ./ckpts/vin100h-preprocessed-v2/model_last.pt $save_path" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T10:16:44.769769Z", "iopub.status.busy": "2025-06-17T10:16:44.769490Z", "iopub.status.idle": "2025-06-17T10:18:44.924685Z", "shell.execute_reply": "2025-06-17T10:18:44.924158Z", "shell.execute_reply.started": "2025-06-17T10:16:44.769742Z" }, "trusted": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "db02312a61864bbda76e0436a3c30d59", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Recovering from metadata files: 0%| | 0/1 [00:00 str:\n", " try:\n", " checkpoint = torch.load(checkpoint_path, weights_only=True)\n", " print(\"Original Checkpoint Keys:\", checkpoint.keys())\n", "\n", " to_retain = \"ema_model_state_dict\" if save_ema else \"model_state_dict\"\n", " try:\n", " model_state_dict_to_retain = checkpoint[to_retain]\n", " except KeyError:\n", " return f\"{to_retain} not found in the checkpoint.\"\n", "\n", " if safetensors:\n", " new_checkpoint_path = new_checkpoint_path.replace(\".pt\", \".safetensors\")\n", " save_file(model_state_dict_to_retain, new_checkpoint_path)\n", " else:\n", " new_checkpoint_path = new_checkpoint_path.replace(\".safetensors\", \".pt\")\n", " new_checkpoint = {\"ema_model_state_dict\": model_state_dict_to_retain}\n", " torch.save(new_checkpoint, new_checkpoint_path)\n", "\n", " return f\"New checkpoint saved at: {new_checkpoint_path}\"\n", "\n", " except Exception as e:\n", " return f\"An error occurred: {e}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-05-11T14:22:24.624318Z", "iopub.status.busy": "2025-05-11T14:22:24.623974Z", "iopub.status.idle": "2025-05-11T14:22:30.316195Z", "shell.execute_reply": "2025-05-11T14:22:30.315529Z", "shell.execute_reply.started": "2025-05-11T14:22:24.624292Z" }, "trusted": true }, "outputs": [], "source": [ "# Prune a checkpoint after training resize model\n", "result = prune_checkpoint(\n", " checkpoint_path=\"/kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.pt\",\n", " new_checkpoint_path=\"/root/.cache/abc.pt\",\n", " save_ema=False,\n", " safetensors=False\n", ")\n", "print(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inference" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2025-06-04T09:45:21.012950Z", "iopub.status.busy": "2025-06-04T09:45:21.012568Z", "iopub.status.idle": "2025-06-04T09:45:21.032225Z", "shell.execute_reply": "2025-06-04T09:45:21.031171Z", "shell.execute_reply.started": "2025-06-04T09:45:21.012924Z" }, "trusted": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import Audio\n", "\n", "# Path to your audio file\n", "audio_path = './data/vin100h-preprocessed-v2/wavs/audio_000010.wav'\n", "\n", "# Display and play the audio\n", "Audio(audio_path)\n", "\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:27:26.150679Z", "iopub.status.busy": "2025-06-17T15:27:26.150330Z", "iopub.status.idle": "2025-06-17T15:28:18.529875Z", "shell.execute_reply": "2025-06-17T15:28:18.528858Z", "shell.execute_reply.started": "2025-06-17T15:27:26.150650Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-06-17 15:27:38.164110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1750174058.189595 391 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1750174058.196516 391 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "Download Vocos from huggingface charactr/vocos-mel-24khz\n", "Using vin100h-preprocessed-v2...\n", "\n", "vocab : ./data/vin100h-preprocessed-v2_pinyin/vocab.txt\n", "token : custom\n", "model : ./ckpts/vin100h-preprocessed-v2/model_last.pt \n", "\n", "Voice: main\n", "ref_audio ./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\n", "Converting audio...\n", "Audio is over 12s, clipping short. (2)\n", "Using custom reference text...\n", "\n", "ref_text Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa. \n", "ref_audio_ /tmp/tmpjucisns9.wav \n", "\n", "\n", "No voice tag found, using main.\n", "Voice: main\n", "gen_text 0 Tuy nhiên đôi khi vẫn có những trường hợp trục lợi trợ cấp khi không khai báo đầy đủ về người có nghĩa vụ chu cấp, cũng như những thay đổi về thu nhập và tài sản của mình.\n", "\n", "\n", "Generating audio in 1 batches...\n", "100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.82s/it]\n", "/kaggle/working/infer_cli_basic.wav\n", "52.37339425086975\n" ] } ], "source": [ "import time\n", "\n", "t1 = time.time()\n", "!python ./src/f5_tts/infer/infer_cli.py \\\n", " --model \"vin100h-preprocessed-v2\" \\\n", " --model_cfg \"./src/f5_tts/configs/vi-fine-tuned-f5-tts.yaml\" \\\n", " --ckpt_file \"./ckpts/vin100h-preprocessed-v2/model_last.pt\" \\\n", " --vocab_file \"./data/vin100h-preprocessed-v2_pinyin/vocab.txt\" \\\n", " --ref_audio \"./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\" \\\n", " --ref_text \"Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa.\" \\\n", " --gen_text \"Tuy nhiên đôi khi vẫn có những trường hợp trục lợi trợ cấp khi không khai báo đầy đủ về người có nghĩa vụ chu cấp, cũng như những thay đổi về thu nhập và tài sản của mình.\" \\\n", " --output_dir \"/kaggle/working/\"\n", " # --output_file \"/content/abc.wav\"\n", "\n", "print(time.time() - t1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T10:23:52.564882Z", "iopub.status.busy": "2025-06-17T10:23:52.564411Z", "iopub.status.idle": "2025-06-17T10:24:36.841824Z", "shell.execute_reply": "2025-06-17T10:24:36.840934Z", "shell.execute_reply.started": "2025-06-17T10:23:52.564858Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-06-17 10:24:02.873808: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1750155842.897993 500 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1750155842.905125 500 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "Download Vocos from huggingface charactr/vocos-mel-24khz\n", "Using vin100h-preprocessed-v2...\n", "\n", "vocab : ./data/vin100h-preprocessed-v2_pinyin/vocab.txt\n", "token : custom\n", "model : ./ckpts/vin100h-preprocessed-v2/model_last.pt \n", "\n", "Voice: main\n", "ref_audio ./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\n", "Converting audio...\n", "Audio is over 12s, clipping short. (2)\n", "Using custom reference text...\n", "\n", "ref_text Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa. \n", "ref_audio_ /tmp/tmp6_z8vr7d.wav \n", "\n", "\n", "No voice tag found, using main.\n", "Voice: main\n", "gen_text 0 Tuy nhiên đôi khi vẫn có những trường hợp trục lợi trợ cấp khi không khai báo đầy đủ về người có nghĩa vụ chu cấp, cũng như những thay đổi về thu nhập và tài sản của mình.\n", "\n", "\n", "Generating audio in 1 batches...\n", "100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.86s/it]\n", "/kaggle/working/infer_cli_basic.wav\n", "44.271546602249146\n" ] } ], "source": [ "import time\n", "\n", "t1 = time.time()\n", "!python ./src/f5_tts/infer/infer_cli.py \\\n", " --model \"vin100h-preprocessed-v2\" \\\n", " --model_cfg \"./src/f5_tts/configs/F5TTS_Base.yaml\" \\\n", " --ckpt_file \"./ckpts/vin100h-preprocessed-v2/model_last.pt\" \\\n", " --vocab_file \"./data/vin100h-preprocessed-v2_pinyin/vocab.txt\" \\\n", " --ref_audio \"./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\" \\\n", " --ref_text \"Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa.\" \\\n", " --gen_text \"Tuy nhiên đôi khi vẫn có những trường hợp trục lợi trợ cấp khi không khai báo đầy đủ về người có nghĩa vụ chu cấp, cũng như những thay đổi về thu nhập và tài sản của mình.\" \\\n", " --output_dir \"/kaggle/working/\"\n", " # --output_file \"/content/abc.wav\"\n", "\n", "print(time.time() - t1)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:28:18.532293Z", "iopub.status.busy": "2025-06-17T15:28:18.531632Z", "iopub.status.idle": "2025-06-17T15:28:18.575767Z", "shell.execute_reply": "2025-06-17T15:28:18.574975Z", "shell.execute_reply.started": "2025-06-17T15:28:18.532267Z" }, "trusted": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import Audio\n", "\n", "# Path to your audio file\n", "audio_path = '/kaggle/working/infer_cli_basic.wav'\n", "\n", "# Display and play the audio\n", "Audio(audio_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2025-06-17T15:06:09.545133Z", "iopub.status.busy": "2025-06-17T15:06:09.544801Z", "iopub.status.idle": "2025-06-17T15:14:10.627410Z", "shell.execute_reply": "2025-06-17T15:14:10.626697Z", "shell.execute_reply.started": "2025-06-17T15:06:09.545102Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Updated git hooks.\n", "Git LFS initialized.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fec7707540b24cdc9dce3d34fb063e04", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Fetching 2 files: 0%| | 0/2 [00:00\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "# To temporary Model hub\n", "from huggingface_hub import HfApi\n", "from huggingface_hub import snapshot_download\n", "# Initialize API\n", "api = HfApi()\n", "\n", "# Upload the folder to the repository root\n", "api.upload_large_folder(\n", " folder_path=\"/kaggle/working/save-to-huggingface\", # Local folder path\n", " repo_id=\"heboya8/t5-tts-temp-model\",\n", " repo_type=\"model\"\n", ")" ] } ], "metadata": { "kaggle": { "accelerator": "gpu", "dataSources": [ { "sourceId": 245908236, "sourceType": "kernelVersion" } ], "dockerImageVersionId": 31012, "isGpuEnabled": true, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 4 }