{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ubuntu/vocos\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniconda3/envs/respair/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
" self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
]
}
],
"source": [
"%cd /home/ubuntu/vocos/"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2705444/1667309830.py:12: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" raw_model = torch.load(checkpoint_path, map_location=device)\n"
]
}
],
"source": [
"from IPython.display import Audio\n",
"import torch\n",
"import librosa\n",
"import torchaudio\n",
"from vocos.pretrained import Vocos\n",
"\n",
"\n",
"\n",
"def load_vocos(checkpoint_path, config_path, device):\n",
" model = Vocos.from_hparams(config_path).to(device)\n",
"\n",
" raw_model = torch.load(checkpoint_path, map_location=device)\n",
" raw_model = raw_model if 'state_dict' not in raw_model else raw_model['state_dict']\n",
" model.load_state_dict(raw_model, strict=False)\n",
" model.eval()\n",
" return model\n",
"\n",
"\n",
"checkpoint_path = \"/home/ubuntu/vocos/logs/lightning_logs/version_25/checkpoints/last.ckpt\"\n",
"config_path = \"/home/ubuntu/vocos/logs/lightning_logs/version_25/config.yaml\"\n",
"device = \"cpu\"\n",
"\n",
"vocos = load_vocos(checkpoint_path, config_path, device)"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1848695/874175190.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" mel_tensor = torch.load(\"/home/ubuntu/respair/Darya_AuxiliaryASR/HiFTNet/test.pt\")\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:\n",
" \"\"\"\n",
" Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.\n",
"\n",
" Args:\n",
" x (Tensor): Input tensor.\n",
" clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.\n",
"\n",
" Returns:\n",
" Tensor: Element-wise logarithm of the input tensor with clipping applied.\n",
" \"\"\"\n",
" return torch.log(torch.clip(x, min=clip_val))\n",
"\n",
"\n",
"mel_tensor = torch.load(\"/home/ubuntu/respair/Darya_AuxiliaryASR/HiFTNet/test.pt\")\n",
"# mel = safe_log(mel)\n",
"\n",
"mean = -4\n",
"std = 4\n",
"\n",
"# Reverse normalization and logarithmic transform\n",
"denormalized = mel_tensor * std + mean\n",
"mel_plus_epsilon = torch.exp(denormalized)\n",
"original_mel = mel_plus_epsilon - 1e-5\n",
"\n",
"# Ensure non-negative values (mel spectrograms can't be negative)\n",
"mel_tensor = torch.clamp(original_mel, min=0)\n",
"\n",
"mel_tensor = safe_log(mel_tensor)\n",
"\n",
"# Original mel: [n_mels, time]\n",
"mel_tensor = F.interpolate(\n",
" mel_tensor, # Add batch and channel dims\n",
" scale_factor=.9, # Halve the time dimension\n",
" mode=\"area\" # Linear interpolation\n",
")\n",
"\n",
"audio = vocos.decode(mel_tensor.to('cuda'))\n",
"Audio(audio.cpu().numpy(), rate=24000)\n",
"\n",
"# mel_tensor = (torch.log(1e-5 + original_mel) - -1) / 1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2705444/713572534.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" x = torch.load(\"/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/gt.pt\")[:1,:,:].to('cpu')\n"
]
}
],
"source": [
"import torch.nn.functional as F\n",
"\n",
"x = torch.load(\"/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/gt.pt\")[:1,:,:].to('cpu')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"to_mel = torchaudio.transforms.MelSpectrogram(sample_rate=44_100,\n",
" n_mels=128, n_fft=2048, win_length=2048, hop_length=512)\n",
"mean, std = -4, 4\n",
"\n",
"def preprocess(wave):\n",
" \n",
" wave_tensor = torch.from_numpy(wave).float()\n",
" mel_tensor = to_mel(wave_tensor)\n",
" mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
" return mel_tensor.to('cpu')\n",
"\n",
"\n",
"wav = librosa.load(\"/home/ubuntu/respair/jpn/miside/voices_combined/LocationDialogue_Location1_25_28.wav\", sr=44_100)[0]\n",
"\n",
"\n",
"mel = preprocess(wav)\n",
"\n",
"\n",
"\n",
"audio = vocos.decode(x)\n",
"Audio(audio.cpu().numpy(), rate=44_100)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([18, 72192])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"ename": "LibsndfileError",
"evalue": "Error opening '/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav': Format not recognised.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mLibsndfileError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msoundfile\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msf\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m sf\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav\u001b[39m\u001b[38;5;124m\"\u001b[39m, audio, \u001b[38;5;241m44_100\u001b[39m)\n",
"File \u001b[0;32m~/miniconda3/envs/respair/lib/python3.11/site-packages/soundfile.py:363\u001b[0m, in \u001b[0;36mwrite\u001b[0;34m(file, data, samplerate, subtype, endian, format, closefd, compression_level, bitrate_mode)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 362\u001b[0m channels \u001b[38;5;241m=\u001b[39m data\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m SoundFile(file, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m, samplerate, channels,\n\u001b[1;32m 364\u001b[0m subtype, endian, \u001b[38;5;28mformat\u001b[39m, closefd,\n\u001b[1;32m 365\u001b[0m compression_level, bitrate_mode) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 366\u001b[0m f\u001b[38;5;241m.\u001b[39mwrite(data)\n",
"File \u001b[0;32m~/miniconda3/envs/respair/lib/python3.11/site-packages/soundfile.py:690\u001b[0m, in \u001b[0;36mSoundFile.__init__\u001b[0;34m(self, file, mode, samplerate, channels, subtype, endian, format, closefd, compression_level, bitrate_mode)\u001b[0m\n\u001b[1;32m 687\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_bitrate_mode \u001b[38;5;241m=\u001b[39m bitrate_mode\n\u001b[1;32m 688\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info \u001b[38;5;241m=\u001b[39m _create_info_struct(file, mode, samplerate, channels,\n\u001b[1;32m 689\u001b[0m \u001b[38;5;28mformat\u001b[39m, subtype, endian)\n\u001b[0;32m--> 690\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_open(file, mode_int, closefd)\n\u001b[1;32m 691\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mset\u001b[39m(mode)\u001b[38;5;241m.\u001b[39missuperset(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr+\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mseekable():\n\u001b[1;32m 692\u001b[0m \u001b[38;5;66;03m# Move write position to 0 (like in Python file objects)\u001b[39;00m\n\u001b[1;32m 693\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n",
"File \u001b[0;32m~/miniconda3/envs/respair/lib/python3.11/site-packages/soundfile.py:1265\u001b[0m, in \u001b[0;36mSoundFile._open\u001b[0;34m(self, file, mode_int, closefd)\u001b[0m\n\u001b[1;32m 1262\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file_ptr \u001b[38;5;241m==\u001b[39m _ffi\u001b[38;5;241m.\u001b[39mNULL:\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;66;03m# get the actual error code\u001b[39;00m\n\u001b[1;32m 1264\u001b[0m err \u001b[38;5;241m=\u001b[39m _snd\u001b[38;5;241m.\u001b[39msf_error(file_ptr)\n\u001b[0;32m-> 1265\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LibsndfileError(err, prefix\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError opening \u001b[39m\u001b[38;5;132;01m{0!r}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname))\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mode_int \u001b[38;5;241m==\u001b[39m _snd\u001b[38;5;241m.\u001b[39mSFM_WRITE:\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# Due to a bug in libsndfile version <= 1.0.25, frames != 0\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# when opening a named pipe in SFM_WRITE mode.\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# See http://github.com/erikd/libsndfile/issues/77.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info\u001b[38;5;241m.\u001b[39mframes \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"\u001b[0;31mLibsndfileError\u001b[0m: Error opening '/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav': Format not recognised."
]
}
],
"source": [
"import soundfile as sf\n",
"\n",
"sf.write(\"/home/ubuntu/ASR_128bin/bigvgan_v2_44khz_128band_512x/decoded.wav\", audio, 44_100)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Audio(wav, rate=24_000)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[-2.0122, -1.7062, -0.6355, ..., -1.2065, -0.7076, -1.2003],\n",
" [-2.0825, -1.8459, -0.6099, ..., -1.1121, -1.1247, -1.2791],\n",
" [-1.8935, -1.7773, -0.6214, ..., -1.2913, -0.8913, -0.9125],\n",
" ...,\n",
" [-1.8168, -1.8461, -1.7281, ..., -1.5361, -1.5047, -1.5924],\n",
" [-1.8209, -1.8349, -1.7215, ..., -1.6026, -1.5498, -1.5136],\n",
" [-1.7768, -1.8042, -1.7079, ..., -1.7335, -1.5213, -1.5378]]])"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "respair",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}