{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f800718e-c29f-44d8-bf41-e02d50d0f730", "metadata": { "ExecuteTime": { "start_time": "2023-04-29T13:11:15.198687Z", "end_time": "2023-04-29T13:11:15.245584Z" }, "pycharm": { "is_executing": true } }, "outputs": [], "source": [ "\n", "from pathlib import Path\n", "\n", "from datasets import Valentini\n", "\n", "dataset = Valentini('/media/public/datasets/denoising/DS_10283_2791/', valid=False)" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "noisy , clean = dataset[0]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/maksim/miniconda3/bin/python\r\n" ] } ], "source": [ "!which python" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2023-04-29T13:19:44.813901Z", "end_time": "2023-04-29T13:19:45.361947Z" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "from IPython.display import Audio\n", "Audio(noisy,)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'matplotlib'", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", "Cell \u001B[0;32mIn[12], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdatasets\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Valentini\n\u001B[1;32m 3\u001B[0m dataset \u001B[38;5;241m=\u001B[39m Valentini()\n", "File \u001B[0;32m~/PycharmProjects/denoising/datasets.py:4\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdata\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Dataset\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m----> 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mutils\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_wav\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mclass\u001B[39;00m \u001B[38;5;21;01mValentini\u001B[39;00m(Dataset):\n\u001B[1;32m 8\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m/media/public/datasets/denoising/DS_10283_2791/\u001B[39m\u001B[38;5;124m'\u001B[39m, transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 9\u001B[0m target_transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n", "File \u001B[0;32m~/PycharmProjects/denoising/utils.py:3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorchaudio\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n\u001B[0;32m----> 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mmatplotlib\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpyplot\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mplt\u001B[39;00m\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mcollect_valentini_paths\u001B[39m(dataset_path):\n", "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'matplotlib'" ] } ], "source": [ "from datasets import Valentini\n", "\n", "dataset = Valentini()\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2023-04-29T13:12:43.304369Z", "end_time": "2023-04-29T13:12:43.377178Z" } } }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "dataset_path = Path('/media/public/datasets/denoising/DS_10283_2791')\n", "clean_path = dataset_path / 'clean_testset_wav'\n", "noisy_path = dataset_path / 'noisy_testset_wav'" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2023-04-29T09:40:20.255923Z", "end_time": "2023-04-29T09:40:20.259910Z" } } }, { "cell_type": "code", "execution_count": 3, "id": "f236e6df-2e29-4100-9549-8566a1dc1307", "metadata": { "ExecuteTime": { "start_time": "2023-04-29T09:40:20.259910Z", "end_time": "2023-04-29T09:40:20.259910Z" } }, "outputs": [], "source": [ "clean_wavs = list(clean_path.glob(\"*\"))\n", "noisy_wavs = list(noisy_path.glob(\"*\"))" ] }, { "cell_type": "code", "execution_count": 4, "id": "023c655d-2515-4f29-ba87-1c17d87acf97", "metadata": { "ExecuteTime": { "start_time": "2023-04-29T09:40:20.354536Z", "end_time": "2023-04-29T09:40:20.383325Z" } }, "outputs": [ { "data": { "text/plain": "(824, 824)" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(clean_wavs), len(noisy_wavs)" ] }, { "cell_type": "code", "execution_count": 5, "id": "f45674a2-586e-49e0-85c4-2abdc9f27697", "metadata": { "ExecuteTime": { "start_time": "2023-04-29T09:40:20.354536Z", "end_time": "2023-04-29T09:40:20.383325Z" } }, "outputs": [], "source": [ "from IPython.display import Audio, display" ] }, { "cell_type": "code", "execution_count": 6, "id": "7303c87b-ffc2-4203-93e1-0d5ccde3d553", "metadata": { "ExecuteTime": { "start_time": "2023-04-29T09:40:20.354536Z", "end_time": "2023-04-29T09:40:21.319341Z" } }, "outputs": [ { "data": { "text/plain": "", "text/html": "\n \n " }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": "", "text/html": "\n \n " }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def display_pair(i):\n", " display(Audio(noisy_wavs[i], rate=48000))\n", " display(Audio(clean_wavs[i], rate=48000))\n", "display_pair(-1)" ] }, { "cell_type": "markdown", "id": "b3a93ce0-aa4a-416a-a8d7-398dbd19236b", "metadata": {}, "source": [ "- SDR https://torchmetrics.readthedocs.io/en/stable/audio/signal_distortion_ratio.html\n", "- SI-SNR https://torchmetrics.readthedocs.io/en/stable/audio/scale_invariant_signal_noise_ratio.html?highlight=Si-SNR" ] }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/maksim/miniconda3/bin/python\r\n" ] } ], "source": [], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2023-04-29T09:40:46.742924Z", "end_time": "2023-04-29T09:40:47.415784Z" } } }, { "cell_type": "code", "execution_count": 8, "id": "37404b32-dc25-4c70-8aca-6849c1a611bf", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'torchmetrics'", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", "Cell \u001B[0;32mIn[8], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorchmetrics\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01maudio\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpesq\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m PerceptualEvaluationSpeechQuality\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorchmetrics\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01maudio\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mstoi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m ShortTimeObjectiveIntelligibility\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n", "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'torchmetrics'" ] } ], "source": [ "from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality\n", "from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility\n", "import torch\n", "import torchaudio\n", "import torchmetrics\n", "from denoisers.SpectralGating import SpectralGating\n", "\n", "\n", "class Metrics:\n", " def __init__(self, rate=16000):\n", " self.nb_pesq = PerceptualEvaluationSpeechQuality(rate, 'wb')\n", " self.stoi = ShortTimeObjectiveIntelligibility(rate, False)\n", " def calculate(self, preds, target):\n", " return {'PESQ': self.nb_pesq(preds, target), \n", " 'STOI': self.stoi(preds, target)}\n", "\n", "def load_wav(path):\n", " wav, org_sr = torchaudio.load(path)\n", " wav = torchaudio.functional.resample(wav, orig_freq=org_sr, new_freq=16000)\n", " return wav\n", "\n", "\n", "\n", "\n", "model = SpectralGating()\n", "metrics = Metrics()\n", "\n", "clean_wav = load_wav(clean_wavs[0])\n", "noisy_wav = load_wav(noisy_wavs[0])\n", "denoised = model(noisy_wav)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "af9d9987-19dd-498e-8f83-6601bca17013", "metadata": {}, "outputs": [], "source": [ "metrics.calculate(noisy_wav, clean_wav)" ] }, { "cell_type": "code", "execution_count": null, "id": "15d7cb6e-951a-42dd-ae23-1838bcdcbd77", "metadata": {}, "outputs": [], "source": [ "metrics.calculate(denoised, clean_wav)" ] }, { "cell_type": "code", "execution_count": null, "id": "abeea748-a9c4-4f1c-97f5-66b441136e52", "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "mean_scores_ideal = {'PESQ': 0,'STOI': 0}\n", "mean_scores_model = {'PESQ': 0, 'STOI': 0}\n", "\n", "for clean_path, noisy_path in tqdm(zip(clean_wavs[:10], noisy_wavs[:10])):\n", " clean_wav = load_wav(clean_path)\n", " noisy_wav = load_wav(noisy_path)\n", " denoised_wav = model(noisy_wav)\n", " \n", " scores_ideal = metrics.calculate(noisy_wav, clean_wav)\n", " scores_model = metrics.calculate(noisy_wav, denoised_wav)\n", " \n", " mean_scores_ideal['PESQ'] += scores_ideal['PESQ']\n", " mean_scores_ideal['STOI'] += scores_ideal['STOI']\n", " \n", " mean_scores_model['PESQ'] += scores_model['PESQ']\n", " mean_scores_model['STOI'] += scores_model['STOI']\n", "\n", "mean_scores_ideal['PESQ'] = mean_scores_ideal['PESQ'] / len(clean_wavs)\n", "mean_scores_ideal['STOI'] = mean_scores_ideal['STOI'] / len(clean_wavs)\n", "mean_scores_model['PESQ'] = mean_scores_model['PESQ'] / len(clean_wavs)\n", "mean_scores_model['STOI'] = mean_scores_model['STOI'] / len(clean_wavs)" ] }, { "cell_type": "code", "execution_count": null, "id": "f0eac478-9a2d-4820-a0ef-37a6d28025e0", "metadata": {}, "outputs": [], "source": [ "mean_scores_ideal" ] }, { "cell_type": "code", "execution_count": null, "id": "42651dba-fa5b-461f-acc3-5c226cdb355b", "metadata": {}, "outputs": [], "source": [ "mean_scores_model" ] }, { "cell_type": "code", "execution_count": 1, "id": "b60c40dd-7244-4ef3-be6c-e16df51e2e17", "metadata": { "ExecuteTime": { "start_time": "2023-04-29T09:23:03.151509Z", "end_time": "2023-04-29T09:23:03.151509Z" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "17bf893d-6468-48d7-902b-c160426a6067", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'model' is not defined", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", "Cell \u001B[0;32mIn[3], line 7\u001B[0m\n\u001B[1;32m 4\u001B[0m display(Audio(clean_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[1;32m 5\u001B[0m display(Audio(prediction,rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[0;32m----> 7\u001B[0m \u001B[43minference\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m5\u001B[39;49m\u001B[43m)\u001B[49m\n", "Cell \u001B[0;32mIn[3], line 2\u001B[0m, in \u001B[0;36minference\u001B[0;34m(i)\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21minference\u001B[39m(i):\n\u001B[0;32m----> 2\u001B[0m prediction \u001B[38;5;241m=\u001B[39m \u001B[43mmodel\u001B[49m(noisy_wavs[i])\n\u001B[1;32m 3\u001B[0m display(Audio(noisy_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[1;32m 4\u001B[0m display(Audio(clean_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n", "\u001B[0;31mNameError\u001B[0m: name 'model' is not defined" ] } ], "source": [ "def inference(i):\n", " prediction = model(noisy_wavs[i])\n", " display(Audio(noisy_wavs[i],rate=48000))\n", " display(Audio(clean_wavs[i],rate=48000))\n", " display(Audio(prediction,rate=48000))\n", " \n", "inference(5)" ] }, { "cell_type": "code", "execution_count": null, "id": "66c821ca-8c64-43d4-b1f0-6c01801ae6b1", "metadata": {}, "outputs": [], "source": [ "from huggingsound import SpeechRecognitionModel\n", "\n", "model = SpeechRecognitionModel(\"jonatasgrosman/wav2vec2-large-xlsr-53-spanish\")\n", "audio_paths = [\"/path/to/file.mp3\", \"/path/to/another_file.wav\"]\n", "\n", "transcriptions = model.transcribe(audio_paths)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8ac750ea-709a-4a2f-a76f-0940861ab099", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6fc4220f-6c18-4fa7-8c60-13a55a785a0c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "85ad6beb-0258-40b8-9e0e-3f6aaec6fdae", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" } }, "nbformat": 4, "nbformat_minor": 5 }