diff --git "a/notebooks/EDA.ipynb" "b/notebooks/EDA.ipynb" deleted file mode 100644--- "a/notebooks/EDA.ipynb" +++ /dev/null @@ -1,465 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "f800718e-c29f-44d8-bf41-e02d50d0f730", - "metadata": { - "ExecuteTime": { - "start_time": "2023-04-29T13:11:15.198687Z", - "end_time": "2023-04-29T13:11:15.245584Z" - }, - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "\n", - "from pathlib import Path\n", - "\n", - "from datasets import Valentini\n", - "\n", - "dataset = Valentini('/media/public/datasets/denoising/DS_10283_2791/', valid=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "noisy , clean = dataset[0]" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 2, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/maksim/miniconda3/bin/python\r\n" - ] - } - ], - "source": [ - "!which python" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-04-29T13:19:44.813901Z", - "end_time": "2023-04-29T13:19:45.361947Z" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from IPython.display import Audio\n", - "Audio(noisy,)" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 12, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'matplotlib'", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[12], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mdatasets\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Valentini\n\u001B[1;32m 3\u001B[0m dataset \u001B[38;5;241m=\u001B[39m Valentini()\n", - "File \u001B[0;32m~/PycharmProjects/denoising/datasets.py:4\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdata\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Dataset\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[0;32m----> 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mutils\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_wav\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mclass\u001B[39;00m \u001B[38;5;21;01mValentini\u001B[39;00m(Dataset):\n\u001B[1;32m 8\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m/media/public/datasets/denoising/DS_10283_2791/\u001B[39m\u001B[38;5;124m'\u001B[39m, transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m,\n\u001B[1;32m 9\u001B[0m target_transform\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n", - "File \u001B[0;32m~/PycharmProjects/denoising/utils.py:3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorchaudio\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n\u001B[0;32m----> 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mmatplotlib\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpyplot\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mplt\u001B[39;00m\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpathlib\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Path\n\u001B[1;32m 7\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mcollect_valentini_paths\u001B[39m(dataset_path):\n", - "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'matplotlib'" - ] - } - ], - "source": [ - "from datasets import Valentini\n", - "\n", - "dataset = Valentini()\n" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-04-29T13:12:43.304369Z", - "end_time": "2023-04-29T13:12:43.377178Z" - } - } - }, - { - "cell_type": "code", - "execution_count": 2, - "outputs": [], - "source": [ - "dataset_path = Path('/media/public/datasets/denoising/DS_10283_2791')\n", - "clean_path = dataset_path / 'clean_testset_wav'\n", - "noisy_path = dataset_path / 'noisy_testset_wav'" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-04-29T09:40:20.255923Z", - "end_time": "2023-04-29T09:40:20.259910Z" - } - } - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f236e6df-2e29-4100-9549-8566a1dc1307", - "metadata": { - "ExecuteTime": { - "start_time": "2023-04-29T09:40:20.259910Z", - "end_time": "2023-04-29T09:40:20.259910Z" - } - }, - "outputs": [], - "source": [ - "clean_wavs = list(clean_path.glob(\"*\"))\n", - "noisy_wavs = list(noisy_path.glob(\"*\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "023c655d-2515-4f29-ba87-1c17d87acf97", - "metadata": { - "ExecuteTime": { - "start_time": "2023-04-29T09:40:20.354536Z", - "end_time": "2023-04-29T09:40:20.383325Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "(824, 824)" - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(clean_wavs), len(noisy_wavs)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f45674a2-586e-49e0-85c4-2abdc9f27697", - "metadata": { - "ExecuteTime": { - "start_time": "2023-04-29T09:40:20.354536Z", - "end_time": "2023-04-29T09:40:20.383325Z" - } - }, - "outputs": [], - "source": [ - "from IPython.display import Audio, display" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "7303c87b-ffc2-4203-93e1-0d5ccde3d553", - "metadata": { - "ExecuteTime": { - "start_time": "2023-04-29T09:40:20.354536Z", - "end_time": "2023-04-29T09:40:21.319341Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "", - "text/html": "\n \n " - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": "", - "text/html": "\n \n " - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def display_pair(i):\n", - " display(Audio(noisy_wavs[i], rate=48000))\n", - " display(Audio(clean_wavs[i], rate=48000))\n", - "display_pair(-1)" - ] - }, - { - "cell_type": "markdown", - "id": "b3a93ce0-aa4a-416a-a8d7-398dbd19236b", - "metadata": {}, - "source": [ - "- SDR https://torchmetrics.readthedocs.io/en/stable/audio/signal_distortion_ratio.html\n", - "- SI-SNR https://torchmetrics.readthedocs.io/en/stable/audio/scale_invariant_signal_noise_ratio.html?highlight=Si-SNR" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/maksim/miniconda3/bin/python\r\n" - ] - } - ], - "source": [], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-04-29T09:40:46.742924Z", - "end_time": "2023-04-29T09:40:47.415784Z" - } - } - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "37404b32-dc25-4c70-8aca-6849c1a611bf", - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'torchmetrics'", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[8], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorchmetrics\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01maudio\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpesq\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m PerceptualEvaluationSpeechQuality\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorchmetrics\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01maudio\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mstoi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m ShortTimeObjectiveIntelligibility\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\n", - "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'torchmetrics'" - ] - } - ], - "source": [ - "from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality\n", - "from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility\n", - "import torch\n", - "import torchaudio\n", - "import torchmetrics\n", - "from denoisers.SpectralGating import SpectralGating\n", - "\n", - "\n", - "class Metrics:\n", - " def __init__(self, rate=16000):\n", - " self.nb_pesq = PerceptualEvaluationSpeechQuality(rate, 'wb')\n", - " self.stoi = ShortTimeObjectiveIntelligibility(rate, False)\n", - " def calculate(self, preds, target):\n", - " return {'PESQ': self.nb_pesq(preds, target), \n", - " 'STOI': self.stoi(preds, target)}\n", - "\n", - "def load_wav(path):\n", - " wav, org_sr = torchaudio.load(path)\n", - " wav = torchaudio.functional.resample(wav, orig_freq=org_sr, new_freq=16000)\n", - " return wav\n", - "\n", - "\n", - "\n", - "\n", - "model = SpectralGating()\n", - "metrics = Metrics()\n", - "\n", - "clean_wav = load_wav(clean_wavs[0])\n", - "noisy_wav = load_wav(noisy_wavs[0])\n", - "denoised = model(noisy_wav)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af9d9987-19dd-498e-8f83-6601bca17013", - "metadata": {}, - "outputs": [], - "source": [ - "metrics.calculate(noisy_wav, clean_wav)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15d7cb6e-951a-42dd-ae23-1838bcdcbd77", - "metadata": {}, - "outputs": [], - "source": [ - "metrics.calculate(denoised, clean_wav)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abeea748-a9c4-4f1c-97f5-66b441136e52", - "metadata": {}, - "outputs": [], - "source": [ - "from tqdm import tqdm\n", - "mean_scores_ideal = {'PESQ': 0,'STOI': 0}\n", - "mean_scores_model = {'PESQ': 0, 'STOI': 0}\n", - "\n", - "for clean_path, noisy_path in tqdm(zip(clean_wavs[:10], noisy_wavs[:10])):\n", - " clean_wav = load_wav(clean_path)\n", - " noisy_wav = load_wav(noisy_path)\n", - " denoised_wav = model(noisy_wav)\n", - " \n", - " scores_ideal = metrics.calculate(noisy_wav, clean_wav)\n", - " scores_model = metrics.calculate(noisy_wav, denoised_wav)\n", - " \n", - " mean_scores_ideal['PESQ'] += scores_ideal['PESQ']\n", - " mean_scores_ideal['STOI'] += scores_ideal['STOI']\n", - " \n", - " mean_scores_model['PESQ'] += scores_model['PESQ']\n", - " mean_scores_model['STOI'] += scores_model['STOI']\n", - "\n", - "mean_scores_ideal['PESQ'] = mean_scores_ideal['PESQ'] / len(clean_wavs)\n", - "mean_scores_ideal['STOI'] = mean_scores_ideal['STOI'] / len(clean_wavs)\n", - "mean_scores_model['PESQ'] = mean_scores_model['PESQ'] / len(clean_wavs)\n", - "mean_scores_model['STOI'] = mean_scores_model['STOI'] / len(clean_wavs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0eac478-9a2d-4820-a0ef-37a6d28025e0", - "metadata": {}, - "outputs": [], - "source": [ - "mean_scores_ideal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42651dba-fa5b-461f-acc3-5c226cdb355b", - "metadata": {}, - "outputs": [], - "source": [ - "mean_scores_model" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b60c40dd-7244-4ef3-be6c-e16df51e2e17", - "metadata": { - "ExecuteTime": { - "start_time": "2023-04-29T09:23:03.151509Z", - "end_time": "2023-04-29T09:23:03.151509Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "17bf893d-6468-48d7-902b-c160426a6067", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'model' is not defined", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[3], line 7\u001B[0m\n\u001B[1;32m 4\u001B[0m display(Audio(clean_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[1;32m 5\u001B[0m display(Audio(prediction,rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[0;32m----> 7\u001B[0m \u001B[43minference\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m5\u001B[39;49m\u001B[43m)\u001B[49m\n", - "Cell \u001B[0;32mIn[3], line 2\u001B[0m, in \u001B[0;36minference\u001B[0;34m(i)\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21minference\u001B[39m(i):\n\u001B[0;32m----> 2\u001B[0m prediction \u001B[38;5;241m=\u001B[39m \u001B[43mmodel\u001B[49m(noisy_wavs[i])\n\u001B[1;32m 3\u001B[0m display(Audio(noisy_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n\u001B[1;32m 4\u001B[0m display(Audio(clean_wavs[i],rate\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m48000\u001B[39m))\n", - "\u001B[0;31mNameError\u001B[0m: name 'model' is not defined" - ] - } - ], - "source": [ - "def inference(i):\n", - " prediction = model(noisy_wavs[i])\n", - " display(Audio(noisy_wavs[i],rate=48000))\n", - " display(Audio(clean_wavs[i],rate=48000))\n", - " display(Audio(prediction,rate=48000))\n", - " \n", - "inference(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66c821ca-8c64-43d4-b1f0-6c01801ae6b1", - "metadata": {}, - "outputs": [], - "source": [ - "from huggingsound import SpeechRecognitionModel\n", - "\n", - "model = SpeechRecognitionModel(\"jonatasgrosman/wav2vec2-large-xlsr-53-spanish\")\n", - "audio_paths = [\"/path/to/file.mp3\", \"/path/to/another_file.wav\"]\n", - "\n", - "transcriptions = model.transcribe(audio_paths)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ac750ea-709a-4a2f-a76f-0940861ab099", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fc4220f-6c18-4fa7-8c60-13a55a785a0c", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85ad6beb-0258-40b8-9e0e-3f6aaec6fdae", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}