github-actions[bot] commited on
Commit
2d9b22b
·
0 Parent(s):

Sync from https://github.com/JacobLinCool/zero-rvc

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.github/workflows/sync.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Spaces
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ name: Sync
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - name: Checkout Repository
15
+ uses: actions/checkout@v4
16
+ with:
17
+ lfs: true
18
+
19
+ - name: Sync to Hugging Face Spaces
20
+ uses: JacobLinCool/huggingface-sync@v1
21
+ with:
22
+ github: ${{ secrets.GITHUB_TOKEN }}
23
+ user: jacoblincool # Hugging Face username or organization name
24
+ space: ZeroRVC # Hugging Face space name
25
+ token: ${{ secrets.HF_TOKEN }} # Hugging Face token
26
+ configuration: headers.yaml
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ *.pyc
3
+ __pycache__
4
+ dist/
5
+ logs/
6
+ separated/
LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2024 Jacob Lin <jacob@csie.cool>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ZeroRVC
3
+ emoji: 🎙️
4
+ colorFrom: gray
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.37.2
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # ZeroRVC
13
+
14
+ Run Retrieval-based Voice Conversion training and inference with ease.
15
+
16
+ ## Features
17
+
18
+ - [x] Dataset Preparation
19
+ - [x] Hugging Face Datasets Integration
20
+ - [x] Hugging Face Accelerate Integration
21
+ - [x] Trainer API
22
+ - [x] Inference API
23
+ - [ ] Index Support
24
+ - [x] Tensorboard Support
25
+ - [ ] FP16 Support
26
+
27
+ ## Dataset Preparation
28
+
29
+ ZeroRVC provides a simple API to prepare your dataset for training. You only need to provide the path to your audio files. The feature extraction models will be downloaded automatically, or you can provide your own with the `hubert` and `rmvpe` arguments.
30
+
31
+ ```py
32
+ from datasets import load_dataset
33
+ from zerorvc import prepare, RVCTrainer
34
+
35
+ dataset = load_dataset("my-audio-dataset")
36
+ dataset = prepare(dataset)
37
+
38
+ trainer = RVCTrainer(
39
+ "my-rvc-model",
40
+ dataset_train=dataset["train"],
41
+ dataset_test=dataset["test"],
42
+ )
43
+ trainer.train(epochs=100, batch_size=8, upload="someone/rvc-test-1")
44
+ ```
45
+
46
+ ## Inference
47
+
48
+ ZeroRVC provides an easy API to convert your voice with the trained model.
49
+
50
+ ```py
51
+ from zerorvc import RVC
52
+ import soundfile as sf
53
+
54
+ rvc = RVC.from_pretrained("someone/rvc-test-1")
55
+ samples = rvc.convert("test.mp3")
56
+ sf.write("output.wav", samples, rvc.sr)
57
+ ```
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app.settings import SettingsTab
3
+ from app.tutorial import TutotialTab
4
+ from app.dataset import DatasetTab
5
+ from app.train import TrainTab
6
+ from app.infer import InferenceTab
7
+ from app.zero import zero_is_available
8
+
9
+ if zero_is_available:
10
+ import torch
11
+
12
+ torch.backends.cuda.matmul.allow_tf32 = True
13
+
14
+
15
+ with gr.Blocks() as app:
16
+ gr.Markdown("# ZeroRVC")
17
+ gr.Markdown(
18
+ "Run Retrieval-based Voice Conversion training and inference on Hugging Face ZeroGPU or locally."
19
+ )
20
+
21
+ settings = SettingsTab()
22
+ tutorial = TutotialTab()
23
+ dataset = DatasetTab()
24
+ training = TrainTab()
25
+ inference = InferenceTab()
26
+
27
+ with gr.Accordion(label="Environment Settings"):
28
+ settings.ui()
29
+
30
+ with gr.Tabs():
31
+ with gr.Tab(label="Tutorial", id=0):
32
+ tutorial.ui()
33
+
34
+ with gr.Tab(label="Dataset", id=1):
35
+ dataset.ui()
36
+
37
+ with gr.Tab(label="Training", id=2):
38
+ training.ui()
39
+
40
+ with gr.Tab(label="Inference", id=3):
41
+ inference.ui()
42
+
43
+ settings.build()
44
+ tutorial.build()
45
+ dataset.build(settings.exp_dir, settings.hf_token)
46
+ training.build(settings.exp_dir, settings.hf_token)
47
+ inference.build(settings.exp_dir)
48
+
49
+ app.launch()
app/__init__.py ADDED
File without changes
app/constants.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ HF_TOKEN = os.environ.get("HF_TOKEN")
5
+
6
+ ROOT_EXP_DIR = Path(
7
+ os.environ.get("ROOT_EXP_DIR")
8
+ or os.path.join(os.path.dirname(os.path.abspath(__file__)), "../logs")
9
+ ).resolve()
10
+ ROOT_EXP_DIR.mkdir(exist_ok=True, parents=True)
11
+
12
+ BATCH_SIZE = int(os.environ.get("BATCH_SIZE") or 8)
13
+ TRAINING_EPOCHS = int(os.environ.get("TRAINING_EPOCHS") or 10)
app/dataset.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import zipfile
4
+ import tempfile
5
+ from zerorvc import prepare
6
+ from datasets import load_dataset, load_from_disk
7
+ from .constants import ROOT_EXP_DIR, BATCH_SIZE
8
+ from .zero import zero
9
+ from .model import accelerator
10
+
11
+
12
+ def extract_audio_files(zip_file: str, target_dir: str) -> list[str]:
13
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
14
+ zip_ref.extractall(target_dir)
15
+
16
+ audio_files = [
17
+ os.path.join(target_dir, f)
18
+ for f in os.listdir(target_dir)
19
+ if f.endswith((".wav", ".mp3", ".ogg"))
20
+ ]
21
+ if not audio_files:
22
+ raise gr.Error("No audio files found at the top level of the zip file")
23
+
24
+ return audio_files
25
+
26
+
27
+ def make_dataset_from_zip(exp_dir: str, zip_file: str):
28
+ if not exp_dir:
29
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
30
+ print(f"Using exp dir: {exp_dir}")
31
+
32
+ data_dir = os.path.join(exp_dir, "raw_data")
33
+ if not os.path.exists(data_dir):
34
+ os.makedirs(data_dir)
35
+ extract_audio_files(zip_file, data_dir)
36
+
37
+ ds = prepare(
38
+ data_dir,
39
+ accelerator=accelerator,
40
+ batch_size=BATCH_SIZE,
41
+ stage=1,
42
+ )
43
+
44
+ return exp_dir, str(ds)
45
+
46
+
47
+ @zero(duration=120)
48
+ def make_dataset_from_zip_stage_2(exp_dir: str):
49
+ data_dir = os.path.join(exp_dir, "raw_data")
50
+ ds = prepare(
51
+ data_dir,
52
+ accelerator=accelerator,
53
+ batch_size=BATCH_SIZE,
54
+ stage=2,
55
+ )
56
+ return exp_dir, str(ds)
57
+
58
+
59
+ def make_dataset_from_zip_stage_3(exp_dir: str):
60
+ data_dir = os.path.join(exp_dir, "raw_data")
61
+ ds = prepare(
62
+ data_dir,
63
+ accelerator=accelerator,
64
+ batch_size=BATCH_SIZE,
65
+ stage=3,
66
+ )
67
+
68
+ dataset = os.path.join(exp_dir, "dataset")
69
+ ds.save_to_disk(dataset)
70
+ return exp_dir, str(ds)
71
+
72
+
73
+ def make_dataset_from_repo(repo: str, hf_token: str):
74
+ ds = load_dataset(repo, token=hf_token)
75
+ ds = prepare(
76
+ ds,
77
+ accelerator=accelerator,
78
+ batch_size=BATCH_SIZE,
79
+ stage=1,
80
+ )
81
+ return str(ds)
82
+
83
+
84
+ @zero(duration=120)
85
+ def make_dataset_from_repo_stage_2(repo: str, hf_token: str):
86
+ ds = load_dataset(repo, token=hf_token)
87
+ ds = prepare(
88
+ ds,
89
+ accelerator=accelerator,
90
+ batch_size=BATCH_SIZE,
91
+ stage=2,
92
+ )
93
+ return str(ds)
94
+
95
+
96
+ def make_dataset_from_repo_stage_3(exp_dir: str, repo: str, hf_token: str):
97
+ ds = load_dataset(repo, token=hf_token)
98
+ ds = prepare(
99
+ ds,
100
+ accelerator=accelerator,
101
+ batch_size=BATCH_SIZE,
102
+ stage=3,
103
+ )
104
+
105
+ if not exp_dir:
106
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
107
+ print(f"Using exp dir: {exp_dir}")
108
+
109
+ dataset = os.path.join(exp_dir, "dataset")
110
+ ds.save_to_disk(dataset)
111
+ return exp_dir, str(ds)
112
+
113
+
114
+ def use_dataset(exp_dir: str, repo: str, hf_token: str):
115
+ gr.Info("Fetching dataset")
116
+ ds = load_dataset(repo, token=hf_token)
117
+
118
+ if not exp_dir:
119
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
120
+ print(f"Using exp dir: {exp_dir}")
121
+
122
+ dataset = os.path.join(exp_dir, "dataset")
123
+ ds.save_to_disk(dataset)
124
+ return exp_dir, str(ds)
125
+
126
+
127
+ def upload_dataset(exp_dir: str, repo: str, hf_token: str):
128
+ dataset = os.path.join(exp_dir, "dataset")
129
+ if not os.path.exists(dataset):
130
+ raise gr.Error("Dataset not found")
131
+
132
+ gr.Info("Uploading dataset")
133
+ ds = load_from_disk(dataset)
134
+ ds.push_to_hub(repo, token=hf_token, private=True)
135
+ gr.Info("Dataset uploaded successfully")
136
+
137
+
138
+ class DatasetTab:
139
+ def __init__(self):
140
+ pass
141
+
142
+ def ui(self):
143
+ gr.Markdown("# Dataset")
144
+ gr.Markdown("The suggested dataset size is > 5 minutes of audio.")
145
+
146
+ gr.Markdown("## Create Dataset from ZIP")
147
+ gr.Markdown(
148
+ "Create a dataset by simply upload a zip file containing audio files. The audio files should be at the top level of the zip file."
149
+ )
150
+ with gr.Row():
151
+ self.zip_file = gr.File(
152
+ label="Upload a zip file containing audio files",
153
+ file_types=["zip"],
154
+ )
155
+ self.make_ds_from_dir = gr.Button(
156
+ value="Create Dataset from ZIP", variant="primary"
157
+ )
158
+
159
+ gr.Markdown("## Create Dataset from Dataset Repository")
160
+ gr.Markdown(
161
+ "You can also create a dataset from any Hugging Face dataset repository that has 'audio' column."
162
+ )
163
+ with gr.Row():
164
+ self.repo = gr.Textbox(
165
+ label="Hugging Face Dataset Repository",
166
+ placeholder="username/dataset-name",
167
+ )
168
+ self.make_ds_from_repo = gr.Button(
169
+ value="Create Dataset from Repo", variant="primary"
170
+ )
171
+
172
+ gr.Markdown("## Sync Preprocessed Dataset")
173
+ gr.Markdown(
174
+ "After you have preprocessed the dataset, you can upload the dataset to Hugging Face. And fetch it back later directly."
175
+ )
176
+ with gr.Row():
177
+ self.preprocessed_repo = gr.Textbox(
178
+ label="Hugging Face Dataset Repository",
179
+ placeholder="username/dataset-name",
180
+ )
181
+ self.fetch_ds = gr.Button(value="Fetch Dataset", variant="primary")
182
+ self.upload_ds = gr.Button(value="Upload Dataset", variant="primary")
183
+
184
+ self.ds_state = gr.Textbox(label="Dataset Info", lines=5)
185
+
186
+ def build(self, exp_dir: gr.Textbox, hf_token: gr.Textbox):
187
+ self.make_ds_from_dir.click(
188
+ fn=make_dataset_from_zip,
189
+ inputs=[exp_dir, self.zip_file],
190
+ outputs=[exp_dir, self.ds_state],
191
+ ).success(
192
+ fn=make_dataset_from_zip_stage_2,
193
+ inputs=[exp_dir],
194
+ outputs=[exp_dir, self.ds_state],
195
+ ).success(
196
+ fn=make_dataset_from_zip_stage_3,
197
+ inputs=[exp_dir],
198
+ outputs=[exp_dir, self.ds_state],
199
+ )
200
+
201
+ self.make_ds_from_repo.click(
202
+ fn=make_dataset_from_repo,
203
+ inputs=[self.repo, hf_token],
204
+ outputs=[self.ds_state],
205
+ ).success(
206
+ fn=make_dataset_from_repo_stage_2,
207
+ inputs=[self.repo, hf_token],
208
+ outputs=[self.ds_state],
209
+ ).success(
210
+ fn=make_dataset_from_repo_stage_3,
211
+ inputs=[exp_dir, self.repo, hf_token],
212
+ outputs=[exp_dir, self.ds_state],
213
+ )
214
+
215
+ self.fetch_ds.click(
216
+ fn=use_dataset,
217
+ inputs=[exp_dir, self.preprocessed_repo, hf_token],
218
+ outputs=[exp_dir, self.ds_state],
219
+ )
220
+
221
+ self.upload_ds.click(
222
+ fn=upload_dataset,
223
+ inputs=[exp_dir, self.preprocessed_repo, hf_token],
224
+ outputs=[],
225
+ )
app/dataset_maker.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ import numpy as np
3
+ import librosa
4
+ import soundfile as sf
5
+ import os
6
+ import zipfile
7
+
8
+
9
+ # Function to download audio from YouTube and save it as a WAV file
10
+ def download_youtube_audio(url, audio_name):
11
+ ydl_opts = {
12
+ "format": "bestaudio/best",
13
+ "postprocessors": [
14
+ {
15
+ "key": "FFmpegExtractAudio",
16
+ "preferredcodec": "wav",
17
+ }
18
+ ],
19
+ "outtmpl": f"youtubeaudio/{audio_name}", # Output template
20
+ }
21
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
22
+ ydl.download([url])
23
+ return f"youtubeaudio/{audio_name}.wav"
24
+
25
+
26
+ # Function to calculate RMS
27
+ def get_rms(y, frame_length=2048, hop_length=512, pad_mode="constant"):
28
+ padding = (int(frame_length // 2), int(frame_length // 2))
29
+ y = np.pad(y, padding, mode=pad_mode)
30
+
31
+ axis = -1
32
+ out_strides = y.strides + tuple([y.strides[axis]])
33
+ x_shape_trimmed = list(y.shape)
34
+ x_shape_trimmed[axis] -= frame_length - 1
35
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
36
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
37
+ if axis < 0:
38
+ target_axis = axis - 1
39
+ else:
40
+ target_axis = axis + 1
41
+ xw = np.moveaxis(xw, -1, target_axis)
42
+ slices = [slice(None)] * xw.ndim
43
+ slices[axis] = slice(0, None, hop_length)
44
+ x = xw[tuple(slices)]
45
+
46
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
47
+ return np.sqrt(power)
48
+
49
+
50
+ # Slicer class
51
+ class Slicer:
52
+ def __init__(
53
+ self,
54
+ sr,
55
+ threshold=-40.0,
56
+ min_length=5000,
57
+ min_interval=300,
58
+ hop_size=20,
59
+ max_sil_kept=5000,
60
+ ):
61
+ if not min_length >= min_interval >= hop_size:
62
+ raise ValueError(
63
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
64
+ )
65
+ if not max_sil_kept >= hop_size:
66
+ raise ValueError(
67
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
68
+ )
69
+ min_interval = sr * min_interval / 1000
70
+ self.threshold = 10 ** (threshold / 20.0)
71
+ self.hop_size = round(sr * hop_size / 1000)
72
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
73
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
74
+ self.min_interval = round(min_interval / self.hop_size)
75
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
76
+
77
+ def _apply_slice(self, waveform, begin, end):
78
+ if len(waveform.shape) > 1:
79
+ return waveform[
80
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
81
+ ]
82
+ else:
83
+ return waveform[
84
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
85
+ ]
86
+
87
+ def slice(self, waveform):
88
+ if len(waveform.shape) > 1:
89
+ samples = waveform.mean(axis=0)
90
+ else:
91
+ samples = waveform
92
+ if samples.shape[0] <= self.min_length:
93
+ return [waveform]
94
+ rms_list = get_rms(
95
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
96
+ ).squeeze(0)
97
+ sil_tags = []
98
+ silence_start = None
99
+ clip_start = 0
100
+ for i, rms in enumerate(rms_list):
101
+ if rms < self.threshold:
102
+ if silence_start is None:
103
+ silence_start = i
104
+ continue
105
+ if silence_start is None:
106
+ continue
107
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
108
+ need_slice_middle = (
109
+ i - silence_start >= self.min_interval
110
+ and i - clip_start >= self.min_length
111
+ )
112
+ if not is_leading_silence and not need_slice_middle:
113
+ silence_start = None
114
+ continue
115
+ if i - silence_start <= self.max_sil_kept:
116
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
117
+ if silence_start == 0:
118
+ sil_tags.append((0, pos))
119
+ else:
120
+ sil_tags.append((pos, pos))
121
+ clip_start = pos
122
+ elif i - silence_start <= self.max_sil_kept * 2:
123
+ pos = rms_list[
124
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
125
+ ].argmin()
126
+ pos += i - self.max_sil_kept
127
+ pos_l = (
128
+ rms_list[
129
+ silence_start : silence_start + self.max_sil_kept + 1
130
+ ].argmin()
131
+ + silence_start
132
+ )
133
+ pos_r = (
134
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
135
+ + i
136
+ - self.max_sil_kept
137
+ )
138
+ if silence_start == 0:
139
+ sil_tags.append((0, pos_r))
140
+ clip_start = pos_r
141
+ else:
142
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
143
+ clip_start = max(pos_r, pos)
144
+ else:
145
+ pos_l = (
146
+ rms_list[
147
+ silence_start : silence_start + self.max_sil_kept + 1
148
+ ].argmin()
149
+ + silence_start
150
+ )
151
+ pos_r = (
152
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
153
+ + i
154
+ - self.max_sil_kept
155
+ )
156
+ if silence_start == 0:
157
+ sil_tags.append((0, pos_r))
158
+ else:
159
+ sil_tags.append((pos_l, pos_r))
160
+ clip_start = pos_r
161
+ silence_start = None
162
+ total_frames = rms_list.shape[0]
163
+ if (
164
+ silence_start is not None
165
+ and total_frames - silence_start >= self.min_interval
166
+ ):
167
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
168
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
169
+ sil_tags.append((pos, total_frames + 1))
170
+ if len(sil_tags) == 0:
171
+ return [waveform]
172
+ else:
173
+ chunks = []
174
+ if sil_tags[0][0] > 0:
175
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
176
+ for i in range(len(sil_tags) - 1):
177
+ chunks.append(
178
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
179
+ )
180
+ if sil_tags[-1][1] < total_frames:
181
+ chunks.append(
182
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
183
+ )
184
+ return chunks
185
+
186
+
187
+ # Function to slice and save audio chunks
188
+ def slice_audio(file_path, audio_name):
189
+ audio, sr = librosa.load(file_path, sr=None, mono=False)
190
+ os.makedirs(f"dataset/{audio_name}", exist_ok=True)
191
+ slicer = Slicer(
192
+ sr=sr,
193
+ threshold=-40,
194
+ min_length=5000,
195
+ min_interval=500,
196
+ hop_size=10,
197
+ max_sil_kept=500,
198
+ )
199
+ chunks = slicer.slice(audio)
200
+ for i, chunk in enumerate(chunks):
201
+ if len(chunk.shape) > 1:
202
+ chunk = chunk.T
203
+ sf.write(f"dataset/{audio_name}/split_{i}.wav", chunk, sr)
204
+ return f"dataset/{audio_name}"
205
+
206
+
207
+ # Function to zip the dataset directory
208
+ def zip_directory(directory_path, audio_name):
209
+ zip_file = f"dataset/{audio_name}.zip"
210
+ os.makedirs(os.path.dirname(zip_file), exist_ok=True) # Ensure the directory exists
211
+ with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
212
+ for root, dirs, files in os.walk(directory_path):
213
+ for file in files:
214
+ file_path = os.path.join(root, file)
215
+ arcname = os.path.relpath(file_path, start=directory_path)
216
+ zipf.write(file_path, arcname)
217
+ return zip_file
218
+
219
+
220
+ # Gradio interface
221
+ def process_audio(url, audio_name):
222
+ file_path = download_youtube_audio(url, audio_name)
223
+ dataset_path = slice_audio(file_path, audio_name)
224
+ zip_file = zip_directory(dataset_path, audio_name)
225
+ return zip_file, print(f"{zip_file} successfully processed")
app/infer.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import hashlib
4
+ from pathlib import Path
5
+ from typing import Tuple
6
+ from demucs.separate import main as demucs
7
+ import gradio as gr
8
+ import numpy as np
9
+ import soundfile as sf
10
+ from zerorvc import RVC
11
+ from .zero import zero
12
+ from .model import device
13
+ import yt_dlp
14
+
15
+
16
+ def download_audio(url):
17
+ ydl_opts = {
18
+ "format": "bestaudio/best",
19
+ "outtmpl": "ytdl/%(title)s.%(ext)s",
20
+ "postprocessors": [
21
+ {
22
+ "key": "FFmpegExtractAudio",
23
+ "preferredcodec": "wav",
24
+ "preferredquality": "192",
25
+ }
26
+ ],
27
+ }
28
+
29
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
30
+ info_dict = ydl.extract_info(url, download=True)
31
+ file_path = ydl.prepare_filename(info_dict).rsplit(".", 1)[0] + ".wav"
32
+ sample_rate, audio_data = read(file_path)
33
+ audio_array = np.asarray(audio_data, dtype=np.int16)
34
+
35
+ return sample_rate, audio_array
36
+
37
+
38
+ @zero(duration=120)
39
+ def infer(
40
+ exp_dir: str, original_audio: str, pitch_mod: int, protect: float
41
+ ) -> Tuple[int, np.ndarray]:
42
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
43
+ if not os.path.exists(checkpoint_dir):
44
+ raise gr.Error("Model not found")
45
+
46
+ # rename the original audio to the hash
47
+ with open(original_audio, "rb") as f:
48
+ original_audio_hash = hashlib.md5(f.read()).hexdigest()
49
+ ext = Path(original_audio).suffix
50
+ original_audio_hashed = os.path.join(exp_dir, f"{original_audio_hash}{ext}")
51
+ shutil.copy(original_audio, original_audio_hashed)
52
+
53
+ out = os.path.join("separated", "htdemucs", original_audio_hash, "vocals.wav")
54
+ if not os.path.exists(out):
55
+ demucs(
56
+ [
57
+ "--two-stems",
58
+ "vocals",
59
+ "-d",
60
+ str(device),
61
+ "-n",
62
+ "htdemucs",
63
+ original_audio_hashed,
64
+ ]
65
+ )
66
+
67
+ rvc = RVC.from_pretrained(checkpoint_dir)
68
+ samples = rvc.convert(out, pitch_modification=pitch_mod, protect=protect)
69
+ file = os.path.join(exp_dir, "infer.wav")
70
+ sf.write(file, samples, rvc.sr)
71
+
72
+ return file
73
+
74
+
75
+ def merge(exp_dir: str, original_audio: str, vocal: Tuple[int, np.ndarray]) -> str:
76
+ with open(original_audio, "rb") as f:
77
+ original_audio_hash = hashlib.md5(f.read()).hexdigest()
78
+ music = os.path.join("separated", "htdemucs", original_audio_hash, "no_vocals.wav")
79
+
80
+ tmp = os.path.join(exp_dir, "tmp.wav")
81
+ sf.write(tmp, vocal[1], vocal[0])
82
+
83
+ os.system(
84
+ f"ffmpeg -i {music} -i {tmp} -filter_complex '[1]volume=2[a];[0][a]amix=inputs=2:duration=first:dropout_transition=2' -ac 2 -y {tmp}.merged.mp3"
85
+ )
86
+
87
+ return f"{tmp}.merged.mp3"
88
+
89
+
90
+ class InferenceTab:
91
+ def __init__(self):
92
+ pass
93
+
94
+ def ui(self):
95
+ gr.Markdown("# Inference")
96
+ gr.Markdown(
97
+ "After trained model is pruned, you can use it to infer on new music. \n"
98
+ "Upload the original audio and adjust the F0 add value to generate the inferred audio."
99
+ )
100
+
101
+ with gr.Row():
102
+ self.original_audio = gr.Audio(
103
+ label="Upload original audio",
104
+ type="filepath",
105
+ show_download_button=True,
106
+ )
107
+ with gr.Accordion("inference by Link", open=False):
108
+ with gr.Row():
109
+ youtube_link = gr.Textbox(
110
+ label="Link",
111
+ placeholder="Paste the link here",
112
+ interactive=True,
113
+ )
114
+ with gr.Row():
115
+ gr.Markdown(
116
+ "You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)"
117
+ )
118
+ with gr.Row():
119
+ download_button = gr.Button("Download!", variant="primary")
120
+ download_button.click(
121
+ download_audio, [youtube_link], [self.original_audio]
122
+ )
123
+
124
+ with gr.Column():
125
+ self.pitch_mod = gr.Slider(
126
+ label="Pitch Modification +/-",
127
+ minimum=-16,
128
+ maximum=16,
129
+ step=1,
130
+ value=0,
131
+ )
132
+ self.protect = gr.Slider(
133
+ label="Protect",
134
+ minimum=0,
135
+ maximum=0.5,
136
+ step=0.01,
137
+ value=0.33,
138
+ )
139
+
140
+ self.infer_btn = gr.Button(value="Infer", variant="primary")
141
+ with gr.Row():
142
+ self.infer_output = gr.Audio(
143
+ label="Inferred audio", show_download_button=True, format="mp3"
144
+ )
145
+ with gr.Row():
146
+ self.merge_output = gr.Audio(
147
+ label="Merged audio", show_download_button=True, format="mp3"
148
+ )
149
+
150
+ def build(self, exp_dir: gr.Textbox):
151
+ self.infer_btn.click(
152
+ fn=infer,
153
+ inputs=[
154
+ exp_dir,
155
+ self.original_audio,
156
+ self.pitch_mod,
157
+ self.protect,
158
+ ],
159
+ outputs=[self.infer_output],
160
+ ).success(
161
+ fn=merge,
162
+ inputs=[exp_dir, self.original_audio, self.infer_output],
163
+ outputs=[self.merge_output],
164
+ )
app/model.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from accelerate import Accelerator
3
+ from zerorvc import load_hubert, load_rmvpe
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ accelerator = Accelerator()
8
+ device = accelerator.device
9
+
10
+ logger.info(f"device: {device}")
11
+ logger.info(f"mixed_precision: {accelerator.mixed_precision}")
12
+
13
+ rmvpe = load_rmvpe(device=device)
14
+ logger.info("RMVPE model loaded.")
15
+
16
+ hubert = load_hubert(device=device)
17
+ logger.info("HuBERT model loaded.")
app/settings.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .constants import HF_TOKEN
3
+
4
+
5
+ class SettingsTab:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def ui(self):
10
+ self.exp_dir = gr.Textbox(
11
+ label="Temporary Experiment Directory (auto-managed)",
12
+ placeholder="It will be auto-generated after setup",
13
+ interactive=True,
14
+ )
15
+ gr.Markdown(
16
+ "### Sync with Hugging Face 🤗\n\nThe access token will be use to upload/download the dataset and model."
17
+ )
18
+ self.hf_token = gr.Textbox(
19
+ label="Hugging Face Access Token",
20
+ placeholder="Paste your Hugging Face access token here (hf_...)",
21
+ value=HF_TOKEN,
22
+ interactive=True,
23
+ )
24
+
25
+ def build(self):
26
+ pass
app/train.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import gradio as gr
4
+ import torch
5
+ from zerorvc import RVCTrainer, pretrained_checkpoints, SynthesizerTrnMs768NSFsid
6
+ from zerorvc.trainer import TrainingCheckpoint
7
+ from datasets import load_from_disk
8
+ from huggingface_hub import snapshot_download
9
+ from .zero import zero
10
+ from .model import accelerator, device
11
+ from .constants import BATCH_SIZE, ROOT_EXP_DIR, TRAINING_EPOCHS
12
+
13
+
14
+ @zero(duration=240)
15
+ def train_model(exp_dir: str, progress=gr.Progress()):
16
+ dataset = os.path.join(exp_dir, "dataset")
17
+ if not os.path.exists(dataset):
18
+ raise gr.Error("Dataset not found. Please prepare the dataset first.")
19
+
20
+ ds = load_from_disk(dataset)
21
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
22
+ trainer = RVCTrainer(checkpoint_dir)
23
+
24
+ resume_from = trainer.latest_checkpoint()
25
+ if resume_from is None:
26
+ resume_from = pretrained_checkpoints()
27
+ gr.Info(f"Starting training from pretrained checkpoints.")
28
+ else:
29
+ gr.Info(f"Resuming training from {resume_from}")
30
+
31
+ tqdm = progress.tqdm(
32
+ trainer.train(
33
+ dataset=ds["train"],
34
+ resume_from=resume_from,
35
+ batch_size=BATCH_SIZE,
36
+ epochs=TRAINING_EPOCHS,
37
+ accelerator=accelerator,
38
+ ),
39
+ total=TRAINING_EPOCHS,
40
+ unit="epochs",
41
+ desc="Training",
42
+ )
43
+
44
+ for ckpt in tqdm:
45
+ info = f"Epoch: {ckpt.epoch} loss: (gen: {ckpt.loss_gen:.4f}, fm: {ckpt.loss_fm:.4f}, mel: {ckpt.loss_mel:.4f}, kl: {ckpt.loss_kl:.4f}, disc: {ckpt.loss_disc:.4f})"
46
+ print(info)
47
+ latest: TrainingCheckpoint = ckpt
48
+
49
+ latest.save(trainer.checkpoint_dir)
50
+ latest.G.save_pretrained(trainer.checkpoint_dir)
51
+
52
+ result = f"{TRAINING_EPOCHS} epochs trained. Latest loss: (gen: {latest.loss_gen:.4f}, fm: {latest.loss_fm:.4f}, mel: {latest.loss_mel:.4f}, kl: {latest.loss_kl:.4f}, disc: {latest.loss_disc:.4f})"
53
+
54
+ del trainer
55
+ if device.type == "cuda":
56
+ torch.cuda.empty_cache()
57
+
58
+ return result
59
+
60
+
61
+ def upload_model(exp_dir: str, repo: str, hf_token: str):
62
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
63
+ if not os.path.exists(checkpoint_dir):
64
+ raise gr.Error("Model not found")
65
+
66
+ gr.Info("Uploading model")
67
+ model = SynthesizerTrnMs768NSFsid.from_pretrained(checkpoint_dir)
68
+ model.push_to_hub(repo, token=hf_token, private=True)
69
+ gr.Info("Model uploaded successfully")
70
+
71
+
72
+ def upload_checkpoints(exp_dir: str, repo: str, hf_token: str):
73
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
74
+ if not os.path.exists(checkpoint_dir):
75
+ raise gr.Error("Checkpoints not found")
76
+
77
+ gr.Info("Uploading checkpoints")
78
+ trainer = RVCTrainer(checkpoint_dir)
79
+ trainer.push_to_hub(repo, token=hf_token, private=True)
80
+ gr.Info("Checkpoints uploaded successfully")
81
+
82
+
83
+ def fetch_model(exp_dir: str, repo: str, hf_token: str):
84
+ if not exp_dir:
85
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
86
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
87
+
88
+ gr.Info("Fetching model")
89
+ files = ["README.md", "config.json", "model.safetensors"]
90
+ snapshot_download(
91
+ repo, token=hf_token, local_dir=checkpoint_dir, allow_patterns=files
92
+ )
93
+ gr.Info("Model fetched successfully")
94
+
95
+ return exp_dir
96
+
97
+
98
+ def fetch_checkpoints(exp_dir: str, repo: str, hf_token: str):
99
+ if not exp_dir:
100
+ exp_dir = tempfile.mkdtemp(dir=ROOT_EXP_DIR)
101
+ checkpoint_dir = os.path.join(exp_dir, "checkpoints")
102
+
103
+ gr.Info("Fetching checkpoints")
104
+ snapshot_download(repo, token=hf_token, local_dir=checkpoint_dir)
105
+ gr.Info("Checkpoints fetched successfully")
106
+
107
+ return exp_dir
108
+
109
+
110
+ class TrainTab:
111
+ def __init__(self):
112
+ pass
113
+
114
+ def ui(self):
115
+ gr.Markdown("# Training")
116
+ gr.Markdown(
117
+ "You can start training the model by clicking the button below. "
118
+ f"Each time you click the button, the model will train for {TRAINING_EPOCHS} epochs, which takes about 3 minutes on ZeroGPU (A100). "
119
+ )
120
+
121
+ with gr.Row():
122
+ self.train_btn = gr.Button(value="Train", variant="primary")
123
+ self.result = gr.Textbox(label="Training Result", lines=3)
124
+
125
+ gr.Markdown("## Sync Model and Checkpoints with Hugging Face")
126
+ gr.Markdown(
127
+ "You can upload the trained model and checkpoints to Hugging Face for sharing or further training."
128
+ )
129
+
130
+ self.repo = gr.Textbox(label="Repository ID", placeholder="username/repo")
131
+ with gr.Row():
132
+ self.upload_model_btn = gr.Button(value="Upload Model", variant="primary")
133
+ self.upload_checkpoints_btn = gr.Button(
134
+ value="Upload Checkpoints", variant="primary"
135
+ )
136
+ with gr.Row():
137
+ self.fetch_mode_btn = gr.Button(value="Fetch Model", variant="primary")
138
+ self.fetch_checkpoints_btn = gr.Button(
139
+ value="Fetch Checkpoints", variant="primary"
140
+ )
141
+
142
+ def build(self, exp_dir: gr.Textbox, hf_token: gr.Textbox):
143
+ self.train_btn.click(
144
+ fn=train_model,
145
+ inputs=[exp_dir],
146
+ outputs=[self.result],
147
+ )
148
+
149
+ self.upload_model_btn.click(
150
+ fn=upload_model,
151
+ inputs=[exp_dir, self.repo, hf_token],
152
+ )
153
+
154
+ self.upload_checkpoints_btn.click(
155
+ fn=upload_checkpoints,
156
+ inputs=[exp_dir, self.repo, hf_token],
157
+ )
158
+
159
+ self.fetch_mode_btn.click(
160
+ fn=fetch_model,
161
+ inputs=[exp_dir, self.repo, hf_token],
162
+ outputs=[exp_dir],
163
+ )
164
+
165
+ self.fetch_checkpoints_btn.click(
166
+ fn=fetch_checkpoints,
167
+ inputs=[exp_dir, self.repo, hf_token],
168
+ outputs=[exp_dir],
169
+ )
app/tutorial.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ class TutotialTab:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def ui(self):
9
+ gr.Markdown(
10
+ """
11
+ # Welcome to ZeroRVC!
12
+
13
+ > If you are more satisfied with Python code, you can also [use the Python API to run ZeroRVC](https://pypi.org/project/zerorvc/).
14
+
15
+ ZeroRVC is a toolkit for training and inference of retrieval-based voice conversion models.
16
+
17
+ By leveraging the power of Hugging Face ZeroGPU, you can train your model in minutes without setting up the environment.
18
+
19
+ ## How to Use
20
+
21
+ There are 3 main steps to use ZeroRVC:
22
+
23
+ - **Make Dataset**: Prepare your dataset for training. You can upload a zip file containing audio files.
24
+ - **Model Training**: Train your model using the prepared dataset.
25
+ - **Model Inference**: Try your model.
26
+ """
27
+ )
28
+
29
+ def build(self):
30
+ pass
app/zero.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ zero_is_available = "SPACES_ZERO_GPU" in os.environ
7
+
8
+ if zero_is_available:
9
+ import spaces # type: ignore
10
+
11
+ logger.info("ZeroGPU is available")
12
+ else:
13
+ logger.info("ZeroGPU is not available")
14
+
15
+
16
+ # a decorator that applies the spaces.GPU decorator if zero is available
17
+ def zero(duration=60):
18
+ def wrapper(func):
19
+ if zero_is_available:
20
+ return spaces.GPU(func, duration=duration)
21
+ else:
22
+ return func
23
+
24
+ return wrapper
example-dataset.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from zerorvc import prepare
3
+
4
+ HF_TOKEN = os.environ.get("HF_TOKEN")
5
+
6
+ dataset = prepare("./my-voices")
7
+ print(dataset)
8
+
9
+ dataset.push_to_hub("my-rvc-dataset", token=HF_TOKEN, private=True)
example-infer.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from zerorvc import RVC
3
+ import soundfile as sf
4
+
5
+ HF_TOKEN = os.environ.get("HF_TOKEN")
6
+ MODEL = "JacobLinCool/my-rvc-model3"
7
+
8
+ rvc = RVC.from_pretrained(MODEL, token=HF_TOKEN)
9
+ samples = rvc.convert("test.mp3")
10
+ sf.write("output.wav", samples, rvc.sr)
11
+
12
+ pitch_modifications = [-12, -8, -4, 4, 8, 12]
13
+ for pitch_modification in pitch_modifications:
14
+ samples = rvc.convert("test.mp3", pitch_modification=pitch_modification)
15
+ sf.write(f"output-{pitch_modification}.wav", samples, rvc.sr)
example-train.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datasets import load_dataset
3
+ from tqdm import tqdm
4
+ from zerorvc import RVCTrainer, pretrained_checkpoints
5
+
6
+ HF_TOKEN = os.environ.get("HF_TOKEN")
7
+ EPOCHS = 100
8
+ BATCH_SIZE = 8
9
+ DATASET = "JacobLinCool/my-rvc-dataset"
10
+ MODEL = "JacobLinCool/my-rvc-model"
11
+
12
+ dataset = load_dataset(DATASET, token=HF_TOKEN)
13
+ print(dataset)
14
+
15
+ trainer = RVCTrainer(checkpoint_dir="./checkpoints")
16
+ training = tqdm(
17
+ trainer.train(
18
+ dataset=dataset["train"],
19
+ resume_from=pretrained_checkpoints(), # resume training from the pretrained VCTK checkpoint
20
+ epochs=EPOCHS,
21
+ batch_size=BATCH_SIZE,
22
+ ),
23
+ total=EPOCHS,
24
+ )
25
+
26
+ # Training loop: iterate over epochs
27
+ for checkpoint in training:
28
+ training.set_description(
29
+ f"Epoch {checkpoint.epoch}/{EPOCHS} loss: (gen: {checkpoint.loss_gen:.4f}, fm: {checkpoint.loss_fm:.4f}, mel: {checkpoint.loss_mel:.4f}, kl: {checkpoint.loss_kl:.4f}, disc: {checkpoint.loss_disc:.4f})"
30
+ )
31
+
32
+ # Save checkpoint every 10 epochs
33
+ if checkpoint.epoch % 10 == 0:
34
+ checkpoint.save(checkpoint_dir=trainer.checkpoint_dir)
35
+ # Directly push the synthesizer to the Hugging Face Hub
36
+ checkpoint.G.push_to_hub(MODEL, token=HF_TOKEN, private=True)
37
+
38
+ print("Training completed.")
headers.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ title: ZeroRVC
2
+ emoji: 🎙️
3
+ colorFrom: gray
4
+ colorTo: gray
5
+ sdk: gradio
6
+ sdk_version: 4.37.2
7
+ app_file: app.py
8
+ pinned: false
my-voices/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.wav
pyproject.toml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "zerorvc"
3
+ version = "0.0.19"
4
+ authors = [{ name = "Jacob Lin", email = "jacob@csie.cool" }]
5
+ description = "Run Retrieval-based Voice Conversion training and inference with ease."
6
+ readme = "README.md"
7
+ requires-python = ">=3.8"
8
+ classifiers = [
9
+ "Programming Language :: Python :: 3",
10
+ "License :: OSI Approved :: MIT License",
11
+ "Operating System :: OS Independent",
12
+ ]
13
+ dependencies = [
14
+ "numpy>=1.0.0",
15
+ "torch>=2.0.0",
16
+ "datasets",
17
+ "accelerate",
18
+ "transformers",
19
+ "huggingface_hub",
20
+ "tqdm",
21
+ "librosa",
22
+ "scipy",
23
+ "tensorboard",
24
+ ]
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/jacoblincool/zero-rvc"
28
+ Issues = "https://github.com/jacoblincool/zero-rvc/issues"
29
+
30
+ [build-system]
31
+ requires = ["hatchling"]
32
+ build-backend = "hatchling.build"
33
+
34
+ [tool.hatch.build.targets.sdist]
35
+ include = ["zerorvc/**/*", "pyproject.toml", "README.md", "LICENSE"]
36
+ [tool.hatch.build.targets.wheel]
37
+ packages = ["zerorvc"]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ zerorvc>=0.0.10
2
+
3
+ # gradio app deps
4
+ gradio
5
+ demucs==4.0.1
6
+ yt_dlp
7
+ tensorboard
zerorvc/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .rvc import RVC
2
+ from .trainer import RVCTrainer
3
+ from .dataset import prepare
4
+ from .synthesizer import SynthesizerTrnMs768NSFsid
5
+ from .pretrained import pretrained_checkpoints
6
+ from .f0 import load_rmvpe, RMVPE, F0Extractor
7
+ from .hubert import load_hubert, HubertModel, HubertFeatureExtractor
8
+ from .auto_loader import auto_loaded_model
zerorvc/assets/mute/mute48k.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2bb4daaa106e351aebb001e5a25de985c0b472f22e8d60676bc924a79056ee
3
+ size 288078
zerorvc/auto_loader.py ADDED
@@ -0,0 +1 @@
 
 
1
+ auto_loaded_model = {}
zerorvc/constants.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ SR_16K = 16000
2
+ SR_48K = 48000
3
+
4
+ N_FFT = 2048
5
+ HOP_LENGTH = 480
6
+ WIN_LENGTH = 2048
7
+ N_MELS = 128
zerorvc/dataset.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ import librosa
5
+ import logging
6
+ import shutil
7
+ from pkg_resources import resource_filename
8
+ from accelerate import Accelerator
9
+ from datasets import load_dataset, DatasetDict, Dataset, Audio
10
+ from .preprocess import Preprocessor, crop_feats_length
11
+ from .hubert import HubertFeatureExtractor, HubertModel, load_hubert
12
+ from .f0 import F0Extractor, RMVPE, load_rmvpe
13
+ from .constants import *
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def extract_hubert_features(
20
+ rows,
21
+ hfe: HubertFeatureExtractor,
22
+ hubert: str | HubertModel | None,
23
+ device: torch.device,
24
+ ):
25
+ if not hfe.is_loaded():
26
+ model = load_hubert(hubert, device)
27
+ hfe.load(model)
28
+ feats = []
29
+ for row in rows["wav_16k"]:
30
+ feat = hfe.extract_feature_from(row["array"].astype("float32"))
31
+ feats.append(feat)
32
+ return {"hubert_feats": feats}
33
+
34
+
35
+ def extract_f0_features(
36
+ rows, f0e: F0Extractor, rmvpe: str | RMVPE | None, device: torch.device
37
+ ):
38
+ if not f0e.is_loaded():
39
+ model = load_rmvpe(rmvpe, device)
40
+ f0e.load(model)
41
+ f0s = []
42
+ f0nsfs = []
43
+ for row in rows["wav_16k"]:
44
+ f0nsf, f0 = f0e.extract_f0_from(row["array"].astype("float32"))
45
+ f0s.append(f0)
46
+ f0nsfs.append(f0nsf)
47
+ return {"f0": f0s, "f0nsf": f0nsfs}
48
+
49
+
50
+ def feature_postprocess(rows):
51
+ phones = rows["hubert_feats"]
52
+ for i, phone in enumerate(phones):
53
+ phone = np.repeat(phone, 2, axis=0)
54
+ n_num = min(phone.shape[0], 900)
55
+ phone = phone[:n_num, :]
56
+ phones[i] = phone
57
+
58
+ if "f0" in rows:
59
+ pitch = rows["f0"][i]
60
+ pitch = pitch[:n_num]
61
+ pitch = np.array(pitch, dtype=np.float32)
62
+ rows["f0"][i] = pitch
63
+ if "f0nsf" in rows:
64
+ pitchf = rows["f0nsf"][i]
65
+ pitchf = pitchf[:n_num]
66
+ rows["f0nsf"][i] = pitchf
67
+ return rows
68
+
69
+
70
+ def calculate_spectrogram(
71
+ rows, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH
72
+ ):
73
+ specs = []
74
+ hann_window = np.hanning(win_length)
75
+ pad_amount = int((win_length - hop_length) / 2)
76
+ for row in rows["wav_gt"]:
77
+ stft = librosa.stft(
78
+ np.pad(row["array"], (pad_amount, pad_amount), mode="reflect"),
79
+ n_fft=n_fft,
80
+ hop_length=hop_length,
81
+ win_length=win_length,
82
+ window=hann_window,
83
+ center=False,
84
+ )
85
+ specs.append(np.abs(stft) + 1e-6)
86
+
87
+ return {"spec": specs}
88
+
89
+
90
+ def fix_length(rows, hop_length=HOP_LENGTH):
91
+ for i, row in enumerate(rows["spec"]):
92
+ spec = np.array(row)
93
+ phone = np.array(rows["hubert_feats"][i])
94
+ pitch = np.array(rows["f0"][i])
95
+ pitchf = np.array(rows["f0nsf"][i])
96
+ wav_gt = np.array(rows["wav_gt"][i]["array"])
97
+
98
+ spec, phone, pitch, pitchf = crop_feats_length(spec, phone, pitch, pitchf)
99
+
100
+ phone_len = phone.shape[0]
101
+ wav_gt = wav_gt[: phone_len * hop_length]
102
+
103
+ rows["hubert_feats"][i] = phone
104
+ rows["f0"][i] = pitch
105
+ rows["f0nsf"][i] = pitchf
106
+ rows["spec"][i] = spec
107
+ rows["wav_gt"][i]["array"] = wav_gt
108
+ return rows
109
+
110
+
111
+ def prepare(
112
+ dir: str | DatasetDict,
113
+ sr=SR_48K,
114
+ hubert: str | HubertModel | None = None,
115
+ rmvpe: str | RMVPE | None = None,
116
+ batch_size=1,
117
+ max_slice_length: float | None = 3.0,
118
+ accelerator: Accelerator = None,
119
+ include_mute=True,
120
+ stage=3,
121
+ ):
122
+ """
123
+ Prepare the dataset for training or evaluation.
124
+
125
+ Args:
126
+ dir (str | DatasetDict): The directory path or DatasetDict object containing the dataset.
127
+ sr (int, optional): The target sampling rate. Defaults to SR_48K.
128
+ hubert (str | HubertModel | None, optional): The Hubert model or its name to use for feature extraction. Defaults to None.
129
+ rmvpe (str | RMVPE | None, optional): The RMVPE model or its name to use for feature extraction. Defaults to None.
130
+ batch_size (int, optional): The batch size for processing the dataset. Defaults to 1.
131
+ accelerator (Accelerator, optional): The accelerator object for distributed training. Defaults to None.
132
+ include_mute (bool, optional): Whether to include a mute audio file in the directory dataset. Defaults to True.
133
+ stage (int, optional): The dataset preparation level to perform. Defaults to 3. (Stage 1 and 3 are CPU intensive, Stage 2 is GPU intensive.)
134
+
135
+ Returns:
136
+ DatasetDict: The prepared dataset.
137
+ """
138
+ if accelerator is None:
139
+ accelerator = Accelerator()
140
+
141
+ if isinstance(dir, (DatasetDict, Dataset)):
142
+ ds = dir
143
+ else:
144
+ mute_source = resource_filename("zerorvc", "assets/mute/mute48k.wav")
145
+ mute_dest = os.path.join(dir, "mute.wav")
146
+ if include_mute and not os.path.exists(mute_dest):
147
+ logger.info(f"Copying {mute_source} to {mute_dest}")
148
+ shutil.copy(mute_source, mute_dest)
149
+
150
+ ds: DatasetDict | Dataset = load_dataset("audiofolder", data_dir=dir)
151
+
152
+ for key in ds:
153
+ ds[key] = ds[key].remove_columns(
154
+ [col for col in ds[key].column_names if col != "audio"]
155
+ )
156
+ ds = ds.cast_column("audio", Audio(sampling_rate=sr))
157
+
158
+ if stage <= 0:
159
+ return ds
160
+
161
+ # Stage 1, CPU intensive
162
+
163
+ pp = Preprocessor(sr, max_slice_length) if max_slice_length is not None else None
164
+
165
+ def preprocess(rows):
166
+ wav_gt = []
167
+ wav_16k = []
168
+ for row in rows["audio"]:
169
+ if pp is not None:
170
+ slices = pp.preprocess_audio(row["array"])
171
+ for slice in slices:
172
+ wav_gt.append({"path": "", "array": slice, "sampling_rate": sr})
173
+ slice16k = librosa.resample(slice, orig_sr=sr, target_sr=SR_16K)
174
+ wav_16k.append(
175
+ {"path": "", "array": slice16k, "sampling_rate": SR_16K}
176
+ )
177
+ else:
178
+ slice = row["array"]
179
+ wav_gt.append({"path": "", "array": slice, "sampling_rate": sr})
180
+ slice16k = librosa.resample(slice, orig_sr=sr, target_sr=SR_16K)
181
+ wav_16k.append({"path": "", "array": slice16k, "sampling_rate": SR_16K})
182
+ return {"wav_gt": wav_gt, "wav_16k": wav_16k}
183
+
184
+ ds = ds.map(
185
+ preprocess, batched=True, batch_size=batch_size, remove_columns=["audio"]
186
+ )
187
+ ds = ds.cast_column("wav_gt", Audio(sampling_rate=sr))
188
+ ds = ds.cast_column("wav_16k", Audio(sampling_rate=SR_16K))
189
+
190
+ if stage <= 1:
191
+ return ds
192
+
193
+ # Stage 2, GPU intensive
194
+
195
+ hfe = HubertFeatureExtractor()
196
+ ds = ds.map(
197
+ extract_hubert_features,
198
+ batched=True,
199
+ batch_size=batch_size,
200
+ fn_kwargs={"hfe": hfe, "hubert": hubert, "device": accelerator.device},
201
+ )
202
+
203
+ f0e = F0Extractor()
204
+ ds = ds.map(
205
+ extract_f0_features,
206
+ batched=True,
207
+ batch_size=batch_size,
208
+ fn_kwargs={"f0e": f0e, "rmvpe": rmvpe, "device": accelerator.device},
209
+ )
210
+
211
+ if stage <= 2:
212
+ return ds
213
+
214
+ # Stage 3, CPU intensive
215
+
216
+ ds = ds.map(feature_postprocess, batched=True, batch_size=batch_size)
217
+ ds = ds.map(calculate_spectrogram, batched=True, batch_size=batch_size)
218
+ ds = ds.map(fix_length, batched=True, batch_size=batch_size)
219
+
220
+ return ds
221
+
222
+
223
+ def show_dataset_pitch_distribution(dataset):
224
+ import matplotlib.pyplot as plt
225
+ import seaborn as sns
226
+ import numpy as np
227
+
228
+ sns.set_theme()
229
+ pitches = []
230
+ for row in dataset["f0"]:
231
+ pitches.extend([p for p in row if p != 1])
232
+
233
+ pitches = np.array(pitches)
234
+ stats = {
235
+ "mean": np.mean(pitches),
236
+ "std": np.std(pitches),
237
+ "min": np.min(pitches),
238
+ "max": np.max(pitches),
239
+ "median": np.median(pitches),
240
+ "q1": np.percentile(pitches, 25),
241
+ "q3": np.percentile(pitches, 75),
242
+ }
243
+
244
+ plt.figure(figsize=(10, 6))
245
+ sns.histplot(pitches, bins=100)
246
+ plt.title(
247
+ f"Pitch Distribution\nMean: {stats['mean']:.1f} ± {stats['std']:.1f}\n"
248
+ f"Range: [{stats['min']:.1f}, {stats['max']:.1f}]\n"
249
+ f"Quartiles: [{stats['q1']:.1f}, {stats['median']:.1f}, {stats['q3']:.1f}]"
250
+ )
251
+ plt.xlabel("Frequency (Note)")
252
+ plt.ylabel("Count")
253
+ plt.show()
zerorvc/f0/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .extractor import F0Extractor
2
+ from .rmvpe import RMVPE
3
+ from .load import load_rmvpe
zerorvc/f0/extractor.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import numpy as np
3
+ import librosa
4
+ from .rmvpe import RMVPE
5
+ from ..constants import SR_16K
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class F0Extractor:
11
+ def __init__(
12
+ self,
13
+ rmvpe: RMVPE = None,
14
+ sr=SR_16K,
15
+ f0_bin=256,
16
+ f0_max=1100.0,
17
+ f0_min=50.0,
18
+ ):
19
+ self.sr = sr
20
+ self.f0_bin = f0_bin
21
+ self.f0_max = f0_max
22
+ self.f0_min = f0_min
23
+ self.f0_mel_min = 1127 * np.log(1 + f0_min / 700)
24
+ self.f0_mel_max = 1127 * np.log(1 + f0_max / 700)
25
+
26
+ if rmvpe is not None:
27
+ self.load(rmvpe)
28
+
29
+ def load(self, rmvpe: RMVPE):
30
+ self.rmvpe = rmvpe
31
+ self.device = next(rmvpe.parameters()).device
32
+ logger.info(f"RMVPE model is on {self.device}")
33
+
34
+ def is_loaded(self) -> bool:
35
+ return hasattr(self, "rmvpe")
36
+
37
+ def calculate_f0_from_f0nsf(self, f0nsf: np.ndarray):
38
+ f0_mel = 1127 * np.log(1 + f0nsf / 700)
39
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
40
+ self.f0_bin - 2
41
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
42
+
43
+ # use 0 or 1
44
+ f0_mel[f0_mel <= 1] = 1
45
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
46
+ f0 = np.rint(f0_mel).astype(int)
47
+ assert f0.max() <= 255 and f0.min() >= 1, (
48
+ f0.max(),
49
+ f0.min(),
50
+ )
51
+
52
+ return f0
53
+
54
+ def extract_f0_from(self, y: np.ndarray, modification=0.0):
55
+ f0nsf = self.rmvpe.infer_from_audio(y, thred=0.03)
56
+
57
+ f0nsf *= pow(2, modification / 12)
58
+
59
+ f0 = self.calculate_f0_from_f0nsf(f0nsf)
60
+
61
+ return f0nsf, f0
62
+
63
+ def extract_f0(self, wav_file: str):
64
+ y, _ = librosa.load(wav_file, sr=self.sr)
65
+ return self.extract_f0_from(y)
zerorvc/f0/load.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .rmvpe import RMVPE
3
+
4
+
5
+ def load_rmvpe(
6
+ rmvpe: str | RMVPE | None = None, device: torch.device = torch.device("cpu")
7
+ ) -> RMVPE:
8
+ """
9
+ Load the RMVPE model from a file or download it if necessary.
10
+ If a loaded model is provided, it will be returned as is.
11
+
12
+ Args:
13
+ rmvpe (str | RMVPE | None): The path to the RMVPE model file or the pre-loaded RMVPE model. If None, the default model will be downloaded.
14
+ device (torch.device): The device to load the model on.
15
+
16
+ Returns:
17
+ RMVPE: The loaded RMVPE model.
18
+
19
+ Raises:
20
+ If the model file does not exist.
21
+ """
22
+ if isinstance(rmvpe, RMVPE):
23
+ return rmvpe.to(device)
24
+ if isinstance(rmvpe, str):
25
+ model = RMVPE.from_pretrained(rmvpe).to(device)
26
+ return model
27
+ return RMVPE.from_pretrained("safe-models/RMVPE").to(device)
zerorvc/f0/rmvpe/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # The RMVPE model is from https://github.com/Dream-High/RMVPE
2
+ # Apache License 2.0: https://github.com/Dream-High/RMVPE/blob/main/LICENSE
3
+ # With modifications from https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer/lib/rmvpe.py
4
+ # MIT License: https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/LICENSE
5
+
6
+ from .model import RMVPE
zerorvc/f0/rmvpe/constants.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ N_CLASS = 360
2
+ N_MELS = 128
3
+ MAGIC_CONST = 1997.3794084376191
4
+ SAMPLE_RATE = 16000
5
+ WINDOW_LENGTH = 1024
6
+ HOP_LENGTH = 160
7
+ MEL_FMIN = 30
8
+ MEL_FMAX = SAMPLE_RATE // 2
zerorvc/f0/rmvpe/deepunet.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from torch import nn
4
+ from .constants import *
5
+
6
+
7
+ class ConvBlockRes(nn.Module):
8
+ def __init__(self, in_channels: int, out_channels: int, momentum=0.01):
9
+ super().__init__()
10
+ self.conv = nn.Sequential(
11
+ nn.Conv2d(
12
+ in_channels=in_channels,
13
+ out_channels=out_channels,
14
+ kernel_size=(3, 3),
15
+ stride=(1, 1),
16
+ padding=(1, 1),
17
+ bias=False,
18
+ ),
19
+ nn.BatchNorm2d(out_channels, momentum=momentum),
20
+ nn.ReLU(),
21
+ nn.Conv2d(
22
+ in_channels=out_channels,
23
+ out_channels=out_channels,
24
+ kernel_size=(3, 3),
25
+ stride=(1, 1),
26
+ padding=(1, 1),
27
+ bias=False,
28
+ ),
29
+ nn.BatchNorm2d(out_channels, momentum=momentum),
30
+ nn.ReLU(),
31
+ )
32
+ # self.shortcut:Optional[nn.Module] = None
33
+ if in_channels != out_channels:
34
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
35
+
36
+ def forward(self, x: torch.Tensor):
37
+ if not hasattr(self, "shortcut"):
38
+ return self.conv(x) + x
39
+ else:
40
+ return self.conv(x) + self.shortcut(x)
41
+
42
+
43
+ class Encoder(nn.Module):
44
+ def __init__(
45
+ self,
46
+ in_channels: int,
47
+ in_size: int,
48
+ n_encoders: int,
49
+ kernel_size: int,
50
+ n_blocks: int,
51
+ out_channels=16,
52
+ momentum=0.01,
53
+ ):
54
+ super().__init__()
55
+ self.n_encoders = n_encoders
56
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
57
+ self.layers = nn.ModuleList()
58
+ self.latent_channels = []
59
+ for i in range(self.n_encoders):
60
+ self.layers.append(
61
+ ResEncoderBlock(
62
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
63
+ )
64
+ )
65
+ self.latent_channels.append([out_channels, in_size])
66
+ in_channels = out_channels
67
+ out_channels *= 2
68
+ in_size //= 2
69
+ self.out_size = in_size
70
+ self.out_channel = out_channels
71
+
72
+ def forward(self, x: torch.Tensor):
73
+ concat_tensors: List[torch.Tensor] = []
74
+ x = self.bn(x)
75
+ for i, layer in enumerate(self.layers):
76
+ t, x = layer(x)
77
+ concat_tensors.append(t)
78
+ return x, concat_tensors
79
+
80
+
81
+ class ResEncoderBlock(nn.Module):
82
+ def __init__(
83
+ self,
84
+ in_channels: int,
85
+ out_channels: int,
86
+ kernel_size: int | None = None,
87
+ n_blocks=1,
88
+ momentum=0.01,
89
+ ):
90
+ super().__init__()
91
+ self.n_blocks = n_blocks
92
+ self.conv = nn.ModuleList()
93
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
94
+ for _ in range(n_blocks - 1):
95
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
96
+ self.kernel_size = kernel_size
97
+ if kernel_size is not None:
98
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
99
+
100
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
101
+ for conv in self.conv:
102
+ x = conv(x)
103
+ if self.kernel_size is None:
104
+ return x, x
105
+ return x, self.pool(x)
106
+
107
+
108
+ class Intermediate(nn.Module): #
109
+ def __init__(
110
+ self,
111
+ in_channels: int,
112
+ out_channels: int,
113
+ n_inters: int,
114
+ n_blocks: int,
115
+ momentum=0.01,
116
+ ):
117
+ super().__init__()
118
+ self.n_inters = n_inters
119
+ self.layers = nn.ModuleList()
120
+ self.layers.append(
121
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
122
+ )
123
+ for _ in range(self.n_inters - 1):
124
+ self.layers.append(
125
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
126
+ )
127
+
128
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
129
+ for layer in self.layers:
130
+ x, _ = layer(x)
131
+ return x
132
+
133
+
134
+ class ResDecoderBlock(nn.Module):
135
+ def __init__(
136
+ self,
137
+ in_channels: int,
138
+ out_channels: int,
139
+ stride: int,
140
+ n_blocks=1,
141
+ momentum=0.01,
142
+ ):
143
+ super().__init__()
144
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
145
+ self.n_blocks = n_blocks
146
+ self.conv1 = nn.Sequential(
147
+ nn.ConvTranspose2d(
148
+ in_channels=in_channels,
149
+ out_channels=out_channels,
150
+ kernel_size=(3, 3),
151
+ stride=stride,
152
+ padding=(1, 1),
153
+ output_padding=out_padding,
154
+ bias=False,
155
+ ),
156
+ nn.BatchNorm2d(out_channels, momentum=momentum),
157
+ nn.ReLU(),
158
+ )
159
+ self.conv2 = nn.ModuleList()
160
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
161
+ for _ in range(n_blocks - 1):
162
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
163
+
164
+ def forward(self, x: torch.Tensor, concat_tensor: torch.Tensor) -> torch.Tensor:
165
+ x = self.conv1(x)
166
+ x = torch.cat((x, concat_tensor), dim=1)
167
+ for conv2 in self.conv2:
168
+ x = conv2(x)
169
+ return x
170
+
171
+
172
+ class Decoder(nn.Module):
173
+ def __init__(
174
+ self,
175
+ in_channels: int,
176
+ n_decoders: int,
177
+ stride: int,
178
+ n_blocks: int,
179
+ momentum=0.01,
180
+ ):
181
+ super().__init__()
182
+ self.layers = nn.ModuleList()
183
+ self.n_decoders = n_decoders
184
+ for _ in range(self.n_decoders):
185
+ out_channels = in_channels // 2
186
+ self.layers.append(
187
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
188
+ )
189
+ in_channels = out_channels
190
+
191
+ def forward(
192
+ self, x: torch.Tensor, concat_tensors: List[torch.Tensor]
193
+ ) -> torch.Tensor:
194
+ for i, layer in enumerate(self.layers):
195
+ x = layer(x, concat_tensors[-1 - i])
196
+ return x
197
+
198
+
199
+ class DeepUnet(nn.Module):
200
+ def __init__(
201
+ self,
202
+ kernel_size: int,
203
+ n_blocks: int,
204
+ en_de_layers=5,
205
+ inter_layers=4,
206
+ in_channels=1,
207
+ en_out_channels=16,
208
+ ):
209
+ super().__init__()
210
+ self.encoder = Encoder(
211
+ in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels
212
+ )
213
+ self.intermediate = Intermediate(
214
+ self.encoder.out_channel // 2,
215
+ self.encoder.out_channel,
216
+ inter_layers,
217
+ n_blocks,
218
+ )
219
+ self.decoder = Decoder(
220
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
221
+ )
222
+
223
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
224
+ x, concat_tensors = self.encoder(x)
225
+ x = self.intermediate(x)
226
+ x = self.decoder(x, concat_tensors)
227
+ return x
zerorvc/f0/rmvpe/mel.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ import librosa
6
+ from .stft import STFT, TorchSTFT
7
+
8
+ USING_TORCH_STFT = os.getenv("USING_TORCH_STFT") is not None
9
+
10
+
11
+ class MelSpectrogram(nn.Module):
12
+ def __init__(
13
+ self,
14
+ n_mel_channels: int,
15
+ sampling_rate: int,
16
+ win_length: int,
17
+ hop_length: int,
18
+ n_fft: int = None,
19
+ mel_fmin: int = 0,
20
+ mel_fmax: int = None,
21
+ clamp: float = 1e-5,
22
+ ):
23
+ super().__init__()
24
+ n_fft = win_length if n_fft is None else n_fft
25
+ mel_basis = librosa.filters.mel(
26
+ sr=sampling_rate,
27
+ n_fft=n_fft,
28
+ n_mels=n_mel_channels,
29
+ fmin=mel_fmin,
30
+ fmax=mel_fmax,
31
+ htk=True,
32
+ )
33
+ mel_basis = torch.from_numpy(mel_basis).float()
34
+ self.register_buffer("mel_basis", mel_basis, persistent=False)
35
+ self.n_fft = n_fft
36
+ self.hop_length = hop_length
37
+ self.win_length = win_length
38
+ self.sampling_rate = sampling_rate
39
+ self.n_mel_channels = n_mel_channels
40
+ self.clamp = clamp
41
+
42
+ self.keyshift = 0
43
+ self.speed = 1
44
+ self.factor = 2 ** (self.keyshift / 12)
45
+ self.n_fft_new = int(np.round(self.n_fft * self.factor))
46
+ self.win_length_new = int(np.round(self.win_length * self.factor))
47
+ self.hop_length_new = int(np.round(self.hop_length * self.speed))
48
+
49
+ if USING_TORCH_STFT:
50
+ self.stft = TorchSTFT(
51
+ filter_length=self.n_fft_new,
52
+ hop_length=self.hop_length_new,
53
+ win_length=self.win_length_new,
54
+ window="hann",
55
+ )
56
+ else:
57
+ self.stft = STFT(
58
+ filter_length=self.n_fft_new,
59
+ hop_length=self.hop_length_new,
60
+ win_length=self.win_length_new,
61
+ window="hann",
62
+ )
63
+
64
+ def forward(self, audio: torch.Tensor):
65
+ magnitude = self.stft(audio)
66
+ mel_output = torch.matmul(self.mel_basis, magnitude)
67
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
68
+ return log_mel_spec
zerorvc/f0/rmvpe/model.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+ from huggingface_hub import PyTorchModelHubMixin
7
+ from .seq import BiGRU
8
+ from .deepunet import DeepUnet
9
+ from .mel import MelSpectrogram
10
+ from .constants import *
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RMVPE(nn.Module, PyTorchModelHubMixin):
16
+ def __init__(
17
+ self,
18
+ n_blocks: int,
19
+ n_gru: int,
20
+ kernel_size: int,
21
+ en_de_layers=5,
22
+ inter_layers=4,
23
+ in_channels=1,
24
+ en_out_channels=16,
25
+ ):
26
+ super().__init__()
27
+ self.mel_extractor = MelSpectrogram(
28
+ N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, None, MEL_FMIN, MEL_FMAX
29
+ )
30
+ self.unet = DeepUnet(
31
+ kernel_size,
32
+ n_blocks,
33
+ en_de_layers,
34
+ inter_layers,
35
+ in_channels,
36
+ en_out_channels,
37
+ )
38
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
39
+ if n_gru:
40
+ self.fc = nn.Sequential(
41
+ BiGRU(3 * N_MELS, 256, n_gru),
42
+ nn.Linear(512, N_CLASS),
43
+ nn.Dropout(0.25),
44
+ nn.Sigmoid(),
45
+ )
46
+ else:
47
+ self.fc = nn.Sequential(
48
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
49
+ )
50
+
51
+ cents_mapping = 20 * np.arange(360) + MAGIC_CONST
52
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
53
+ self.cents_mapping_torch = torch.from_numpy(self.cents_mapping).to(
54
+ dtype=torch.float32
55
+ )
56
+
57
+ def to(self, device):
58
+ self.cents_mapping_torch = self.cents_mapping_torch.to(device)
59
+ return super().to(device)
60
+
61
+ def forward(self, mel: torch.Tensor) -> torch.Tensor:
62
+ mel = mel.transpose(-1, -2).unsqueeze(1)
63
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
64
+ x = self.fc(x)
65
+ return x
66
+
67
+ def mel2hidden(self, mel: torch.Tensor):
68
+ with torch.no_grad():
69
+ n_frames = mel.shape[2]
70
+ n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
71
+ mel = F.pad(mel, (0, n_pad), mode="constant")
72
+ hidden = self(mel)
73
+ return hidden[:, :n_frames]
74
+
75
+ def decode(self, hidden: torch.Tensor, thred=0.03):
76
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
77
+ f0 = 10 * (2 ** (cents_pred / 1200))
78
+ f0[f0 == 10] = 0
79
+ return f0
80
+
81
+ def infer(self, audio: torch.Tensor, thred=0.03, return_tensor=False):
82
+ mel = self.mel_extractor(audio.unsqueeze(0))
83
+ hidden = self.mel2hidden(mel)
84
+ hidden = hidden[0].float()
85
+ f0 = self.decode(hidden, thred=thred)
86
+ if return_tensor:
87
+ return f0
88
+ return f0.cpu().numpy()
89
+
90
+ def infer_from_audio(self, audio: np.ndarray, thred=0.03):
91
+ audio = torch.from_numpy(audio).to(next(self.parameters()).device)
92
+ return self.infer(audio, thred=thred)
93
+
94
+ def to_local_average_cents(
95
+ self, salience: torch.Tensor, thred=0.05
96
+ ) -> torch.Tensor:
97
+ center = torch.argmax(salience, dim=1)
98
+ salience = F.pad(salience, (4, 4))
99
+
100
+ center += 4
101
+ batch_indices = torch.arange(salience.shape[0], device=salience.device)
102
+
103
+ # Create indices for the 9-point window around each center
104
+ offsets = torch.arange(-4, 5, device=salience.device)
105
+ indices = center.unsqueeze(1) + offsets.unsqueeze(0)
106
+
107
+ # Extract values using advanced indexing
108
+ todo_salience = salience[batch_indices.unsqueeze(1), indices]
109
+ todo_cents_mapping = self.cents_mapping_torch[indices]
110
+
111
+ product_sum = torch.sum(todo_salience * todo_cents_mapping, 1)
112
+ weight_sum = torch.sum(todo_salience, 1)
113
+ divided = product_sum / weight_sum
114
+
115
+ maxx = torch.max(salience, 1).values
116
+ divided[maxx <= thred] = 0
117
+
118
+ return divided
zerorvc/f0/rmvpe/seq.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class BiGRU(nn.Module):
6
+ def __init__(self, input_features: int, hidden_features: int, num_layers: int):
7
+ super().__init__()
8
+ self.gru = nn.GRU(
9
+ input_features,
10
+ hidden_features,
11
+ num_layers=num_layers,
12
+ batch_first=True,
13
+ bidirectional=True,
14
+ )
15
+ self.gru.flatten_parameters()
16
+
17
+ def forward(self, x: torch.Tensor):
18
+ return self.gru(x)[0]
zerorvc/f0/rmvpe/stft.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from librosa.util import pad_center
6
+ from scipy.signal import get_window
7
+
8
+
9
+ class TorchSTFT(nn.Module):
10
+ def __init__(
11
+ self, filter_length=1024, hop_length=512, win_length=None, window="hann"
12
+ ):
13
+ """
14
+ This module implements an STFT using PyTorch's stft function.
15
+
16
+ Keyword Arguments:
17
+ filter_length {int} -- Length of filters used (default: {1024})
18
+ hop_length {int} -- Hop length of STFT (default: {512})
19
+ win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
20
+ equals the filter length). (default: {None})
21
+ window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
22
+ (default: {'hann'})
23
+ """
24
+ super(TorchSTFT, self).__init__()
25
+ self.n_fft_new = filter_length
26
+ self.hop_length_new = hop_length
27
+ self.win_length_new = win_length if win_length else filter_length
28
+ self.center = True
29
+ hann_window_0 = torch.hann_window(self.win_length_new)
30
+ self.register_buffer("hann_window_0", hann_window_0, persistent=False)
31
+
32
+ def forward(self, input_data):
33
+ fft = torch.stft(
34
+ input_data,
35
+ n_fft=self.n_fft_new,
36
+ hop_length=self.hop_length_new,
37
+ win_length=self.win_length_new,
38
+ window=self.hann_window_0,
39
+ center=self.center,
40
+ return_complex=True,
41
+ )
42
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
43
+ return magnitude
44
+
45
+
46
+ class STFT(nn.Module):
47
+ def __init__(
48
+ self, filter_length=1024, hop_length=512, win_length=None, window="hann"
49
+ ):
50
+ """
51
+ This module implements an STFT using 1D convolution and 1D transpose convolutions.
52
+ This is a bit tricky so there are some cases that probably won't work as working
53
+ out the same sizes before and after in all overlap add setups is tough. Right now,
54
+ this code should work with hop lengths that are half the filter length (50% overlap
55
+ between frames).
56
+
57
+ Keyword Arguments:
58
+ filter_length {int} -- Length of filters used (default: {1024})
59
+ hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
60
+ win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
61
+ equals the filter length). (default: {None})
62
+ window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
63
+ (default: {'hann'})
64
+ """
65
+ super(STFT, self).__init__()
66
+ self.filter_length = filter_length
67
+ self.hop_length = hop_length
68
+ self.win_length = win_length if win_length else filter_length
69
+ self.window = window
70
+ self.forward_transform = None
71
+ self.pad_amount = int(self.filter_length / 2)
72
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
73
+
74
+ cutoff = int((self.filter_length / 2 + 1))
75
+ fourier_basis = np.vstack(
76
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
77
+ )
78
+ forward_basis = torch.FloatTensor(fourier_basis)
79
+ inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
80
+
81
+ assert filter_length >= self.win_length
82
+ # get window and zero center pad it to filter_length
83
+ fft_window = get_window(window, self.win_length, fftbins=True)
84
+ fft_window = pad_center(fft_window, size=filter_length)
85
+ fft_window = torch.from_numpy(fft_window).float()
86
+
87
+ # window the bases
88
+ forward_basis *= fft_window
89
+ inverse_basis = (inverse_basis.T * fft_window).T
90
+
91
+ self.register_buffer("forward_basis", forward_basis.float(), persistent=False)
92
+ self.register_buffer("inverse_basis", inverse_basis.float(), persistent=False)
93
+ self.register_buffer("fft_window", fft_window.float(), persistent=False)
94
+
95
+ def forward(self, input_data):
96
+ """Take input data (audio) to STFT domain using convolution."""
97
+ input_data = F.pad(
98
+ input_data,
99
+ (self.pad_amount, self.pad_amount),
100
+ mode="reflect",
101
+ )
102
+
103
+ # Reshape input for convolution
104
+ input_data = input_data.unsqueeze(1)
105
+
106
+ # Create windowed basis as convolution weights
107
+ forward_transform = F.conv1d(
108
+ input_data,
109
+ self.forward_basis.unsqueeze(1),
110
+ stride=self.hop_length,
111
+ groups=1,
112
+ )
113
+
114
+ cutoff = int((self.filter_length / 2) + 1)
115
+ real_part = forward_transform[:, :cutoff, :]
116
+ imag_part = forward_transform[:, cutoff:, :]
117
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
118
+
119
+ return magnitude
zerorvc/hubert/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .extractor import HubertFeatureExtractor, HubertModel
2
+ from .load import load_hubert
zerorvc/hubert/extractor.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import librosa
3
+ import numpy as np
4
+ from transformers import AutoProcessor, HubertModel
5
+ from ..constants import SR_16K
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class HubertFeatureExtractor:
11
+ def __init__(self, hubert: HubertModel = None, sr=SR_16K):
12
+ self.sr = sr
13
+ if hubert is not None:
14
+ self.load(hubert)
15
+
16
+ def load(self, hubert: HubertModel):
17
+ self.hubert = hubert
18
+ self.device = next(hubert.parameters()).device
19
+ self.processor = AutoProcessor.from_pretrained("safe-models/ContentVec")
20
+ logger.info(f"HuBERT model is on {self.device}")
21
+
22
+ def is_loaded(self) -> bool:
23
+ return hasattr(self, "hubert")
24
+
25
+ def extract_feature_from(self, y: np.ndarray) -> np.ndarray:
26
+ input_values = self.processor(
27
+ y, sampling_rate=self.sr, return_tensors="pt"
28
+ ).input_values
29
+ input_values = input_values.to(self.device)
30
+ feats = self.hubert(input_values, output_hidden_states=True)["hidden_states"][
31
+ 12
32
+ ]
33
+ feats = feats.squeeze(0).float().cpu().detach().numpy()
34
+ if np.isnan(feats).sum() > 0:
35
+ feats = np.nan_to_num(feats)
36
+ return feats
37
+
38
+ def extract_feature(self, wav_file: str) -> np.ndarray:
39
+ y, _ = librosa.load(wav_file, sr=self.sr)
40
+ return self.extract_feature_from(y)
zerorvc/hubert/load.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import HubertModel
3
+
4
+
5
+ def load_hubert(
6
+ hubert: str | HubertModel | None = None,
7
+ device: torch.device = torch.device("cpu"),
8
+ ) -> HubertModel:
9
+ """
10
+ Load the Hubert model from a file or download it if necessary.
11
+ If a loaded model is provided, it will be returned as is.
12
+
13
+ Args:
14
+ hubert (str | HubertModel | None): The path to the Hubert model file or the pre-loaded Hubert model. If None, the default model will be downloaded.
15
+ device (torch.device): The device to load the model on.
16
+
17
+ Returns:
18
+ HubertModel: The loaded Hubert model.
19
+
20
+ Raises:
21
+ If the model file does not exist.
22
+ """
23
+ if isinstance(hubert, HubertModel):
24
+ return hubert.to(device)
25
+ if isinstance(hubert, str):
26
+ model = HubertModel.from_pretrained(hubert).to(device)
27
+ return model
28
+ return HubertModel.from_pretrained("safe-models/ContentVec").to(device)
zerorvc/preprocess/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .preprocess import Preprocessor
2
+ from .crop import crop_feats_length
zerorvc/preprocess/crop.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ import numpy as np
3
+
4
+
5
+ def crop_feats_length(
6
+ spec: np.ndarray, phone: np.ndarray, pitch: np.ndarray, pitchf: np.ndarray
7
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
8
+ phone_len = phone.shape[0]
9
+ spec_len = spec.shape[1]
10
+ if phone_len != spec_len:
11
+ len_min = min(phone_len, spec_len)
12
+ phone = phone[:len_min, :]
13
+ pitch = pitch[:len_min]
14
+ pitchf = pitchf[:len_min]
15
+ spec = spec[:, :len_min]
16
+ return spec, phone, pitch, pitchf
zerorvc/preprocess/preprocess.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+ from scipy import signal
4
+ from .slicer2 import Slicer
5
+
6
+
7
+ class Preprocessor:
8
+ def __init__(
9
+ self, sr: int, max_slice_length: float = 3.0, min_slice_length: float = 0.5
10
+ ):
11
+ self.slicer = Slicer(
12
+ sr=sr,
13
+ threshold=-42,
14
+ min_length=1500,
15
+ min_interval=400,
16
+ hop_size=15,
17
+ max_sil_kept=500,
18
+ )
19
+ self.sr = sr
20
+ self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
21
+ self.max_slice_length = max_slice_length
22
+ self.min_slice_length = min_slice_length
23
+ self.overlap = 0.3
24
+ self.tail = self.max_slice_length + self.overlap
25
+ self.max = 0.9
26
+ self.alpha = 0.75
27
+
28
+ def norm(self, samples: np.ndarray) -> np.ndarray:
29
+ sample_max = np.abs(samples).max()
30
+ normalized = samples / sample_max * self.max
31
+ normalized = (normalized * self.alpha) + (samples * (1 - self.alpha))
32
+ return normalized
33
+
34
+ def preprocess_audio(self, y: np.ndarray) -> list[np.ndarray]:
35
+ y = signal.filtfilt(self.bh, self.ah, y)
36
+ audios = []
37
+ for audio in self.slicer.slice(y):
38
+ i = 0
39
+ while True:
40
+ start = int(self.sr * (self.max_slice_length - self.overlap) * i)
41
+ i += 1
42
+ if len(audio[start:]) > self.tail * self.sr:
43
+ slice = audio[start : start + int(self.max_slice_length * self.sr)]
44
+ audios.append(self.norm(slice))
45
+ else:
46
+ slice = audio[start:]
47
+ if len(slice) > self.min_slice_length * self.sr:
48
+ audios.append(self.norm(slice))
49
+ break
50
+ return audios
51
+
52
+ def preprocess_file(self, file_path: str) -> list[np.ndarray]:
53
+ y, _ = librosa.load(file_path, sr=self.sr)
54
+ return self.preprocess_audio(y)
zerorvc/preprocess/slicer2.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From https://github.com/openvpi/audio-slicer
2
+ # MIT License: https://github.com/openvpi/audio-slicer/blob/main/LICENSE
3
+ from librosa.feature import rms as get_rms
4
+
5
+
6
+ class Slicer:
7
+ def __init__(
8
+ self,
9
+ sr: int,
10
+ threshold: float = -40.0,
11
+ min_length: int = 5000,
12
+ min_interval: int = 300,
13
+ hop_size: int = 20,
14
+ max_sil_kept: int = 5000,
15
+ ):
16
+ if not min_length >= min_interval >= hop_size:
17
+ raise ValueError(
18
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
19
+ )
20
+ if not max_sil_kept >= hop_size:
21
+ raise ValueError(
22
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
23
+ )
24
+ min_interval = sr * min_interval / 1000
25
+ self.threshold = 10 ** (threshold / 20.0)
26
+ self.hop_size = round(sr * hop_size / 1000)
27
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
28
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
29
+ self.min_interval = round(min_interval / self.hop_size)
30
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
31
+
32
+ def _apply_slice(self, waveform, begin, end):
33
+ if len(waveform.shape) > 1:
34
+ return waveform[
35
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
36
+ ]
37
+ else:
38
+ return waveform[
39
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
40
+ ]
41
+
42
+ # @timeit
43
+ def slice(self, waveform):
44
+ if len(waveform.shape) > 1:
45
+ samples = waveform.mean(axis=0)
46
+ else:
47
+ samples = waveform
48
+ if samples.shape[0] <= self.min_length:
49
+ return [waveform]
50
+ rms_list = get_rms(
51
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
52
+ ).squeeze(0)
53
+ sil_tags = []
54
+ silence_start = None
55
+ clip_start = 0
56
+ for i, rms in enumerate(rms_list):
57
+ # Keep looping while frame is silent.
58
+ if rms < self.threshold:
59
+ # Record start of silent frames.
60
+ if silence_start is None:
61
+ silence_start = i
62
+ continue
63
+ # Keep looping while frame is not silent and silence start has not been recorded.
64
+ if silence_start is None:
65
+ continue
66
+ # Clear recorded silence start if interval is not enough or clip is too short
67
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
68
+ need_slice_middle = (
69
+ i - silence_start >= self.min_interval
70
+ and i - clip_start >= self.min_length
71
+ )
72
+ if not is_leading_silence and not need_slice_middle:
73
+ silence_start = None
74
+ continue
75
+ # Need slicing. Record the range of silent frames to be removed.
76
+ if i - silence_start <= self.max_sil_kept:
77
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
78
+ if silence_start == 0:
79
+ sil_tags.append((0, pos))
80
+ else:
81
+ sil_tags.append((pos, pos))
82
+ clip_start = pos
83
+ elif i - silence_start <= self.max_sil_kept * 2:
84
+ pos = rms_list[
85
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
86
+ ].argmin()
87
+ pos += i - self.max_sil_kept
88
+ pos_l = (
89
+ rms_list[
90
+ silence_start : silence_start + self.max_sil_kept + 1
91
+ ].argmin()
92
+ + silence_start
93
+ )
94
+ pos_r = (
95
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
96
+ + i
97
+ - self.max_sil_kept
98
+ )
99
+ if silence_start == 0:
100
+ sil_tags.append((0, pos_r))
101
+ clip_start = pos_r
102
+ else:
103
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
104
+ clip_start = max(pos_r, pos)
105
+ else:
106
+ pos_l = (
107
+ rms_list[
108
+ silence_start : silence_start + self.max_sil_kept + 1
109
+ ].argmin()
110
+ + silence_start
111
+ )
112
+ pos_r = (
113
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
114
+ + i
115
+ - self.max_sil_kept
116
+ )
117
+ if silence_start == 0:
118
+ sil_tags.append((0, pos_r))
119
+ else:
120
+ sil_tags.append((pos_l, pos_r))
121
+ clip_start = pos_r
122
+ silence_start = None
123
+ # Deal with trailing silence.
124
+ total_frames = rms_list.shape[0]
125
+ if (
126
+ silence_start is not None
127
+ and total_frames - silence_start >= self.min_interval
128
+ ):
129
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
130
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
131
+ sil_tags.append((pos, total_frames + 1))
132
+ # Apply and return slices.
133
+ if len(sil_tags) == 0:
134
+ return [waveform]
135
+ else:
136
+ chunks = []
137
+ if sil_tags[0][0] > 0:
138
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
139
+ for i in range(len(sil_tags) - 1):
140
+ chunks.append(
141
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
142
+ )
143
+ if sil_tags[-1][1] < total_frames:
144
+ chunks.append(
145
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
146
+ )
147
+ return chunks
zerorvc/pretrained.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ from huggingface_hub import hf_hub_download
3
+
4
+
5
+ def pretrained_checkpoints() -> Tuple[str, str]:
6
+ """
7
+ The pretrained checkpoints from the Hugging Face Hub.
8
+
9
+ Returns:
10
+ A tuple containing the paths to the downloaded checkpoints for the generator (G) and discriminator (D).
11
+ """
12
+ G = hf_hub_download("lj1995/VoiceConversionWebUI", "pretrained_v2/f0G48k.pth")
13
+ D = hf_hub_download("lj1995/VoiceConversionWebUI", "pretrained_v2/f0D48k.pth")
14
+ return G, D
zerorvc/rvc.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from logging import getLogger
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ import librosa
8
+ from accelerate import Accelerator
9
+ from datasets import Dataset
10
+
11
+ from .f0 import F0Extractor, RMVPE, load_rmvpe
12
+ from .hubert import HubertFeatureExtractor, HubertModel, load_hubert
13
+ from .synthesizer import SynthesizerTrnMs768NSFsid
14
+ from .constants import *
15
+
16
+ logger = getLogger(__name__)
17
+
18
+
19
+ class Synthesizer(SynthesizerTrnMs768NSFsid):
20
+ def forward(self, phone, pitch, pitchf, sid):
21
+ if type(phone.shape[1]) == int:
22
+ phone_lengths = torch.tensor(
23
+ [phone.shape[1]], device=phone.device, dtype=torch.int32
24
+ )
25
+ else:
26
+ phone_lengths = phone.shape[1]
27
+ g = self.emb_g(sid).unsqueeze(-1)
28
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
29
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
30
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
31
+ o = self.dec(z * x_mask, pitchf, g=g, n_res=None)
32
+ return o
33
+
34
+
35
+ class FeatureExtractor(nn.Module):
36
+ def __init__(self, hubert: HubertModel, rmvpe: RMVPE):
37
+ super().__init__()
38
+ self.hubert = hubert
39
+ self.rmvpe = rmvpe
40
+
41
+ def to(self, device):
42
+ self.hubert = self.hubert.to(device)
43
+ self.rmvpe = self.rmvpe.to(device)
44
+ return super().to(device)
45
+
46
+ def forward(self, audio16k, pitch_modification):
47
+ phone = self.hubert(audio16k, output_hidden_states=True)["hidden_states"][12]
48
+ phone = phone.squeeze(0).float()
49
+ phone_lengths = phone.shape[0]
50
+ if type(phone_lengths) == int:
51
+ phone_lengths = torch.tensor(
52
+ [phone_lengths], device=phone.device, dtype=torch.int32
53
+ )
54
+
55
+ pitchf = self.rmvpe.infer(audio16k.squeeze(0), thred=0.03, return_tensor=True)
56
+ pitchf *= torch.pow(
57
+ 2,
58
+ torch.tensor(
59
+ pitch_modification / 12.0, dtype=torch.float32, device=pitchf.device
60
+ ),
61
+ )
62
+ pitch = self.calculate_f0_from_f0nsf_torch(pitchf)
63
+
64
+ pitch = pitch.unsqueeze(0)
65
+ pitchf = pitchf.unsqueeze(0)
66
+ phone = phone.unsqueeze(0)
67
+ logger.info(
68
+ f"{phone.shape=}, {phone_lengths=}, {pitch.shape=}, {pitchf.shape=}"
69
+ )
70
+
71
+ feats0 = phone.clone()
72
+ feats: torch.Tensor = F.interpolate(
73
+ phone.permute(0, 2, 1), scale_factor=2
74
+ ).permute(0, 2, 1)
75
+ feats0: torch.Tensor = F.interpolate(
76
+ feats0.permute(0, 2, 1), scale_factor=2
77
+ ).permute(0, 2, 1)
78
+
79
+ phone_len = feats.shape[1]
80
+ pitch = pitch[:, :phone_len]
81
+ pitchf = pitchf[:, :phone_len]
82
+
83
+ pitchff = pitchf.clone()
84
+ pitchff[pitchf > 0] = 1
85
+ pitchff[pitchf < 1] = 0.33
86
+ pitchff = pitchff.unsqueeze(-1)
87
+ feats = feats * pitchff + feats0 * (1 - pitchff)
88
+ feats = feats.to(feats0.dtype)
89
+
90
+ if type(phone_len) == int:
91
+ phone_len = torch.tensor(
92
+ [phone_len], device=feats.device, dtype=torch.int32
93
+ )
94
+ else:
95
+ phone_len = phone_len.unsqueeze(0)
96
+
97
+ logger.info(f"{feats.shape=}, {pitch.shape=}, {pitchf.shape=}, {phone_len=}")
98
+ return feats, phone_len, pitch, pitchf
99
+
100
+ def calculate_f0_from_f0nsf_torch(self, f0nsf: torch.Tensor):
101
+ f0_mel = 1127 * torch.log(1 + f0nsf / 700)
102
+ f0_max = torch.tensor(1100.0)
103
+ f0_min = torch.tensor(50.0)
104
+ f0_bin = torch.tensor(256)
105
+ f0_mel_max = 1127 * torch.log(1 + f0_max / 700)
106
+ f0_mel_min = 1127 * torch.log(1 + f0_min / 700)
107
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (
108
+ f0_mel_max - f0_mel_min
109
+ ) + 1
110
+
111
+ # use 0 or 1
112
+ f0_mel[f0_mel <= 1] = 1
113
+ f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
114
+ f0 = torch.round(f0_mel).long()
115
+ f0 = torch.clamp(f0, 1, 255)
116
+
117
+ return f0
118
+
119
+
120
+ class RVC:
121
+ """
122
+ RVC (Retrieval-based Voice Conversion) class for converting speech using a pre-trained model.
123
+
124
+ Args:
125
+ name (str | SynthesizerTrnMs768NSFsid): The name of the pre-trained model or the model instance itself.
126
+ sr (int, optional): The sample rate of the input audio. Defaults to SR_48K.
127
+ segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds.
128
+ hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None.
129
+ rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None.
130
+ accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator().
131
+ from_pretrained_kwargs (dict, optional): Additional keyword arguments for loading the pre-trained model. Defaults to {}.
132
+
133
+ Methods:
134
+ from_pretrained(name, sr=SR_48K, hubert=None, rmvpe=None, accelerator=Accelerator(), **from_pretrained_kwargs):
135
+ Creates an instance of RVC using the from_pretrained method.
136
+
137
+ convert(audio, protect=0.33):
138
+ Converts the input audio to the target voice using the pre-trained model.
139
+
140
+ convert_dataset(dataset, protect=0.33):
141
+ Converts a dataset of audio samples to the target voice using the pre-trained model.
142
+
143
+ convert_file(audio, protect=0.33):
144
+ Converts a single audio file to the target voice using the pre-trained model.
145
+
146
+ convert_from_wav16k(wav16k, protect=0.33):
147
+ Converts a 16kHz waveform to the target voice using the pre-trained model.
148
+
149
+ convert_from_features(phone, pitchf, pitch, protect=0.33):
150
+ Converts audio features (phone, pitchf, pitch) to the target voice using the pre-trained model.
151
+ """
152
+
153
+ def __init__(
154
+ self,
155
+ synthesizer: str | Synthesizer,
156
+ hubert: HubertModel | None = None,
157
+ rmvpe: RMVPE | None = None,
158
+ sr=SR_48K,
159
+ segment_size=30.0,
160
+ accelerator: Accelerator | None = None,
161
+ from_pretrained_kwargs={},
162
+ ):
163
+ """
164
+ Initializes an instance of the RVC class.
165
+
166
+ Args:
167
+ synthesizer (str | Synthesizer): The name of the pre-trained model or the model instance itself.
168
+ hubert (str | HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None.
169
+ rmvpe (str | RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None.
170
+ sr (int, optional): The sample rate of the input audio. Defaults to SR_48K.
171
+ segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds.
172
+ accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator().
173
+ from_pretrained_kwargs (dict, optional): Additional keyword arguments for loading the pre-trained model. Defaults to {}.
174
+ """
175
+ accelerator = accelerator or Accelerator()
176
+ self.accelerator = accelerator
177
+
178
+ self.synthesizer = (
179
+ Synthesizer.from_pretrained(synthesizer, **from_pretrained_kwargs)
180
+ if isinstance(synthesizer, str)
181
+ else synthesizer
182
+ )
183
+ self.synthesizer = self.synthesizer.to(accelerator.device)
184
+
185
+ hubert = hubert or load_hubert()
186
+ rmvpe = rmvpe or load_rmvpe()
187
+ self.feature_extractor = FeatureExtractor(hubert, rmvpe)
188
+ self.feature_extractor = self.feature_extractor.to(accelerator.device)
189
+
190
+ self.sr = sr
191
+ self.segment_size = segment_size
192
+
193
+ @staticmethod
194
+ def from_pretrained(
195
+ name: str,
196
+ hubert: HubertModel | None = None,
197
+ rmvpe: RMVPE | None = None,
198
+ sr=SR_48K,
199
+ segment_size=30.0,
200
+ accelerator: Accelerator | None = None,
201
+ **from_pretrained_kwargs,
202
+ ):
203
+ """
204
+ Creates an instance of RVC using the from_pretrained method.
205
+
206
+ Args:
207
+ name (str): The name of the pre-trained model.
208
+ hubert (HubertModel | None, optional): The name of the pre-trained Hubert model or the model instance itself. Defaults to None.
209
+ rmvpe (RMVPE | None, optional): The name of the pre-trained RMVPE model or the model instance itself. Defaults to None.
210
+ sr (int, optional): The sample rate of the input audio. Defaults to SR_48K.
211
+ segment_size (float, optional): The segment size for splitting the input audio. Defaults to 30.0 seconds.
212
+ accelerator (Accelerator, optional): The accelerator device for model inference. Defaults to Accelerator().
213
+ from_pretrained_kwargs (dict): Additional keyword arguments for loading the pre-trained model.
214
+
215
+ Returns:
216
+ RVC: An instance of the RVC class.
217
+ """
218
+ return RVC(
219
+ name,
220
+ hubert=hubert,
221
+ rmvpe=rmvpe,
222
+ sr=sr,
223
+ segment_size=segment_size,
224
+ accelerator=accelerator,
225
+ from_pretrained_kwargs=from_pretrained_kwargs,
226
+ )
227
+
228
+ def convert(self, audio: str | Dataset | np.ndarray, pitch_modification=0.0):
229
+ """
230
+ Converts the input audio to the target voice using the pre-trained model.
231
+
232
+ Args:
233
+ audio (str | Dataset | np.ndarray): The input audio to be converted. It can be a file path, a dataset of audio samples, or a numpy array.
234
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
235
+
236
+ Returns:
237
+ np.ndarray: The converted audio in the target voice.
238
+ If the input is a dataset, it yields the converted audio samples one by one.
239
+ """
240
+ logger.info(f"audio: {audio}, pitch_modification: {pitch_modification}")
241
+ if isinstance(audio, str):
242
+ return self.convert_file(audio, pitch_modification=pitch_modification)
243
+ if isinstance(audio, Dataset):
244
+ return self.convert_dataset(audio, pitch_modification=pitch_modification)
245
+ return self.convert_from_wav16k(audio, pitch_modification=pitch_modification)
246
+
247
+ def convert_dataset(self, dataset: Dataset, pitch_modification=0.0):
248
+ """
249
+ Converts a dataset of audio samples to the target voice using the pre-trained model.
250
+
251
+ Args:
252
+ dataset (Dataset): The dataset of audio samples to be converted.
253
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
254
+
255
+ Yields:
256
+ np.ndarray: The converted audio samples in the target voice.
257
+ """
258
+ for i, data in enumerate(dataset):
259
+ logger.info(f"Converting data {i}")
260
+ phone = data["hubert_feats"]
261
+ pitchf = data["f0nsf"]
262
+ pitch = data["f0"]
263
+ yield self.convert_from_features(
264
+ phone=phone,
265
+ pitchf=pitchf,
266
+ pitch=pitch,
267
+ pitch_modification=pitch_modification,
268
+ )
269
+
270
+ def convert_file(self, audio: str, pitch_modification=0.0) -> np.ndarray:
271
+ """
272
+ Converts a single audio file to the target voice using the pre-trained model.
273
+
274
+ Args:
275
+ audio (str): The path to the audio file to be converted.
276
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
277
+
278
+ Returns:
279
+ np.ndarray: The converted audio in the target voice.
280
+ """
281
+ wav16k, _ = librosa.load(audio, sr=SR_16K)
282
+ logger.info(f"Loaded {audio} with shape {wav16k.shape}")
283
+ return self.convert_from_wav16k(wav16k, pitch_modification=pitch_modification)
284
+
285
+ @torch.no_grad()
286
+ def convert_from_wav16k(
287
+ self, wav16k: np.ndarray, pitch_modification=0.0
288
+ ) -> np.ndarray:
289
+ """
290
+ Converts a 16kHz waveform to the target voice using the pre-trained model.
291
+
292
+ Args:
293
+ wav16k (np.ndarray): The 16kHz waveform to be converted.
294
+ pitch_modification (float, optional): The pitch modification factor. Defaults to 0.0.
295
+
296
+ Returns:
297
+ np.ndarray: The converted audio in the target voice.
298
+ """
299
+ self.feature_extractor.eval()
300
+ feature_extractor_device = next(self.feature_extractor.parameters()).device
301
+
302
+ ret = []
303
+ segment_size = int(self.segment_size * SR_16K)
304
+ for i in range(0, len(wav16k), segment_size):
305
+ segment = wav16k[i : i + segment_size]
306
+ segment = np.pad(segment, (SR_16K, SR_16K), mode="reflect")
307
+ logger.info(f"Padded audio with shape {segment.shape}")
308
+
309
+ phone, phone_lengths, pitch, pitchf = self.feature_extractor(
310
+ torch.from_numpy(segment)
311
+ .unsqueeze(0)
312
+ .to(device=feature_extractor_device),
313
+ pitch_modification,
314
+ )
315
+ print(f"{phone.shape=}, {phone_lengths=}, {pitch.shape=}, {pitchf.shape=}")
316
+
317
+ ret.append(
318
+ self.convert_from_features(phone, pitchf, pitch)[self.sr : -self.sr]
319
+ )
320
+
321
+ return np.concatenate(ret)
322
+
323
+ @torch.no_grad()
324
+ def convert_from_features(
325
+ self,
326
+ phone: np.ndarray | torch.Tensor,
327
+ pitchf: np.ndarray | torch.Tensor,
328
+ pitch: np.ndarray | torch.Tensor,
329
+ ) -> np.ndarray:
330
+ """
331
+ Converts audio features (phone, pitchf, pitch) to the target voice using the pre-trained model.
332
+
333
+ Args:
334
+ phone (np.ndarray): The phone features of the audio.
335
+ pitchf (np.ndarray): The pitch features of the audio.
336
+ pitch (np.ndarray): The pitch values of the audio.
337
+
338
+ Returns:
339
+ np.ndarray: The converted audio in the target voice.
340
+ """
341
+ self.synthesizer.eval()
342
+ synthesizer_device = next(self.synthesizer.parameters()).device
343
+
344
+ if isinstance(phone, np.ndarray):
345
+ phone = torch.from_numpy(phone).to(device=synthesizer_device)
346
+ if isinstance(pitchf, np.ndarray):
347
+ pitchf = torch.from_numpy(pitchf).to(device=synthesizer_device)
348
+ if isinstance(pitch, np.ndarray):
349
+ pitch = torch.from_numpy(pitch).to(device=synthesizer_device)
350
+
351
+ if phone.dim() == 2:
352
+ phone = phone.unsqueeze(0)
353
+ if pitchf.dim() == 1:
354
+ pitchf = pitchf.unsqueeze(0)
355
+ if pitch.dim() == 1:
356
+ pitch = pitch.unsqueeze(0)
357
+
358
+ sid = torch.tensor([0], device=synthesizer_device, dtype=torch.int32)
359
+
360
+ audio_segment = (
361
+ self.synthesizer(phone, pitch, pitchf, sid).squeeze().cpu().float().numpy()
362
+ )
363
+ logger.info(
364
+ f"Generated audio shape: {audio_segment.shape} {audio_segment.dtype}"
365
+ )
366
+ return audio_segment
zerorvc/synthesizer/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .models import SynthesizerTrnMs768NSFsid, MultiPeriodDiscriminator
zerorvc/synthesizer/attentions.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional
3
+
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from . import commons
9
+ from .modules import LayerNorm
10
+
11
+
12
+ class Encoder(nn.Module):
13
+ def __init__(
14
+ self,
15
+ hidden_channels: int,
16
+ filter_channels: int,
17
+ n_heads: int,
18
+ n_layers: int,
19
+ kernel_size=1,
20
+ p_dropout=0.0,
21
+ window_size=10,
22
+ ):
23
+ super().__init__()
24
+ self.hidden_channels = hidden_channels
25
+ self.filter_channels = filter_channels
26
+ self.n_heads = n_heads
27
+ self.n_layers = int(n_layers)
28
+ self.kernel_size = kernel_size
29
+ self.p_dropout = p_dropout
30
+ self.window_size = window_size
31
+
32
+ self.drop = nn.Dropout(p_dropout)
33
+ self.attn_layers = nn.ModuleList()
34
+ self.norm_layers_1 = nn.ModuleList()
35
+ self.ffn_layers = nn.ModuleList()
36
+ self.norm_layers_2 = nn.ModuleList()
37
+ for i in range(self.n_layers):
38
+ self.attn_layers.append(
39
+ MultiHeadAttention(
40
+ hidden_channels,
41
+ hidden_channels,
42
+ n_heads,
43
+ p_dropout=p_dropout,
44
+ window_size=window_size,
45
+ )
46
+ )
47
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
48
+ self.ffn_layers.append(
49
+ FFN(
50
+ hidden_channels,
51
+ hidden_channels,
52
+ filter_channels,
53
+ kernel_size,
54
+ p_dropout=p_dropout,
55
+ )
56
+ )
57
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
58
+
59
+ def forward(self, x, x_mask):
60
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
61
+ x = x * x_mask
62
+ zippep = zip(
63
+ self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
64
+ )
65
+ for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
66
+ y = attn_layers(x, x, attn_mask)
67
+ y = self.drop(y)
68
+ x = norm_layers_1(x + y)
69
+
70
+ y = ffn_layers(x, x_mask)
71
+ y = self.drop(y)
72
+ x = norm_layers_2(x + y)
73
+ x = x * x_mask
74
+ return x
75
+
76
+
77
+ class Decoder(nn.Module):
78
+ def __init__(
79
+ self,
80
+ hidden_channels: int,
81
+ filter_channels: int,
82
+ n_heads: int,
83
+ n_layers: int,
84
+ kernel_size=1,
85
+ p_dropout=0.0,
86
+ proximal_bias=False,
87
+ proximal_init=True,
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(
137
+ self,
138
+ x: torch.Tensor,
139
+ x_mask: torch.Tensor,
140
+ h: torch.Tensor,
141
+ h_mask: torch.Tensor,
142
+ ):
143
+ """
144
+ x: decoder input
145
+ h: encoder output
146
+ """
147
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
148
+ device=x.device, dtype=x.dtype
149
+ )
150
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
151
+ x = x * x_mask
152
+ for i in range(self.n_layers):
153
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
154
+ y = self.drop(y)
155
+ x = self.norm_layers_0[i](x + y)
156
+
157
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
158
+ y = self.drop(y)
159
+ x = self.norm_layers_1[i](x + y)
160
+
161
+ y = self.ffn_layers[i](x, x_mask)
162
+ y = self.drop(y)
163
+ x = self.norm_layers_2[i](x + y)
164
+ x = x * x_mask
165
+ return x
166
+
167
+
168
+ class MultiHeadAttention(nn.Module):
169
+ def __init__(
170
+ self,
171
+ channels: int,
172
+ out_channels: int,
173
+ n_heads: int,
174
+ p_dropout=0.0,
175
+ window_size: int = None,
176
+ heads_share=True,
177
+ block_length: int = None,
178
+ proximal_bias=False,
179
+ proximal_init=False,
180
+ ):
181
+ super().__init__()
182
+ assert channels % n_heads == 0
183
+
184
+ self.channels = channels
185
+ self.out_channels = out_channels
186
+ self.n_heads = n_heads
187
+ self.p_dropout = p_dropout
188
+ self.window_size = window_size
189
+ self.heads_share = heads_share
190
+ self.block_length = block_length
191
+ self.proximal_bias = proximal_bias
192
+ self.proximal_init = proximal_init
193
+ self.attn = None
194
+
195
+ self.k_channels = channels // n_heads
196
+ self.conv_q = nn.Conv1d(channels, channels, 1)
197
+ self.conv_k = nn.Conv1d(channels, channels, 1)
198
+ self.conv_v = nn.Conv1d(channels, channels, 1)
199
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
200
+ self.drop = nn.Dropout(p_dropout)
201
+
202
+ if window_size is not None:
203
+ n_heads_rel = 1 if heads_share else n_heads
204
+ rel_stddev = self.k_channels**-0.5
205
+ self.emb_rel_k = nn.Parameter(
206
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
207
+ * rel_stddev
208
+ )
209
+ self.emb_rel_v = nn.Parameter(
210
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
211
+ * rel_stddev
212
+ )
213
+
214
+ nn.init.xavier_uniform_(self.conv_q.weight)
215
+ nn.init.xavier_uniform_(self.conv_k.weight)
216
+ nn.init.xavier_uniform_(self.conv_v.weight)
217
+ if proximal_init:
218
+ with torch.no_grad():
219
+ self.conv_k.weight.copy_(self.conv_q.weight)
220
+ self.conv_k.bias.copy_(self.conv_q.bias)
221
+
222
+ def forward(
223
+ self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
224
+ ):
225
+ q = self.conv_q(x)
226
+ k = self.conv_k(c)
227
+ v = self.conv_v(c)
228
+
229
+ x, _ = self.attention(q, k, v, mask=attn_mask)
230
+
231
+ x = self.conv_o(x)
232
+ return x
233
+
234
+ def attention(
235
+ self,
236
+ query: torch.Tensor,
237
+ key: torch.Tensor,
238
+ value: torch.Tensor,
239
+ mask: Optional[torch.Tensor] = None,
240
+ ):
241
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
242
+ b, d, t_s = key.shape
243
+
244
+ if type(t_s) == int:
245
+ t_s = torch.tensor(t_s, device=key.device, dtype=torch.int32)
246
+
247
+ t_t = query.size(2)
248
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
249
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
250
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
251
+
252
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
253
+ if self.window_size is not None:
254
+ assert (
255
+ t_s == t_t
256
+ ), "Relative attention is only available for self-attention."
257
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
258
+ rel_logits = self._matmul_with_relative_keys(
259
+ query / math.sqrt(self.k_channels), key_relative_embeddings
260
+ )
261
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
262
+ scores = scores + scores_local
263
+ if self.proximal_bias:
264
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
265
+ scores = scores + self._attention_bias_proximal(t_s).to(
266
+ device=scores.device, dtype=scores.dtype
267
+ )
268
+ if mask is not None:
269
+ scores = scores.masked_fill(mask == 0, -1e4)
270
+ if self.block_length is not None:
271
+ assert (
272
+ t_s == t_t
273
+ ), "Local attention is only available for self-attention."
274
+ block_mask = (
275
+ torch.ones_like(scores)
276
+ .triu(-self.block_length)
277
+ .tril(self.block_length)
278
+ )
279
+ scores = scores.masked_fill(block_mask == 0, -1e4)
280
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
281
+ p_attn = self.drop(p_attn)
282
+ output = torch.matmul(p_attn, value)
283
+ if self.window_size is not None:
284
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
285
+ value_relative_embeddings = self._get_relative_embeddings(
286
+ self.emb_rel_v, t_s
287
+ )
288
+ output = output + self._matmul_with_relative_values(
289
+ relative_weights, value_relative_embeddings
290
+ )
291
+ output = (
292
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
293
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
294
+ return output, p_attn
295
+
296
+ def _matmul_with_relative_values(self, x: torch.Tensor, y: torch.Tensor):
297
+ """
298
+ x: [b, h, l, m]
299
+ y: [h or 1, m, d]
300
+ ret: [b, h, l, d]
301
+ """
302
+ ret = torch.matmul(x, y.unsqueeze(0))
303
+ return ret
304
+
305
+ def _matmul_with_relative_keys(self, x: torch.Tensor, y: torch.Tensor):
306
+ """
307
+ x: [b, h, l, d]
308
+ y: [h or 1, m, d]
309
+ ret: [b, h, l, m]
310
+ """
311
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
312
+ return ret
313
+
314
+ def _get_relative_embeddings(
315
+ self, relative_embeddings: torch.Tensor, length: torch.Tensor
316
+ ):
317
+ """
318
+ Get relative embeddings based on the input length.
319
+
320
+ Args:
321
+ relative_embeddings: Predefined relative embeddings [n_heads_rel, max_relative_position, d].
322
+ length: The length of the sequence as a tensor.
323
+
324
+ Returns:
325
+ Used relative embeddings [n_heads_rel, 2*length-1, d].
326
+ """
327
+ # Ensure `length` is a tensor
328
+ if not isinstance(length, torch.Tensor):
329
+ length = torch.as_tensor(
330
+ length, device=relative_embeddings.device, dtype=torch.int32
331
+ )
332
+
333
+ # Calculate padding dynamically using PyTorch operations
334
+ pad_length = torch.maximum(
335
+ length - (self.window_size + 1),
336
+ torch.zeros(1, device=length.device, dtype=length.dtype),
337
+ )
338
+ slice_start_position = torch.maximum(
339
+ (self.window_size + 1) - length,
340
+ torch.zeros(1, device=length.device, dtype=length.dtype),
341
+ )
342
+ slice_end_position = slice_start_position + 2 * length - 1
343
+
344
+ padded_relative_embeddings = F.pad(
345
+ relative_embeddings,
346
+ [
347
+ 0,
348
+ 0,
349
+ pad_length,
350
+ pad_length,
351
+ 0,
352
+ 0,
353
+ ],
354
+ )
355
+
356
+ used_relative_embeddings = padded_relative_embeddings[
357
+ :, slice_start_position:slice_end_position
358
+ ]
359
+ return used_relative_embeddings
360
+
361
+ def _relative_position_to_absolute_position(self, x: torch.Tensor):
362
+ """
363
+ x: [b, h, l, 2*l-1]
364
+ ret: [b, h, l, l]
365
+ """
366
+ batch, heads, length, _ = x.size()
367
+ # Concat columns of pad to shift from relative to absolute indexing.
368
+ x = F.pad(
369
+ x,
370
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
371
+ [0, 1, 0, 0, 0, 0, 0, 0],
372
+ )
373
+
374
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
375
+ x_flat = x.view([batch, heads, length * 2 * length])
376
+ x_flat = F.pad(
377
+ x_flat,
378
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]])
379
+ [0, length - 1, 0, 0, 0, 0],
380
+ )
381
+
382
+ # Reshape and slice out the padded elements.
383
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
384
+ :, :, :length, length - 1 :
385
+ ]
386
+ return x_final
387
+
388
+ def _absolute_position_to_relative_position(self, x: torch.Tensor):
389
+ """
390
+ x: [b, h, l, l]
391
+ ret: [b, h, l, 2*l-1]
392
+ """
393
+ batch, heads, length, _ = x.size()
394
+ # padd along column
395
+ x = F.pad(
396
+ x,
397
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]])
398
+ [0, length - 1, 0, 0, 0, 0, 0, 0],
399
+ )
400
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
401
+ # add 0's in the beginning that will skew the elements after reshape
402
+ x_flat = F.pad(
403
+ x_flat,
404
+ # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]])
405
+ [length, 0, 0, 0, 0, 0],
406
+ )
407
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
408
+ return x_final
409
+
410
+ def _attention_bias_proximal(self, length: int):
411
+ """Bias for self-attention to encourage attention to close positions.
412
+ Args:
413
+ length: an integer scalar.
414
+ Returns:
415
+ a Tensor with shape [1, 1, length, length]
416
+ """
417
+ r = torch.arange(length, dtype=torch.float32)
418
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
419
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
420
+
421
+
422
+ class FFN(nn.Module):
423
+ def __init__(
424
+ self,
425
+ in_channels: int,
426
+ out_channels: int,
427
+ filter_channels: int,
428
+ kernel_size: int,
429
+ p_dropout=0.0,
430
+ activation: str = None,
431
+ causal=False,
432
+ ):
433
+ super().__init__()
434
+ self.in_channels = in_channels
435
+ self.out_channels = out_channels
436
+ self.filter_channels = filter_channels
437
+ self.kernel_size = kernel_size
438
+ self.p_dropout = p_dropout
439
+ self.activation = activation
440
+ self.causal = causal
441
+ self.is_activation = True if activation == "gelu" else False
442
+ # if causal:
443
+ # self.padding = self._causal_padding
444
+ # else:
445
+ # self.padding = self._same_padding
446
+
447
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
448
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
449
+ self.drop = nn.Dropout(p_dropout)
450
+
451
+ def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
452
+ if self.causal:
453
+ padding = self._causal_padding(x * x_mask)
454
+ else:
455
+ padding = self._same_padding(x * x_mask)
456
+ return padding
457
+
458
+ def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
459
+ x = self.conv_1(self.padding(x, x_mask))
460
+ if self.is_activation:
461
+ x = x * torch.sigmoid(1.702 * x)
462
+ else:
463
+ x = torch.relu(x)
464
+ x = self.drop(x)
465
+
466
+ x = self.conv_2(self.padding(x, x_mask))
467
+ return x * x_mask
468
+
469
+ def _causal_padding(self, x: torch.Tensor):
470
+ if self.kernel_size == 1:
471
+ return x
472
+ pad_l: int = self.kernel_size - 1
473
+ pad_r: int = 0
474
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
475
+ x = F.pad(
476
+ x,
477
+ # commons.convert_pad_shape(padding)
478
+ [pad_l, pad_r, 0, 0, 0, 0],
479
+ )
480
+ return x
481
+
482
+ def _same_padding(self, x: torch.Tensor):
483
+ if self.kernel_size == 1:
484
+ return x
485
+ pad_l: int = (self.kernel_size - 1) // 2
486
+ pad_r: int = self.kernel_size // 2
487
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
488
+ x = F.pad(
489
+ x,
490
+ # commons.convert_pad_shape(padding)
491
+ [pad_l, pad_r, 0, 0, 0, 0],
492
+ )
493
+ return x
zerorvc/synthesizer/commons.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ import math
3
+
4
+ import torch
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size: int, dilation=1):
15
+ return int((kernel_size * dilation - dilation) / 2)
16
+
17
+
18
+ # def convert_pad_shape(pad_shape):
19
+ # l = pad_shape[::-1]
20
+ # pad_shape = [item for sublist in l for item in sublist]
21
+ # return pad_shape
22
+
23
+
24
+ def kl_divergence(
25
+ m_p: torch.Tensor, logs_p: torch.Tensor, m_q: torch.Tensor, logs_q: torch.Tensor
26
+ ):
27
+ """KL(P||Q)"""
28
+ kl = (logs_q - logs_p) - 0.5
29
+ kl += (
30
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
31
+ )
32
+ return kl
33
+
34
+
35
+ def rand_gumbel(shape):
36
+ """Sample from the Gumbel distribution, protect from overflows."""
37
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
38
+ return -torch.log(-torch.log(uniform_samples))
39
+
40
+
41
+ def rand_gumbel_like(x: torch.Tensor):
42
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
43
+ return g
44
+
45
+
46
+ def slice_segments(x: torch.Tensor, ids_str, segment_size=4):
47
+ ret = torch.zeros_like(x[:, :, :segment_size])
48
+ for i in range(x.size(0)):
49
+ idx_str = ids_str[i]
50
+ idx_end = idx_str + segment_size
51
+ ret[i] = x[i, :, idx_str:idx_end]
52
+ return ret
53
+
54
+
55
+ def slice_segments2(x: torch.Tensor, ids_str, segment_size=4):
56
+ ret = torch.zeros_like(x[:, :segment_size])
57
+ for i in range(x.size(0)):
58
+ idx_str = ids_str[i]
59
+ idx_end = idx_str + segment_size
60
+ ret[i] = x[i, idx_str:idx_end]
61
+ return ret
62
+
63
+
64
+ def rand_slice_segments(x: torch.Tensor, x_lengths=None, segment_size=4):
65
+ b, d, t = x.size()
66
+ if x_lengths is None:
67
+ x_lengths = t
68
+ ids_str_max = x_lengths - segment_size + 1
69
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.int32)
70
+ ret = slice_segments(x, ids_str, segment_size)
71
+ return ret, ids_str
72
+
73
+
74
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
75
+ position = torch.arange(length, dtype=torch.float)
76
+ num_timescales = channels // 2
77
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
78
+ num_timescales - 1
79
+ )
80
+ inv_timescales = min_timescale * torch.exp(
81
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
82
+ )
83
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
84
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
85
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
86
+ signal = signal.view(1, channels, length)
87
+ return signal
88
+
89
+
90
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
91
+ b, channels, length = x.size()
92
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
93
+ return x + signal.to(dtype=x.dtype, device=x.device)
94
+
95
+
96
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
97
+ b, channels, length = x.size()
98
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
99
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
100
+
101
+
102
+ def subsequent_mask(length):
103
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
104
+ return mask
105
+
106
+
107
+ @torch.jit.script
108
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
109
+ n_channels_int = n_channels[0]
110
+ in_act = input_a + input_b
111
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
112
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
113
+ acts = t_act * s_act
114
+ return acts
115
+
116
+
117
+ # def convert_pad_shape(pad_shape):
118
+ # l = pad_shape[::-1]
119
+ # pad_shape = [item for sublist in l for item in sublist]
120
+ # return pad_shape
121
+
122
+
123
+ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
124
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
125
+
126
+
127
+ def shift_1d(x):
128
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
129
+ return x
130
+
131
+
132
+ def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
133
+ if max_length is None:
134
+ max_length = length.max()
135
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
136
+ return x.unsqueeze(0) < length.unsqueeze(1)
137
+
138
+
139
+ def generate_path(duration, mask):
140
+ """
141
+ duration: [b, 1, t_x]
142
+ mask: [b, 1, t_y, t_x]
143
+ """
144
+ device = duration.device
145
+
146
+ b, _, t_y, t_x = mask.shape
147
+ cum_duration = torch.cumsum(duration, -1)
148
+
149
+ cum_duration_flat = cum_duration.view(b * t_x)
150
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
151
+ path = path.view(b, t_x, t_y)
152
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
153
+ path = path.unsqueeze(1).transpose(2, 3) * mask
154
+ return path
155
+
156
+
157
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
158
+ if isinstance(parameters, torch.Tensor):
159
+ parameters = [parameters]
160
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
161
+ norm_type = float(norm_type)
162
+ if clip_value is not None:
163
+ clip_value = float(clip_value)
164
+
165
+ total_norm = 0
166
+ for p in parameters:
167
+ param_norm = p.grad.data.norm(norm_type)
168
+ total_norm += param_norm.item() ** norm_type
169
+ if clip_value is not None:
170
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
171
+ total_norm = total_norm ** (1.0 / norm_type)
172
+ return total_norm