Spaces:

r3gm
/

AICoverGen

Running on Zero

App Files Files Community

r3gm commited on 3 days ago

Commit

f5db5c5

verified ·

1 Parent(s): a83ce13

Upload 10 files

Browse files

Files changed (10) hide show

README.md +12 -12
app.py +4 -7
pre-requirements.txt +2 -1
requirements.txt +7 -5
src/download_models.py +12 -5
src/infer_pack/models.py +4 -4
src/main.py +112 -27
src/mdx.py +7 -3
src/rvc.py +15 -4
src/webui.py +82 -27

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: AICoverGen
-emoji: 🚀
-colorFrom: red
-colorTo: pink
-sdk: gradio
-sdk_version: 5.35.0
-app_file: app.py
-pinned: false
-license: mit
----
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: AICoverGen
+emoji: 🚀
+colorFrom: red
+colorTo: pink
+sdk: gradio
+sdk_version: 5.44.0
+app_file: app.py
+pinned: false
+license: mit
+---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,12 +1,9 @@
 import os
-cmd = """
-pip install onnxruntime-gpu[cuda,cudnn]==1.22.0
-find / -name 'libcudnn.so*' 2>/dev/null
-python src/download_models.py
-python src/webui.py
-"""
 os.system(cmd)

 import os
+import sys
+os.system("python src/download_models.py")
+args = " ".join(sys.argv[1:])
+cmd = f"python src/webui.py {args}"
 os.system(cmd)

pre-requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- pip<=23.1.2


1	+ pip==23.0.1
2	+ Setuptools<=80.6.0

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
---extra-index-url=https://download.pytorch.org/whl/cu121
 torch==2.5.1
 torchvision==0.20.1
 torchaudio==2.5.1
@@ -6,11 +5,11 @@ deemix
 fairseq==0.12.2
 faiss-cpu==1.7.3
 ffmpeg-python>=0.2.0
-# gradio==3.39.0
 lib==4.0.0
 librosa==0.9.1
 numpy==1.23.5
-# onnxruntime #onnxruntime_gpu
 praat-parselmouth>=0.4.2
 pedalboard==0.7.7
 pydub==0.25.1
@@ -20,5 +19,8 @@ scipy==1.11.1
 soundfile==0.12.1
 torchcrepe==0.0.20
 tqdm==4.65.0
-yt_dlp==2023.7.6
-sox==1.4.1

 torch==2.5.1
 torchvision==0.20.1
 torchaudio==2.5.1
 fairseq==0.12.2
 faiss-cpu==1.7.3
 ffmpeg-python>=0.2.0
+gradio==5.44.0
 lib==4.0.0
 librosa==0.9.1
 numpy==1.23.5
+onnxruntime-gpu==1.22.0 # onnxruntime #onnxruntime_gpu
 praat-parselmouth>=0.4.2
 pedalboard==0.7.7
 pydub==0.25.1
 soundfile==0.12.1
 torchcrepe==0.0.20
 tqdm==4.65.0
+yt_dlp
+sox==1.4.1
+noisereduce
+spaces
+matplotlib-inline

src/download_models.py CHANGED Viewed

@@ -8,11 +8,20 @@ BASE_DIR = Path(__file__).resolve().parent.parent
 mdxnet_models_dir = BASE_DIR / 'mdxnet_models'
 rvc_models_dir = BASE_DIR / 'rvc_models'
 def dl_model(link, model_name, dir_name):
-    with requests.get(f'{link}{model_name}') as r:
         r.raise_for_status()
-        with open(dir_name / model_name, 'wb') as f:
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
@@ -20,12 +29,10 @@ def dl_model(link, model_name, dir_name):
 if __name__ == '__main__':
     mdx_model_names = ['UVR-MDX-NET-Inst_HQ_4.onnx', 'UVR-MDX-NET-Voc_FT.onnx', 'UVR_MDXNET_KARA_2.onnx', 'Reverb_HQ_By_FoxJoy.onnx']
     for model in mdx_model_names:
-        print(f'Downloading {model}...')
         dl_model(MDX_DOWNLOAD_LINK, model, mdxnet_models_dir)
     rvc_model_names = ['hubert_base.pt', 'rmvpe.pt']
     for model in rvc_model_names:
-        print(f'Downloading {model}...')
         dl_model(RVC_DOWNLOAD_LINK, model, rvc_models_dir)
-    print('All models downloaded!')

 mdxnet_models_dir = BASE_DIR / 'mdxnet_models'
 rvc_models_dir = BASE_DIR / 'rvc_models'
+mdxnet_models_dir.mkdir(parents=True, exist_ok=True)
+rvc_models_dir.mkdir(parents=True, exist_ok=True)
 def dl_model(link, model_name, dir_name):
+    model_path = dir_name / model_name
+    if model_path.exists():
+        # print(f"{model_name} already exists, skipping download.")
+        return
+    print(f"Downloading {model_name}...")
+    with requests.get(f'{link}{model_name}', stream=True) as r:
         r.raise_for_status()
+        with open(model_path, 'wb') as f:
             for chunk in r.iter_content(chunk_size=8192):
                 f.write(chunk)
 if __name__ == '__main__':
     mdx_model_names = ['UVR-MDX-NET-Inst_HQ_4.onnx', 'UVR-MDX-NET-Voc_FT.onnx', 'UVR_MDXNET_KARA_2.onnx', 'Reverb_HQ_By_FoxJoy.onnx']
     for model in mdx_model_names:
         dl_model(MDX_DOWNLOAD_LINK, model, mdxnet_models_dir)
     rvc_model_names = ['hubert_base.pt', 'rmvpe.pt']
     for model in rvc_model_names:
         dl_model(RVC_DOWNLOAD_LINK, model, rvc_models_dir)
+    print('All models ready!')

src/infer_pack/models.py CHANGED Viewed

@@ -607,7 +607,7 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
@@ -718,7 +718,7 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
@@ -826,7 +826,7 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
@@ -928,7 +928,7 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
-        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()

             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()
             inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         )
         self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        # print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
     def remove_weight_norm(self):
         self.dec.remove_weight_norm()

src/main.py CHANGED Viewed

@@ -9,6 +9,8 @@ import shlex
 import subprocess
 from contextlib import suppress
 from urllib.parse import urlparse, parse_qs
 import gradio as gr
 import librosa
@@ -19,6 +21,7 @@ import yt_dlp
 from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter
 from pedalboard.io import AudioFile
 from pydub import AudioSegment
 from mdx import run_mdx
 from rvc import Config, load_hubert, get_vc, rvc_infer
@@ -27,12 +30,29 @@ import logging
 logging.getLogger("httpx").setLevel(logging.WARNING)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 mdxnet_models_dir = os.path.join(BASE_DIR, 'mdxnet_models')
 rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models')
 output_dir = os.path.join(BASE_DIR, 'song_output')
 def get_youtube_video_id(url, ignore_playlist=True):
     """
     Examples:
@@ -68,6 +88,9 @@ def get_youtube_video_id(url, ignore_playlist=True):
 def yt_download(link):
     ydl_opts = {
         'format': 'bestaudio',
         'outtmpl': '%(title)s',
@@ -95,12 +118,12 @@ def raise_exception(error_msg, is_webui):
 def get_rvc_model(voice_model, is_webui):
     rvc_model_filename, rvc_index_filename = None, None
     model_dir = os.path.join(rvc_models_dir, voice_model)
-    print(model_dir)
     for file in os.listdir(model_dir):
-        print(file)
         if os.path.isdir(file):
             for ff in os.listdir(file):
-                print("subfile", ff)
                 ext = os.path.splitext(ff)[1]
                 if ext == '.pth':
                     rvc_model_filename = ff
@@ -136,9 +159,21 @@ def get_audio_paths(song_dir):
         elif file.endswith('_Vocals_Backup.wav'):
             backup_vocals_path = os.path.join(song_dir, file)
     return orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path
 def convert_to_stereo(audio_path):
     wave, sr = librosa.load(audio_path, mono=False, sr=44100)
@@ -216,7 +251,7 @@ hubert_model = load_hubert("cuda", config.is_half, os.path.join(rvc_models_dir,
 print(device, "half>>", config.is_half)
 # @spaces.GPU(enable_queue=True)
-def voice_change(voice_model, vocals_path, output_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui):
     rvc_model_path, rvc_index_path = get_rvc_model(voice_model, is_webui)
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -227,8 +262,8 @@ def voice_change(voice_model, vocals_path, output_path, pitch_change, f0_method,
     # convert main vocals
     global hubert_model
-    rvc_infer(rvc_index_path, index_rate, vocals_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model)
-    del hubert_model, cpt
     gc.collect()
@@ -267,9 +302,9 @@ def combine_audio(audio_paths, output_path, main_gain, backup_gain, inst_gain, o
 def process_song(
     song_dir, song_input, mdx_model_params, song_id, is_webui, input_type, progress,
     keep_files, pitch_change, pitch_change_all, voice_model, index_rate, filter_radius,
-    rms_mix_rate, protect, f0_method, crepe_hop_length, output_format, keep_orig, orig_song_path
 ):
     if not os.path.exists(song_dir):
         os.makedirs(song_dir)
         orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress, keep_orig, orig_song_path)
@@ -278,29 +313,72 @@ def process_song(
         paths = get_audio_paths(song_dir)
         # if any of the audio files aren't available or keep intermediate files, rerun preprocess
-        if any(path is None for path in paths) or keep_files:
             orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress, keep_orig, orig_song_path)
         else:
             orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path = paths
     pitch_change = pitch_change * 12 + pitch_change_all
-    ai_vocals_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}_{f0_method}{"" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"}.wav')
     ai_cover_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]} ({voice_model} Ver).{output_format}')
     if not os.path.exists(ai_vocals_path):
         display_progress('[~] Converting voice using RVC...', 0.5, is_webui, progress)
-        voice_change(voice_model, main_vocals_dereverb_path, ai_vocals_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui)
     return ai_vocals_path, ai_cover_path, instrumentals_path, backup_vocals_path, vocals_path, main_vocals_path
-# process_song.zerogpu = True
 # @spaces.GPU(duration=140)
 def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
                         is_webui=0, main_gain=0, backup_gain=0, inst_gain=0, index_rate=0.5, filter_radius=3,
                         rms_mix_rate=0.25, f0_method='rmvpe', crepe_hop_length=128, protect=0.33, pitch_change_all=0,
                         reverb_rm_size=0.15, reverb_wet=0.2, reverb_dry=0.8, reverb_damping=0.7, output_format='mp3',
                         progress=gr.Progress()):
     try:
         if not song_input or not voice_model:
             raise_exception('Ensure that the song input field and voice model field is filled.', is_webui)
@@ -334,9 +412,8 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
         keep_orig, orig_song_path = get_audio_file(song_input, is_webui, input_type, progress)
         orig_song_path = convert_to_stereo(orig_song_path)
-        import time
         start = time.time()
         (
             ai_vocals_path,
             ai_cover_path,
@@ -365,6 +442,7 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
             output_format,
             keep_orig,
             orig_song_path,
         )
         end = time.time()
@@ -374,20 +452,27 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
         print(f"Audio duration: {duration__:.2f} seconds")
         display_progress('[~] Applying audio effects to Vocals...', 0.8, is_webui, progress)
         ai_vocals_mixed_path = add_audio_effects(ai_vocals_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping)
-        instrumentals_path, _ = run_mdx(
-            mdx_model_params,
-            os.path.join(output_dir, song_id),
-            os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
-            instrumentals_path,
-            # exclude_main=False,
-            exclude_inversion=True,
-            suffix="Voiceless",
-            denoise=False,
-            keep_orig=True,
-            base_device=""
-        )
         if pitch_change_all != 0:
             display_progress('[~] Applying overall pitch change', 0.85, is_webui, progress)
@@ -399,7 +484,7 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
         if not keep_files:
             display_progress('[~] Removing intermediate audio files...', 0.95, is_webui, progress)
-            intermediate_files = [vocals_path, main_vocals_path, ai_vocals_mixed_path]
             if pitch_change_all != 0:
                 intermediate_files += [instrumentals_path, backup_vocals_path]
             for file in intermediate_files:

 import subprocess
 from contextlib import suppress
 from urllib.parse import urlparse, parse_qs
+import time
+import shutil
 import gradio as gr
 import librosa
 from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter
 from pedalboard.io import AudioFile
 from pydub import AudioSegment
+import noisereduce as nr
 from mdx import run_mdx
 from rvc import Config, load_hubert, get_vc, rvc_infer
 logging.getLogger("httpx").setLevel(logging.WARNING)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+IS_ZERO_GPU = os.getenv("SPACES_ZERO_GPU")
 mdxnet_models_dir = os.path.join(BASE_DIR, 'mdxnet_models')
 rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models')
 output_dir = os.path.join(BASE_DIR, 'song_output')
+def clean_old_folders(base_path: str, max_age_seconds: int = 10800):
+    if not os.path.isdir(base_path):
+        print(f"Error: {base_path} is not a valid directory.")
+        return
+    now = time.time()
+    for folder_name in os.listdir(base_path):
+        folder_path = os.path.join(base_path, folder_name)
+        if os.path.isdir(folder_path):
+            last_modified = os.path.getmtime(folder_path)
+            if now - last_modified > max_age_seconds:
+                # print(f"Deleting folder: {folder_path}")
+                shutil.rmtree(folder_path)
 def get_youtube_video_id(url, ignore_playlist=True):
     """
     Examples:
 def yt_download(link):
+    if not link.strip():
+        gr.Info("You need to provide a download link.")
+        return None
     ydl_opts = {
         'format': 'bestaudio',
         'outtmpl': '%(title)s',
 def get_rvc_model(voice_model, is_webui):
     rvc_model_filename, rvc_index_filename = None, None
     model_dir = os.path.join(rvc_models_dir, voice_model)
+    # print(model_dir)
     for file in os.listdir(model_dir):
+        # print(file)
         if os.path.isdir(file):
             for ff in os.listdir(file):
+                # print("subfile", ff)
                 ext = os.path.splitext(ff)[1]
                 if ext == '.pth':
                     rvc_model_filename = ff
         elif file.endswith('_Vocals_Backup.wav'):
             backup_vocals_path = os.path.join(song_dir, file)
+    # print(orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path)
     return orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path
+def get_audio_with_suffix(song_dir, suffix="_mysuffix.wav"):
+    target_path = None
+    for file in os.listdir(song_dir):
+        if file.endswith(suffix):
+            target_path = os.path.join(song_dir, file)
+            break
+    return target_path
 def convert_to_stereo(audio_path):
     wave, sr = librosa.load(audio_path, mono=False, sr=44100)
 print(device, "half>>", config.is_half)
 # @spaces.GPU(enable_queue=True)
+def voice_change(voice_model, vocals_path, output_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui, steps):
     rvc_model_path, rvc_index_path = get_rvc_model(voice_model, is_webui)
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     # convert main vocals
     global hubert_model
+    rvc_infer(rvc_index_path, index_rate, vocals_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model, steps)
+    del cpt
     gc.collect()
 def process_song(
     song_dir, song_input, mdx_model_params, song_id, is_webui, input_type, progress,
     keep_files, pitch_change, pitch_change_all, voice_model, index_rate, filter_radius,
+    rms_mix_rate, protect, f0_method, crepe_hop_length, output_format, keep_orig, orig_song_path, steps
 ):
     if not os.path.exists(song_dir):
         os.makedirs(song_dir)
         orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress, keep_orig, orig_song_path)
         paths = get_audio_paths(song_dir)
         # if any of the audio files aren't available or keep intermediate files, rerun preprocess
+        if any(path is None for path in paths):
             orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path = preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type, progress, keep_orig, orig_song_path)
         else:
             orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path = paths
     pitch_change = pitch_change * 12 + pitch_change_all
+    ai_vocals_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}_{f0_method}{"" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"}_s{steps}.wav')
     ai_cover_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]} ({voice_model} Ver).{output_format}')
     if not os.path.exists(ai_vocals_path):
         display_progress('[~] Converting voice using RVC...', 0.5, is_webui, progress)
+        voice_change(voice_model, main_vocals_dereverb_path, ai_vocals_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui, steps)
     return ai_vocals_path, ai_cover_path, instrumentals_path, backup_vocals_path, vocals_path, main_vocals_path
+def apply_noisereduce(audio_list, type_output="wav"):
+    # https://github.com/sa-if/Audio-Denoiser
+    print("Noice reduce")
+    result = []
+    for audio_path in audio_list:
+        out_path = f"{os.path.splitext(audio_path)[0]}_nr.{type_output}"
+        try:
+            # Load audio file
+            audio = AudioSegment.from_file(audio_path)
+            # Convert audio to numpy array
+            samples = np.array(audio.get_array_of_samples())
+            # Reduce noise
+            reduced_noise = nr.reduce_noise(samples, sr=audio.frame_rate, prop_decrease=0.6)
+            # Convert reduced noise signal back to audio
+            reduced_audio = AudioSegment(
+                reduced_noise.tobytes(),
+                frame_rate=audio.frame_rate,
+                sample_width=audio.sample_width,
+                channels=audio.channels
+            )
+            # Save reduced audio to file
+            reduced_audio.export(out_path, format=type_output)
+            result.append(out_path)
+        except Exception as e:
+            print(f"Error noisereduce: {str(e)}")
+            result.append(audio_path)
+    return result
 # @spaces.GPU(duration=140)
 def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
                         is_webui=0, main_gain=0, backup_gain=0, inst_gain=0, index_rate=0.5, filter_radius=3,
                         rms_mix_rate=0.25, f0_method='rmvpe', crepe_hop_length=128, protect=0.33, pitch_change_all=0,
                         reverb_rm_size=0.15, reverb_wet=0.2, reverb_dry=0.8, reverb_damping=0.7, output_format='mp3',
+                        extra_denoise=False, steps=1,
                         progress=gr.Progress()):
+    if not keep_files or IS_ZERO_GPU:
+        clean_old_folders("./song_output", 14400)
+    if IS_ZERO_GPU:
+        clean_old_folders("./rvc_models", 10800)
     try:
         if not song_input or not voice_model:
             raise_exception('Ensure that the song input field and voice model field is filled.', is_webui)
         keep_orig, orig_song_path = get_audio_file(song_input, is_webui, input_type, progress)
         orig_song_path = convert_to_stereo(orig_song_path)
         start = time.time()
         (
             ai_vocals_path,
             ai_cover_path,
             output_format,
             keep_orig,
             orig_song_path,
+            steps,
         )
         end = time.time()
         print(f"Audio duration: {duration__:.2f} seconds")
         display_progress('[~] Applying audio effects to Vocals...', 0.8, is_webui, progress)
+        nr_path = ai_vocals_path  # get_audio_with_suffix(song_dir, "_nr.wav")
+        if extra_denoise:
+            ai_vocals_path = apply_noisereduce([ai_vocals_path])[0]
         ai_vocals_mixed_path = add_audio_effects(ai_vocals_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping)
+        ins_path = get_audio_with_suffix(song_dir, "_Voiceless.wav")
+        if not ins_path:
+            instrumentals_path, _ = run_mdx(
+                mdx_model_params,
+                os.path.join(output_dir, song_id),
+                os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
+                instrumentals_path,
+                # exclude_main=False,
+                exclude_inversion=True,
+                suffix="Voiceless",
+                denoise=False,
+                keep_orig=True,
+                base_device=("" if IS_ZERO_GPU else "cuda")
+            )
         if pitch_change_all != 0:
             display_progress('[~] Applying overall pitch change', 0.85, is_webui, progress)
         if not keep_files:
             display_progress('[~] Removing intermediate audio files...', 0.95, is_webui, progress)
+            intermediate_files = [vocals_path, main_vocals_path, ai_vocals_mixed_path, ins_path, nr_path]
             if pitch_change_all != 0:
                 intermediate_files += [instrumentals_path, backup_vocals_path]
             for file in intermediate_files:

src/mdx.py CHANGED Viewed

@@ -246,20 +246,19 @@ class MDX:
 def run_mdx(model_params, output_dir, model_path, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2, base_device="cuda"):
     if base_device == "cuda" and torch.cuda.is_available():
         device = torch.device("cuda:0")
         device_properties = torch.cuda.get_device_properties(device)
         vram_gb = device_properties.total_memory / 1024**3
         m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
-        print(f"threads: {m_threads} vram: {vram_gb}")
         processor_num = 0
     else:
         device = torch.device("cpu")
         m_threads = 2
         if torch.cuda.is_available():
             m_threads = 8
-        print(f"threads: {m_threads}")
         processor_num = -1
     model_hash = MDX.get_hash(model_path)
@@ -275,6 +274,11 @@ def run_mdx(model_params, output_dir, model_path, filename, exclude_main=False,
     mdx_sess = MDX(model_path, model, processor=processor_num)
     wave, sr = librosa.load(filename, mono=False, sr=44100)
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
     wave /= peak

 def run_mdx(model_params, output_dir, model_path, filename, exclude_main=False, exclude_inversion=False, suffix=None, invert_suffix=None, denoise=False, keep_orig=True, m_threads=2, base_device="cuda"):
+    vram_gb = 0
     if base_device == "cuda" and torch.cuda.is_available():
         device = torch.device("cuda:0")
         device_properties = torch.cuda.get_device_properties(device)
         vram_gb = device_properties.total_memory / 1024**3
         m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
         processor_num = 0
     else:
         device = torch.device("cpu")
         m_threads = 2
         if torch.cuda.is_available():
             m_threads = 8
         processor_num = -1
     model_hash = MDX.get_hash(model_path)
     mdx_sess = MDX(model_path, model, processor=processor_num)
     wave, sr = librosa.load(filename, mono=False, sr=44100)
+    duration = librosa.get_duration(y=wave, sr=sr)
+    if duration < 60:
+        m_threads = 1
+    print(f"threads: {m_threads} vram: {vram_gb}")
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
     wave /= peak

src/rvc.py CHANGED Viewed

@@ -157,9 +157,20 @@ def get_vc(device, is_half, config, model_path):
     return cpt, version, net_g, tgt_sr, vc
-def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model):
-    audio = load_audio(input_path, 16000)
     times = [0, 0, 0]
     if_f0 = cpt.get('f0', 1)
-    audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, input_path, times, pitch_change, f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr, 0, rms_mix_rate, version, protect, crepe_hop_length)
-    wavfile.write(output_path, tgt_sr, audio_opt)

     return cpt, version, net_g, tgt_sr, vc
+def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model, steps):
     times = [0, 0, 0]
     if_f0 = cpt.get('f0', 1)
+    working_path = input_path
+    for step in range(steps):
+        audio = load_audio(working_path, 16000)
+        audio_opt = vc.pipeline(
+            hubert_model, net_g, step, audio, working_path, times, pitch_change,
+            f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr,
+            0, rms_mix_rate, version, protect, crepe_hop_length
+        )
+        wavfile.write(output_path, tgt_sr, audio_opt)
+        working_path = output_path

src/webui.py CHANGED Viewed

@@ -6,10 +6,28 @@ import zipfile
 from argparse import ArgumentParser
 import spaces
 import gradio as gr
-from main import song_cover_pipeline
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 mdxnet_models_dir = os.path.join(BASE_DIR, 'mdxnet_models')
 rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models')
@@ -78,10 +96,21 @@ def download_online_model(url, dir_name, progress=gr.Progress()):
         if 'pixeldrain.com' in url:
             url = f'https://pixeldrain.com/api/file/{zip_name}'
-        urllib.request.urlretrieve(url, zip_name)
-        progress(0.5, desc='[~] Extracting zip...')
-        extract_zip(extraction_folder, zip_name)
         return f'[+] {dir_name} Model successfully downloaded!'
     except Exception as e:
@@ -157,33 +186,37 @@ def show_hop_slider(pitch_detection_algo):
 if __name__ == '__main__':
     parser = ArgumentParser(description='Generate a AI cover song in the song_output/id directory.', add_help=True)
     parser.add_argument("--share", action="store_true", dest="share_enabled", default=False, help="Enable sharing")
     parser.add_argument("--listen", action="store_true", default=False, help="Make the WebUI reachable from your local network.")
     parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
     parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
     args = parser.parse_args()
     voice_models = get_current_models(rvc_models_dir)
     with open(os.path.join(rvc_models_dir, 'public_models.json'), encoding='utf8') as infile:
         public_models = json.load(infile)
-    with gr.Blocks(title='AICoverGenWebUI') as app:
-        gr.Label('AICoverGen WebUI ZeroGPU mode created with ❤️', show_label=False)
-        gr.Markdown(
-            """
-            <details>
-                <summary style="font-size: 1.5em;">⚠️ Important (click to expand)</summary>
-                <ul>
-                    <li>🚀 This demo use a Zero GPU, which is available only for a limited time. It's recommended to use audio files that are no longer than 5 minutes. If you want to use it without time restrictions, you can duplicate the 'old CPU space'. ⏳</li>
-                </ul>
-            </details>
-            """
-        )
-        gr.Markdown("Duplicate the old CPU space for use in private: [![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/r3gm/AICoverGen_old_stable_cpu?duplicate=true)\n\n")
         # main tab
         with gr.Tab("Generate"):
             with gr.Accordion('Main Options'):
                 with gr.Row():
                     with gr.Column():
@@ -191,12 +224,19 @@ if __name__ == '__main__':
                         ref_btn = gr.Button('Refresh Models 🔁', variant='primary')
                     with gr.Column(visible=False) as yt_link_col:
-                        song_input = gr.Text(label='Song input', info='Link to a song on YouTube or full path to a local file. For file upload, click the button below. Example: https://www.youtube.com/watch?v=M-mtdN6R3bQ')
                         show_file_upload_button = gr.Button('Upload file instead')
                     with gr.Column(visible=True) as file_upload_col:
                         audio_extensions = ['.mp3', '.m4a', '.flac', '.wav', '.aac', '.ogg', '.wma', '.alac', '.aiff', '.opus', 'amr']
-                        local_file = gr.File(label='Audio file', interactive=True, type="filepath", file_types=audio_extensions)
                         song_input_file = gr.UploadButton('Upload 📂', file_types=['audio'], variant='primary', visible=False)
                         show_yt_link_button = gr.Button('Paste YouTube link/Path to local file instead', visible=False)
                         song_input_file.upload(process_file_upload, inputs=[song_input_file], outputs=[local_file, song_input])
@@ -217,7 +257,12 @@ if __name__ == '__main__':
                         f0_method = gr.Dropdown(['rmvpe+', 'rmvpe', 'mangio-crepe'], value='rmvpe+', label='Pitch detection algorithm', info='Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals), rmvpe+ use a minimum and maximum allowed pitch values.')
                         crepe_hop_length = gr.Slider(32, 320, value=128, step=1, visible=False, label='Crepe hop length', info='Lower values leads to longer conversions and higher risk of voice cracks, but better pitch accuracy.')
                         f0_method.change(show_hop_slider, inputs=f0_method, outputs=crepe_hop_length)
-                keep_files = gr.Checkbox(True, label='Keep intermediate files', info='Keep all audio files generated in the song_output/id directory, e.g. Isolated Vocals/Instrumentals. Leave unchecked to save space')
             with gr.Accordion('Audio mixing options', open=False):
                 gr.Markdown('### Volume Change (decibels)')
@@ -239,7 +284,12 @@ if __name__ == '__main__':
             with gr.Row():
                 clear_btn = gr.ClearButton(value='Clear', components=[song_input, rvc_model, keep_files, local_file])
                 generate_btn = gr.Button("Generate", variant='primary')
-                ai_cover = gr.Audio(label='AI Cover', show_share_button=False)
             ref_btn.click(update_models_list, None, outputs=rvc_model)
             is_webui = gr.Number(value=1, visible=False)
@@ -247,12 +297,12 @@ if __name__ == '__main__':
                                inputs=[local_file, rvc_model, pitch, keep_files, is_webui, main_gain, backup_gain,
                                        inst_gain, index_rate, filter_radius, rms_mix_rate, f0_method, crepe_hop_length,
                                        protect, pitch_all, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping,
-                                       output_format],
                                outputs=[ai_cover])
-            clear_btn.click(lambda: [0, 0, 0, 0, 0.5, 3, 0.25, 0.33, 'rmvpe+', 128, 0, 0.15, 0.2, 0.8, 0.7, 'mp3', None],
                             outputs=[pitch, main_gain, backup_gain, inst_gain, index_rate, filter_radius, rms_mix_rate,
                                      protect, f0_method, crepe_hop_length, pitch_all, reverb_rm_size, reverb_wet,
-                                     reverb_dry, reverb_damping, output_format, ai_cover])
         # Download tab
         with gr.Tab('Download model'):
@@ -271,6 +321,8 @@ if __name__ == '__main__':
                 gr.Markdown('## Input Examples')
                 gr.Examples(
                     [
                         ['https://huggingface.co/phant0m4r/LiSA/resolve/main/LiSA.zip', 'Lisa'],
                         ['https://pixeldrain.com/u/3tJmABXA', 'Gura'],
                         ['https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models/resolve/main/AZKi%20(Hybrid).zip', 'Azki']
@@ -329,7 +381,10 @@ if __name__ == '__main__':
     app.launch(
         share=args.share_enabled,
         # enable_queue=True,
         server_name=None if not args.listen else (args.listen_host or '0.0.0.0'),
         server_port=args.listen_port,
     )

 from argparse import ArgumentParser
 import spaces
 import gradio as gr
+import logging
+def configure_logging_libs(debug=False):
+    modules = [
+      "numba",
+      "httpx",
+      "markdown_it",
+      "fairseq",
+      "faiss",
+    ]
+    try:
+        for module in modules:
+            logging.getLogger(module).setLevel(logging.WARNING)
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1"
+    except Exception as error:
+        pass
+configure_logging_libs()
+from main import song_cover_pipeline, yt_download
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+IS_ZERO_GPU = os.getenv("SPACES_ZERO_GPU")
 mdxnet_models_dir = os.path.join(BASE_DIR, 'mdxnet_models')
 rvc_models_dir = os.path.join(BASE_DIR, 'rvc_models')
         if 'pixeldrain.com' in url:
             url = f'https://pixeldrain.com/api/file/{zip_name}'
+        if "," in url:
+            urls = [u.strip() for u in url.split(",") if u.strip()]
+            os.makedirs(extraction_folder, exist_ok=True)
+            for u in urls:
+                u = u.replace("?download=true", "")
+                file_name = u.split('/')[-1]
+                file_path = os.path.join(extraction_folder, file_name)
+                if not os.path.exists(file_path):  # avoid re-downloading
+                    urllib.request.urlretrieve(u, file_path)
+        else:
+            urllib.request.urlretrieve(url, zip_name)
+            progress(0.5, desc='[~] Extracting zip...')
+            extract_zip(extraction_folder, zip_name)
         return f'[+] {dir_name} Model successfully downloaded!'
     except Exception as e:
 if __name__ == '__main__':
     parser = ArgumentParser(description='Generate a AI cover song in the song_output/id directory.', add_help=True)
     parser.add_argument("--share", action="store_true", dest="share_enabled", default=False, help="Enable sharing")
+    parser.add_argument("--builtin-player",  action="store_true", default=False, help="Use the builtin audio player")
     parser.add_argument("--listen", action="store_true", default=False, help="Make the WebUI reachable from your local network.")
     parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
     parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
+    parser.add_argument('--theme', type=str, default="NoCrypt/miku", help='Set the theme (default: NoCrypt/miku)')
+    parser.add_argument("--ssr", action="store_true", help="Enable SSR (Server-Side Rendering)")
     args = parser.parse_args()
     voice_models = get_current_models(rvc_models_dir)
     with open(os.path.join(rvc_models_dir, 'public_models.json'), encoding='utf8') as infile:
         public_models = json.load(infile)
+    with gr.Blocks(title='AICoverGenWebUI', theme=args.theme, fill_width=True, fill_height=False) as app:
+        gr.Label(f'AICoverGen WebUI {"ZeroGPU mode" if IS_ZERO_GPU else ""} created with ❤️', show_label=False)
+        if IS_ZERO_GPU:
+            gr.Markdown(
+                """
+                <details>
+                    <summary style="font-size: 1.5em;">⚠️ Important (click to expand)</summary>
+                    <ul>
+                        <li>🚀 This demo use a Zero GPU, which is available only for a limited time. It's recommended to use audio files that are no longer than 5 minutes. If you want to use it without time restrictions, you can duplicate the 'old CPU space'. ⏳</li>
+                    </ul>
+                </details>
+                """
+            )
+            gr.Markdown("Duplicate the old CPU space for use in private: [![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/r3gm/AICoverGen_old_stable_cpu?duplicate=true)\n\n")
         # main tab
         with gr.Tab("Generate"):
             with gr.Accordion('Main Options'):
                 with gr.Row():
                     with gr.Column():
                         ref_btn = gr.Button('Refresh Models 🔁', variant='primary')
                     with gr.Column(visible=False) as yt_link_col:
+                        song_input = gr.Text(label='Song input', info='Link to a song on YouTube or full path to a local file. For file upload, click the button below.')
                         show_file_upload_button = gr.Button('Upload file instead')
                     with gr.Column(visible=True) as file_upload_col:
                         audio_extensions = ['.mp3', '.m4a', '.flac', '.wav', '.aac', '.ogg', '.wma', '.alac', '.aiff', '.opus', 'amr']
+                        local_file = gr.File(label='Audio file', interactive=True, type="filepath", file_types=audio_extensions, height=150)
+                        if not IS_ZERO_GPU:
+                            with gr.Row():
+                                with gr.Row(scale=2):
+                                    url_media_gui = gr.Textbox(value="", label="Enter URL", placeholder="www.youtube.com/watch?v=g_9rPvbENUw", lines=1)
+                                with gr.Row(scale=1):
+                                    url_button_gui = gr.Button("Process URL", variant="secondary")
+                            url_button_gui.click(yt_download, [url_media_gui], [local_file])
                         song_input_file = gr.UploadButton('Upload 📂', file_types=['audio'], variant='primary', visible=False)
                         show_yt_link_button = gr.Button('Paste YouTube link/Path to local file instead', visible=False)
                         song_input_file.upload(process_file_upload, inputs=[song_input_file], outputs=[local_file, song_input])
                         f0_method = gr.Dropdown(['rmvpe+', 'rmvpe', 'mangio-crepe'], value='rmvpe+', label='Pitch detection algorithm', info='Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals), rmvpe+ use a minimum and maximum allowed pitch values.')
                         crepe_hop_length = gr.Slider(32, 320, value=128, step=1, visible=False, label='Crepe hop length', info='Lower values leads to longer conversions and higher risk of voice cracks, but better pitch accuracy.')
                         f0_method.change(show_hop_slider, inputs=f0_method, outputs=crepe_hop_length)
+                with gr.Row():
+                    with gr.Row():
+                        steps = gr.Slider(minimum=1, maximum=3, label="Steps", value=1, step=1, interactive=True)
+                    with gr.Row():
+                        extra_denoise = gr.Checkbox(True, label='Denoise', info='Apply an additional noise reduction step to clean up the audio further.')
+                        keep_files = gr.Checkbox((False if IS_ZERO_GPU else True), label='Keep intermediate files', info='Keep all audio files generated in the song_output/id directory, e.g. Isolated Vocals/Instrumentals. Leave unchecked to save space', interactive=(False if IS_ZERO_GPU else True))
             with gr.Accordion('Audio mixing options', open=False):
                 gr.Markdown('### Volume Change (decibels)')
             with gr.Row():
                 clear_btn = gr.ClearButton(value='Clear', components=[song_input, rvc_model, keep_files, local_file])
                 generate_btn = gr.Button("Generate", variant='primary')
+                ai_cover = (
+                    gr.Audio(label='AI Cover', show_share_button=True)
+                    if args.builtin_player else
+                    gr.File(label="AI Cover", interactive=False)
+                )
+            gr.Markdown("- You can also try `AICoverGen❤️` in Colab’s free tier, which provides free GPU [link](https://github.com/R3gm/AICoverGen?tab=readme-ov-file#aicovergen).")
             ref_btn.click(update_models_list, None, outputs=rvc_model)
             is_webui = gr.Number(value=1, visible=False)
                                inputs=[local_file, rvc_model, pitch, keep_files, is_webui, main_gain, backup_gain,
                                        inst_gain, index_rate, filter_radius, rms_mix_rate, f0_method, crepe_hop_length,
                                        protect, pitch_all, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping,
+                                       output_format, extra_denoise, steps],
                                outputs=[ai_cover])
+            clear_btn.click(lambda: [0, 0, 0, 0, 0.5, 3, 0.25, 0.33, 'rmvpe+', 128, 0, 0.15, 0.2, 0.8, 0.7, 'mp3', None, True, 1],
                             outputs=[pitch, main_gain, backup_gain, inst_gain, index_rate, filter_radius, rms_mix_rate,
                                      protect, f0_method, crepe_hop_length, pitch_all, reverb_rm_size, reverb_wet,
+                                     reverb_dry, reverb_damping, output_format, ai_cover, extra_denoise, steps])
         # Download tab
         with gr.Tab('Download model'):
                 gr.Markdown('## Input Examples')
                 gr.Examples(
                     [
+                        ['https://huggingface.co/MrDawg/ToothBrushing/resolve/main/ToothBrushing.zip?download=true', 'ToothBrushing'],
+                        ['https://huggingface.co/sail-rvc/Aldeano_Minecraft__RVC_V2_-_500_Epochs_/resolve/main/model.pth?download=true, https://huggingface.co/sail-rvc/Aldeano_Minecraft__RVC_V2_-_500_Epochs_/resolve/main/model.index?download=true', 'Minecraft_Villager'],
                         ['https://huggingface.co/phant0m4r/LiSA/resolve/main/LiSA.zip', 'Lisa'],
                         ['https://pixeldrain.com/u/3tJmABXA', 'Gura'],
                         ['https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models/resolve/main/AZKi%20(Hybrid).zip', 'Azki']
     app.launch(
         share=args.share_enabled,
+        debug=args.share_enabled,
+        show_error=True,
         # enable_queue=True,
         server_name=None if not args.listen else (args.listen_host or '0.0.0.0'),
         server_port=args.listen_port,
+        ssr_mode=args.ssr
     )