|
import os |
|
import re |
|
import random |
|
from scipy.io.wavfile import write |
|
from scipy.io.wavfile import read |
|
import numpy as np |
|
import gradio as gr |
|
import yt_dlp |
|
import subprocess |
|
from pydub import AudioSegment |
|
from scipy.signal import convolve |
|
|
|
from audio_separator.separator import Separator |
|
from lib.infer import infer_audio |
|
import edge_tts |
|
import tempfile |
|
import anyio |
|
from pathlib import Path |
|
from lib.language_tts import language_dict |
|
import shutil |
|
import time |
|
from argparse import ArgumentParser |
|
from download_model import download_online_model |
|
|
|
main_dir = Path().resolve() |
|
print(main_dir) |
|
|
|
os.chdir(main_dir) |
|
models_dir = main_dir / "rvc_models" |
|
audio_separat_dir = main_dir / "audio_input" |
|
AUDIO_DIR = main_dir / 'audio_input' |
|
|
|
|
|
|
|
def get_folders(): |
|
if models_dir.exists() and models_dir.is_dir(): |
|
return [folder.name for folder in models_dir.iterdir() if folder.is_dir()] |
|
return [] |
|
|
|
|
|
|
|
def refresh_folders(): |
|
return gr.Dropdown.update(choices=get_folders()) |
|
|
|
|
|
|
|
def get_audio_files(): |
|
if not os.path.exists(AUDIO_DIR): |
|
os.makedirs(AUDIO_DIR) |
|
return [f for f in os.listdir(AUDIO_DIR) if f.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.aac'))] |
|
|
|
|
|
|
|
def load_audio_files(): |
|
audio_files = get_audio_files() |
|
return [os.path.join(AUDIO_DIR, f) for f in audio_files] |
|
|
|
|
|
def refresh_audio_list(): |
|
audio_files = load_audio_files() |
|
return gr.Dropdown.update(choices=audio_files) |
|
|
|
|
|
def download_audio(url): |
|
ydl_opts = { |
|
'format': 'bestaudio/best', |
|
'outtmpl': 'ytdl/%(title)s.%(ext)s', |
|
'postprocessors': [{ |
|
'key': 'FFmpegExtractAudio', |
|
'preferredcodec': 'wav', |
|
'preferredquality': '192', |
|
}], |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
info_dict = ydl.extract_info(url, download=True) |
|
file_path = ydl.prepare_filename(info_dict).rsplit('.', 1)[0] + '.wav' |
|
return file_path |
|
|
|
|
|
async def text_to_speech_edge(text, language_code): |
|
voice = language_dict.get(language_code, "default_voice") |
|
communicate = edge_tts.Communicate(text, voice) |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: |
|
tmp_path = tmp_file.name |
|
await communicate.save(tmp_path) |
|
return tmp_path |
|
|
|
|
|
|
|
|
|
|
|
def add_simple_reverb(input_audio): |
|
|
|
sound = AudioSegment.from_file(input_audio) |
|
|
|
|
|
samples = np.array(sound.get_array_of_samples()) |
|
|
|
|
|
impulse_response = np.concatenate([np.zeros(5000), np.array([0.5**i for i in range(1000)])]) |
|
|
|
|
|
reverbed_samples = convolve(samples, impulse_response, mode='full') |
|
reverbed_samples = reverbed_samples[:len(samples)] |
|
|
|
|
|
reverbed_sound = sound._spawn(reverbed_samples.astype(np.int16).tobytes()) |
|
|
|
|
|
output_path = "vocals_with_reverb.wav" |
|
reverbed_sound.export(output_path, format='wav') |
|
|
|
return output_path |
|
|
|
|
|
|
|
|
|
def process_audio(MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, |
|
FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, |
|
KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio=None): |
|
|
|
|
|
if not SOUND_PATH and upload_audio is not None: |
|
SOUND_PATH = os.path.join("uploaded_audio", upload_audio.name) |
|
with open(SOUND_PATH, "wb") as f: |
|
f.write(upload_audio.read()) |
|
|
|
|
|
if not MODEL_NAME: |
|
return "Please provide a model name." |
|
|
|
|
|
os.system("chmod +x stftpitchshift") |
|
inferred_audio = infer_audio( |
|
MODEL_NAME, |
|
SOUND_PATH, |
|
F0_CHANGE, |
|
F0_METHOD, |
|
MIN_PITCH, |
|
MAX_PITCH, |
|
CREPE_HOP_LENGTH, |
|
INDEX_RATE, |
|
FILTER_RADIUS, |
|
RMS_MIX_RATE, |
|
PROTECT, |
|
SPLIT_INFER, |
|
MIN_SILENCE, |
|
SILENCE_THRESHOLD, |
|
SEEK_STEP, |
|
KEEP_SILENCE, |
|
FORMANT_SHIFT, |
|
QUEFRENCY, |
|
TIMBRE, |
|
F0_AUTOTUNE, |
|
OUTPUT_FORMAT |
|
) |
|
|
|
return inferred_audio |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
parser = ArgumentParser() |
|
parser.add_argument("--share", action="store_true", dest="share_enabled", default=False) |
|
parser.add_argument("--listen", action="store_true", default=False) |
|
parser.add_argument('--listen-host', type=str) |
|
parser.add_argument('--listen-port', type=int) |
|
args = parser.parse_args() |
|
|
|
|
|
with gr.Blocks(title="Hex RVC", theme=gr.themes.Base(primary_hue="red", secondary_hue="pink")) as app: |
|
gr.Markdown("# Hex RVC - AI Audio Inference") |
|
gr.Markdown("Join [AIHub](https://discord.gg/aihub) to get the RVC model!") |
|
|
|
|
|
with gr.Tab("Inference"): |
|
gr.Markdown("## Inference Settings") |
|
with gr.Row(): |
|
MODEL_NAME = gr.Dropdown( |
|
label="Select AI Model", |
|
choices=get_folders(), |
|
interactive=True, |
|
info="Choose a pre-trained model for audio processing" |
|
) |
|
SOUND_PATH = gr.Dropdown( |
|
choices=load_audio_files(), |
|
label="Select Existing Audio File", |
|
interactive=True, |
|
info="Pick an audio file from the predefined directory" |
|
) |
|
upload_audio = gr.Audio( |
|
label="Upload Your Own Audio", |
|
type='filepath', |
|
info="Upload an audio file if not using existing ones" |
|
) |
|
|
|
gr.Markdown("### Conversion Parameters") |
|
with gr.Accordion("Conversion Settings", open=True): |
|
with gr.Row(): |
|
F0_CHANGE = gr.Number( |
|
label="Pitch Change (semitones)", |
|
value=0, |
|
info="Adjust the pitch of the output audio" |
|
) |
|
F0_METHOD = gr.Dropdown( |
|
choices=["crepe", "harvest", "mangio-crepe", "rmvpe", "rmvpe_legacy", "fcpe", "fcpe_legacy", "hybrid[rmvpe+fcpe]"], |
|
label="F0 Method", |
|
value="fcpe", |
|
info="Select the fundamental frequency extraction method" |
|
) |
|
with gr.Row(): |
|
MIN_PITCH = gr.Number(label="Min Pitch", value=50, info="Minimum pitch detection threshold") |
|
MAX_PITCH = gr.Number(label="Max Pitch", value=1100, info="Maximum pitch detection threshold") |
|
CREPE_HOP_LENGTH = gr.Number(label="Crepe Hop Length", value=120, info="Hop length for Crepe method") |
|
INDEX_RATE = gr.Slider(label="Index Rate", minimum=0, maximum=1, value=0.75) |
|
FILTER_RADIUS = gr.Number(label="Filter Radius", value=3, info="Filter intensity for smoothing") |
|
RMS_MIX_RATE = gr.Slider(label="RMS Mix Rate", minimum=0, maximum=1, value=0.25) |
|
PROTECT = gr.Slider(label="Protect Factor", minimum=0, maximum=1, value=0.33) |
|
|
|
gr.Markdown("## Generate Audio") |
|
output_audio = gr.Audio(label="Generated Audio Output", type='filepath') |
|
|
|
with gr.Row(): |
|
refresh_btn = gr.Button("Refresh Lists") |
|
run_button = gr.Button("Run Inference") |
|
|
|
|
|
refresh_btn.click( |
|
lambda: (refresh_audio_list(), refresh_folders()), |
|
outputs=[SOUND_PATH, MODEL_NAME] |
|
) |
|
|
|
|
|
run_button.click( |
|
fn=process_audio, |
|
inputs=[MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, |
|
FILTER_RADIUS, RMS_MIX_RATE, PROTECT, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, |
|
KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio], |
|
outputs=output_audio |
|
) |
|
|
|
|
|
with gr.Tab("Download RVC Model"): |
|
gr.Markdown("## Download RVC Model") |
|
url = gr.Textbox(label="Model URL") |
|
dirname = gr.Textbox(label="Model Directory Name") |
|
download_button = gr.Button("Download Model") |
|
download_output = gr.Textbox(label="Download Status") |
|
|
|
download_button.click( |
|
download_online_model, |
|
inputs=[url, dirname], |
|
outputs=download_output |
|
) |
|
|
|
with gr.Tab("Audio Effect (demo)"): |
|
input_audio = gr.Textbox(label="Path Audio File") |
|
output_audio = gr.Audio(type="filepath", label="Processed Audio with Reverb") |
|
|
|
reverb_btn = gr.Button("Add Reverb") |
|
|
|
reverb_btn.click(add_simple_reverb, inputs=input_audio, outputs=output_audio) |
|
|
|
|
|
with gr.Tab("Audio Separation"): |
|
gr.Markdown("## Audio Separation") |
|
input_audio = gr.Audio(type="filepath", label="Upload Audio for Separation") |
|
with gr.Accordion("Separation by Link", open = False): |
|
with gr.Row(): |
|
roformer_link = gr.Textbox( |
|
label = "Link", |
|
placeholder = "Paste the link here", |
|
interactive = True |
|
) |
|
with gr.Row(): |
|
gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)") |
|
with gr.Row(): |
|
roformer_download_button = gr.Button( |
|
"Download!", |
|
variant = "primary" |
|
) |
|
separate_button = gr.Button("Separate Audio") |
|
separation_output = gr.Textbox(label="Separation Output Path") |
|
|
|
roformer_download_button.click(download_audio, [roformer_link], [input_audio]) |
|
separate_button.click( |
|
fn=separate_audio, |
|
inputs=[input_audio, "model_bs_roformer_ep_317_sdr_12.9755.ckpt", |
|
"UVR-DeEcho-DeReverb.pth", |
|
"mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt"], |
|
outputs=[separation_output] |
|
) |
|
|
|
app.launch( |
|
share=args.share_enabled, |
|
server_name=None if not args.listen else (args.listen_host or '0.0.0.0'), |
|
server_port=args.listen_port |
|
) |
|
|