import os import re import random from scipy.io.wavfile import write from scipy.io.wavfile import read import numpy as np import gradio as gr import yt_dlp import subprocess from pydub import AudioSegment from scipy.signal import convolve from audio_separator.separator import Separator from lib.infer import infer_audio import edge_tts import tempfile import anyio from pathlib import Path from lib.language_tts import language_dict import shutil import time from argparse import ArgumentParser from download_model import download_online_model main_dir = Path().resolve() print(main_dir) os.chdir(main_dir) models_dir = main_dir / "rvc_models" audio_separat_dir = main_dir / "audio_input" AUDIO_DIR = main_dir / 'audio_input' # Function to list all folders in the models directory def get_folders(): if models_dir.exists() and models_dir.is_dir(): return [folder.name for folder in models_dir.iterdir() if folder.is_dir()] return [] # Function to refresh and return the list of folders def refresh_folders(): return gr.Dropdown.update(choices=get_folders()) # Function to get the list of audio files in the specified directory def get_audio_files(): if not os.path.exists(AUDIO_DIR): os.makedirs(AUDIO_DIR) return [f for f in os.listdir(AUDIO_DIR) if f.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.aac'))] # Function to return the full path of audio files for playback def load_audio_files(): audio_files = get_audio_files() return [os.path.join(AUDIO_DIR, f) for f in audio_files] def refresh_audio_list(): audio_files = load_audio_files() return gr.Dropdown.update(choices=audio_files) def download_audio(url): ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': 'ytdl/%(title)s.%(ext)s', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192', }], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=True) file_path = ydl.prepare_filename(info_dict).rsplit('.', 1)[0] + '.wav' return file_path async def text_to_speech_edge(text, language_code): voice = language_dict.get(language_code, "default_voice") communicate = edge_tts.Communicate(text, voice) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path # Function to apply a basic reverb effect using convolution def add_simple_reverb(input_audio): # Load the uploaded audio file using pydub sound = AudioSegment.from_file(input_audio) # Convert AudioSegment to numpy array samples = np.array(sound.get_array_of_samples()) # Define a simple impulse response for reverb (can be customized) impulse_response = np.concatenate([np.zeros(5000), np.array([0.5**i for i in range(1000)])]) # Apply convolution (reverb effect) reverbed_samples = convolve(samples, impulse_response, mode='full') reverbed_samples = reverbed_samples[:len(samples)] # trim to original length # Convert numpy array back to AudioSegment reverbed_sound = sound._spawn(reverbed_samples.astype(np.int16).tobytes()) # Export the reverbed sound to a new file-like object (in-memory) output_path = "vocals_with_reverb.wav" reverbed_sound.export(output_path, format='wav') return output_path # Ensure this function is defined before your Gradio Blocks UI def process_audio(MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio=None): # If no sound path is given, use the uploaded file if not SOUND_PATH and upload_audio is not None: SOUND_PATH = os.path.join("uploaded_audio", upload_audio.name) with open(SOUND_PATH, "wb") as f: f.write(upload_audio.read()) # Check if a model name is provided if not MODEL_NAME: return "Please provide a model name." # Run the inference process os.system("chmod +x stftpitchshift") inferred_audio = infer_audio( MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT ) return inferred_audio if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--share", action="store_true", dest="share_enabled", default=False) parser.add_argument("--listen", action="store_true", default=False) parser.add_argument('--listen-host', type=str) parser.add_argument('--listen-port', type=int) args = parser.parse_args() # Gradio Interface with gr.Blocks(title="Hex RVC", theme=gr.themes.Base(primary_hue="red", secondary_hue="pink")) as app: gr.Markdown("# Hex RVC - AI Audio Inference") gr.Markdown("Join [AIHub](https://discord.gg/aihub) to get the RVC model!") # Inference Tab with Priority on Settings with gr.Tab("Inference"): gr.Markdown("## Inference Settings") with gr.Row(): MODEL_NAME = gr.Dropdown( label="Select AI Model", choices=get_folders(), interactive=True, info="Choose a pre-trained model for audio processing" ) SOUND_PATH = gr.Dropdown( choices=load_audio_files(), label="Select Existing Audio File", interactive=True, info="Pick an audio file from the predefined directory" ) upload_audio = gr.Audio( label="Upload Your Own Audio", type='filepath', info="Upload an audio file if not using existing ones" ) gr.Markdown("### Conversion Parameters") with gr.Accordion("Conversion Settings", open=True): with gr.Row(): F0_CHANGE = gr.Number( label="Pitch Change (semitones)", value=0, info="Adjust the pitch of the output audio" ) F0_METHOD = gr.Dropdown( choices=["crepe", "harvest", "mangio-crepe", "rmvpe", "rmvpe_legacy", "fcpe", "fcpe_legacy", "hybrid[rmvpe+fcpe]"], label="F0 Method", value="fcpe", info="Select the fundamental frequency extraction method" ) with gr.Row(): MIN_PITCH = gr.Number(label="Min Pitch", value=50, info="Minimum pitch detection threshold") MAX_PITCH = gr.Number(label="Max Pitch", value=1100, info="Maximum pitch detection threshold") CREPE_HOP_LENGTH = gr.Number(label="Crepe Hop Length", value=120, info="Hop length for Crepe method") INDEX_RATE = gr.Slider(label="Index Rate", minimum=0, maximum=1, value=0.75) FILTER_RADIUS = gr.Number(label="Filter Radius", value=3, info="Filter intensity for smoothing") RMS_MIX_RATE = gr.Slider(label="RMS Mix Rate", minimum=0, maximum=1, value=0.25) PROTECT = gr.Slider(label="Protect Factor", minimum=0, maximum=1, value=0.33) gr.Markdown("## Generate Audio") output_audio = gr.Audio(label="Generated Audio Output", type='filepath') with gr.Row(): refresh_btn = gr.Button("Refresh Lists") run_button = gr.Button("Run Inference") # Refresh Button for Updating Model and Audio Choices refresh_btn.click( lambda: (refresh_audio_list(), refresh_folders()), outputs=[SOUND_PATH, MODEL_NAME] ) # Run Inference and Display Result run_button.click( fn=process_audio, inputs=[MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, FILTER_RADIUS, RMS_MIX_RATE, PROTECT, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio], outputs=output_audio ) # Other Tabs (Download Model, Audio Separation) with gr.Tab("Download RVC Model"): gr.Markdown("## Download RVC Model") url = gr.Textbox(label="Model URL") dirname = gr.Textbox(label="Model Directory Name") download_button = gr.Button("Download Model") download_output = gr.Textbox(label="Download Status") download_button.click( download_online_model, inputs=[url, dirname], outputs=download_output ) with gr.Tab("Audio Effect (demo)"): input_audio = gr.Textbox(label="Path Audio File") output_audio = gr.Audio(type="filepath", label="Processed Audio with Reverb") reverb_btn = gr.Button("Add Reverb") reverb_btn.click(add_simple_reverb, inputs=input_audio, outputs=output_audio) with gr.Tab("Audio Separation"): gr.Markdown("## Audio Separation") input_audio = gr.Audio(type="filepath", label="Upload Audio for Separation") with gr.Accordion("Separation by Link", open = False): with gr.Row(): roformer_link = gr.Textbox( label = "Link", placeholder = "Paste the link here", interactive = True ) with gr.Row(): gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)") with gr.Row(): roformer_download_button = gr.Button( "Download!", variant = "primary" ) separate_button = gr.Button("Separate Audio") separation_output = gr.Textbox(label="Separation Output Path") roformer_download_button.click(download_audio, [roformer_link], [input_audio]) separate_button.click( fn=separate_audio, inputs=[input_audio, "model_bs_roformer_ep_317_sdr_12.9755.ckpt", "UVR-DeEcho-DeReverb.pth", "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt"], outputs=[separation_output] ) app.launch( share=args.share_enabled, server_name=None if not args.listen else (args.listen_host or '0.0.0.0'), server_port=args.listen_port )