import gradio as gr from infer_rvc_python import BaseLoader import soundfile as sf import random from urllib.request import urlretrieve import os import zipfile files_to_retrieve = [ "https://replicate.delivery/pbxt/N97QM3XNFrooJhV6Fb0meBff0aAG1rEDfvuxcdLS6fTx1vmWC/test.zip", # "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt?download=true", # "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt?download=true" ] for file in files_to_retrieve: print(f"Downloading {file}") urlretrieve(file, file.split("/")[-1]) # unzip test.zip with zipfile.ZipFile("test.zip", "r") as zip_ref: zip_ref.extractall(".") converter = BaseLoader( only_cpu=True, hubert_path="./hubert_base.pt", rmvpe_path="./rmvpe.pt" ) model = "test.pth" index = "added_IVF839_Flat_nprobe_1_test_v2.index" def voice_conversion( audio, pitch_change, filter_radius, envelope_ratio, index_influence, consonant_breath_protection, ): global output_file audio_out = run( [str(audio)], model, "rmvpe+", pitch_change, index, index_influence, filter_radius, envelope_ratio, consonant_breath_protection, ) print(audio_out) # output_audio, sr = sf.read(output_file, dtype="int32") return audio_out def convert_now(audio_files, random_tag): return converter(audio_files, random_tag, overwrite=False, parallel_workers=8) def run( audio_files, file_m, pitch_alg, pitch_lvl, file_index, index_inf, r_m_f, e_r, c_b_p, ): random_tag = "USER_" + str(random.randint(10000000, 99999999)) print("PITCH LVL: ", pitch_lvl) converter.apply_conf( tag=random_tag, file_model=file_m, pitch_algo=pitch_alg, pitch_lvl=pitch_lvl, file_index=file_index, index_influence=index_inf, respiration_median_filtering=r_m_f, envelope_ratio=e_r, consonant_breath_protection=c_b_p, resample_sr=44100 if audio_files[0].endswith(".mp3") else 0, ) output = convert_now(audio_files, random_tag) audio, sr = sf.read(output[0], dtype="int32") return (sr, audio) def ui(): with gr.Blocks() as demo: audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") with gr.Row(): pitch_slider = gr.Slider( minimum=-24, maximum=24, value=0, step=1, label="Pitch", interactive=True, ) index_influence_slider = gr.Slider( minimum=0, maximum=1, value=0.75, step=0.01, label="Index Influence", interactive=True, ) respiration_median_filtering = gr.Slider( minimum=0, maximum=10, value=3, step=1, label="Resp. Median Filtering", interactive=True, ) envelope_ratio = gr.Slider( minimum=0, maximum=1, value=0.25, step=0.01, label="Envelope Ratio", interactive=True, ) consonant_breath_protection = gr.Slider( minimum=0, maximum=1, value=0.5, step=0.01, label="Consonant Breath Protection", interactive=True, ) button = gr.Button("Convert") audio_output = gr.Audio(interactive=False, type="numpy") button.click( voice_conversion, inputs=[ audio_input, pitch_slider, respiration_median_filtering, envelope_ratio, index_influence_slider, consonant_breath_protection, ], outputs=[audio_output], ) return demo ui().launch(auth=("output", "becreative"))