import os
import re
import random
from scipy.io.wavfile import write
from scipy.io.wavfile import read
import numpy as np
import gradio as gr
import yt_dlp
import subprocess
from pydub import AudioSegment
from audio_separator.separator import Separator
from lib.infer import infer_audio
import edge_tts
import tempfile
import anyio
from pathlib import Path
import os
import zipfile
import shutil
import urllib.request
import gdown
import subprocess

main_dir = Path().resolve()
print(main_dir)

os.chdir(main_dir)
models_dir = "models"



language_dict = {
'English-Jenny (Female)': 'en-US-JennyNeural',
 'English-Guy (Male)': 'en-US-GuyNeural',
 'English-Ana (Female)': 'en-US-AnaNeural',
 'English-Aria (Female)': 'en-US-AriaNeural',
 'English-Christopher (Male)': 'en-US-ChristopherNeural',
 'English-Eric (Male)': 'en-US-EricNeural',
 'English-Michelle (Female)': 'en-US-MichelleNeural',
 'English-Roger (Male)': 'en-US-RogerNeural',
 'Spanish (Mexican)-Dalia (Female)': 'es-MX-DaliaNeural',
 'Spanish (Mexican)-Jorge- (Male)': 'es-MX-JorgeNeural',
 'Korean-Sun-Hi- (Female)': 'ko-KR-SunHiNeural',
 'Korean-InJoon- (Male)': 'ko-KR-InJoonNeural',
'Thai-Premwadee- (Female)': 'th-TH-PremwadeeNeural',
 'Thai-Niwat- (Male)': 'th-TH-NiwatNeural',
 'Vietnamese-HoaiMy- (Female)': 'vi-VN-HoaiMyNeural',
'Vietnamese-NamMinh- (Male)': 'vi-VN-NamMinhNeural',
 'Japanese-Nanami- (Female)': 'ja-JP-NanamiNeural',
 'Japanese-Keita- (Male)': 'ja-JP-KeitaNeural',
 'French-Denise- (Female)': 'fr-FR-DeniseNeural',
 'French-Eloise- (Female)': 'fr-FR-EloiseNeural',
 'French-Henri- (Male)': 'fr-FR-HenriNeural',
 'Brazilian-Francisca- (Female)': 'pt-BR-FranciscaNeural',
 'Brazilian-Antonio- (Male)': 'pt-BR-AntonioNeural',
 'Indonesian-Ardi- (Male)': 'id-ID-ArdiNeural',
 'Indonesian-Gadis- (Female)': 'id-ID-GadisNeural',
 'Hebrew-Avri- (Male)': 'he-IL-AvriNeural',
 'Hebrew-Hila- (Female)': 'he-IL-HilaNeural',
'Italian-Isabella- (Female)': 'it-IT-IsabellaNeural',
 'Italian-Diego- (Male)': 'it-IT-DiegoNeural',
 'Italian-Elsa- (Female)': 'it-IT-ElsaNeural',
 'Dutch-Colette- (Female)': 'nl-NL-ColetteNeural',
 'Dutch-Fenna- (Female)': 'nl-NL-FennaNeural',
 'Dutch-Maarten- (Male)': 'nl-NL-MaartenNeural',
'Malese-Osman- (Male)': 'ms-MY-OsmanNeural',
 'Malese-Yasmin- (Female)': 'ms-MY-YasminNeural',
 'Norwegian-Pernille- (Female)': 'nb-NO-PernilleNeural',
 'Norwegian-Finn- (Male)': 'nb-NO-FinnNeural',
 'Swedish-Sofie- (Female)': 'sv-SE-SofieNeural',
 'ArabicSwedish-Mattias- (Male)': 'sv-SE-MattiasNeural',
 'Arabic-Hamed- (Male)': 'ar-SA-HamedNeural',
 'Arabic-Zariyah- (Female)': 'ar-SA-ZariyahNeural',
 'Greek-Athina- (Female)': 'el-GR-AthinaNeural',
 'Greek-Nestoras- (Male)': 'el-GR-NestorasNeural',
'German-Katja- (Female)': 'de-DE-KatjaNeural',
 'German-Amala- (Female)': 'de-DE-AmalaNeural',
 'German-Conrad- (Male)': 'de-DE-ConradNeural',
 'German-Killian- (Male)': 'de-DE-KillianNeural',
 'Afrikaans-Adri- (Female)': 'af-ZA-AdriNeural',
 'Afrikaans-Willem- (Male)': 'af-ZA-WillemNeural',
 'Ethiopian-Ameha- (Male)': 'am-ET-AmehaNeural',
 'Ethiopian-Mekdes- (Female)': 'am-ET-MekdesNeural',
 'Arabic (UAD)-Fatima- (Female)': 'ar-AE-FatimaNeural',
 'Arabic (UAD)-Hamdan- (Male)': 'ar-AE-HamdanNeural',
 'Arabic (Bahrain)-Ali- (Male)': 'ar-BH-AliNeural',
 'Arabic (Bahrain)-Laila- (Female)': 'ar-BH-LailaNeural',
 'Arabic (Algeria)-Ismael- (Male)': 'ar-DZ-IsmaelNeural',
 'Arabic (Egypt)-Salma- (Female)': 'ar-EG-SalmaNeural',
 'Arabic (Egypt)-Shakir- (Male)': 'ar-EG-ShakirNeural',
 'Arabic (Iraq)-Bassel- (Male)': 'ar-IQ-BasselNeural',
 'Arabic (Iraq)-Rana- (Female)': 'ar-IQ-RanaNeural',
 'Arabic (Jordan)-Sana- (Female)': 'ar-JO-SanaNeural',
 'Arabic (Jordan)-Taim- (Male)': 'ar-JO-TaimNeural',
 'Arabic (Kuwait)-Fahed- (Male)': 'ar-KW-FahedNeural',
 'Arabic (Kuwait)-Noura- (Female)': 'ar-KW-NouraNeural',
 'Arabic (Lebanon)-Layla- (Female)': 'ar-LB-LaylaNeural',
 'Arabic (Lebanon)-Rami- (Male)': 'ar-LB-RamiNeural',
 'Arabic (Libya)-Iman- (Female)': 'ar-LY-ImanNeural',
 'Arabic (Libya)-Omar- (Male)': 'ar-LY-OmarNeural',
 'Arabic (Morocco)-Jamal- (Male)': 'ar-MA-JamalNeural',
 'Arabic (Morocco)-Mouna- (Female)': 'ar-MA-MounaNeural',
 'Arabic (Oman)-Abdullah- (Male)': 'ar-OM-AbdullahNeural',
 'Arabic (Oman)-Aysha- (Female)': 'ar-OM-AyshaNeural',
 'Arabic (Qatar)-Amal- (Female)': 'ar-QA-AmalNeural',
 'Arabic (Qatar)-Moaz- (Male)': 'ar-QA-MoazNeural',
 'Arabic (Syrian Arab Republic)-Amany- (Female)': 'ar-SY-AmanyNeural',
 'Arabic (Syrian Arab Republic)-Laith- (Male)': 'ar-SY-LaithNeural',
 'Arabic (Tunisia)-Hedi- (Male)': 'ar-TN-HediNeural',
 'Arabic (Tunisia)-Reem- (Female)': 'ar-TN-ReemNeural',
 'Arabic (Yemen	)-Maryam- (Female)': 'ar-YE-MaryamNeural',
 'Arabic (Yemen	)-Saleh- (Male)': 'ar-YE-SalehNeural',
 'Azerbaijani-Babek- (Male)': 'az-AZ-BabekNeural',
 'Azerbaijani-Banu- (Female)': 'az-AZ-BanuNeural',
 'Bulgarian-Borislav- (Male)': 'bg-BG-BorislavNeural',
 'Bulgarian-Kalina- (Female)': 'bg-BG-KalinaNeural',
 'Bengali (Bangladesh)-Nabanita- (Female)': 'bn-BD-NabanitaNeural',
 'Bengali (Bangladesh)-Pradeep- (Male)': 'bn-BD-PradeepNeural',
 'Bengali (India)-Bashkar- (Male)': 'bn-IN-BashkarNeural',
 'Bengali (India)-Tanishaa- (Female)': 'bn-IN-TanishaaNeural',
 'Bosniak (Bosnia and Herzegovina)-Goran- (Male)': 'bs-BA-GoranNeural',
 'Bosniak (Bosnia and Herzegovina)-Vesna- (Female)': 'bs-BA-VesnaNeural',
 'Catalan (Spain)-Joana- (Female)': 'ca-ES-JoanaNeural',
 'Catalan (Spain)-Enric- (Male)': 'ca-ES-EnricNeural',
 'Czech (Czech Republic)-Antonin- (Male)': 'cs-CZ-AntoninNeural',
 'Czech (Czech Republic)-Vlasta- (Female)': 'cs-CZ-VlastaNeural',
 'Welsh (UK)-Aled- (Male)': 'cy-GB-AledNeural',
 'Welsh (UK)-Nia- (Female)': 'cy-GB-NiaNeural',
 'Danish (Denmark)-Christel- (Female)': 'da-DK-ChristelNeural',
 'Danish (Denmark)-Jeppe- (Male)': 'da-DK-JeppeNeural',
 'German (Austria)-Ingrid- (Female)': 'de-AT-IngridNeural',
 'German (Austria)-Jonas- (Male)': 'de-AT-JonasNeural',
 'German (Switzerland)-Jan- (Male)': 'de-CH-JanNeural',
 'German (Switzerland)-Leni- (Female)': 'de-CH-LeniNeural',
 'English (Australia)-Natasha- (Female)': 'en-AU-NatashaNeural',
 'English (Australia)-William- (Male)': 'en-AU-WilliamNeural',
 'English (Canada)-Clara- (Female)': 'en-CA-ClaraNeural',
 'English (Canada)-Liam- (Male)': 'en-CA-LiamNeural',
 'English (UK)-Libby- (Female)': 'en-GB-LibbyNeural',
 'English (UK)-Maisie- (Female)': 'en-GB-MaisieNeural',
 'English (UK)-Ryan- (Male)': 'en-GB-RyanNeural',
 'English (UK)-Sonia- (Female)': 'en-GB-SoniaNeural',
 'English (UK)-Thomas- (Male)': 'en-GB-ThomasNeural',
 'English (Hong Kong)-Sam- (Male)': 'en-HK-SamNeural',
 'English (Hong Kong)-Yan- (Female)': 'en-HK-YanNeural',
 'English (Ireland)-Connor- (Male)': 'en-IE-ConnorNeural',
 'English (Ireland)-Emily- (Female)': 'en-IE-EmilyNeural',
 'English (India)-Neerja- (Female)': 'en-IN-NeerjaNeural',
 'English (India)-Prabhat- (Male)': 'en-IN-PrabhatNeural',
 'English (Kenya)-Asilia- (Female)': 'en-KE-AsiliaNeural',
 'English (Kenya)-Chilemba- (Male)': 'en-KE-ChilembaNeural',
 'English (Nigeria)-Abeo- (Male)': 'en-NG-AbeoNeural',
'English (Nigeria)-Ezinne- (Female)': 'en-NG-EzinneNeural',
 'English (New Zealand)-Mitchell- (Male)': 'en-NZ-MitchellNeural',
 'English (Philippines)-James- (Male)': 'en-PH-JamesNeural',
 'English (Philippines)-Rosa- (Female)': 'en-PH-RosaNeural',
 'English (Singapore)-Luna- (Female)': 'en-SG-LunaNeural',
 'English (Singapore)-Wayne- (Male)': 'en-SG-WayneNeural',
 'English (Tanzania)-Elimu- (Male)': 'en-TZ-ElimuNeural',
 'English (Tanzania)-Imani- (Female)': 'en-TZ-ImaniNeural',
 'English (South Africa)-Leah- (Female)': 'en-ZA-LeahNeural',
 'English (South Africa)-Luke- (Male)': 'en-ZA-LukeNeural',
'Spanish (Argentina)-Elena- (Female)': 'es-AR-ElenaNeural',
 'Spanish (Argentina)-Tomas- (Male)': 'es-AR-TomasNeural',
 'Spanish (Bolivia)-Marcelo- (Male)': 'es-BO-MarceloNeural',
 'Spanish (Bolivia)-Sofia- (Female)': 'es-BO-SofiaNeural',
 'Spanish (Colombia)-Gonzalo- (Male)': 'es-CO-GonzaloNeural',
 'Spanish (Colombia)-Salome- (Female)': 'es-CO-SalomeNeural',
 'Spanish (Costa Rica)-Juan- (Male)': 'es-CR-JuanNeural',
 'Spanish (Costa Rica)-Maria- (Female)': 'es-CR-MariaNeural',
 'Spanish (Cuba)-Belkys- (Female)': 'es-CU-BelkysNeural',
 'Spanish (Dominican Republic)-Emilio- (Male)': 'es-DO-EmilioNeural',
 'Spanish (Dominican Republic)-Ramona- (Female)': 'es-DO-RamonaNeural',
 'Spanish (Ecuador)-Andrea- (Female)': 'es-EC-AndreaNeural',
 'Spanish (Ecuador)-Luis- (Male)': 'es-EC-LuisNeural',
 'Spanish (Spain)-Alvaro- (Male)': 'es-ES-AlvaroNeural',
 'Spanish (Spain)-Elvira- (Female)': 'es-ES-ElviraNeural',
 'Spanish (Equatorial Guinea)-Teresa- (Female)': 'es-GQ-TeresaNeural',
 'Spanish (Guatemala)-Andres- (Male)': 'es-GT-AndresNeural',
 'Spanish (Guatemala)-Marta- (Female)': 'es-GT-MartaNeural',
 'Spanish (Honduras)-Carlos- (Male)': 'es-HN-CarlosNeural',
 'Spanish (Honduras)-Karla- (Female)': 'es-HN-KarlaNeural',
 'Spanish (Nicaragua)-Federico- (Male)': 'es-NI-FedericoNeural',
 'Spanish (Nicaragua)-Yolanda- (Female)': 'es-NI-YolandaNeural',
 'Spanish (Panama)-Margarita- (Female)': 'es-PA-MargaritaNeural',
 'Spanish (Panama)-Roberto- (Male)': 'es-PA-RobertoNeural',
 'Spanish (Peru)-Alex- (Male)': 'es-PE-AlexNeural',
 'Spanish (Peru)-Camila- (Female)': 'es-PE-CamilaNeural',
 'Spanish (Puerto Rico)-Karina- (Female)': 'es-PR-KarinaNeural',
 'Spanish (Puerto Rico)-Victor- (Male)': 'es-PR-VictorNeural',
 'Spanish (Paraguay)-Mario- (Male)': 'es-PY-MarioNeural',
 'Spanish (Paraguay)-Tania- (Female)': 'es-PY-TaniaNeural',
 'Spanish (El Salvador)-Lorena- (Female)': 'es-SV-LorenaNeural',
 'Spanish (El Salvador)-Rodrigo- (Male)': 'es-SV-RodrigoNeural',
 'Spanish (United States)-Alonso- (Male)': 'es-US-AlonsoNeural',
 'Spanish (United States)-Paloma- (Female)': 'es-US-PalomaNeural',
 'Spanish (Uruguay)-Mateo- (Male)': 'es-UY-MateoNeural',
 'Spanish (Uruguay)-Valentina- (Female)': 'es-UY-ValentinaNeural',
 'Spanish (Venezuela)-Paola- (Female)': 'es-VE-PaolaNeural',
 'Spanish (Venezuela)-Sebastian- (Male)': 'es-VE-SebastianNeural',
'Estonian (Estonia)-Anu- (Female)': 'et-EE-AnuNeural',
'Estonian (Estonia)-Kert- (Male)': 'et-EE-KertNeural',
'Persian (Iran)-Dilara- (Female)': 'fa-IR-DilaraNeural',
'Persian (Iran)-Farid- (Male)': 'fa-IR-FaridNeural',
'Finnish (Finland)-Harri- (Male)': 'fi-FI-HarriNeural',
'Finnish (Finland)-Noora- (Female)': 'fi-FI-NooraNeural',
'French (Belgium)-Charline- (Female)': 'fr-BE-CharlineNeural',
'French (Belgium)-Gerard- (Male)': 'fr-BE-GerardNeural',
'French (Canada)-Sylvie- (Female)': 'fr-CA-SylvieNeural',
'French (Canada)-Antoine- (Male)': 'fr-CA-AntoineNeural',
'French (Canada)-Jean- (Male)': 'fr-CA-JeanNeural',
'French (Switzerland)-Ariane- (Female)': 'fr-CH-ArianeNeural',
'French (Switzerland)-Fabrice- (Male)': 'fr-CH-FabriceNeural',
'Irish (Ireland)-Colm- (Male)': 'ga-IE-ColmNeural',
'Irish (Ireland)-Orla- (Female)': 'ga-IE-OrlaNeural',
'Galician (Spain)-Roi- (Male)': 'gl-ES-RoiNeural',
'Galician (Spain)-Sabela- (Female)': 'gl-ES-SabelaNeural',
'Gujarati (India)-Dhwani- (Female)': 'gu-IN-DhwaniNeural',
'Gujarati (India)-Niranjan- (Male)': 'gu-IN-NiranjanNeural',
'Hindi (India)-Madhur- (Male)': 'hi-IN-MadhurNeural',
'Hindi (India)-Swara- (Female)': 'hi-IN-SwaraNeural',
'Croatian (Croatia)-Gabrijela- (Female)': 'hr-HR-GabrijelaNeural',
'Croatian (Croatia)-Srecko- (Male)': 'hr-HR-SreckoNeural',
'Hungarian (Hungary)-Noemi- (Female)': 'hu-HU-NoemiNeural',
'Hungarian (Hungary)-Tamas- (Male)': 'hu-HU-TamasNeural',
'Icelandic (Iceland)-Gudrun- (Female)': 'is-IS-GudrunNeural',
'Icelandic (Iceland)-Gunnar- (Male)': 'is-IS-GunnarNeural',
'Javanese (Indonesia)-Dimas- (Male)': 'jv-ID-DimasNeural',
'Javanese (Indonesia)-Siti- (Female)': 'jv-ID-SitiNeural',
'Georgian (Georgia)-Eka- (Female)': 'ka-GE-EkaNeural',
'Georgian (Georgia)-Giorgi- (Male)': 'ka-GE-GiorgiNeural',
'Kazakh (Kazakhstan)-Aigul- (Female)': 'kk-KZ-AigulNeural',
'Kazakh (Kazakhstan)-Daulet- (Male)': 'kk-KZ-DauletNeural',
'Khmer (Cambodia)-Piseth- (Male)': 'km-KH-PisethNeural',
'Khmer (Cambodia)-Sreymom- (Female)': 'km-KH-SreymomNeural',
'Kannada (India)-Gagan- (Male)': 'kn-IN-GaganNeural',
'Kannada (India)-Sapna- (Female)': 'kn-IN-SapnaNeural',
'Lao (Laos)-Chanthavong- (Male)': 'lo-LA-ChanthavongNeural',
'Lao (Laos)-Keomany- (Female)': 'lo-LA-KeomanyNeural',
'Lithuanian (Lithuania)-Leonas- (Male)': 'lt-LT-LeonasNeural',
'Lithuanian (Lithuania)-Ona- (Female)': 'lt-LT-OnaNeural',
'Latvian (Latvia)-Everita- (Female)': 'lv-LV-EveritaNeural',
'Latvian (Latvia)-Nils- (Male)': 'lv-LV-NilsNeural',
'Macedonian (North Macedonia)-Aleksandar- (Male)': 'mk-MK-AleksandarNeural',
'Macedonian (North Macedonia)-Marija- (Female)': 'mk-MK-MarijaNeural',
'Malayalam (India)-Midhun- (Male)': 'ml-IN-MidhunNeural',
'Malayalam (India)-Sobhana- (Female)': 'ml-IN-SobhanaNeural',
'Mongolian (Mongolia)-Bataa- (Male)': 'mn-MN-BataaNeural',
'Mongolian (Mongolia)-Yesui- (Female)': 'mn-MN-YesuiNeural',
'Marathi (India)-Aarohi- (Female)': 'mr-IN-AarohiNeural',
'Marathi (India)-Manohar- (Male)': 'mr-IN-ManoharNeural',
'Maltese (Malta)-Grace- (Female)': 'mt-MT-GraceNeural',
'Maltese (Malta)-Joseph- (Male)': 'mt-MT-JosephNeural',
'Burmese (Myanmar)-Nilar- (Female)': 'my-MM-NilarNeural',
'Burmese (Myanmar)-Thiha- (Male)': 'my-MM-ThihaNeural',
'Nepali (Nepal)-Hemkala- (Female)': 'ne-NP-HemkalaNeural',
'Nepali (Nepal)-Sagar- (Male)': 'ne-NP-SagarNeural',
'Dutch (Belgium)-Arnaud- (Male)': 'nl-BE-ArnaudNeural',
'Dutch (Belgium)-Dena- (Female)': 'nl-BE-DenaNeural',
'Polish (Poland)-Marek- (Male)': 'pl-PL-MarekNeural',
'Polish (Poland)-Zofia- (Female)': 'pl-PL-ZofiaNeural',
'Pashto (Afghanistan)-Gul Nawaz- (Male)': 'ps-AF-Gul',
}



def download_audio(url):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'ytdl/%(title)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(url, download=True)
        file_path = ydl.prepare_filename(info_dict).rsplit('.', 1)[0] + '.wav'
        sample_rate, audio_data = read(file_path)
        audio_array = np.asarray(audio_data, dtype=np.int16)

        return sample_rate, audio_array




# Define a function to handle the entire separation process
def separate_audio(input_audio, output_dir, model_voc_inst, model_deecho, model_back_voc):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    separator = Separator(output_dir=output_dir)

    # Define output files
    vocals = os.path.join(output_dir, 'Vocals.wav')
    instrumental = os.path.join(output_dir, 'Instrumental.wav')
    vocals_reverb = os.path.join(output_dir, 'Vocals (Reverb).wav')
    vocals_no_reverb = os.path.join(output_dir, 'Vocals (No Reverb).wav')
    lead_vocals = os.path.join(output_dir, 'Lead Vocals.wav')
    backing_vocals = os.path.join(output_dir, 'Backing Vocals.wav')

    # Splitting a track into Vocal and Instrumental
    separator.load_model(model_filename=model_voc_inst)
    voc_inst = separator.separate(input_audio)
    os.rename(os.path.join(output_dir, voc_inst[0]), instrumental)  # Rename to “Instrumental.wav”
    os.rename(os.path.join(output_dir, voc_inst[1]), vocals)        # Rename to “Vocals.wav”

    # Applying DeEcho-DeReverb to Vocals
    separator.load_model(model_filename=model_deecho)
    voc_no_reverb = separator.separate(vocals)
    os.rename(os.path.join(output_dir, voc_no_reverb[0]), vocals_no_reverb)  # Rename to “Vocals (No Reverb).wav”
    os.rename(os.path.join(output_dir, voc_no_reverb[1]), vocals_reverb)     # Rename to “Vocals (Reverb).wav”

    # Separating Back Vocals from Main Vocals
    separator.load_model(model_filename=model_back_voc)
    backing_voc = separator.separate(vocals_no_reverb)
    os.rename(os.path.join(output_dir, backing_voc[0]), backing_vocals)  # Rename to “Backing Vocals.wav”
    os.rename(os.path.join(output_dir, backing_voc[1]), lead_vocals)     # Rename to “Lead Vocals.wav”

    return instrumental, vocals, vocals_reverb, vocals_no_reverb, lead_vocals, backing_vocals


# Main function to process audio (Inference)
def process_audio(MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, 
                  FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, 
                  KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio=None):

    # If no sound path is given, use the uploaded file
    if not SOUND_PATH and upload_audio is not None:
        SOUND_PATH = os.path.join("uploaded_audio", upload_audio.name)
        with open(SOUND_PATH, "wb") as f:
            f.write(upload_audio.read())
    
    # Check if a model name is provided
    if not MODEL_NAME:
        return "Please provide a model name."

    # Run the inference
    os.system("chmod +x stftpitchshift")
    inferred_audio = infer_audio(
        MODEL_NAME,
        SOUND_PATH,
        F0_CHANGE,
        F0_METHOD,
        MIN_PITCH,
        MAX_PITCH,
        CREPE_HOP_LENGTH,
        INDEX_RATE,
        FILTER_RADIUS,
        RMS_MIX_RATE,
        PROTECT,
        SPLIT_INFER,
        MIN_SILENCE,
        SILENCE_THRESHOLD,
        SEEK_STEP,
        KEEP_SILENCE,
        FORMANT_SHIFT,
        QUEFRENCY,
        TIMBRE,
        F0_AUTOTUNE,
        OUTPUT_FORMAT
    )
    
    return inferred_audio


async def text_to_speech_edge(text, language_code):
    voice = language_dict.get(language_code, "default_voice")
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path




def extract_zip(extraction_folder, zip_name):
    os.makedirs(extraction_folder)
    with zipfile.ZipFile(zip_name, 'r') as zip_ref:
        zip_ref.extractall(extraction_folder)
    os.remove(zip_name)

    index_filepath, model_filepath = None, None
    for root, dirs, files in os.walk(extraction_folder):
        for name in files:
            if name.endswith('.index') and os.stat(os.path.join(root, name)).st_size > 1024 * 100:
                index_filepath = os.path.join(root, name)

            if name.endswith('.pth') and os.stat(os.path.join(root, name)).st_size > 1024 * 1024 * 40:
                model_filepath = os.path.join(root, name)

    if not model_filepath:
        raise Exception(f'No .pth model file was found in the extracted zip. Please check {extraction_folder}.')

    # move model and index file to extraction folder
    os.rename(model_filepath, os.path.join(extraction_folder, os.path.basename(model_filepath)))
    if index_filepath:
        os.rename(index_filepath, os.path.join(extraction_folder, os.path.basename(index_filepath)))

    # remove any unnecessary nested folders
    for filepath in os.listdir(extraction_folder):
        if os.path.isdir(os.path.join(extraction_folder, filepath)):
            shutil.rmtree(os.path.join(extraction_folder, filepath))

def download_online_model(url, dir_name):
    try:
        print(f'[~] Downloading voice model with name {dir_name}...')
        zip_name = url.split('/')[-1]
        extraction_folder = os.path.join(models_dir, dir_name)
        if os.path.exists(extraction_folder):
            raise Exception(f'Voice model directory {dir_name} already exists! Choose a different name for your voice model.')

        if 'pixeldrain.com' in url:
            url = f'https://pixeldrain.com/api/file/{zip_name}'
        if 'drive.google.com' in url:
          zip_name = dir_name + ".zip"
          gdown.download(url, output=zip_name, use_cookies=True, quiet=True, fuzzy=True)
        else:
            urllib.request.urlretrieve(url, zip_name)

        print(f'[~] Extracting zip file...')
        extract_zip(extraction_folder, zip_name)
        print(f'[+] {dir_name} Model successfully downloaded!')

    except Exception as e:
        raise Exception(str(e))



# Gradio Blocks Interface with Tabs
with gr.Blocks(title="Hex RVC") as app:
    gr.Markdown("# Hex RVC")
    
    with gr.Tab("Inference"):
        with gr.Row():
            MODEL_NAME = gr.Textbox(label="Model Name", placeholder="Enter model name")
            SOUND_PATH = gr.Textbox(label="Audio Path (Optional)", placeholder="Leave blank to upload audio")
            upload_audio = gr.File(label="Upload Audio", type='filepath', file_types=["audio"])
        
        with gr.Row():
            F0_CHANGE = gr.Number(label="Pitch Change (semitones)", value=0)
            F0_METHOD = gr.Dropdown(choices=["crepe", "harvest", "mangio-crepe", "rmvpe", "rmvpe+", "fcpe", 
                                             "hybrid[mangio-crepe+rmvpe]", "hybrid[mangio-crepe+fcpe]", 
                                             "hybrid[rmvpe+fcpe]", "hybrid[mangio-crepe+rmvpe+fcpe]"], 
                                    label="F0 Method", value="fcpe")
        
        with gr.Row():
            MIN_PITCH = gr.Textbox(label="Min Pitch", value="50")
            MAX_PITCH = gr.Textbox(label="Max Pitch", value="1100")
            CREPE_HOP_LENGTH = gr.Number(label="Crepe Hop Length", value=120)
            INDEX_RATE = gr.Slider(label="Index Rate", minimum=0, maximum=1, value=0.75)
            FILTER_RADIUS = gr.Number(label="Filter Radius", value=3)
            RMS_MIX_RATE = gr.Slider(label="RMS Mix Rate", minimum=0, maximum=1, value=0.25)
            PROTECT = gr.Slider(label="Protect", minimum=0, maximum=1, value=0.33)

        with gr.Accordion("Hex TTS"):
            input_text = gr.Textbox(lines=5, label="Input Text")
            #output_text = gr.Textbox(label="Output Text")
            #output_audio = gr.Audio(type="filepath", label="Exported Audio")
            language = gr.Dropdown(choices=list(language_dict.keys()), label="Choose the Voice Model")
            tts_convert = gr.Button("Convert")
            tts_convert.click(fn=text_to_speech_edge, inputs=[input_text, language], output=upload_audio)
        with gr.Accordion("Advanced Settings", open=False):
            SPLIT_INFER = gr.Checkbox(label="Enable Split Inference", value=False)
            MIN_SILENCE = gr.Number(label="Min Silence (ms)", value=500)
            SILENCE_THRESHOLD = gr.Number(label="Silence Threshold (dBFS)", value=-50)
            SEEK_STEP = gr.Slider(label="Seek Step (ms)", minimum=1, maximum=10, value=1)
            KEEP_SILENCE = gr.Number(label="Keep Silence (ms)", value=200)
            FORMANT_SHIFT = gr.Checkbox(label="Enable Formant Shift", value=False)
            QUEFRENCY = gr.Number(label="Quefrency", value=0)
            TIMBRE = gr.Number(label="Timbre", value=1)
            F0_AUTOTUNE = gr.Checkbox(label="Enable F0 Autotune", value=False)
            OUTPUT_FORMAT = gr.Dropdown(choices=["wav", "flac", "mp3"], label="Output Format", value="wav")
        
        run_button = gr.Button("Run Inference")
        output_audio = gr.Audio(label="Generated Audio", type='filepath')

        run_button.click(
            process_audio, 
            inputs=[MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, 
                    FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, 
                    KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio], 
            outputs=output_audio
        )

    with gr.Tqb("Download RVC Model"):
        url = gr.Textbox(label="Your model URL")
        dirname = gr.Textbox(label="Your Model name")
        button_model = gr.Button("Download model")
        button_model.click(fn=download_online_model, inputs=[url, dirname], output=[dirname])
    with gr.Tab("Audio Separation"):
        with gr.Row():
            input_audio = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
            output_dir = gr.Textbox(value="/content/output", label="Output Directory")

            with gr.Accordion("Separation by Link", open = False):
                with gr.Row():
                    roformer_link = gr.Textbox(
                    label = "Link",
                    placeholder = "Paste the link here",
                    interactive = True
                )
                with gr.Row():
                   gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
                with gr.Row():
                    roformer_download_button = gr.Button(
                    "Download!",
                    variant = "primary"
                )

            roformer_download_button.click(download_audio, [roformer_link], [input_audio])
        
        with gr.Row():
            model_voc_inst = gr.Textbox(value='model_bs_roformer_ep_317_sdr_12.9755.ckpt', label="Vocal & Instrumental Model")
            model_deecho = gr.Textbox(value='UVR-DeEcho-DeReverb.pth', label="DeEcho-DeReverb Model")
            model_back_voc = gr.Textbox(value='mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt', label="Backing Vocals Model")
        
        separate_button = gr.Button("Separate Audio")
        
        with gr.Row():
            instrumental_out = gr.Audio(label="Instrumental")
            vocals_out = gr.Audio(label="Vocals")
            vocals_reverb_out = gr.Audio(label="Vocals (Reverb)")
            vocals_no_reverb_out = gr.Audio(label="Vocals (No Reverb)")
            lead_vocals_out = gr.Audio(label="Lead Vocals")
            backing_vocals_out = gr.Audio(label="Backing Vocals")
        
        separate_button.click(
            separate_audio,
            inputs=[input_audio, output_dir, model_voc_inst, model_deecho, model_back_voc],
            outputs=[instrumental_out, vocals_out, vocals_reverb_out, vocals_no_reverb_out, lead_vocals_out, backing_vocals_out]
        )


# Launch the Gradio app
app.launch(share=True, debug=True)