import gradio as gr import librosa from asr import transcribe, ASR_EXAMPLES, ASR_LANGUAGES, ASR_NOTE from tts import synthesize, TTS_EXAMPLES, TTS_LANGUAGES from lid import identify, LID_EXAMPLES from generate import generate, GenExamples MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 1024 demo = gr.Blocks() mms_select_source_trans = gr.Radio( ["Record from Mic", "Upload audio"], label="Audio input", value="Record from Mic", ) mms_mic_source_trans = gr.Audio(source="microphone", type="filepath", label="Use mic") mms_upload_source_trans = gr.Audio( source="upload", type="filepath", label="Upload file", visible=False ) mms_transcribe = gr.Interface( fn=transcribe, inputs=[ mms_select_source_trans, mms_mic_source_trans, mms_upload_source_trans, gr.Dropdown( [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()], label="Language", value="eng English", ), # gr.Checkbox(label="Use Language Model (if available)", default=True), ], outputs="text", examples=ASR_EXAMPLES, title="Speech-to-text", description=( "Transcribe audio from a microphone or input file in your desired language." ), article=ASR_NOTE, allow_flagging="never", ) mms_synthesize = gr.Interface( fn=synthesize, inputs=[ gr.Text(label="Input text"), gr.Dropdown( [f"{k} ({v})" for k, v in TTS_LANGUAGES.items()], label="Language", value="eng English", ), gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), ], outputs=[ gr.Audio(label="Generated Audio", type="numpy"), gr.Text(label="Filtered text after removing OOVs"), ], examples=TTS_EXAMPLES, title="Text-to-speech", description=("Generate audio in your desired language from input text."), allow_flagging="never", ) chat_interface = gr.Interface( fn=generate, inputs=[ gr.Textbox(label="Message", type="text"), gr.Textbox(label="Chat History", type="text"), gr.Textbox(label="System prompt", type="text"), ], outputs=gr.Textbox(), # live=True, title="Chat Interface", description="Interactive chat interface using Hugging Face Transformers.", # interpretation="default", # allow_flagging=False, ) mms_select_source_iden = gr.Radio( ["Record from Mic", "Upload audio"], label="Audio input", value="Record from Mic", ) mms_mic_source_iden = gr.Audio(source="microphone", type="filepath", label="Use mic") mms_upload_source_iden = gr.Audio( source="upload", type="filepath", label="Upload file", visible=False ) mms_identify = gr.Interface( fn=identify, inputs=[ mms_select_source_iden, mms_mic_source_iden, mms_upload_source_iden, ], outputs=gr.Label(num_top_classes=10), examples=LID_EXAMPLES, title="Language Identification", description=("Identity the language of input audio."), allow_flagging="never", ) tabbed_interface = gr.TabbedInterface( [mms_transcribe, mms_synthesize, mms_identify, chat_interface], ["Speech-to-text", "Text-to-speech", "Language Identification", "Chat with Llama"], ) with gr.Blocks() as demo: tabbed_interface.render() mms_select_source_trans.change( lambda x: [ gr.update(visible=True if x == "Record from Mic" else False), gr.update(visible=True if x == "Upload audio" else False), ], inputs=[mms_select_source_trans], outputs=[mms_mic_source_trans, mms_upload_source_trans], queue=False, ) mms_select_source_iden.change( lambda x: [ gr.update(visible=True if x == "Record from Mic" else False), gr.update(visible=True if x == "Upload audio" else False), ], inputs=[mms_select_source_iden], outputs=[mms_mic_source_iden, mms_upload_source_iden], queue=False, ) demo.queue(concurrency_count=3) demo.launch() # demo.queue(max_size=20).launch()