import gradio as gr from transformers import pipeline import soundfile as sf from huggingface_hub import InferenceClient # Initialize Facebook MMS ASR model asr_model = pipeline("automatic-speech-recognition", model="facebook/mms-1b-all") # Initialize Facebook MMS TTS model tts_model = pipeline("text-to-speech", model="facebook/mms-tts") # Initialize the Chat Model (Gemma-2-9B or Futuresony.gguf) chat_client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf") # Change if needed def asr_chat_tts(audio): """ 1. Convert Speech to Text (ASR) 2. Process text through Chat Model (LLM) 3. Convert response to Speech (TTS) """ # Step 1: Transcribe speech using Facebook MMS ASR transcription = asr_model(audio)["text"] # Step 2: Process text through the chat model messages = [{"role": "system", "content": "You are a helpful AI assistant."}] messages.append({"role": "user", "content": transcription}) response = "" for msg in chat_client.chat_completion(messages, max_tokens=512, stream=True): token = msg.choices[0].delta.content response += token # Step 3: Convert response to speech using Facebook MMS TTS speech = tts_model(response) output_file = "generated_speech.wav" sf.write(output_file, speech["audio"], samplerate=speech["sampling_rate"]) return transcription, response, output_file # Gradio Interface with gr.Blocks() as demo: gr.Markdown("