from transformers import pipeline import gradio as gr from gtts import gTTS from openai import OpenAI # Load the Whisper model for speech-to-text pipe = pipeline(model="openai/whisper-small") # Load the text generation model # text_pipe = pipeline("text2text-generation", model="google/flan-t5-base") def generate_gpt_response(text, api_key): client = OpenAI(api_key=api_key) response = client.chat.completions.create( model='gpt-3.5-turbo-0125', messages=[{"role": "user", "content": text}] ) return response.choices[0].message.content def transcribe(audio, api_key): # Transcribe the audio to text text = pipe(audio)["text"] # Generate a response from the transcribed text # lm_response = text_pipe(text)[0]["generated_text"] lm_response = generate_gpt_response(text, api_key) # Convert the response text to speech tts = gTTS(lm_response, lang='ko') # Save the generated audio out_audio = "output_audio.mp3" tts.save(out_audio) return out_audio # Create the Gradio interface iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(type="filepath"), gr.Textbox(label="OpenAI API Key", type="password") # Add a textbox for the API key ], outputs=gr.Audio(type="filepath"), title="Whisper Small Glaswegian", description="Realtime demo for Glaswegian speech recognition using a fine-tuned Whisper small model." ) # Launch the interface iface.launch(share=True)