Spaces:

divakaivan
/

korean_voice_assistant

Sleeping

File size: 1,502 Bytes

2370acc
 
 
1ede7b6
a20e4bb
 
2370acc
 
 
 
 
538698d
 
a20e4bb
 
1ede7b6
 
 
538698d
1ede7b6
 
2370acc
a20e4bb
2370acc
 
 
 
538698d
a20e4bb
2370acc
 
 
 
 
 
 
 
 
 
 
 
a20e4bb
 
 
 
2370acc

from transformers import pipeline
import gradio as gr
from gtts import gTTS
from openai import OpenAI



# Load the Whisper model for speech-to-text
pipe = pipeline(model="openai/whisper-small")

# Load the text generation model
# text_pipe = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_gpt_response(text, api_key):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model='gpt-3.5-turbo-0125',
        messages=[{"role": "user", "content": text}]
    )
    
    return response.choices[0].message.content

def transcribe(audio, api_key):
    # Transcribe the audio to text
    text = pipe(audio)["text"]
    
    # Generate a response from the transcribed text
    # lm_response = text_pipe(text)[0]["generated_text"]
    lm_response = generate_gpt_response(text, api_key)
    # Convert the response text to speech
    tts = gTTS(lm_response, lang='ko')
    
    # Save the generated audio
    out_audio = "output_audio.mp3"
    tts.save(out_audio)
    
    return out_audio

# Create the Gradio interface
iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Textbox(label="OpenAI API Key", type="password")  # Add a textbox for the API key
    ],
    outputs=gr.Audio(type="filepath"),
    title="Whisper Small Glaswegian",
    description="Realtime demo for Glaswegian speech recognition using a fine-tuned Whisper small model."
)

# Launch the interface
iface.launch(share=True)