File size: 1,502 Bytes
2370acc
 
 
1ede7b6
a20e4bb
 
2370acc
 
 
 
 
538698d
 
a20e4bb
 
1ede7b6
 
 
538698d
1ede7b6
 
2370acc
a20e4bb
2370acc
 
 
 
538698d
a20e4bb
2370acc
 
 
 
 
 
 
 
 
 
 
 
a20e4bb
 
 
 
2370acc
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from transformers import pipeline
import gradio as gr
from gtts import gTTS
from openai import OpenAI



# Load the Whisper model for speech-to-text
pipe = pipeline(model="openai/whisper-small")

# Load the text generation model
# text_pipe = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_gpt_response(text, api_key):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model='gpt-3.5-turbo-0125',
        messages=[{"role": "user", "content": text}]
    )
    
    return response.choices[0].message.content

def transcribe(audio, api_key):
    # Transcribe the audio to text
    text = pipe(audio)["text"]
    
    # Generate a response from the transcribed text
    # lm_response = text_pipe(text)[0]["generated_text"]
    lm_response = generate_gpt_response(text, api_key)
    # Convert the response text to speech
    tts = gTTS(lm_response, lang='ko')
    
    # Save the generated audio
    out_audio = "output_audio.mp3"
    tts.save(out_audio)
    
    return out_audio

# Create the Gradio interface
iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Textbox(label="OpenAI API Key", type="password")  # Add a textbox for the API key
    ],
    outputs=gr.Audio(type="filepath"),
    title="Whisper Small Glaswegian",
    description="Realtime demo for Glaswegian speech recognition using a fine-tuned Whisper small model."
)

# Launch the interface
iface.launch(share=True)