Spaces:

camanalo1
/

MyAlexa

Sleeping

File size: 1,488 Bytes

35f8a26
0f4f655
4ae34f2
 
 
0f4f655
 
9a710b0
 
 
 
15736b9
9a710b0
 
0e6d273
 
0f4f655
 
0e6d273
 
 
4ae34f2
 
0e6d273
4ae34f2
 
0e6d273
4ae34f2
0e6d273
 
4ae34f2
 
0e6d273
4ae34f2
 
 
 
 
0f4f655
 
 
 
 
3fdc3cc
0f4f655
 
 
 
 
0e6d273

import gradio as gr
from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
import numpy as np
import torch
import io
import soundfile as sf

# Initialize ASR pipeline
transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")

# Initialize LLM pipeline
generator = pipeline("text-generation", model="gpt2")

# Initialize TTS tokenizer and model
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model = VitsModel.from_pretrained("facebook/mms-tts-eng")

def transcribe_and_generate_audio(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # Transcribe audio
    asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]

    # Generate text based on ASR output
    generated_text = generator(asr_output)[0]['generated_text']

    # Generate audio from text
    inputs = tokenizer(text=generated_text, return_tensors="pt")
    set_seed(555)
    with torch.no_grad():
        outputs = model(**inputs)
    waveform = outputs.waveform[0]
    waveform_path = "output.wav"
    sf.write(waveform_path, waveform.numpy(), 16000, format='wav')

    return waveform_path

# Define Gradio interface
audio_input = gr.Interface(
    transcribe_and_generate_audio,
    gr.Audio(sources=["microphone"], label="Speak Here"),
    "audio",
    title="ASR -> LLM -> TTS",
    description="Speak into the microphone and hear the generated audio."
)

# Launch the interface
audio_input.launch()