Spaces:
Running
Running
import whisper as openai_whisper | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from TTS.api import TTS | |
import gradio as gr | |
import torch | |
import os | |
# 1. Speech-to-Text (STT) Implementation | |
def setup_stt(): | |
model = openai_whisper.load_model("base") # Explicit OpenAI Whisper | |
return model | |
def transcribe_audio(model, audio_file): | |
result = model.transcribe(audio_file) | |
print("Transcription:", result['text']) | |
return result['text'] | |
# 2. Natural Language Processing (NLP) Implementation | |
def setup_nlp(): | |
model_name = "gpt2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
return tokenizer, model | |
def generate_response(tokenizer, model, input_text): | |
prompt = f"User: {input_text}\nAssistant:" | |
input_ids = tokenizer.encode(prompt, return_tensors="pt") | |
response = model.generate( | |
input_ids, | |
max_length=150, | |
num_return_sequences=1, | |
temperature=0.7, | |
top_p=0.9, | |
pad_token_id=tokenizer.eos_token_id, | |
no_repeat_ngram_size=2 | |
) | |
return tokenizer.decode(response[0], skip_special_tokens=True) | |
# 3. Text-to-Speech (TTS) Implementation | |
def setup_tts(): | |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") | |
return tts | |
def generate_speech(tts, text, file_path="output.wav"): | |
tts.tts_to_file(text, file_path=file_path) | |
return file_path | |
# 4. Voice AI System Class | |
class VoiceAISystem: | |
def __init__(self): | |
print("Initializing Voice AI System...") | |
print("Loading STT model...") | |
self.stt_model = setup_stt() | |
print("Loading NLP model...") | |
self.tokenizer, self.nlp_model = setup_nlp() | |
print("Loading TTS model...") | |
self.tts_model = setup_tts() | |
# GPU Optimization | |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
print(f"Using device: {self.device}") | |
self.nlp_model = self.nlp_model.to(self.device) | |
print("System initialization complete!") | |
def process_audio(self, audio_file): | |
try: | |
os.makedirs("tmp", exist_ok=True) | |
print("Transcribing audio...") | |
text = transcribe_audio(self.stt_model, audio_file) | |
print("Generating response...") | |
with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): | |
response = generate_response(self.tokenizer, self.nlp_model, text) | |
print("Converting response to speech...") | |
output_path = os.path.join("tmp", "response.wav") | |
audio_response = generate_speech(self.tts_model, response, output_path) | |
return audio_response, text, response | |
except Exception as e: | |
print(f"Error during processing: {str(e)}") | |
return None, f"Error: {str(e)}", "Error processing request" | |
# 5. Gradio UI Integration | |
def create_voice_ai_interface(): | |
system = VoiceAISystem() | |
def chat(audio): | |
if audio is None: | |
return None, "No audio provided", "No response generated" | |
return system.process_audio(audio) | |
interface = gr.Interface( | |
fn=chat, | |
inputs=[ | |
gr.Audio( | |
type="filepath", | |
label="Speak here" | |
) | |
], | |
outputs=[ | |
gr.Audio(label="AI Response"), | |
gr.Textbox(label="Transcribed Text"), | |
gr.Textbox(label="AI Response Text") | |
], | |
title="Voice AI System", | |
description="Click to record your voice and interact with the AI" | |
) | |
return interface | |
# Launch the interface | |
if __name__ == "__main__": | |
iface = create_voice_ai_interface() | |
iface.launch(share=True) | |