import whisper as openai_whisper from transformers import AutoModelForCausalLM, AutoTokenizer from TTS.api import TTS import gradio as gr import torch import os # 1. Speech-to-Text (STT) Implementation def setup_stt(): model = openai_whisper.load_model("base") # Explicit OpenAI Whisper return model def transcribe_audio(model, audio_file): result = model.transcribe(audio_file) print("Transcription:", result['text']) return result['text'] # 2. Natural Language Processing (NLP) Implementation def setup_nlp(): model_name = "gpt2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) return tokenizer, model def generate_response(tokenizer, model, input_text): prompt = f"User: {input_text}\nAssistant:" input_ids = tokenizer.encode(prompt, return_tensors="pt") response = model.generate( input_ids, max_length=150, num_return_sequences=1, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=2 ) return tokenizer.decode(response[0], skip_special_tokens=True) # 3. Text-to-Speech (TTS) Implementation def setup_tts(): tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") return tts def generate_speech(tts, text, file_path="output.wav"): tts.tts_to_file(text, file_path=file_path) return file_path # 4. Voice AI System Class class VoiceAISystem: def __init__(self): print("Initializing Voice AI System...") print("Loading STT model...") self.stt_model = setup_stt() print("Loading NLP model...") self.tokenizer, self.nlp_model = setup_nlp() print("Loading TTS model...") self.tts_model = setup_tts() # GPU Optimization self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {self.device}") self.nlp_model = self.nlp_model.to(self.device) print("System initialization complete!") def process_audio(self, audio_file): try: os.makedirs("tmp", exist_ok=True) print("Transcribing audio...") text = transcribe_audio(self.stt_model, audio_file) print("Generating response...") with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): response = generate_response(self.tokenizer, self.nlp_model, text) print("Converting response to speech...") output_path = os.path.join("tmp", "response.wav") audio_response = generate_speech(self.tts_model, response, output_path) return audio_response, text, response except Exception as e: print(f"Error during processing: {str(e)}") return None, f"Error: {str(e)}", "Error processing request" # 5. Gradio UI Integration def create_voice_ai_interface(): system = VoiceAISystem() def chat(audio): if audio is None: return None, "No audio provided", "No response generated" return system.process_audio(audio) interface = gr.Interface( fn=chat, inputs=[ gr.Audio( type="filepath", label="Speak here" ) ], outputs=[ gr.Audio(label="AI Response"), gr.Textbox(label="Transcribed Text"), gr.Textbox(label="AI Response Text") ], title="Voice AI System", description="Click to record your voice and interact with the AI" ) return interface # Launch the interface if __name__ == "__main__": iface = create_voice_ai_interface() iface.launch(share=True)