File size: 5,240 Bytes

d7dfe8c
79a67ac
 
 
 
 
 
 
d7dfe8c
79a67ac

import streamlit as st
from openai import OpenAI
import sounddevice as sd
import scipy.io.wavfile
import io
import base64
import os
import time


st.set_page_config(page_title="Voice Bot", layout="wide")

# Configuration
SAMPLE_RATE = 44100
RECORD_DURATION = 5
TEMP_AUDIO_FILE = "temp_audio.wav"

# Initialize OpenAI client
api_key = st.secrets['openai']
client = OpenAI(api_key=api_key)

# Initialize session state variables if they don't exist
if 'recorded_audio' not in st.session_state:
    st.session_state.recorded_audio = None
if 'user_text' not in st.session_state:
    st.session_state.user_text = None
if 'ai_reply' not in st.session_state:
    st.session_state.ai_reply = None

def load_context():
    """Load the context from file."""
    try:
        with open("context.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        st.error("Context file not found!")
        return ""

def record_audio():
    """Record audio and return the buffer."""
    progress_bar = st.progress(0)
    recording = sd.rec(int(RECORD_DURATION * SAMPLE_RATE), 
                      samplerate=SAMPLE_RATE, 
                      channels=1)
    
    # Update progress bar while recording
    for i in range(RECORD_DURATION * 10):
        progress_bar.progress((i + 1) / (RECORD_DURATION * 10))
        time.sleep(0.1)
    
    sd.wait()
    progress_bar.empty()  # Remove progress bar after recording
    
    buf = io.BytesIO()
    scipy.io.wavfile.write(buf, SAMPLE_RATE, recording)
    buf.seek(0)
    return buf

def transcribe_audio(audio_buffer):
    """Transcribe audio using Whisper API."""
    with open(TEMP_AUDIO_FILE, "wb") as f:
        f.write(audio_buffer.getvalue())
    
    with open(TEMP_AUDIO_FILE, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file
        )
    return transcript.text

def get_ai_response(user_text, context):
    """Get AI response using GPT-4."""
    system_prompt = f"""
    You are Prakhar.
    You must respond **only using the following context**:

    {context}

    If the user's question cannot be answered using this context, respond with:
    "I'm not sure about that based on what I know."
    """
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_text}
        ]
    )
    return response.choices[0].message.content

def text_to_speech(text):
    """Convert text to speech using OpenAI TTS."""
    speech = client.audio.speech.create(
        model="tts-1",
        voice="onyx",
        input=text
    )
    return base64.b64encode(speech.content).decode()

def handle_record_button():
    """Handle recording button click"""
    st.session_state.processing = True
    info_placeholder = st.empty()
    info_placeholder.info("Recording...")
    audio_buffer = record_audio()
    info_placeholder.empty()
    st.session_state.recorded_audio = audio_buffer

def main():
    st.title("Voice Bot")
    
    if 'context' not in st.session_state:
        st.session_state.context = load_context()
    if 'processing' not in st.session_state:
        st.session_state.processing = False

    with st.container():

        audio, script = st.columns(2, border=True)

        with audio:
            st.subheader("Audio Input")
            st.button("🎙️ Record Voice", on_click=handle_record_button)
            
            # Create placeholder for processing status
            process_placeholder = st.empty()
            
            # Handle processing if recording just completed
            if st.session_state.processing:
                with process_placeholder.container():
                    with st.spinner("Processing..."):
                        st.session_state.user_text = transcribe_audio(st.session_state.recorded_audio)
                        st.session_state.ai_reply = get_ai_response(st.session_state.user_text, st.session_state.context)
                        audio_b64 = text_to_speech(st.session_state.ai_reply)
                        st.session_state.ai_audio = audio_b64
                        st.session_state.processing = False
            
            # Display recorded audio if exists
            if st.session_state.recorded_audio is not None:
                st.audio(st.session_state.recorded_audio, format="audio/wav")
                if hasattr(st.session_state, 'ai_audio'):
                    st.audio(f"data:audio/mp3;base64,{st.session_state.ai_audio}", format="audio/mp3")

        with script:
            st.subheader("Conversation")
            if st.session_state.user_text is not None:
                st.markdown("**You said:**")
                st.markdown(f"{st.session_state.user_text}")
                st.markdown("**AI Response:**")
                st.markdown(f"{st.session_state.ai_reply}")
            
            st.divider()

    with st.container(border=True):
        st.text_area("Context", value=st.session_state.context, height=270, disabled=False)
        st.markdown("You can update the context in the `context.txt` file.")

if __name__ == "__main__":
    main()