File size: 3,990 Bytes
46bf7bb
eb2635d
38df71d
eb2635d
38df71d
 
4cba026
191f52c
38df71d
 
 
191f52c
38df71d
 
eb2635d
191f52c
38df71d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517c9ad
38df71d
 
 
 
 
 
eb2635d
 
38df71d
eb2635d
 
 
38df71d
eb2635d
 
 
 
38df71d
 
eb2635d
 
 
 
 
 
 
 
 
38df71d
191f52c
38df71d
 
 
 
191f52c
38df71d
191f52c
38df71d
 
 
 
eb2635d
38df71d
 
eb2635d
38df71d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191f52c
38df71d
 
191f52c
38df71d
191f52c
 
38df71d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import io
import streamlit as st
from groq import Groq
import soundfile as sf
import google.generativeai as genai
from audiorecorder import audiorecorder

# Umgebungsvariablen laden
groq_api_key = os.getenv('groqwhisper')
gemini_api_key = os.getenv('geminiapi')

if not groq_api_key or not gemini_api_key:
    st.error("Bitte setze die Umgebungsvariablen 'groqwhisper' und 'geminiapi'")
    st.stop()

# Groq Client initialisieren
groq_client = Groq(api_key=groq_api_key)

# Gemini konfigurieren
genai.configure(api_key=gemini_api_key)

# Modellkonfiguration
generation_config = {
    "temperature": 0.4,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-2.0-flash-exp",
    generation_config=generation_config,
)
audio_bytes = st.audio("Click to record")
# Session State für Chatverlauf
if "chat_session" not in st.session_state:
    st.session_state.chat_session = model.start_chat(history=[])

if "display_history" not in st.session_state:
    st.session_state.display_history = []

def process_audio(audio_data):
    """Verarbeitet Audiodaten und gibt Transkript zurück."""
    try:
        sample_rate, samples = audio_data
        
        # WAV-Datei im Speicher erstellen
        with io.BytesIO() as wav_buffer:
            sf.write(wav_buffer, samples, sample_rate, format='WAV')
            wav_buffer.seek(0)
            
            # Transkription mit Groq
            transcription = groq_client.audio.transcriptions.create(
                file=("recording.wav", wav_buffer.read(), "audio/wav"),
                model="whisper-large-v3-turbo",
                prompt="transcribe",
                language="de",
                response_format="json",
                temperature=0.0
            )
        return transcription.text
    except Exception as e:
        return f"Fehler: {str(e)}"

# Streamlit UI
st.set_page_config(
    page_title="Gemini Chatbot mit Spracheingabe",
    page_icon="🤖"
)
st.title("Gemini Chatbot 🎤+📝")

# Chatverlauf anzeigen
for role, text in st.session_state.display_history:
    with st.chat_message(role):
        st.markdown(text)

# Spracheingabe verarbeiten
audio_bytes = st.audio_input("Sprachnachricht aufnehmen")
if audio_bytes:
    try:
        audio_content = audio_bytes.getvalue()
        with io.BytesIO(audio_content) as wav_io:
            samples, sample_rate = sf.read(wav_io)
            if len(samples.shape) > 1 and samples.shape[1] == 2:
                samples = samples.mean(axis=1)
                
            with st.spinner("Transkription..."):
                transcription = process_audio((sample_rate, samples))
                
            if transcription:
                if transcription.startswith("Fehler:"):
                    st.error(transcription)
                else:
                    st.session_state.display_history.append(("user", transcription))
                    full_prompt = f"{transcription}\nAntworte immer auf Deutsch"
                    response = st.session_state.chat_session.send_message(full_prompt)
                    response_text = response.candidates[0].content.parts[0].text if response.candidates else "Keine Antwort"
                    st.session_state.display_history.append(("assistant", response_text))
                    st.rerun()
    except Exception as e:
        st.error(f"Audioprocessing fehlgeschlagen: {str(e)}")

# Texteingabe verarbeiten
user_input = st.text_input("Schreibe deine Frage:", key="user_input")
if user_input:
    st.session_state.display_history.append(("user", user_input))
    full_prompt = f"{user_input}\nAntworte immer auf Deutsch"
    response = st.session_state.chat_session.send_message(full_prompt)
    response_text = response.candidates[0].content.parts[0].text if response.candidates else "Keine Antwort"
    st.session_state.display_history.append(("assistant", response_text))
    st.rerun()