Spaces:
Sleeping
Sleeping
| import os, tempfile, time, traceback | |
| from pathlib import Path | |
| import gradio as gr | |
| from groq import Groq | |
| # Read secret from HF Spaces. Support both "groq_api_key" and "GROQ_API_KEY". | |
| def _load_key() -> str: | |
| key = os.environ.get("GROQ_API_KEY") or os.environ.get("groq_api_key") | |
| if not key: | |
| raise RuntimeError( | |
| "Groq API key not found. In your Space settings -> Secrets, add 'groq_api_key'." | |
| ) | |
| os.environ["GROQ_API_KEY"] = key | |
| return key | |
| client = Groq(api_key=_load_key()) | |
| def transcribe_audio(audio_path: str, model: str = "whisper-large-v3") -> str: | |
| if not audio_path or not Path(audio_path).exists(): | |
| raise FileNotFoundError("Audio file path is missing or not found.") | |
| with open(audio_path, "rb") as f: | |
| resp = client.audio.transcriptions.create( | |
| file=(Path(audio_path).name, f.read()), | |
| model=model, | |
| response_format="verbose_json", | |
| ) | |
| return (getattr(resp, "text", "") or "").strip() | |
| def stream_answer(prompt_text: str, | |
| model: str = "llama-3.1-8b-instant", | |
| temperature: float = 0.3): | |
| if not prompt_text.strip(): | |
| raise ValueError("Empty prompt for the LLM.") | |
| stream = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": "Answer clearly and concisely."}, | |
| {"role": "user", "content": prompt_text}, | |
| ], | |
| temperature=temperature, | |
| max_completion_tokens=1024, | |
| top_p=1, | |
| stream=True, | |
| ) | |
| acc = [] | |
| for chunk in stream: | |
| delta = chunk.choices[0].delta.content or "" | |
| if delta: | |
| acc.append(delta) | |
| yield "".join(acc) | |
| yield "".join(acc) | |
| def text_to_speech(text: str, | |
| voice: str = "Calum-PlayAI", | |
| model: str = "playai-tts", | |
| fmt: str = "wav") -> str: | |
| if not text.strip(): | |
| raise ValueError("Empty text for TTS.") | |
| tts_input = text[:1200] | |
| resp = client.audio.speech.create( | |
| model=model, | |
| voice=voice, | |
| response_format=fmt, | |
| input=tts_input, | |
| ) | |
| out_path = os.path.join(tempfile.gettempdir(), f"answer_{int(time.time())}.{fmt}") | |
| # BinaryAPIResponse uses write_to_file in Groq SDK | |
| resp.write_to_file(out_path) | |
| return out_path | |
| def run_pipeline(audio_file, typed_question, llm_model, voice_name): | |
| transcript = "" | |
| answer = "" | |
| try: | |
| if typed_question and typed_question.strip(): | |
| transcript = typed_question.strip() | |
| status = "Using typed question." | |
| else: | |
| if not audio_file: | |
| raise RuntimeError("Provide a recording or type a question.") | |
| status = "Transcribing audio..." | |
| yield transcript, answer, None, status | |
| transcript = transcribe_audio(audio_file) | |
| if not transcript: | |
| raise RuntimeError("No text returned by transcription.") | |
| status = "Transcription done." | |
| yield transcript, answer, None, status | |
| status = "Generating answer..." | |
| partial = "" | |
| for partial in stream_answer(transcript, model=llm_model): | |
| answer = partial | |
| yield transcript, answer, None, status | |
| if not answer.strip(): | |
| raise RuntimeError("No text returned by the LLM.") | |
| status = "Converting answer to speech..." | |
| yield transcript, answer, None, status | |
| audio_out = text_to_speech(answer, voice=voice_name) | |
| status = "Done." | |
| yield transcript, answer, audio_out, status | |
| except Exception as e: | |
| err = "Error: " + str(e) | |
| short_tb = "\n".join(traceback.format_exc().splitlines()[-6:]) | |
| help_tip = ( | |
| "\nTips:\n" | |
| "- Check Space secret 'groq_api_key'.\n" | |
| "- Try a shorter audio clip.\n" | |
| "- Verify model names.\n" | |
| "- Confirm requirements installed." | |
| ) | |
| yield transcript, answer, None, err + "\n" + short_tb + help_tip | |
| with gr.Blocks(title="Audio Q&A with Groq") as demo: | |
| gr.Markdown("# Audio Q&A with Groq") | |
| gr.Markdown("One audio or typed question in, one answer out, plus speech.") | |
| with gr.Row(): | |
| audio_in = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Question audio" | |
| ) | |
| typed_in = gr.Textbox(label="Or type your question", placeholder="Optional") | |
| with gr.Row(): | |
| llm_model = gr.Dropdown( | |
| choices=[ | |
| "llama-3.1-8b-instant", | |
| "llama-3.1-70b-versatile", | |
| "llama3-8b-8192", | |
| ], | |
| value="llama-3.1-8b-instant", | |
| label="LLM model" | |
| ) | |
| voice_name = gr.Textbox(value="Calum-PlayAI", label="TTS voice") | |
| ask_btn = gr.Button("Run") | |
| clear_btn = gr.Button("Clear") | |
| transcript_box = gr.Textbox(label="Transcription", interactive=False, lines=4) | |
| answer_box = gr.Textbox(label="Answer", interactive=False, lines=10) | |
| answer_audio = gr.Audio(label="Answer speech", interactive=False) | |
| status_md = gr.Markdown("") | |
| ask_btn.click( | |
| fn=run_pipeline, | |
| inputs=[audio_in, typed_in, llm_model, voice_name], | |
| outputs=[transcript_box, answer_box, answer_audio, status_md] | |
| ) | |
| def clear_all(): | |
| return "", "", None, "" | |
| clear_btn.click(fn=clear_all, inputs=None, outputs=[transcript_box, answer_box, answer_audio, status_md]) | |
| if __name__ == "__main__": | |
| # On HF Spaces you can simply do demo.launch() | |
| # Queue enables generator streaming without extra args in Gradio v4 | |
| demo.queue().launch() | |