Spaces:
Sleeping
Sleeping
import os | |
import whisper | |
from groq import Groq | |
from diffusers import StableDiffusionPipeline | |
import gradio as gr | |
import torch | |
# Load Whisper model | |
whisper_model = whisper.load_model("base") | |
GROQ_API_KEY="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" | |
client=Groq(api_key=GROQ_API_KEY) | |
# Load Stable Diffusion pipeline | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
stable_diffusion_model = StableDiffusionPipeline.from_pretrained( | |
"runwayml/stable-diffusion-v1-5" | |
).to(device) | |
# Function to handle voice-to-image pipeline | |
def voice_to_image(audio): | |
# Step 1: Transcribe audio to text using Whisper | |
transcription = whisper_model.transcribe(audio) | |
input_text = transcription["text"] | |
# Step 2: Query LLM using Groq API | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{"role": "user", "content": input_text}, | |
], | |
model="llama3-8b-8192", | |
stream=False, | |
) | |
response_text = chat_completion.choices[0].message.content | |
# Step 3: Generate image using Stable Diffusion | |
image = stable_diffusion_model(response_text).images[0] | |
return image | |
# Gradio Interface | |
interface = gr.Interface( | |
fn=voice_to_image, | |
inputs=gr.Audio(type="filepath"), | |
outputs="image", | |
title="Voice-to-Image Generator", | |
description="Transcribe voice input into an image using Whisper, Groq LLM, and Stable Diffusion." | |
) | |
# Launch Gradio app | |
interface.launch() |