Spaces:

Shahabmoin
/

Voice-to-Image-Generator

Sleeping

Update app.py

2a81121 verified 4 months ago

1.46 kB

	import os
	import whisper
	from groq import Groq
	from diffusers import StableDiffusionPipeline
	import gradio as gr
	import torch

	# Load Whisper model
	whisper_model = whisper.load_model("base")

	GROQ_API_KEY="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"
	client=Groq(api_key=GROQ_API_KEY)

	# Load Stable Diffusion pipeline
	device = "cuda" if torch.cuda.is_available() else "cpu"

	stable_diffusion_model = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5"
	).to(device)

	# Function to handle voice-to-image pipeline
	def voice_to_image(audio):
	# Step 1: Transcribe audio to text using Whisper
	transcription = whisper_model.transcribe(audio)
	input_text = transcription["text"]

	# Step 2: Query LLM using Groq API
	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "user", "content": input_text},
	],
	model="llama3-8b-8192",
	stream=False,
	)
	response_text = chat_completion.choices[0].message.content

	# Step 3: Generate image using Stable Diffusion
	image = stable_diffusion_model(response_text).images[0]

	return image

	# Gradio Interface
	interface = gr.Interface(
	fn=voice_to_image,
	inputs=gr.Audio(type="filepath"),
	outputs="image",
	title="Voice-to-Image Generator",
	description="Transcribe voice input into an image using Whisper, Groq LLM, and Stable Diffusion."
	)

	# Launch Gradio app
	interface.launch()