Spaces:

danube2024
/

text-to-image-depth-map

Running

App Files Files Community

text-to-image-depth-map / app.py

danube2024

Update app.py

dff845d verified 5 months ago

raw

history blame

2.09 kB

	import gradio as gr
	from transformers import DPTFeatureExtractor, DPTForDepthEstimation
	from diffusers import StableDiffusionPipeline
	import torch
	import numpy as np
	from PIL import Image
	import open3d as o3d
	from pathlib import Path

	# Initialize the models for CPU environment
	device = "cpu"
	torch_dtype = torch.float32

	# Use a lighter text-to-image model optimized for CPU
	text_to_image_pipeline = StableDiffusionPipeline.from_pretrained(
	"stabilityai/stable-diffusion-2-1-base",
	torch_dtype=torch_dtype
	)
	text_to_image_pipeline.to(device)

	# Load depth estimation models
	feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
	depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

	def generate_3d_from_text(prompt):
	# Step 1: Generate Image from Text Prompt
	generated_image = text_to_image_pipeline(prompt).images[0]

	# Step 2: Estimate Depth from Generated Image
	encoding = feature_extractor(generated_image, return_tensors="pt")
	with torch.no_grad():
	outputs = depth_model(**encoding)
	predicted_depth = outputs.predicted_depth

	# Resize depth map to original image size
	prediction = torch.nn.functional.interpolate(
	predicted_depth.unsqueeze(1),
	size=generated_image.size[::-1],
	mode="bicubic",
	align_corners=False,
	).squeeze()
	depth_image = (prediction.cpu().numpy() * 255 / np.max(prediction.cpu().numpy())).astype("uint8")
	depth_image_pil = Image.fromarray(depth_image)

	return generated_image, depth_image_pil

	# Gradio Interface
	title = "3D Model Generation from Text (CPU-friendly)"
	description = "Generate a 3D model from a text description using a lightweight text-to-image and depth estimation."

	iface = gr.Interface(
	fn=generate_3d_from_text,
	inputs=gr.Textbox(label="Enter text description", placeholder="Describe your scene (e.g., 'A Roman soldier in armor')"),
	outputs=[
	gr.Image(label="Generated Image"),
	gr.Image(label="Depth Map")
	],
	title=title,
	description=description,
	)

	iface.launch()