danube2024's picture
Update app.py
dff845d verified
raw
history blame
2.09 kB
import gradio as gr
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
from diffusers import StableDiffusionPipeline
import torch
import numpy as np
from PIL import Image
import open3d as o3d
from pathlib import Path
# Initialize the models for CPU environment
device = "cpu"
torch_dtype = torch.float32
# Use a lighter text-to-image model optimized for CPU
text_to_image_pipeline = StableDiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-base",
torch_dtype=torch_dtype
)
text_to_image_pipeline.to(device)
# Load depth estimation models
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
def generate_3d_from_text(prompt):
# Step 1: Generate Image from Text Prompt
generated_image = text_to_image_pipeline(prompt).images[0]
# Step 2: Estimate Depth from Generated Image
encoding = feature_extractor(generated_image, return_tensors="pt")
with torch.no_grad():
outputs = depth_model(**encoding)
predicted_depth = outputs.predicted_depth
# Resize depth map to original image size
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=generated_image.size[::-1],
mode="bicubic",
align_corners=False,
).squeeze()
depth_image = (prediction.cpu().numpy() * 255 / np.max(prediction.cpu().numpy())).astype("uint8")
depth_image_pil = Image.fromarray(depth_image)
return generated_image, depth_image_pil
# Gradio Interface
title = "3D Model Generation from Text (CPU-friendly)"
description = "Generate a 3D model from a text description using a lightweight text-to-image and depth estimation."
iface = gr.Interface(
fn=generate_3d_from_text,
inputs=gr.Textbox(label="Enter text description", placeholder="Describe your scene (e.g., 'A Roman soldier in armor')"),
outputs=[
gr.Image(label="Generated Image"),
gr.Image(label="Depth Map")
],
title=title,
description=description,
)
iface.launch()