Spaces:

danube2024
/

text-to-image-depth-map

Runtime error

File size: 2,091 Bytes

2b2a90e
dff845d
 
 
 
2b2a90e
dff845d
 
2b2a90e
dff845d
 
 
2b2a90e
dff845d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b2a90e
 
dff845d
 
 
 
 
 
 
 
2b2a90e

import gradio as gr
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
from diffusers import StableDiffusionPipeline
import torch
import numpy as np
from PIL import Image
import open3d as o3d
from pathlib import Path

# Initialize the models for CPU environment
device = "cpu"
torch_dtype = torch.float32

# Use a lighter text-to-image model optimized for CPU
text_to_image_pipeline = StableDiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-1-base",
    torch_dtype=torch_dtype
)
text_to_image_pipeline.to(device)

# Load depth estimation models
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

def generate_3d_from_text(prompt):
    # Step 1: Generate Image from Text Prompt
    generated_image = text_to_image_pipeline(prompt).images[0]

    # Step 2: Estimate Depth from Generated Image
    encoding = feature_extractor(generated_image, return_tensors="pt")
    with torch.no_grad():
        outputs = depth_model(**encoding)
        predicted_depth = outputs.predicted_depth

    # Resize depth map to original image size
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size=generated_image.size[::-1],
        mode="bicubic",
        align_corners=False,
    ).squeeze()
    depth_image = (prediction.cpu().numpy() * 255 / np.max(prediction.cpu().numpy())).astype("uint8")
    depth_image_pil = Image.fromarray(depth_image)

    return generated_image, depth_image_pil

# Gradio Interface
title = "3D Model Generation from Text (CPU-friendly)"
description = "Generate a 3D model from a text description using a lightweight text-to-image and depth estimation."

iface = gr.Interface(
    fn=generate_3d_from_text,
    inputs=gr.Textbox(label="Enter text description", placeholder="Describe your scene (e.g., 'A Roman soldier in armor')"),
    outputs=[
        gr.Image(label="Generated Image"),
        gr.Image(label="Depth Map")
    ],
    title=title,
    description=description,
)

iface.launch()