|
import gradio as gr |
|
from transformers import DPTFeatureExtractor, DPTForDepthEstimation |
|
from diffusers import StableDiffusionPipeline |
|
import torch |
|
import numpy as np |
|
from PIL import Image |
|
import open3d as o3d |
|
from pathlib import Path |
|
|
|
|
|
device = "cpu" |
|
torch_dtype = torch.float32 |
|
|
|
|
|
text_to_image_pipeline = StableDiffusionPipeline.from_pretrained( |
|
"stabilityai/stable-diffusion-2-1-base", |
|
torch_dtype=torch_dtype |
|
) |
|
text_to_image_pipeline.to(device) |
|
|
|
|
|
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") |
|
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") |
|
|
|
def generate_3d_from_text(prompt): |
|
|
|
generated_image = text_to_image_pipeline(prompt).images[0] |
|
|
|
|
|
encoding = feature_extractor(generated_image, return_tensors="pt") |
|
with torch.no_grad(): |
|
outputs = depth_model(**encoding) |
|
predicted_depth = outputs.predicted_depth |
|
|
|
|
|
prediction = torch.nn.functional.interpolate( |
|
predicted_depth.unsqueeze(1), |
|
size=generated_image.size[::-1], |
|
mode="bicubic", |
|
align_corners=False, |
|
).squeeze() |
|
depth_image = (prediction.cpu().numpy() * 255 / np.max(prediction.cpu().numpy())).astype("uint8") |
|
depth_image_pil = Image.fromarray(depth_image) |
|
|
|
return generated_image, depth_image_pil |
|
|
|
|
|
title = "3D Model Generation from Text (CPU-friendly)" |
|
description = "Generate a 3D model from a text description using a lightweight text-to-image and depth estimation." |
|
|
|
iface = gr.Interface( |
|
fn=generate_3d_from_text, |
|
inputs=gr.Textbox(label="Enter text description", placeholder="Describe your scene (e.g., 'A Roman soldier in armor')"), |
|
outputs=[ |
|
gr.Image(label="Generated Image"), |
|
gr.Image(label="Depth Map") |
|
], |
|
title=title, |
|
description=description, |
|
) |
|
|
|
iface.launch() |
|
|