Spaces:
Runtime error
Runtime error
File size: 5,442 Bytes
7f6b914 e5a6d7f f8b3886 7f6b914 f8b3886 893be2d 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 893be2d d5bc6d9 7f6b914 d5bc6d9 7f6b914 d5bc6d9 fd26002 7f6b914 893be2d 7f6b914 7b83683 f8b3886 a42d79c 7f6b914 e41de5d 7f6b914 99bbe3e 7f6b914 99bbe3e a42d79c dcf5ae7 7f6b914 59ab62d 7f6b914 59ab62d 7f6b914 59ab62d dcf5ae7 7f6b914 59ab62d e687f81 7f6b914 59ab62d 7f6b914 e687f81 7f6b914 59ab62d 7f6b914 dcf5ae7 7f6b914 dcf5ae7 3548ace 7f6b914 d6a18b3 7f6b914 1f906f0 7f6b914 cafea28 7f6b914 d5bc6d9 40334e7 7f6b914 d5bc6d9 40334e7 7f6b914 d5bc6d9 dcf5ae7 edd2a5b 7f6b914 dcf5ae7 7bc8ed0 7f6b914 4f1fd81 7f6b914 4f1fd81 f8b3886 7f6b914 f170544 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# Import required libraries for image processing, deep learning, and visualization
import cv2 # OpenCV for image processing
import torch # PyTorch deep learning framework
import numpy as np # NumPy for numerical operations
from transformers import DPTImageProcessor # Hugging Face image processor for depth estimation
import gradio as gr # Gradio for creating web interfaces
import matplotlib.pyplot as plt # Matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D # 3D plotting tools
import torch.nn as nn # Neural network modules from PyTorch
# Set up device - will use GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define my compressed student model architecture for depth estimation
class CompressedStudentModel(nn.Module):
def __init__(self):
# Initialize parent class
super(CompressedStudentModel, self).__init__()
# Define encoder network that extracts features from input image
self.encoder = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1), # First conv layer: RGB -> 64 channels
nn.ReLU(), # Activation function
nn.Conv2d(64, 64, kernel_size=3, padding=1), # Second conv: 64 -> 64 channels
nn.ReLU(),
nn.MaxPool2d(2), # Reduce spatial dimensions by 2
nn.Conv2d(64, 128, kernel_size=3, padding=1), # Third conv: 64 -> 128 channels
nn.ReLU(),
nn.Conv2d(128, 128, kernel_size=3, padding=1), # Fourth conv: 128 -> 128 channels
nn.ReLU(),
nn.MaxPool2d(2), # Further reduce spatial dimensions
nn.Conv2d(128, 256, kernel_size=3, padding=1), # Fifth conv: 128 -> 256 channels
nn.ReLU(),
nn.Conv2d(256, 256, kernel_size=3, padding=1), # Sixth conv: 256 -> 256 channels
nn.ReLU(),
)
# Define decoder network that upsamples features back to original resolution
self.decoder = nn.Sequential(
nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1), # First upsample: 256 -> 128
nn.ReLU(),
nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1), # Second upsample: 128 -> 64
nn.ReLU(),
nn.Conv2d(64, 1, kernel_size=3, padding=1), # Final conv: 64 -> 1 channel depth map
)
def forward(self, x):
# Pass input through encoder to get features
features = self.encoder(x)
# Pass features through decoder to get depth map
depth = self.decoder(features)
return depth
# Load my trained model and prepare it for inference
model = CompressedStudentModel().to(device) # Create model instance and move to device
model.load_state_dict(torch.load("huntrezz_depth_v2.pt", map_location=device)) # Load trained weights
model.eval() # Set model to evaluation mode
# Initialize the image processor from Hugging Face
processor = DPTImageProcessor.from_pretrained("Intel/dpt-swinv2-tiny-256")
def preprocess_image(image):
# Resize image to 200x200 for consistent processing
image = cv2.resize(image, (200, 200))
# Convert image to PyTorch tensor and move to device
image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).float().to(device)
# Normalize pixel values to [0,1] range
return image / 255.0
def plot_depth_map(depth_map, original_image):
# Create new figure with specific size
fig = plt.figure(figsize=(16, 9))
# Add 3D subplot
ax = fig.add_subplot(111, projection='3d')
# Create coordinate grids for 3D plot
x, y = np.meshgrid(range(depth_map.shape[1]), range(depth_map.shape[0]))
# Normalize depth values for coloring
norm = plt.Normalize(depth_map.min(), depth_map.max())
colors = plt.cm.viridis(norm(depth_map))
# Create 3D surface plot
ax.plot_surface(x, y, depth_map, facecolors=colors, shade=False)
ax.set_zlim(0, 1) # Set z-axis limits
# Set viewing angle for better visualization
ax.view_init(elev=70, azim=90)
plt.axis('off') # Hide axes
plt.close(fig) # Close the figure to free memory
# Convert matplotlib figure to numpy array
fig.canvas.draw()
img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
return img
@torch.inference_mode() # Disable gradient computation for inference
def process_frame(image):
# Check if image is valid
if image is None:
return None
# Preprocess input image
preprocessed = preprocess_image(image)
# Get depth prediction from model
predicted_depth = model(preprocessed).squeeze().cpu().numpy()
# Normalize depth values to [0,1] range
depth_map = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
# Convert BGR to RGB if needed
if image.shape[2] == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Create and return 3D visualization
return plot_depth_map(depth_map, image)
# Create Gradio interface for webcam input
interface = gr.Interface(
fn=process_frame, # Processing function
inputs=gr.Image(sources="webcam", streaming=True), # Webcam input
outputs="image", # Image output
live=True # Enable live updates
)
# Launch the interface
interface.launch() |