Spaces:

huntrezz
/

RealtimeMonocularDepth

Runtime error

File size: 5,442 Bytes

7f6b914
 
 
 
 
 
 
 
 
e5a6d7f
f8b3886
7f6b914
f8b3886
893be2d
7f6b914
d5bc6d9
 
7f6b914
d5bc6d9
7f6b914
d5bc6d9
7f6b914
 
 
d5bc6d9
7f6b914
 
d5bc6d9
7f6b914
d5bc6d9
7f6b914
 
d5bc6d9
7f6b914
d5bc6d9
 
7f6b914
d5bc6d9
7f6b914
d5bc6d9
7f6b914
d5bc6d9
7f6b914
d5bc6d9
893be2d
d5bc6d9
7f6b914
d5bc6d9
7f6b914
d5bc6d9
 
fd26002
7f6b914
 
 
 
893be2d
7f6b914
7b83683
f8b3886
a42d79c
7f6b914
e41de5d
7f6b914
99bbe3e
7f6b914
99bbe3e
a42d79c
dcf5ae7
7f6b914
59ab62d
7f6b914
59ab62d
7f6b914
59ab62d
dcf5ae7
7f6b914
59ab62d
 
e687f81
7f6b914
59ab62d
7f6b914
e687f81
7f6b914
59ab62d
7f6b914
 
dcf5ae7
7f6b914
dcf5ae7
 
 
 
 
3548ace
7f6b914
d6a18b3
7f6b914
1f906f0
 
7f6b914
cafea28
7f6b914
d5bc6d9
40334e7
7f6b914
d5bc6d9
40334e7
7f6b914
d5bc6d9
dcf5ae7
edd2a5b
7f6b914
dcf5ae7
7bc8ed0
7f6b914
4f1fd81
7f6b914
 
 
 
4f1fd81
f8b3886
7f6b914
f170544

# Import required libraries for image processing, deep learning, and visualization
import cv2  # OpenCV for image processing
import torch  # PyTorch deep learning framework
import numpy as np  # NumPy for numerical operations
from transformers import DPTImageProcessor  # Hugging Face image processor for depth estimation
import gradio as gr  # Gradio for creating web interfaces
import matplotlib.pyplot as plt  # Matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D  # 3D plotting tools
import torch.nn as nn  # Neural network modules from PyTorch


# Set up device - will use GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define my compressed student model architecture for depth estimation
class CompressedStudentModel(nn.Module):
    def __init__(self):
        # Initialize parent class
        super(CompressedStudentModel, self).__init__()
        # Define encoder network that extracts features from input image
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),  # First conv layer: RGB -> 64 channels
            nn.ReLU(),  # Activation function
            nn.Conv2d(64, 64, kernel_size=3, padding=1),  # Second conv: 64 -> 64 channels
            nn.ReLU(),
            nn.MaxPool2d(2),  # Reduce spatial dimensions by 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),  # Third conv: 64 -> 128 channels
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),  # Fourth conv: 128 -> 128 channels
            nn.ReLU(),
            nn.MaxPool2d(2),  # Further reduce spatial dimensions
            nn.Conv2d(128, 256, kernel_size=3, padding=1),  # Fifth conv: 128 -> 256 channels
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),  # Sixth conv: 256 -> 256 channels
            nn.ReLU(),
        )
        # Define decoder network that upsamples features back to original resolution
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1),  # First upsample: 256 -> 128
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1),  # Second upsample: 128 -> 64
            nn.ReLU(),
            nn.Conv2d(64, 1, kernel_size=3, padding=1),  # Final conv: 64 -> 1 channel depth map
        )

    def forward(self, x):
        # Pass input through encoder to get features
        features = self.encoder(x)
        # Pass features through decoder to get depth map
        depth = self.decoder(features)
        return depth

# Load my trained model and prepare it for inference
model = CompressedStudentModel().to(device)  # Create model instance and move to device
model.load_state_dict(torch.load("huntrezz_depth_v2.pt", map_location=device))  # Load trained weights
model.eval()  # Set model to evaluation mode

# Initialize the image processor from Hugging Face
processor = DPTImageProcessor.from_pretrained("Intel/dpt-swinv2-tiny-256")

def preprocess_image(image):
    # Resize image to 200x200 for consistent processing
    image = cv2.resize(image, (200, 200))
    # Convert image to PyTorch tensor and move to device
    image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).float().to(device)
    # Normalize pixel values to [0,1] range
    return image / 255.0

def plot_depth_map(depth_map, original_image):
    # Create new figure with specific size
    fig = plt.figure(figsize=(16, 9))
    # Add 3D subplot
    ax = fig.add_subplot(111, projection='3d')
    # Create coordinate grids for 3D plot
    x, y = np.meshgrid(range(depth_map.shape[1]), range(depth_map.shape[0]))
    
    # Normalize depth values for coloring
    norm = plt.Normalize(depth_map.min(), depth_map.max())
    colors = plt.cm.viridis(norm(depth_map))
    
    # Create 3D surface plot
    ax.plot_surface(x, y, depth_map, facecolors=colors, shade=False)
    ax.set_zlim(0, 1)  # Set z-axis limits
    
    # Set viewing angle for better visualization
    ax.view_init(elev=70, azim=90) 
    plt.axis('off')  # Hide axes
    plt.close(fig)  # Close the figure to free memory
    
    # Convert matplotlib figure to numpy array
    fig.canvas.draw()
    img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    
    return img

@torch.inference_mode()  # Disable gradient computation for inference
def process_frame(image):
    # Check if image is valid
    if image is None:
        return None
    # Preprocess input image
    preprocessed = preprocess_image(image)
    # Get depth prediction from model
    predicted_depth = model(preprocessed).squeeze().cpu().numpy()
    
    # Normalize depth values to [0,1] range
    depth_map = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
    
    # Convert BGR to RGB if needed
    if image.shape[2] == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Create and return 3D visualization
    return plot_depth_map(depth_map, image)

# Create Gradio interface for webcam input
interface = gr.Interface(
    fn=process_frame,  # Processing function
    inputs=gr.Image(sources="webcam", streaming=True),  # Webcam input
    outputs="image",  # Image output
    live=True  # Enable live updates
)

# Launch the interface
interface.launch()