Spaces:
Running
Running
""" | |
GPU monitoring tab for Video Model Studio UI. | |
Displays detailed GPU metrics and visualizations. | |
""" | |
import gradio as gr | |
import time | |
import logging | |
from pathlib import Path | |
import os | |
from typing import Dict, Any, List, Optional, Tuple | |
from datetime import datetime, timedelta | |
from vms.utils.base_tab import BaseTab | |
from vms.ui.monitoring.utils import human_readable_size | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
class GPUTab(BaseTab): | |
"""Tab for GPU-specific monitoring and statistics""" | |
def __init__(self, app_state): | |
super().__init__(app_state) | |
self.id = "GPU_tab" | |
self.title = "GPU Stats" | |
self.refresh_interval = 5 | |
self.selected_gpu = 0 | |
def create(self, parent=None) -> gr.TabItem: | |
"""Create the GPU tab UI components""" | |
with gr.TabItem(self.title, id=self.id) as tab: | |
with gr.Row(): | |
gr.Markdown("## 🖥️ GPU Monitoring") | |
# No GPUs available message (hidden by default) | |
with gr.Row(visible=not self.app.monitoring.gpu.has_nvidia_gpus): | |
with gr.Column(): | |
gr.Markdown("### No NVIDIA GPUs detected") | |
gr.Markdown("GPU monitoring is only available for NVIDIA GPUs. If you have NVIDIA GPUs installed, ensure the drivers are properly configured.") | |
# GPU content (only visible if GPUs are available) | |
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus): | |
# GPU selector if multiple GPUs | |
if self.app.monitoring.gpu.gpu_count > 1: | |
with gr.Column(scale=1): | |
gpu_options = [f"GPU {i}" for i in range(self.app.monitoring.gpu.gpu_count)] | |
self.components["gpu_selector"] = gr.Dropdown( | |
choices=gpu_options, | |
value=gpu_options[0] if gpu_options else None, | |
label="Select GPU", | |
interactive=True | |
) | |
# Current metrics | |
with gr.Column(scale=3): | |
self.components["current_metrics"] = gr.Markdown("Loading GPU metrics...") | |
# Display GPU metrics in tabs | |
with gr.Tabs(visible=self.app.monitoring.gpu.has_nvidia_gpus) as metrics_tabs: | |
with gr.Tab(label="Utilization") as util_tab: | |
self.components["utilization_plot"] = gr.Plot() | |
with gr.Tab(label="Memory") as memory_tab: | |
self.components["memory_plot"] = gr.Plot() | |
with gr.Tab(label="Power") as power_tab: | |
self.components["power_plot"] = gr.Plot() | |
# Process information | |
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus): | |
with gr.Column(): | |
gr.Markdown("### Active Processes") | |
self.components["process_info"] = gr.Markdown("Loading process information...") | |
# GPU information summary | |
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus): | |
with gr.Column(): | |
gr.Markdown("### GPU Information") | |
self.components["gpu_info"] = gr.Markdown("Loading GPU information...") | |
# Toggle for enabling/disabling auto-refresh | |
with gr.Row(): | |
self.components["auto_refresh"] = gr.Checkbox( | |
label=f"Auto refresh (every {self.refresh_interval} seconds)", | |
value=True, | |
info="Automatically refresh GPU metrics" | |
) | |
self.components["refresh_btn"] = gr.Button("Refresh Now") | |
# Timer for auto-refresh | |
self.components["refresh_timer"] = gr.Timer( | |
value=self.refresh_interval | |
) | |
return tab | |
def connect_events(self) -> None: | |
"""Connect event handlers to UI components""" | |
# GPU selector (if multiple GPUs) | |
if self.app.monitoring.gpu.gpu_count > 1 and "gpu_selector" in self.components: | |
self.components["gpu_selector"].change( | |
fn=self.update_selected_gpu, | |
inputs=[self.components["gpu_selector"]], | |
outputs=[ | |
self.components["current_metrics"], | |
self.components["utilization_plot"], | |
self.components["memory_plot"], | |
self.components["power_plot"], | |
self.components["process_info"], | |
self.components["gpu_info"] | |
] | |
) | |
# Manual refresh button | |
self.components["refresh_btn"].click( | |
fn=self.refresh_all, | |
outputs=[ | |
self.components["current_metrics"], | |
self.components["utilization_plot"], | |
self.components["memory_plot"], | |
self.components["power_plot"], | |
self.components["process_info"], | |
self.components["gpu_info"] | |
] | |
) | |
# Auto-refresh timer | |
self.components["refresh_timer"].tick( | |
fn=self.conditional_refresh, | |
inputs=[self.components["auto_refresh"]], | |
outputs=[ | |
self.components["current_metrics"], | |
self.components["utilization_plot"], | |
self.components["memory_plot"], | |
self.components["power_plot"], | |
self.components["process_info"], | |
self.components["gpu_info"] | |
] | |
) | |
def on_enter(self): | |
"""Called when the tab is selected""" | |
# Trigger initial refresh | |
return self.refresh_all() | |
def update_selected_gpu(self, gpu_selector: str) -> Tuple: | |
"""Update the selected GPU and refresh data | |
Args: | |
gpu_selector: Selected GPU string ("GPU X") | |
Returns: | |
Updated components | |
""" | |
# Extract GPU index from selector string | |
try: | |
self.selected_gpu = int(gpu_selector.replace("GPU ", "")) | |
except (ValueError, AttributeError): | |
self.selected_gpu = 0 | |
# Refresh all components with the new selected GPU | |
return self.refresh_all() | |
def conditional_refresh(self, auto_refresh: bool) -> Tuple: | |
"""Only refresh if auto-refresh is enabled | |
Args: | |
auto_refresh: Whether auto-refresh is enabled | |
Returns: | |
Updated components or unchanged components | |
""" | |
if auto_refresh: | |
return self.refresh_all() | |
# Return current values unchanged if auto-refresh is disabled | |
return ( | |
self.components["current_metrics"].value, | |
self.components["utilization_plot"].value, | |
self.components["memory_plot"].value, | |
self.components["power_plot"].value, | |
self.components["process_info"].value, | |
self.components["gpu_info"].value | |
) | |
def refresh_all(self) -> Tuple: | |
"""Refresh all GPU monitoring components | |
Returns: | |
Updated values for all components | |
""" | |
try: | |
if not self.app.monitoring.gpu.has_nvidia_gpus: | |
return ( | |
"No NVIDIA GPUs detected", | |
None, | |
None, | |
None, | |
"No process information available", | |
"No GPU information available" | |
) | |
# Get current metrics for the selected GPU | |
all_metrics = self.app.monitoring.gpu.get_current_metrics() | |
if not all_metrics or self.selected_gpu >= len(all_metrics): | |
return ( | |
"GPU metrics not available", | |
None, | |
None, | |
None, | |
"No process information available", | |
"No GPU information available" | |
) | |
# Get selected GPU metrics | |
gpu_metrics = all_metrics[self.selected_gpu] | |
# Format current metrics as markdown | |
metrics_html = self.format_current_metrics(gpu_metrics) | |
# Format process information | |
process_info_html = self.format_process_info(gpu_metrics) | |
# Format GPU information | |
gpu_info = self.app.monitoring.gpu.get_gpu_info() | |
gpu_info_html = self.format_gpu_info(gpu_info[self.selected_gpu] if self.selected_gpu < len(gpu_info) else {}) | |
# Generate plots | |
utilization_plot = self.app.monitoring.gpu.generate_utilization_plot(self.selected_gpu) | |
memory_plot = self.app.monitoring.gpu.generate_memory_plot(self.selected_gpu) | |
power_plot = self.app.monitoring.gpu.generate_power_plot(self.selected_gpu) | |
return ( | |
metrics_html, | |
utilization_plot, | |
memory_plot, | |
power_plot, | |
process_info_html, | |
gpu_info_html | |
) | |
except Exception as e: | |
logger.error(f"Error refreshing GPU data: {str(e)}", exc_info=True) | |
error_msg = f"Error retrieving GPU data: {str(e)}" | |
return ( | |
error_msg, | |
None, | |
None, | |
None, | |
error_msg, | |
error_msg | |
) | |
def format_current_metrics(self, metrics: Dict[str, Any]) -> str: | |
"""Format current GPU metrics as HTML/Markdown | |
Args: | |
metrics: Current metrics dictionary | |
Returns: | |
Formatted HTML/Markdown string | |
""" | |
if 'error' in metrics: | |
return f"Error retrieving GPU metrics: {metrics['error']}" | |
# Format timestamp | |
if isinstance(metrics.get('timestamp'), datetime): | |
timestamp_str = metrics['timestamp'].strftime('%Y-%m-%d %H:%M:%S') | |
else: | |
timestamp_str = "Unknown" | |
# Style for GPU utilization | |
util_style = "color: green;" | |
if metrics.get('utilization_gpu', 0) > 90: | |
util_style = "color: red; font-weight: bold;" | |
elif metrics.get('utilization_gpu', 0) > 70: | |
util_style = "color: orange;" | |
# Style for memory usage | |
mem_style = "color: green;" | |
if metrics.get('memory_percent', 0) > 90: | |
mem_style = "color: red; font-weight: bold;" | |
elif metrics.get('memory_percent', 0) > 70: | |
mem_style = "color: orange;" | |
# Style for temperature | |
temp_style = "color: green;" | |
temp = metrics.get('temperature', 0) | |
if temp > 85: | |
temp_style = "color: red; font-weight: bold;" | |
elif temp > 75: | |
temp_style = "color: orange;" | |
# Memory usage in GB | |
memory_used_gb = metrics.get('memory_used', 0) / (1024**3) | |
memory_total_gb = metrics.get('memory_total', 0) / (1024**3) | |
# Power usage and limit | |
power_html = "" | |
if metrics.get('power_usage') is not None: | |
power_html = f"**Power Usage:** {metrics['power_usage']:.1f}W\n" | |
html = f""" | |
### Current Status as of {timestamp_str} | |
**GPU Utilization:** <span style="{util_style}">{metrics.get('utilization_gpu', 0):.1f}%</span> | |
**Memory Usage:** <span style="{mem_style}">{metrics.get('memory_percent', 0):.1f}% ({memory_used_gb:.2f}/{memory_total_gb:.2f} GB)</span> | |
**Temperature:** <span style="{temp_style}">{metrics.get('temperature', 0)}°C</span> | |
{power_html} | |
""" | |
return html | |
def format_process_info(self, metrics: Dict[str, Any]) -> str: | |
"""Format GPU process information as HTML/Markdown | |
Args: | |
metrics: Current metrics dictionary with process information | |
Returns: | |
Formatted HTML/Markdown string | |
""" | |
if 'error' in metrics: | |
return "Process information not available" | |
processes = metrics.get('processes', []) | |
if not processes: | |
return "No active processes using this GPU" | |
# Sort processes by memory usage (descending) | |
sorted_processes = sorted(processes, key=lambda p: p.get('memory_used', 0), reverse=True) | |
html = "| PID | Process Name | Memory Usage |\n" | |
html += "|-----|-------------|-------------|\n" | |
for proc in sorted_processes: | |
pid = proc.get('pid', 'Unknown') | |
name = proc.get('name', 'Unknown') | |
mem_mb = proc.get('memory_used', 0) / (1024**2) # Convert to MB | |
html += f"| {pid} | {name} | {mem_mb:.1f} MB |\n" | |
return html | |
def format_gpu_info(self, info: Dict[str, Any]) -> str: | |
"""Format GPU information as HTML/Markdown | |
Args: | |
info: GPU information dictionary | |
Returns: | |
Formatted HTML/Markdown string | |
""" | |
if 'error' in info: | |
return f"GPU information not available: {info.get('error', 'Unknown error')}" | |
# Format memory in GB | |
memory_total_gb = info.get('memory_total', 0) / (1024**3) | |
html = f""" | |
**Name:** {info.get('name', 'Unknown')} | |
**Memory:** {memory_total_gb:.2f} GB | |
**UUID:** {info.get('uuid', 'N/A')} | |
**Compute Capability:** {info.get('compute_capability', 'N/A')} | |
""" | |
# Add power limit if available | |
if info.get('power_limit') is not None: | |
html += f"**Power Limit:** {info['power_limit']:.1f}W\n" | |
return html |