""" Analytics Charts Component Interactive visualizations for leaderboard analytics """ import plotly.graph_objects as go import pandas as pd import numpy as np from typing import List, Dict, Any, Optional def create_performance_heatmap(df: pd.DataFrame) -> go.Figure: """ Create an interactive heatmap of models × metrics Args: df: Leaderboard DataFrame with metrics Returns: Plotly figure with heatmap visualization """ if df.empty: return _create_empty_figure("No data available for heatmap") # Select metrics to display metrics = [ 'success_rate', 'avg_duration_ms', 'total_cost_usd', 'co2_emissions_g', 'gpu_utilization_avg', 'total_tokens' ] # Filter to only available metrics available_metrics = [m for m in metrics if m in df.columns] if not available_metrics: return _create_empty_figure("No metrics available for analysis") # Aggregate by model (in case of multiple runs) model_stats = df.groupby('model')[available_metrics].mean() # Prepare data matrix (rows=metrics, columns=models) heatmap_data = [] heatmap_text = [] metric_labels = [] for metric in available_metrics: values = model_stats[metric].values # Normalize to 0-1 scale # For metrics where lower is better (duration, cost, co2), invert the scale if metric in ['avg_duration_ms', 'total_cost_usd', 'co2_emissions_g']: # Invert: lower is better (green) max_val = values.max() if max_val > 0: normalized = 1 - (values / max_val) else: normalized = np.zeros_like(values) else: # Higher is better (green) max_val = values.max() if max_val > 0: normalized = values / max_val else: normalized = np.zeros_like(values) heatmap_data.append(normalized) # Create hover text with actual values if metric == 'success_rate': text_row = [f"{v:.1f}%" for v in values] elif metric == 'avg_duration_ms': text_row = [f"{v:.0f}ms" for v in values] elif metric in ['total_cost_usd']: text_row = [f"${v:.4f}" for v in values] elif metric == 'co2_emissions_g': text_row = [f"{v:.2f}g" for v in values] elif metric == 'gpu_utilization_avg': text_row = [f"{v:.1f}%" if pd.notna(v) else "N/A" for v in values] else: text_row = [f"{v:.0f}" for v in values] heatmap_text.append(text_row) # Create readable metric labels label = metric.replace('_', ' ').replace('avg', 'Avg').replace('usd', 'USD').title() metric_labels.append(label) # Get model names models = model_stats.index.tolist() # Shorten model names if too long model_labels = [m.split('/')[-1] if '/' in m else m for m in models] model_labels = [m[:20] + '...' if len(m) > 20 else m for m in model_labels] # Create heatmap fig = go.Figure(data=go.Heatmap( z=heatmap_data, x=model_labels, y=metric_labels, text=heatmap_text, texttemplate='%{text}', textfont={"size": 10}, colorscale='RdYlGn', # Red (bad) → Yellow → Green (good) hoverongaps=False, hovertemplate='%{y}
Model: %{x}
Value: %{text}
Score: %{z:.2f}', colorbar=dict( title=dict( text="Performance
Score", side="right" ), tickmode="linear", tick0=0, dtick=0.25 ) )) fig.update_layout( title={ 'text': '🔥 Model Performance Heatmap', 'x': 0.5, 'xanchor': 'center', 'font': {'size': 20} }, xaxis_title='Model', yaxis_title='Metric', height=500, plot_bgcolor='#f8f9fa', paper_bgcolor='white', xaxis=dict(tickangle=-45), margin=dict(l=150, r=100, t=100, b=150), ) return fig def create_speed_accuracy_scatter(df: pd.DataFrame) -> go.Figure: """ Speed vs Accuracy trade-off scatter plot Args: df: Leaderboard DataFrame Returns: Plotly figure with scatter plot """ if df.empty: return _create_empty_figure("No data available for scatter plot") # Check required columns required_cols = ['model', 'success_rate', 'avg_duration_ms'] if not all(col in df.columns for col in required_cols): return _create_empty_figure(f"Missing required columns: {required_cols}") # Aggregate by model model_stats = df.groupby('model').agg({ 'success_rate': 'mean', 'avg_duration_ms': 'mean', 'total_cost_usd': 'mean' if 'total_cost_usd' in df.columns else 'size', 'agent_type': 'first' if 'agent_type' in df.columns else 'size' }).reset_index() # Create figure fig = go.Figure() # Get unique agent types agent_types = model_stats['agent_type'].unique() if 'agent_type' in model_stats.columns else ['all'] # Color scheme colors = { 'tool': '#E67E22', # Orange 'code': '#3498DB', # Blue 'both': '#9B59B6', # Purple 'all': '#1ABC9C', # Teal 'unknown': '#95A5A6' # Gray } for agent_type in agent_types: if agent_type == 'all': subset = model_stats else: subset = model_stats[model_stats['agent_type'] == agent_type] # Prepare hover text hover_texts = [] for _, row in subset.iterrows(): model_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] hover = f"{model_name}
" hover += f"Success Rate: {row['success_rate']:.1f}%
" hover += f"Avg Duration: {row['avg_duration_ms']:.0f}ms
" if 'total_cost_usd' in row and pd.notna(row['total_cost_usd']): hover += f"Cost: ${row['total_cost_usd']:.4f}" hover_texts.append(hover) # Bubble size based on cost (if available) if 'total_cost_usd' in subset.columns: sizes = subset['total_cost_usd'] * 5000 # Scale up for visibility sizes = sizes.clip(lower=10, upper=100) # Reasonable range else: sizes = 30 # Default size fig.add_trace(go.Scatter( x=subset['avg_duration_ms'], y=subset['success_rate'], mode='markers+text', name=str(agent_type).title(), marker=dict( size=sizes, color=colors.get(str(agent_type).lower(), colors['unknown']), opacity=0.7, line=dict(width=2, color='white') ), text=[m.split('/')[-1][:15] for m in subset['model']], textposition='top center', textfont=dict(size=9), hovertext=hover_texts, hoverinfo='text' )) # Add quadrant lines (median split) if len(model_stats) > 1: median_speed = model_stats['avg_duration_ms'].median() median_accuracy = model_stats['success_rate'].median() fig.add_hline( y=median_accuracy, line_dash="dash", line_color="gray", opacity=0.4, annotation_text=f"Median Accuracy: {median_accuracy:.1f}%", annotation_position="right" ) fig.add_vline( x=median_speed, line_dash="dash", line_color="gray", opacity=0.4, annotation_text=f"Median Speed: {median_speed:.0f}ms", annotation_position="top" ) # Add zone annotations max_accuracy = model_stats['success_rate'].max() min_speed = model_stats['avg_duration_ms'].min() fig.add_annotation( x=min_speed + (median_speed - min_speed) * 0.5, y=max_accuracy * 0.98, text="⭐ Fast & Accurate", showarrow=False, font=dict(size=14, color='green', family='Arial Black'), bgcolor='rgba(144, 238, 144, 0.2)', borderpad=5 ) fig.update_layout( title={ 'text': '⚡ Speed vs Accuracy Trade-off', 'x': 0.5, 'xanchor': 'center', 'font': {'size': 20} }, xaxis_title='Average Duration (ms)', yaxis_title='Success Rate (%)', xaxis_type='log', # Log scale for duration height=600, plot_bgcolor='white', paper_bgcolor='#f8f9fa', showlegend=True, legend=dict( title=dict(text='Agent Type'), orientation="v", yanchor="top", y=0.99, xanchor="right", x=0.99 ), hovermode='closest' ) # Add grid for better readability fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray') fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray') return fig def create_cost_efficiency_scatter(df: pd.DataFrame) -> go.Figure: """ Cost-Performance Efficiency scatter plot Args: df: Leaderboard DataFrame Returns: Plotly figure with cost efficiency scatter """ if df.empty: return _create_empty_figure("No data available for cost analysis") # Check required columns if 'success_rate' not in df.columns or 'total_cost_usd' not in df.columns: return _create_empty_figure("Missing required columns: success_rate, total_cost_usd") # Aggregate by model agg_dict = { 'success_rate': 'mean', 'total_cost_usd': 'mean', 'avg_duration_ms': 'mean' if 'avg_duration_ms' in df.columns else 'size', 'provider': 'first' if 'provider' in df.columns else 'size' } model_stats = df.groupby('model').agg(agg_dict).reset_index() # Handle zero costs for log scale visualization # Replace zero costs with a small epsilon value (0.00001) # This allows log scale to work properly while keeping all models visible EPSILON = 0.00001 model_stats['total_cost_usd_display'] = model_stats['total_cost_usd'].apply( lambda x: max(x, EPSILON) ) # Calculate efficiency metric: success_rate / cost model_stats['efficiency'] = model_stats['success_rate'] / (model_stats['total_cost_usd'] + 0.0001) # Avoid division by zero # Create figure fig = go.Figure() # Get unique providers providers = model_stats['provider'].unique() if 'provider' in model_stats.columns else ['all'] # Color scheme provider_colors = { 'litellm': '#3498DB', # Blue (API) 'transformers': '#2ECC71', # Green (GPU/local) 'all': '#9B59B6', # Purple 'unknown': '#95A5A6' # Gray } for provider in providers: if provider == 'all': subset = model_stats else: subset = model_stats[model_stats['provider'] == provider] # Prepare hover text hover_texts = [] for _, row in subset.iterrows(): model_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] hover = f"{model_name}
" hover += f"Success Rate: {row['success_rate']:.1f}%
" # Show actual cost (even if zero) in hover text if row['total_cost_usd'] == 0: hover += f"Total Cost: $0.0000 (No cost data)
" else: hover += f"Total Cost: ${row['total_cost_usd']:.4f}
" hover += f"Efficiency: {row['efficiency']:.0f} (points/$)
" if 'avg_duration_ms' in row and pd.notna(row['avg_duration_ms']): hover += f"Duration: {row['avg_duration_ms']:.0f}ms" hover_texts.append(hover) # Bubble size based on duration (if available) if 'avg_duration_ms' in subset.columns: # Invert: smaller duration = smaller bubble sizes = subset['avg_duration_ms'] / 100 # Scale down sizes = sizes.clip(lower=10, upper=80) # Reasonable range else: sizes = 30 # Default size fig.add_trace(go.Scatter( x=subset['total_cost_usd_display'], # Use adjusted cost for log scale y=subset['success_rate'], mode='markers+text', name=str(provider).title(), marker=dict( size=sizes, color=provider_colors.get(str(provider).lower(), provider_colors['unknown']), opacity=0.7, line=dict(width=2, color='white') ), text=[m.split('/')[-1][:15] for m in subset['model']], textposition='top center', textfont=dict(size=9), hovertext=hover_texts, hoverinfo='text' )) # Add cost bands if len(model_stats) > 0: max_cost = model_stats['total_cost_usd'].max() # Budget band: < $0.01 if max_cost > 0.01: fig.add_vrect( x0=0, x1=0.01, fillcolor="lightgreen", opacity=0.1, layer="below", line_width=0, annotation_text="Budget", annotation_position="top left" ) # Mid band: $0.01-$0.10 if max_cost > 0.10: fig.add_vrect( x0=0.01, x1=0.10, fillcolor="yellow", opacity=0.1, layer="below", line_width=0, annotation_text="Mid-Range", annotation_position="top left" ) # Premium band: > $0.10 if max_cost > 0.10: fig.add_vrect( x0=0.10, x1=max_cost * 1.1, fillcolor="orange", opacity=0.1, layer="below", line_width=0, annotation_text="Premium", annotation_position="top left" ) # Highlight top 3 most efficient models top_efficient = model_stats.nlargest(3, 'efficiency') for _, row in top_efficient.iterrows(): fig.add_annotation( x=row['total_cost_usd_display'], # Use adjusted cost for positioning y=row['success_rate'], text="⭐", showarrow=False, font=dict(size=20) ) # Calculate axis ranges for proper log scale display min_cost = model_stats['total_cost_usd_display'].min() max_cost = model_stats['total_cost_usd_display'].max() fig.update_layout( title={ 'text': '💰 Cost-Performance Efficiency', 'x': 0.5, 'xanchor': 'center', 'font': {'size': 20} }, xaxis_title='Total Cost (USD)', yaxis_title='Success Rate (%)', xaxis_type='log', # Log scale for cost xaxis=dict( range=[np.log10(min_cost * 0.5), np.log10(max_cost * 2)], # Explicit log range showgrid=True, gridwidth=1, gridcolor='lightgray' ), height=600, plot_bgcolor='white', paper_bgcolor='#f8f9fa', showlegend=True, legend=dict( title=dict(text='Provider'), orientation="v", yanchor="top", y=0.99, xanchor="right", x=0.99 ), hovermode='closest' ) # Add grid for better readability fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray') return fig def _create_empty_figure(message: str) -> go.Figure: """ Create an empty figure with a message Args: message: Message to display Returns: Plotly figure with annotation """ fig = go.Figure() fig.add_annotation( text=message, xref="paper", yref="paper", x=0.5, y=0.5, xanchor='center', yanchor='middle', showarrow=False, font=dict(size=16, color='gray') ) fig.update_layout( height=500, plot_bgcolor='white', paper_bgcolor='#f8f9fa', xaxis=dict(showgrid=False, showticklabels=False, zeroline=False), yaxis=dict(showgrid=False, showticklabels=False, zeroline=False) ) return fig def create_comparison_radar(runs: List[Dict[str, Any]]) -> go.Figure: """ Create a multi-dimensional radar chart comparing 2-3 runs Args: runs: List of run data dictionaries (2-3 models) Returns: Plotly figure with radar chart comparison """ if not runs or len(runs) < 2: return _create_empty_figure("Please select at least 2 runs to compare") if len(runs) > 3: runs = runs[:3] # Limit to 3 runs for readability # Define dimensions for radar chart dimensions = [] dimension_names = [] # Helper function to normalize values (0-1 scale) def normalize(values, invert=False): """Normalize values to 0-1, optionally inverting (lower is better)""" values = np.array(values, dtype=float) min_val, max_val = np.nanmin(values), np.nanmax(values) if max_val == min_val: return [0.5] * len(values) normalized = (values - min_val) / (max_val - min_val) if invert: normalized = 1 - normalized return normalized.tolist() # Extract metrics from all runs success_rates = [run.get('success_rate', 0) / 100 for run in runs] # Already 0-1 durations = [run.get('avg_duration_ms', 0) for run in runs] costs = [run.get('total_cost_usd', 0) for run in runs] tokens = [run.get('total_tokens', 0) for run in runs] co2 = [run.get('co2_emissions_g', 0) for run in runs] gpu_util = [run.get('gpu_utilization_avg', None) for run in runs] # Calculate Token Efficiency (success per 1000 tokens) # Use max() to avoid division by zero token_efficiency = [ (run.get('success_rate', 0) / 100) / max((run.get('total_tokens', 0) / 1000), 0.001) for run in runs ] # Build dimensions (normalized 0-1) dimensions.append(success_rates) # Already 0-1 dimension_names.append('Success Rate') dimensions.append(normalize(durations, invert=True)) # Faster is better dimension_names.append('Speed') dimensions.append(normalize(costs, invert=True)) # Cheaper is better dimension_names.append('Cost Efficiency') dimensions.append(normalize(token_efficiency)) # Higher is better dimension_names.append('Token Efficiency') dimensions.append(normalize(co2, invert=True)) # Lower CO2 is better dimension_names.append('CO2 Efficiency') # Add GPU Utilization if available if any(g is not None for g in gpu_util): gpu_values = [g / 100 if g is not None else 0 for g in gpu_util] # Normalize to 0-1 dimensions.append(gpu_values) dimension_names.append('GPU Utilization') # Create radar chart fig = go.Figure() colors = ['#667eea', '#f093fb', '#43e97b'] # Purple, Pink, Green for idx, run in enumerate(runs): model_name = run.get('model', f'Run {idx+1}') if '/' in model_name: model_name = model_name.split('/')[-1] # Show only model name, not provider # Extract values for this run across all dimensions values = [dim[idx] for dim in dimensions] # Close the radar chart by repeating first value values_closed = values + [values[0]] theta_closed = dimension_names + [dimension_names[0]] fig.add_trace(go.Scatterpolar( r=values_closed, theta=theta_closed, name=model_name, fill='toself', fillcolor=colors[idx], opacity=0.3, line=dict(color=colors[idx], width=2), marker=dict(size=8, color=colors[idx]), hovertemplate='%{theta}
' + 'Score: %{r:.2f}
' + f'{model_name}' + '' )) fig.update_layout( polar=dict( bgcolor='#f8f9fa', radialaxis=dict( visible=True, range=[0, 1], showticklabels=True, ticks='', gridcolor='rgba(100, 100, 100, 0.2)', tickfont=dict(size=10) ), angularaxis=dict( gridcolor='rgba(100, 100, 100, 0.2)', linecolor='rgba(100, 100, 100, 0.4)', tickfont=dict(size=12, color='#0f172a') ) ), showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.2, xanchor="center", x=0.5, bgcolor='rgba(255, 255, 255, 0.8)', bordercolor='#ccc', borderwidth=1 ), title=dict( text='Multi-Dimensional Model Comparison', x=0.5, xanchor='center', font=dict(size=18, color='#0f172a', family='Inter, sans-serif') ), height=600, paper_bgcolor='white', font=dict(family='Inter, sans-serif') ) return fig def create_trends_plot(df: pd.DataFrame) -> go.Figure: """ Create trends visualization over time with enhanced GPU metrics Args: df: Leaderboard DataFrame with timestamp or evaluation_date column Returns: Plotly figure showing trends """ from plotly.subplots import make_subplots try: # Use evaluation_date or timestamp depending on what's available date_col = 'evaluation_date' if 'evaluation_date' in df.columns else 'timestamp' if df.empty or date_col not in df.columns: fig = go.Figure() fig.add_annotation(text="No trend data available", showarrow=False) return fig # Convert date column to datetime to avoid type errors df[date_col] = pd.to_datetime(df[date_col], errors='coerce') # Sort by date column df_sorted = df.sort_values(date_col) # Check which GPU metrics are available has_gpu_util = 'gpu_utilization_avg' in df.columns and df_sorted['gpu_utilization_avg'].notna().any() has_gpu_memory = 'gpu_memory_avg_mib' in df.columns and df_sorted['gpu_memory_avg_mib'].notna().any() has_gpu_temp = 'gpu_temperature_avg' in df.columns and df_sorted['gpu_temperature_avg'].notna().any() has_power_cost = 'power_cost_total_usd' in df.columns and df_sorted['power_cost_total_usd'].notna().any() # Determine number of subplots based on available data num_plots = 2 # Always show success rate and cost if has_gpu_util: num_plots += 1 if has_gpu_memory: num_plots += 1 if has_gpu_temp: num_plots += 1 if has_power_cost: num_plots += 1 # Create subplots subplot_titles = ["Success Rate Over Time", "Cost Over Time"] if has_gpu_util: subplot_titles.append("GPU Utilization Over Time") if has_gpu_memory: subplot_titles.append("GPU Memory Usage Over Time") if has_gpu_temp: subplot_titles.append("GPU Temperature Over Time") if has_power_cost: subplot_titles.append("Power Cost Over Time") fig = make_subplots( rows=num_plots, cols=1, subplot_titles=subplot_titles, vertical_spacing=0.08 ) current_row = 1 # Success rate trend fig.add_trace( go.Scatter( x=df_sorted[date_col], y=df_sorted['success_rate'], mode='lines+markers', name='Success Rate', line=dict(color='#3498DB', width=2), marker=dict(size=6), hovertemplate='%{x}
Success Rate: %{y:.1f}%' ), row=current_row, col=1 ) fig.update_yaxes(title_text="Success Rate (%)", row=current_row, col=1) current_row += 1 # Cost trend fig.add_trace( go.Scatter( x=df_sorted[date_col], y=df_sorted['total_cost_usd'], mode='lines+markers', name='Cost (USD)', line=dict(color='#E67E22', width=2), marker=dict(size=6), hovertemplate='%{x}
Cost: $%{y:.4f}' ), row=current_row, col=1 ) fig.update_yaxes(title_text="Cost (USD)", row=current_row, col=1) current_row += 1 # GPU Utilization trend (if available) if has_gpu_util: gpu_data = df_sorted[df_sorted['gpu_utilization_avg'].notna()] fig.add_trace( go.Scatter( x=gpu_data[date_col], y=gpu_data['gpu_utilization_avg'], mode='lines+markers', name='GPU Utilization', line=dict(color='#9B59B6', width=2), marker=dict(size=6), hovertemplate='%{x}
GPU Util: %{y:.1f}%' ), row=current_row, col=1 ) fig.update_yaxes(title_text="GPU Utilization (%)", row=current_row, col=1) current_row += 1 # GPU Memory trend (if available) if has_gpu_memory: gpu_memory_data = df_sorted[df_sorted['gpu_memory_avg_mib'].notna()] fig.add_trace( go.Scatter( x=gpu_memory_data[date_col], y=gpu_memory_data['gpu_memory_avg_mib'], mode='lines+markers', name='GPU Memory', line=dict(color='#1ABC9C', width=2), marker=dict(size=6), hovertemplate='%{x}
GPU Memory: %{y:.0f} MiB' ), row=current_row, col=1 ) fig.update_yaxes(title_text="GPU Memory (MiB)", row=current_row, col=1) current_row += 1 # GPU Temperature trend (if available) if has_gpu_temp: gpu_temp_data = df_sorted[df_sorted['gpu_temperature_avg'].notna()] fig.add_trace( go.Scatter( x=gpu_temp_data[date_col], y=gpu_temp_data['gpu_temperature_avg'], mode='lines+markers', name='GPU Temperature', line=dict(color='#E74C3C', width=2), marker=dict(size=6), hovertemplate='%{x}
GPU Temp: %{y:.1f}°C' ), row=current_row, col=1 ) fig.update_yaxes(title_text="GPU Temperature (°C)", row=current_row, col=1) current_row += 1 # Power Cost trend (if available) if has_power_cost: power_cost_data = df_sorted[df_sorted['power_cost_total_usd'].notna()] fig.add_trace( go.Scatter( x=power_cost_data[date_col], y=power_cost_data['power_cost_total_usd'], mode='lines+markers', name='Power Cost', line=dict(color='#F39C12', width=2), marker=dict(size=6), hovertemplate='%{x}
Power Cost: $%{y:.4f}' ), row=current_row, col=1 ) fig.update_yaxes(title_text="Power Cost (USD)", row=current_row, col=1) fig.update_xaxes(title_text="Date", row=num_plots, col=1) # Calculate dynamic height based on number of plots plot_height = max(400, num_plots * 200) fig.update_layout( height=plot_height, showlegend=False, margin=dict(l=50, r=50, t=50, b=50) ) return fig except Exception as e: print(f"[ERROR] Creating trends plot: {e}") import traceback traceback.print_exc() fig = go.Figure() fig.add_annotation(text=f"Error creating trends: {str(e)}", showarrow=False) return fig