Spaces:
Running
Running
| """ | |
| Leaderboard HTML Table Generator | |
| Creates styled HTML tables for the leaderboard view | |
| """ | |
| import pandas as pd | |
| from typing import Optional | |
| from .metric_displays import ( | |
| get_rank_badge, | |
| get_success_rate_bar, | |
| get_gpu_utilization_bar, | |
| get_provider_badge, | |
| get_agent_type_badge, | |
| get_hardware_badge, | |
| format_cost, | |
| format_duration, | |
| get_tooltip_icon | |
| ) | |
| def generate_leaderboard_html( | |
| df: pd.DataFrame, | |
| sort_by: str = "success_rate", | |
| ascending: bool = False | |
| ) -> str: | |
| """ | |
| Generate styled HTML table for leaderboard | |
| Args: | |
| df: Leaderboard DataFrame | |
| sort_by: Column to sort by | |
| ascending: Sort order (False = descending) | |
| Returns: | |
| HTML string with complete styled table | |
| Expected DataFrame columns: | |
| - model (str): Model name | |
| - agent_type (str): tool, code, or both | |
| - provider (str): litellm or transformers | |
| - success_rate (float): 0-100 | |
| - total_tests (int): Number of tests | |
| - avg_duration_ms (float): Average duration | |
| - total_cost_usd (float): Total cost | |
| - co2_emissions_g (float): CO2 emissions | |
| - gpu_utilization_avg (float, optional): GPU utilization % | |
| - submitted_by (str): Username | |
| """ | |
| # Sort dataframe | |
| df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True) | |
| # Start HTML with embedded CSS | |
| html = """ | |
| <style> | |
| /* Leaderboard Table Styles */ | |
| .tm-leaderboard-container { | |
| background: #F8FAFC; /* Light background for better readability */ | |
| border-radius: 16px; | |
| overflow-x: auto; /* Enable horizontal scrolling */ | |
| overflow-y: visible; | |
| border: 1px solid rgba(203, 213, 225, 0.8); | |
| margin: 20px 0; | |
| box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
| max-width: 100%; | |
| } | |
| /* Custom scrollbar styling */ | |
| .tm-leaderboard-container::-webkit-scrollbar { | |
| height: 8px; | |
| } | |
| .tm-leaderboard-container::-webkit-scrollbar-track { | |
| background: #E2E8F0; | |
| border-radius: 4px; | |
| } | |
| .tm-leaderboard-container::-webkit-scrollbar-thumb { | |
| background: #94A3B8; | |
| border-radius: 4px; | |
| } | |
| .tm-leaderboard-container::-webkit-scrollbar-thumb:hover { | |
| background: #64748B; | |
| } | |
| .tm-leaderboard-table { | |
| width: 100%; | |
| min-width: 1650px; /* Reduced from 1800px after combining columns */ | |
| border-collapse: collapse; | |
| font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; | |
| background: #FFFFFF; /* White background */ | |
| color: #0F172A; /* Dark text for maximum contrast */ | |
| } | |
| .tm-leaderboard-table thead { | |
| background: linear-gradient(135deg, #6366F1 0%, #4F46E5 100%); /* Vibrant indigo gradient */ | |
| position: sticky; | |
| top: 0; | |
| z-index: 10; | |
| backdrop-filter: blur(10px); | |
| } | |
| .tm-leaderboard-table th { | |
| padding: 16px 12px; | |
| text-align: left; | |
| font-weight: 600; | |
| color: #FFFFFF; /* Pure white for headers - good contrast */ | |
| border-bottom: 2px solid #4338CA; | |
| font-size: 12px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| white-space: nowrap; | |
| } | |
| .tm-leaderboard-table td { | |
| padding: 14px 12px; | |
| border-bottom: 1px solid rgba(226, 232, 240, 0.8); | |
| color: #1E293B; /* Dark text for cells */ | |
| font-size: 14px; | |
| vertical-align: middle; | |
| } | |
| .tm-leaderboard-table tbody tr { | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); | |
| cursor: pointer; | |
| } | |
| .tm-leaderboard-table tbody tr:hover { | |
| background: rgba(99, 102, 241, 0.08) !important; | |
| box-shadow: 0 0 15px rgba(99, 102, 241, 0.15), | |
| inset 0 0 15px rgba(99, 102, 241, 0.05); | |
| transform: scale(1.002); | |
| } | |
| .tm-leaderboard-table tbody tr:nth-child(even) { | |
| background: rgba(241, 245, 249, 0.6); /* Light stripe */ | |
| } | |
| .tm-model-name { | |
| font-weight: 600; | |
| color: #000000 !important; /* Pure black - readable in all themes */ | |
| font-size: 15px; | |
| transition: color 0.2s ease; | |
| } | |
| .tm-leaderboard-table tr:hover .tm-model-name { | |
| color: #4F46E5 !important; /* Indigo on hover */ | |
| } | |
| .tm-numeric-cell { | |
| font-family: 'Monaco', 'Menlo', monospace; | |
| font-size: 13px; | |
| text-align: center; | |
| color: #000000 !important; /* Pure black for numbers */ | |
| } | |
| .tm-badge-cell { | |
| text-align: center; | |
| } | |
| .tm-run-id { | |
| font-family: 'Monaco', 'Menlo', monospace; | |
| font-size: 12px; | |
| color: #000000 !important; /* Pure black - readable in all themes */ | |
| cursor: pointer; | |
| text-decoration: none; | |
| font-weight: 500; | |
| transition: all 0.2s ease; | |
| } | |
| .tm-run-id:hover { | |
| color: #4F46E5 !important; /* Indigo on hover */ | |
| text-decoration: underline; | |
| } | |
| .tm-text-cell { | |
| color: #000000 !important; /* Pure black for all text */ | |
| font-size: 0.9em; | |
| } | |
| /* Responsive Design */ | |
| @media (max-width: 1024px) { | |
| .tm-leaderboard-table th, | |
| .tm-leaderboard-table td { | |
| padding: 10px 8px; | |
| font-size: 12px; | |
| } | |
| /* Hide less important columns on smaller screens */ | |
| .tm-hide-mobile { | |
| display: none !important; | |
| } | |
| } | |
| @media (max-width: 768px) { | |
| .tm-leaderboard-table th:nth-child(n+7), | |
| .tm-leaderboard-table td:nth-child(n+7) { | |
| display: none !important; | |
| } | |
| .tm-model-name { | |
| font-size: 13px; | |
| } | |
| } | |
| @media (max-width: 480px) { | |
| /* Ultra-compact: Show only rank, model, and success rate */ | |
| .tm-leaderboard-table th:nth-child(n+4), | |
| .tm-leaderboard-table td:nth-child(n+4) { | |
| display: none !important; | |
| } | |
| .tm-leaderboard-table th:nth-child(3), | |
| .tm-leaderboard-table td:nth-child(3) { | |
| display: table-cell !important; | |
| } | |
| } | |
| </style> | |
| <div class="tm-leaderboard-container"> | |
| <table class="tm-leaderboard-table"> | |
| <thead> | |
| <tr> | |
| <th style="width: 60px;">Rank</th> | |
| <th style="width: 110px;" title="Click to view detailed run information">Run ID</th> | |
| <th style="min-width: 160px;">Model</th> | |
| <th style="width: 80px;">Type</th> | |
| <th style="width: 90px;">Provider</th> | |
| <th style="width: 85px;" title="Hardware used for evaluation: GPU or CPU">Hardware</th> | |
| <th style="width: 150px;" title="Percentage of test cases that passed (0-100%). Higher is better."> | |
| Success Rate | |
| </th> | |
| <th style="width: 140px;" class="tm-numeric-cell" title="Tests: Total / Pass / Fail"> | |
| Tests (P/F) | |
| </th> | |
| <th style="width: 70px;" class="tm-numeric-cell" title="Average number of steps per test case."> | |
| Steps | |
| </th> | |
| <th style="width: 100px;" class="tm-numeric-cell" title="Average time per test case. Lower is better."> | |
| Duration | |
| </th> | |
| <th style="width: 90px;" class="tm-numeric-cell" title="Total tokens used across all tests."> | |
| Tokens | |
| </th> | |
| <th style="width: 90px;" class="tm-numeric-cell" title="Total API + power costs in USD. Lower is better."> | |
| Cost | |
| </th> | |
| <th style="width: 80px;" class="tm-numeric-cell tm-hide-mobile" title="Carbon footprint in grams of CO2 equivalent."> | |
| CO2 | |
| </th> | |
| <th style="width: 100px;" class="tm-hide-mobile" title="Average GPU usage during evaluation (0-100%). Only for GPU jobs."> | |
| GPU Util | |
| </th> | |
| <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU memory usage (avg/max in MiB). Only for GPU jobs."> | |
| GPU Mem | |
| </th> | |
| <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="GPU temperature (avg/max in Celsius). Only for GPU jobs."> | |
| GPU Temp | |
| </th> | |
| <th style="width: 100px;" class="tm-numeric-cell tm-hide-mobile" title="Average GPU power consumption in Watts. Only for GPU jobs."> | |
| GPU Power | |
| </th> | |
| <th style="width: 140px;" class="tm-hide-mobile">Timestamp</th> | |
| <th style="width: 110px;" class="tm-hide-mobile">Submitted By</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| # Generate table rows | |
| for idx, row in df_sorted.iterrows(): | |
| rank = idx + 1 | |
| # Convert row to dictionary for data attributes (like reference implementation) | |
| row_dict = row.to_dict() | |
| # Generate data attributes dynamically from all row data | |
| data_attrs_dict = {} | |
| for key, value in row_dict.items(): | |
| # Convert underscores to hyphens for HTML data attributes | |
| attr_name = f"data-{key.replace('_', '-')}" | |
| # Handle None/NaN values | |
| if pd.isna(value): | |
| data_attrs_dict[attr_name] = "None" | |
| else: | |
| data_attrs_dict[attr_name] = str(value) | |
| # Create the data attributes string | |
| data_attrs = " ".join([f'{key}="{value}"' for key, value in data_attrs_dict.items()]) | |
| # Get values with safe defaults for display | |
| model = row.get('model', 'Unknown') | |
| agent_type = row.get('agent_type', 'unknown') | |
| provider = row.get('provider', 'unknown') | |
| success_rate = row.get('success_rate', 0.0) | |
| total_tests = row.get('total_tests', 0) | |
| successful_tests = row.get('successful_tests', 0) | |
| failed_tests = row.get('failed_tests', 0) | |
| avg_steps = row.get('avg_steps', 0.0) | |
| avg_duration_ms = row.get('avg_duration_ms', 0.0) | |
| total_tokens = row.get('total_tokens', 0) | |
| total_cost_usd = row.get('total_cost_usd', 0.0) | |
| co2_emissions_g = row.get('co2_emissions_g', 0.0) | |
| gpu_utilization_avg = row.get('gpu_utilization_avg', None) | |
| gpu_memory_avg_mib = row.get('gpu_memory_avg_mib', None) | |
| gpu_memory_max_mib = row.get('gpu_memory_max_mib', None) | |
| gpu_temperature_avg = row.get('gpu_temperature_avg', None) | |
| gpu_temperature_max = row.get('gpu_temperature_max', None) | |
| gpu_power_avg_w = row.get('gpu_power_avg_w', None) | |
| timestamp = row.get('timestamp', '') | |
| submitted_by = row.get('submitted_by', 'Unknown') | |
| # Check if GPU job | |
| has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0 | |
| # Format GPU utilization | |
| if has_gpu: | |
| gpu_display = get_gpu_utilization_bar(gpu_utilization_avg) | |
| else: | |
| gpu_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>' | |
| # Format CO2 | |
| if pd.notna(co2_emissions_g) and co2_emissions_g > 0: | |
| co2_display = f'<span style="font-family: monospace; font-size: 0.9em; color: #334155;">{co2_emissions_g:.2f}g</span>' | |
| else: | |
| co2_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>' | |
| # Format GPU Memory | |
| if pd.notna(gpu_memory_avg_mib) and pd.notna(gpu_memory_max_mib): | |
| gpu_mem_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_memory_avg_mib:.0f}/{gpu_memory_max_mib:.0f}</span>' | |
| else: | |
| gpu_mem_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>' | |
| # Format GPU Temperature | |
| if pd.notna(gpu_temperature_avg) and pd.notna(gpu_temperature_max): | |
| gpu_temp_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_temperature_avg:.0f}/{gpu_temperature_max:.0f}°C</span>' | |
| else: | |
| gpu_temp_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>' | |
| # Format GPU Power | |
| if pd.notna(gpu_power_avg_w): | |
| gpu_power_display = f'<span style="font-family: monospace; font-size: 0.85em; color: #334155;">{gpu_power_avg_w:.1f}W</span>' | |
| else: | |
| gpu_power_display = '<span style="color: #94A3B8; font-size: 0.85em;">N/A</span>' | |
| # Format timestamp | |
| from datetime import datetime | |
| if pd.notna(timestamp): | |
| try: | |
| # Handle both string and Timestamp objects | |
| if isinstance(timestamp, pd.Timestamp): | |
| timestamp_display = timestamp.strftime('%Y-%m-%d %H:%M') | |
| else: | |
| dt = datetime.fromisoformat(str(timestamp).replace('Z', '+00:00')) | |
| timestamp_display = dt.strftime('%Y-%m-%d %H:%M') | |
| except Exception as e: | |
| timestamp_display = str(timestamp)[:16] if timestamp else 'N/A' | |
| else: | |
| timestamp_display = 'N/A' | |
| # Format Run ID (show first 8 characters) | |
| run_id = row.get('run_id', 'N/A') | |
| run_id_short = run_id[:8] + '...' if len(run_id) > 8 else run_id | |
| html += f""" | |
| <tr {data_attrs} class="tm-clickable-row"> | |
| <td>{get_rank_badge(rank)}</td> | |
| <td class="tm-run-id" title="{run_id}">{run_id_short}</td> | |
| <td class="tm-model-name">{model}</td> | |
| <td class="tm-badge-cell">{get_agent_type_badge(agent_type)}</td> | |
| <td class="tm-badge-cell">{get_provider_badge(provider)}</td> | |
| <td class="tm-badge-cell">{get_hardware_badge(has_gpu)}</td> | |
| <td>{get_success_rate_bar(success_rate)}</td> | |
| <td class="tm-numeric-cell"> | |
| <strong>{total_tests}</strong> | |
| <span style="color: #CBD5E1; margin: 0 4px;">/</span> | |
| <span style="color: #10B981; font-weight: 600;">{successful_tests}</span> | |
| <span style="color: #CBD5E1; margin: 0 4px;">/</span> | |
| <span style="color: #EF4444; font-weight: 600;">{failed_tests}</span> | |
| </td> | |
| <td class="tm-numeric-cell">{avg_steps:.1f}</td> | |
| <td class="tm-numeric-cell">{format_duration(avg_duration_ms)}</td> | |
| <td class="tm-numeric-cell">{total_tokens:,}</td> | |
| <td class="tm-numeric-cell">{format_cost(total_cost_usd)}</td> | |
| <td class="tm-numeric-cell tm-hide-mobile">{co2_display}</td> | |
| <td class="tm-hide-mobile">{gpu_display}</td> | |
| <td class="tm-numeric-cell tm-hide-mobile">{gpu_mem_display}</td> | |
| <td class="tm-numeric-cell tm-hide-mobile">{gpu_temp_display}</td> | |
| <td class="tm-numeric-cell tm-hide-mobile">{gpu_power_display}</td> | |
| <td class="tm-hide-mobile tm-text-cell">{timestamp_display}</td> | |
| <td class="tm-hide-mobile tm-text-cell"> | |
| {submitted_by} | |
| </td> | |
| </tr> | |
| """ | |
| html += """ | |
| </tbody> | |
| </table> | |
| </div> | |
| """ | |
| return html | |
| def generate_empty_state_html() -> str: | |
| """ | |
| Generate HTML for empty leaderboard state | |
| Returns: | |
| HTML string for empty state | |
| """ | |
| return """ | |
| <div style=" | |
| text-align: center; | |
| padding: 60px 20px; | |
| background: var(--tm-bg-card, #1E293B); | |
| border-radius: 16px; | |
| border: 2px dashed var(--tm-border-default, rgba(148, 163, 184, 0.2)); | |
| margin: 20px 0; | |
| "> | |
| <div style="font-size: 48px; margin-bottom: 16px;">📊</div> | |
| <h3 style=" | |
| color: var(--tm-text-primary, #F1F5F9); | |
| margin: 0 0 12px 0; | |
| font-size: 1.5rem; | |
| "> | |
| No Evaluation Results Yet | |
| </h3> | |
| <p style=" | |
| color: var(--tm-text-secondary, #94A3B8); | |
| margin: 0 0 24px 0; | |
| font-size: 1rem; | |
| "> | |
| Run your first evaluation to see results appear here. | |
| </p> | |
| <button style=" | |
| padding: 12px 24px; | |
| background: var(--tm-primary, #4F46E5); | |
| color: white; | |
| border: none; | |
| border-radius: 8px; | |
| font-weight: 600; | |
| cursor: pointer; | |
| font-size: 1rem; | |
| "> | |
| Start New Evaluation | |
| </button> | |
| </div> | |
| """ | |
| def generate_filter_summary_html( | |
| total_runs: int, | |
| filtered_runs: int, | |
| active_filters: dict | |
| ) -> str: | |
| """ | |
| Generate summary of active filters | |
| Args: | |
| total_runs: Total number of runs | |
| filtered_runs: Number of runs after filtering | |
| active_filters: Dict of active filter values | |
| Returns: | |
| HTML string with filter summary | |
| """ | |
| if filtered_runs == total_runs: | |
| return f""" | |
| <div style=" | |
| padding: 12px 16px; | |
| background: var(--tm-bg-secondary, #334155); | |
| border-radius: 8px; | |
| margin-bottom: 16px; | |
| color: var(--tm-text-secondary, #94A3B8); | |
| font-size: 0.9em; | |
| "> | |
| Showing all <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> evaluation runs | |
| </div> | |
| """ | |
| filter_chips = [] | |
| for key, value in active_filters.items(): | |
| if value and value != "All": | |
| filter_chips.append(f""" | |
| <span style=" | |
| display: inline-flex; | |
| align-items: center; | |
| padding: 4px 10px; | |
| background: var(--tm-primary, #4F46E5); | |
| color: white; | |
| border-radius: 6px; | |
| font-size: 0.85em; | |
| margin-right: 8px; | |
| font-weight: 500; | |
| "> | |
| {key}: {value} | |
| </span> | |
| """) | |
| filters_html = "".join(filter_chips) if filter_chips else "" | |
| return f""" | |
| <div style=" | |
| padding: 12px 16px; | |
| background: var(--tm-bg-secondary, #334155); | |
| border-radius: 8px; | |
| margin-bottom: 16px; | |
| color: var(--tm-text-secondary, #94A3B8); | |
| font-size: 0.9em; | |
| "> | |
| <div style="margin-bottom: 8px;"> | |
| Showing <strong style="color: var(--tm-text-primary, #F1F5F9);">{filtered_runs}</strong> of | |
| <strong style="color: var(--tm-text-primary, #F1F5F9);">{total_runs}</strong> runs | |
| </div> | |
| {filters_html} | |
| </div> | |
| """ | |