Spaces:
Running
Running
Mandark-droid
commited on
Commit
Β·
3138502
1
Parent(s):
baaa457
Add dashboard screen with aggregate statistics and recent runs
Browse files- Implement dashboard screen module with stats cards display
- Add 6 metric cards: Total Runs, Avg Accuracy, Avg Latency, Total Tokens, Total Cost, Total CO2
- Include recent evaluations table showing latest 5 runs
- Integrate dashboard navigation with sidebar buttons
- Set dashboard as default landing screen
- Fix console encoding issues for Windows compatibility
- app.py +80 -10
- screens/dashboard.py +291 -0
app.py
CHANGED
|
@@ -27,6 +27,10 @@ from screens.trace_detail import (
|
|
| 27 |
create_gpu_metrics_dashboard,
|
| 28 |
create_gpu_summary_cards
|
| 29 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
from utils.navigation import Navigator, Screen
|
| 31 |
|
| 32 |
|
|
@@ -388,9 +392,9 @@ data_loader = create_data_loader_from_env()
|
|
| 388 |
navigator = Navigator()
|
| 389 |
|
| 390 |
# Pre-load and cache the leaderboard data before building UI
|
| 391 |
-
print("
|
| 392 |
leaderboard_df_cache = data_loader.load_leaderboard()
|
| 393 |
-
print(f"
|
| 394 |
|
| 395 |
# Global state (already populated)
|
| 396 |
# leaderboard_df_cache is now set
|
|
@@ -895,7 +899,7 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 895 |
Agent Evaluation Platform
|
| 896 |
</p>
|
| 897 |
<p style="color: rgba(255,255,255,0.8); margin: 10px 0 0 0; font-size: 0.9em;">
|
| 898 |
-
Powered by Gradio
|
| 899 |
</p>
|
| 900 |
</div>
|
| 901 |
""")
|
|
@@ -913,9 +917,10 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 913 |
|
| 914 |
# Navigation section
|
| 915 |
gr.Markdown("### π§ Navigation")
|
| 916 |
-
|
| 917 |
# Navigation buttons
|
| 918 |
-
|
|
|
|
| 919 |
compare_nav_btn = gr.Button("βοΈ Compare", variant="secondary", size="lg")
|
| 920 |
docs_nav_btn = gr.Button("π Documentation", variant="secondary", size="lg")
|
| 921 |
|
|
@@ -944,10 +949,13 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 944 |
label="Agent Type",
|
| 945 |
info="Tool: Function calling | Code: Code execution | Both: Hybrid"
|
| 946 |
)
|
| 947 |
-
|
| 948 |
# Main content area
|
|
|
|
|
|
|
|
|
|
| 949 |
# Screen 1: Main Leaderboard
|
| 950 |
-
with gr.Column(visible=
|
| 951 |
gr.Markdown("## π Agent Evaluation Leaderboard")
|
| 952 |
with gr.Tabs():
|
| 953 |
with gr.TabItem("π Leaderboard"):
|
|
@@ -1106,7 +1114,53 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 1106 |
trace_ask_btn = gr.Button("Ask", variant="primary")
|
| 1107 |
trace_answer = gr.Markdown("*Ask a question to get AI-powered insights*")
|
| 1108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1109 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1110 |
app.load(
|
| 1111 |
fn=load_leaderboard,
|
| 1112 |
outputs=[leaderboard_by_model, model_filter, sidebar_model_filter]
|
|
@@ -1191,6 +1245,22 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 1191 |
outputs=[mcp_insights]
|
| 1192 |
)
|
| 1193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1194 |
|
| 1195 |
leaderboard_table.select(
|
| 1196 |
fn=on_drilldown_select,
|
|
@@ -1238,9 +1308,9 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
|
|
| 1238 |
|
| 1239 |
|
| 1240 |
if __name__ == "__main__":
|
| 1241 |
-
print("
|
| 1242 |
-
print(f"
|
| 1243 |
-
print(f"
|
| 1244 |
|
| 1245 |
app.launch(
|
| 1246 |
server_name="0.0.0.0",
|
|
|
|
| 27 |
create_gpu_metrics_dashboard,
|
| 28 |
create_gpu_summary_cards
|
| 29 |
)
|
| 30 |
+
from screens.dashboard import (
|
| 31 |
+
create_dashboard_ui,
|
| 32 |
+
update_dashboard_data
|
| 33 |
+
)
|
| 34 |
from utils.navigation import Navigator, Screen
|
| 35 |
|
| 36 |
|
|
|
|
| 392 |
navigator = Navigator()
|
| 393 |
|
| 394 |
# Pre-load and cache the leaderboard data before building UI
|
| 395 |
+
print("Pre-loading leaderboard data from HuggingFace...")
|
| 396 |
leaderboard_df_cache = data_loader.load_leaderboard()
|
| 397 |
+
print(f"Loaded {len(leaderboard_df_cache)} evaluation runs")
|
| 398 |
|
| 399 |
# Global state (already populated)
|
| 400 |
# leaderboard_df_cache is now set
|
|
|
|
| 899 |
Agent Evaluation Platform
|
| 900 |
</p>
|
| 901 |
<p style="color: rgba(255,255,255,0.8); margin: 10px 0 0 0; font-size: 0.9em;">
|
| 902 |
+
Powered by Gradio π | HuggingFace Jobs | TraceVerde | SmolTrace | MCP | Gemini | Modal
|
| 903 |
</p>
|
| 904 |
</div>
|
| 905 |
""")
|
|
|
|
| 917 |
|
| 918 |
# Navigation section
|
| 919 |
gr.Markdown("### π§ Navigation")
|
| 920 |
+
|
| 921 |
# Navigation buttons
|
| 922 |
+
dashboard_nav_btn = gr.Button("π Dashboard", variant="primary", size="lg")
|
| 923 |
+
leaderboard_nav_btn = gr.Button("π Leaderboard", variant="secondary", size="lg")
|
| 924 |
compare_nav_btn = gr.Button("βοΈ Compare", variant="secondary", size="lg")
|
| 925 |
docs_nav_btn = gr.Button("π Documentation", variant="secondary", size="lg")
|
| 926 |
|
|
|
|
| 949 |
label="Agent Type",
|
| 950 |
info="Tool: Function calling | Code: Code execution | Both: Hybrid"
|
| 951 |
)
|
| 952 |
+
|
| 953 |
# Main content area
|
| 954 |
+
# Screen 0: Dashboard
|
| 955 |
+
dashboard_screen, dashboard_components = create_dashboard_ui()
|
| 956 |
+
|
| 957 |
# Screen 1: Main Leaderboard
|
| 958 |
+
with gr.Column(visible=False) as leaderboard_screen:
|
| 959 |
gr.Markdown("## π Agent Evaluation Leaderboard")
|
| 960 |
with gr.Tabs():
|
| 961 |
with gr.TabItem("π Leaderboard"):
|
|
|
|
| 1114 |
trace_ask_btn = gr.Button("Ask", variant="primary")
|
| 1115 |
trace_answer = gr.Markdown("*Ask a question to get AI-powered insights*")
|
| 1116 |
|
| 1117 |
+
# Navigation handlers (define before use)
|
| 1118 |
+
def navigate_to_dashboard():
|
| 1119 |
+
"""Navigate to dashboard screen and load dashboard data"""
|
| 1120 |
+
try:
|
| 1121 |
+
leaderboard_df = data_loader.load_leaderboard()
|
| 1122 |
+
dashboard_updates = update_dashboard_data(leaderboard_df, dashboard_components)
|
| 1123 |
+
except Exception as e:
|
| 1124 |
+
print(f"[ERROR] Loading dashboard data: {e}")
|
| 1125 |
+
dashboard_updates = {}
|
| 1126 |
+
|
| 1127 |
+
# Combine navigation updates with dashboard data updates
|
| 1128 |
+
result = {
|
| 1129 |
+
dashboard_screen: gr.update(visible=True),
|
| 1130 |
+
leaderboard_screen: gr.update(visible=False),
|
| 1131 |
+
run_detail_screen: gr.update(visible=False),
|
| 1132 |
+
trace_detail_screen: gr.update(visible=False),
|
| 1133 |
+
dashboard_nav_btn: gr.update(variant="primary"),
|
| 1134 |
+
leaderboard_nav_btn: gr.update(variant="secondary"),
|
| 1135 |
+
compare_nav_btn: gr.update(variant="secondary"),
|
| 1136 |
+
docs_nav_btn: gr.update(variant="secondary"),
|
| 1137 |
+
}
|
| 1138 |
+
result.update(dashboard_updates)
|
| 1139 |
+
return result
|
| 1140 |
+
|
| 1141 |
+
def navigate_to_leaderboard():
|
| 1142 |
+
"""Navigate to leaderboard screen"""
|
| 1143 |
+
return {
|
| 1144 |
+
dashboard_screen: gr.update(visible=False),
|
| 1145 |
+
leaderboard_screen: gr.update(visible=True),
|
| 1146 |
+
run_detail_screen: gr.update(visible=False),
|
| 1147 |
+
trace_detail_screen: gr.update(visible=False),
|
| 1148 |
+
dashboard_nav_btn: gr.update(variant="secondary"),
|
| 1149 |
+
leaderboard_nav_btn: gr.update(variant="primary"),
|
| 1150 |
+
compare_nav_btn: gr.update(variant="secondary"),
|
| 1151 |
+
docs_nav_btn: gr.update(variant="secondary"),
|
| 1152 |
+
}
|
| 1153 |
+
|
| 1154 |
# Event handlers
|
| 1155 |
+
# Load dashboard on app start
|
| 1156 |
+
app.load(
|
| 1157 |
+
fn=navigate_to_dashboard,
|
| 1158 |
+
outputs=[
|
| 1159 |
+
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen,
|
| 1160 |
+
dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
|
| 1161 |
+
] + list(dashboard_components.values())
|
| 1162 |
+
)
|
| 1163 |
+
|
| 1164 |
app.load(
|
| 1165 |
fn=load_leaderboard,
|
| 1166 |
outputs=[leaderboard_by_model, model_filter, sidebar_model_filter]
|
|
|
|
| 1245 |
outputs=[mcp_insights]
|
| 1246 |
)
|
| 1247 |
|
| 1248 |
+
# Wire up navigation buttons
|
| 1249 |
+
dashboard_nav_btn.click(
|
| 1250 |
+
fn=navigate_to_dashboard,
|
| 1251 |
+
outputs=[
|
| 1252 |
+
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen,
|
| 1253 |
+
dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
|
| 1254 |
+
] + list(dashboard_components.values())
|
| 1255 |
+
)
|
| 1256 |
+
|
| 1257 |
+
leaderboard_nav_btn.click(
|
| 1258 |
+
fn=navigate_to_leaderboard,
|
| 1259 |
+
outputs=[
|
| 1260 |
+
dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen,
|
| 1261 |
+
dashboard_nav_btn, leaderboard_nav_btn, compare_nav_btn, docs_nav_btn
|
| 1262 |
+
]
|
| 1263 |
+
)
|
| 1264 |
|
| 1265 |
leaderboard_table.select(
|
| 1266 |
fn=on_drilldown_select,
|
|
|
|
| 1308 |
|
| 1309 |
|
| 1310 |
if __name__ == "__main__":
|
| 1311 |
+
print("Starting TraceMind-AI...")
|
| 1312 |
+
print(f"Data Source: {os.getenv('DATA_SOURCE', 'both')}")
|
| 1313 |
+
print(f"JSON Path: {os.getenv('JSON_DATA_PATH', './sample_data')}")
|
| 1314 |
|
| 1315 |
app.launch(
|
| 1316 |
server_name="0.0.0.0",
|
screens/dashboard.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dashboard Screen for TraceMind-AI
|
| 3 |
+
Displays aggregate statistics and recent evaluation runs
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def calculate_aggregate_stats(leaderboard_df):
|
| 11 |
+
"""Calculate aggregate statistics for dashboard"""
|
| 12 |
+
if leaderboard_df.empty:
|
| 13 |
+
return {
|
| 14 |
+
'total_runs': 0,
|
| 15 |
+
'avg_accuracy': 0.0,
|
| 16 |
+
'avg_latency': 0.0,
|
| 17 |
+
'total_tokens': 0,
|
| 18 |
+
'total_cost': 0.0,
|
| 19 |
+
'avg_cost': 0.0,
|
| 20 |
+
'total_co2': 0.0
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
return {
|
| 24 |
+
'total_runs': len(leaderboard_df),
|
| 25 |
+
'avg_accuracy': leaderboard_df['success_rate'].mean() if 'success_rate' in leaderboard_df.columns else 0.0,
|
| 26 |
+
'avg_latency': leaderboard_df['avg_duration_ms'].mean() / 1000 if 'avg_duration_ms' in leaderboard_df.columns else 0.0,
|
| 27 |
+
'total_tokens': int(leaderboard_df['total_tokens'].sum()) if 'total_tokens' in leaderboard_df.columns else 0,
|
| 28 |
+
'total_cost': leaderboard_df['total_cost_usd'].sum() if 'total_cost_usd' in leaderboard_df.columns else 0.0,
|
| 29 |
+
'avg_cost': leaderboard_df['total_cost_usd'].mean() if 'total_cost_usd' in leaderboard_df.columns else 0.0,
|
| 30 |
+
'total_co2': leaderboard_df['co2_emissions_g'].sum() if 'co2_emissions_g' in leaderboard_df.columns else 0.0
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def generate_stats_card(title, value, emoji, gradient_colors, description):
|
| 35 |
+
"""
|
| 36 |
+
Generate HTML for a single statistics card
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
title: Card title
|
| 40 |
+
value: Main value to display
|
| 41 |
+
emoji: Emoji icon
|
| 42 |
+
gradient_colors: Tuple of (start_color, end_color) for gradient
|
| 43 |
+
description: Description text
|
| 44 |
+
"""
|
| 45 |
+
return f"""
|
| 46 |
+
<div style="background: linear-gradient(135deg, {gradient_colors[0]} 0%, {gradient_colors[1]} 100%);
|
| 47 |
+
padding: 25px;
|
| 48 |
+
border-radius: 12px;
|
| 49 |
+
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
|
| 50 |
+
color: white;
|
| 51 |
+
min-height: 150px;">
|
| 52 |
+
<div style="display: flex; align-items: center; justify-content: space-between;">
|
| 53 |
+
<div>
|
| 54 |
+
<div style="font-size: 3em; font-weight: bold; margin: 10px 0;">{value}</div>
|
| 55 |
+
<div style="font-size: 1.1em; opacity: 0.9;">{emoji} {title}</div>
|
| 56 |
+
</div>
|
| 57 |
+
</div>
|
| 58 |
+
<div style="margin-top: 15px; font-size: 0.9em; opacity: 0.8;">
|
| 59 |
+
<span style="background: rgba(255,255,255,0.2); padding: 4px 8px; border-radius: 4px;">
|
| 60 |
+
{description}
|
| 61 |
+
</span>
|
| 62 |
+
</div>
|
| 63 |
+
</div>
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def create_dashboard_cards(stats):
|
| 68 |
+
"""
|
| 69 |
+
Create all dashboard stat cards from stats dictionary
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
stats: Dictionary with aggregate statistics
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Dictionary of card HTML strings
|
| 76 |
+
"""
|
| 77 |
+
cards = {}
|
| 78 |
+
|
| 79 |
+
# Card 1: Total Runs
|
| 80 |
+
cards['total_runs'] = generate_stats_card(
|
| 81 |
+
title="Total Runs",
|
| 82 |
+
value=stats['total_runs'],
|
| 83 |
+
emoji="π",
|
| 84 |
+
gradient_colors=("#667eea", "#764ba2"),
|
| 85 |
+
description="All evaluations"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Card 2: Avg Accuracy
|
| 89 |
+
cards['avg_accuracy'] = generate_stats_card(
|
| 90 |
+
title="Avg Accuracy",
|
| 91 |
+
value=f"{stats['avg_accuracy']:.1f}%",
|
| 92 |
+
emoji="π―",
|
| 93 |
+
gradient_colors=("#f093fb", "#f5576c"),
|
| 94 |
+
description="Success rate"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Card 3: Avg Latency
|
| 98 |
+
cards['avg_latency'] = generate_stats_card(
|
| 99 |
+
title="Avg Latency",
|
| 100 |
+
value=f"{stats['avg_latency']:.2f}s",
|
| 101 |
+
emoji="β‘",
|
| 102 |
+
gradient_colors=("#4facfe", "#00f2fe"),
|
| 103 |
+
description="Response time"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Card 4: Total Tokens
|
| 107 |
+
cards['total_tokens'] = generate_stats_card(
|
| 108 |
+
title="Total Tokens",
|
| 109 |
+
value=f"{stats['total_tokens']:,}",
|
| 110 |
+
emoji="π¬",
|
| 111 |
+
gradient_colors=("#43e97b", "#38f9d7"),
|
| 112 |
+
description="Across all runs"
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Card 5: Total Cost
|
| 116 |
+
cards['total_cost'] = generate_stats_card(
|
| 117 |
+
title="Total Cost",
|
| 118 |
+
value=f"${stats['total_cost']:.4f}",
|
| 119 |
+
emoji="π°",
|
| 120 |
+
gradient_colors=("#fa709a", "#fee140"),
|
| 121 |
+
description="All evaluations"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Card 6: Total CO2
|
| 125 |
+
cards['total_co2'] = generate_stats_card(
|
| 126 |
+
title="Total CO2",
|
| 127 |
+
value=f"{stats['total_co2']:.2f}g",
|
| 128 |
+
emoji="π±",
|
| 129 |
+
gradient_colors=("#30cfd0", "#330867"),
|
| 130 |
+
description="Carbon emissions"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
return cards
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def prepare_recent_runs_data(leaderboard_df, n=5):
|
| 137 |
+
"""
|
| 138 |
+
Prepare data for recent runs table
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
leaderboard_df: Leaderboard dataframe
|
| 142 |
+
n: Number of recent runs to show
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
List of lists for Gradio DataFrame
|
| 146 |
+
"""
|
| 147 |
+
recent_runs_data = []
|
| 148 |
+
|
| 149 |
+
if not leaderboard_df.empty:
|
| 150 |
+
# Convert timestamp to datetime to avoid type errors during sorting
|
| 151 |
+
if 'timestamp' in leaderboard_df.columns:
|
| 152 |
+
leaderboard_df['timestamp'] = pd.to_datetime(leaderboard_df['timestamp'], errors='coerce')
|
| 153 |
+
recent_df = leaderboard_df.sort_values('timestamp', ascending=False).head(n)
|
| 154 |
+
else:
|
| 155 |
+
recent_df = leaderboard_df.head(n)
|
| 156 |
+
|
| 157 |
+
for _, row in recent_df.iterrows():
|
| 158 |
+
# Format duration
|
| 159 |
+
duration_ms = row.get('avg_duration_ms', 0)
|
| 160 |
+
if duration_ms >= 1000:
|
| 161 |
+
duration_str = f"{duration_ms/1000:.2f}s"
|
| 162 |
+
else:
|
| 163 |
+
duration_str = f"{duration_ms:.0f}ms"
|
| 164 |
+
|
| 165 |
+
recent_runs_data.append([
|
| 166 |
+
row.get('model', 'N/A'),
|
| 167 |
+
f"{row.get('success_rate', 0):.1f}%",
|
| 168 |
+
f"${row.get('total_cost_usd', 0):.4f}",
|
| 169 |
+
duration_str,
|
| 170 |
+
row.get('timestamp', 'N/A')
|
| 171 |
+
])
|
| 172 |
+
|
| 173 |
+
return recent_runs_data
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def create_dashboard_ui():
|
| 177 |
+
"""
|
| 178 |
+
Create the dashboard screen UI components
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Tuple of (screen_column, component_dict)
|
| 182 |
+
"""
|
| 183 |
+
components = {}
|
| 184 |
+
|
| 185 |
+
with gr.Column(visible=True) as dashboard_screen:
|
| 186 |
+
gr.Markdown("## π Dashboard")
|
| 187 |
+
gr.Markdown("*Overview of agent evaluation metrics*")
|
| 188 |
+
|
| 189 |
+
# Stats cards in draggable grid layout
|
| 190 |
+
with gr.Row():
|
| 191 |
+
# Card 1: Total Runs
|
| 192 |
+
with gr.Draggable():
|
| 193 |
+
components['total_runs_card'] = gr.HTML(
|
| 194 |
+
generate_stats_card(
|
| 195 |
+
"Total Runs", "0", "π",
|
| 196 |
+
("#667eea", "#764ba2"),
|
| 197 |
+
"All evaluations"
|
| 198 |
+
)
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# Card 2: Avg Accuracy
|
| 202 |
+
with gr.Draggable():
|
| 203 |
+
components['avg_accuracy_card'] = gr.HTML(
|
| 204 |
+
generate_stats_card(
|
| 205 |
+
"Avg Accuracy", "0%", "π―",
|
| 206 |
+
("#f093fb", "#f5576c"),
|
| 207 |
+
"Success rate"
|
| 208 |
+
)
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
with gr.Row():
|
| 212 |
+
# Card 3: Avg Latency
|
| 213 |
+
with gr.Draggable():
|
| 214 |
+
components['avg_latency_card'] = gr.HTML(
|
| 215 |
+
generate_stats_card(
|
| 216 |
+
"Avg Latency", "0.0s", "β‘",
|
| 217 |
+
("#4facfe", "#00f2fe"),
|
| 218 |
+
"Response time"
|
| 219 |
+
)
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# Card 4: Total Tokens
|
| 223 |
+
with gr.Draggable():
|
| 224 |
+
components['total_tokens_card'] = gr.HTML(
|
| 225 |
+
generate_stats_card(
|
| 226 |
+
"Total Tokens", "0", "π¬",
|
| 227 |
+
("#43e97b", "#38f9d7"),
|
| 228 |
+
"Across all runs"
|
| 229 |
+
)
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
with gr.Row():
|
| 233 |
+
# Card 5: Total Cost
|
| 234 |
+
with gr.Draggable():
|
| 235 |
+
components['total_cost_card'] = gr.HTML(
|
| 236 |
+
generate_stats_card(
|
| 237 |
+
"Total Cost", "$0.00", "π°",
|
| 238 |
+
("#fa709a", "#fee140"),
|
| 239 |
+
"All evaluations"
|
| 240 |
+
)
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# Card 6: Total CO2
|
| 244 |
+
with gr.Draggable():
|
| 245 |
+
components['total_co2_card'] = gr.HTML(
|
| 246 |
+
generate_stats_card(
|
| 247 |
+
"Total CO2", "0g", "π±",
|
| 248 |
+
("#30cfd0", "#330867"),
|
| 249 |
+
"Carbon emissions"
|
| 250 |
+
)
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
gr.Markdown("---")
|
| 254 |
+
|
| 255 |
+
# Recent Runs Preview
|
| 256 |
+
gr.Markdown("### π Recent Evaluations")
|
| 257 |
+
components['recent_runs_table'] = gr.Dataframe(
|
| 258 |
+
headers=["Model", "Success Rate", "Cost", "Duration", "Timestamp"],
|
| 259 |
+
interactive=False,
|
| 260 |
+
wrap=True,
|
| 261 |
+
row_count=5,
|
| 262 |
+
label="Latest 5 runs"
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
return dashboard_screen, components
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def update_dashboard_data(leaderboard_df, components):
|
| 269 |
+
"""
|
| 270 |
+
Update dashboard stats cards and recent runs table
|
| 271 |
+
|
| 272 |
+
Args:
|
| 273 |
+
leaderboard_df: Leaderboard dataframe
|
| 274 |
+
components: Dictionary of Gradio components
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
Dictionary of component updates
|
| 278 |
+
"""
|
| 279 |
+
stats = calculate_aggregate_stats(leaderboard_df)
|
| 280 |
+
cards = create_dashboard_cards(stats)
|
| 281 |
+
recent_runs_data = prepare_recent_runs_data(leaderboard_df)
|
| 282 |
+
|
| 283 |
+
return {
|
| 284 |
+
components['total_runs_card']: gr.update(value=cards['total_runs']),
|
| 285 |
+
components['avg_accuracy_card']: gr.update(value=cards['avg_accuracy']),
|
| 286 |
+
components['avg_latency_card']: gr.update(value=cards['avg_latency']),
|
| 287 |
+
components['total_tokens_card']: gr.update(value=cards['total_tokens']),
|
| 288 |
+
components['total_cost_card']: gr.update(value=cards['total_cost']),
|
| 289 |
+
components['total_co2_card']: gr.update(value=cards['total_co2']),
|
| 290 |
+
components['recent_runs_table']: gr.update(value=recent_runs_data)
|
| 291 |
+
}
|