diff --git "a/tabs/leaderboard_v2.py" "b/tabs/leaderboard_v2.py" new file mode 100644--- /dev/null +++ "b/tabs/leaderboard_v2.py" @@ -0,0 +1,3154 @@ +import gradio as gr +import pandas as pd +import numpy as np +import plotly.graph_objects as go +import plotly.express as px + +# Utility functions (moved from utils.py) +def get_chart_colors(): + return { + "Private": "#E35454", # Vanguard Red + "Open source": "#1098F7", # Airglow Blue + "performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"], + "text": "#F5F6F7", + "background": "#01091A", + "grid": (0, 0, 0, 0.1), # RGBA tuple for grid + } + + +def get_rank_badge(rank): + """Generate HTML for rank badge with appropriate styling""" + badge_styles = { + 1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"), + 2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"), + 3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"), + } + + if rank in badge_styles: + label, gradient, text_color = badge_styles[rank] + return f""" +
+ {label} +
+ """ + return f""" +
+ {rank} +
+ """ + + +def get_type_badge(model_type): + """Generate HTML for model type badge""" + colors = get_chart_colors() + colors = {"Private": colors["Private"], "Open source": colors["Open source"]} + bg_color = colors.get(model_type, "#4F46E5") + return f""" +
+ {model_type} +
+ """ + + +def get_score_bar(score): + """Generate HTML for score bar with gradient styling""" + width = score * 100 + return f""" +
+
+
+
+ {score:.3f} +
+ """ + + +# Define column mapping once for reuse across all functions +SORT_COLUMN_MAP = { + "Avg Action Completion": "Avg AC", + "Avg Tool Selection Quality": "Avg TSQ", + "Avg Session Cost": "Avg Total Cost", +} + + +def create_leaderboard_v2_tab(): + """Create the main leaderboard v2 tab with interactive table""" + + def load_leaderboard_data(): + """Load and prepare the leaderboard data""" + df = pd.read_csv('results_v2.csv') + + # Clean and prepare data + df = df.copy() + + # Round numeric columns for better display + numeric_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns'] + for col in numeric_cols: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce').round(3) + + # Fill NaN values appropriately + df = df.fillna('') + + return df + + def generate_html_table(filtered_df, domain_filter): + """Generate styled HTML table with rank badges and score bars""" + table_html = """ + + +
+ + + + + + + + + + + + + + + + """ + + # Generate table rows + for idx, (_, row) in enumerate(filtered_df.iterrows()): + rank = idx + 1 + table_html += f""" + + + + + + """ + + # Get appropriate values based on domain filter + if domain_filter != "All": + # For specific domain, show domain-specific scores + ac_col = f'{domain_filter} AC' + tsq_col = f'{domain_filter} TSQ' + + # AC score + if ac_col in row and row[ac_col] != '': + ac_value = float(row[ac_col]) + table_html += f'' + else: + table_html += '' + + # TSQ score + if tsq_col in row and row[tsq_col] != '': + tsq_value = float(row[tsq_col]) + table_html += f'' + else: + table_html += '' + else: + # For "All", show overall averages + table_html += f""" + + + """ + + # Add appropriate cost, duration, and turns based on domain filter + if domain_filter != "All": + # Use domain-specific values + cost_col = f'{domain_filter} Cost' + duration_col = f'{domain_filter} Duration' + turns_col = f'{domain_filter} Turns' + + cost = row.get(cost_col, '') + duration = row.get(duration_col, '') + turns = row.get(turns_col, '') + + # Convert to float if not empty + if cost != '': + cost = float(cost) + if duration != '': + duration = float(duration) + if turns != '': + turns = float(turns) + else: + # Use overall averages for "All" domain + cost = row.get('Avg Total Cost', row.get('Cost ($)', '')) + duration = row.get('Avg Session Duration', row.get('Duration (s)', '')) + turns = row.get('Avg Turns', row.get('Turns', '')) + + # Format the values for display + if cost != '': + cost_display = f'{cost:.3f}' + else: + cost_display = '-' + + if duration != '': + duration_display = f'{duration:.1f}' + else: + duration_display = '-' + + if turns != '': + turns_display = f'{turns:.1f}' + else: + turns_display = '-' + + table_html += f""" + + + + + """ + + table_html += """ + +
RankModelTypeVendorAvg Action CompletionAvg Tool Selection QualityAvg Cost ($)Avg Duration (s)Avg Turns
{get_rank_badge(rank)}{row['Model']}{get_type_badge(row['Model Type'])}{row['Vendor']}{get_score_bar(ac_value)}-{get_score_bar(tsq_value)}-{get_score_bar(row['Avg AC'])}{get_score_bar(row['Avg TSQ'])}${cost_display}{duration_display}{turns_display}
+
+ """ + + return table_html + + def update_leaderboard_title(domain_filter): + """Update the leaderboard title based on selected domain""" + # Strip emoji prefix from domain filter + domain_filter_clean = domain_filter + if domain_filter.startswith('🌐'): + domain_filter_clean = "All" + elif domain_filter.startswith('🏦'): + domain_filter_clean = "Banking" + elif domain_filter.startswith('πŸ₯'): + domain_filter_clean = "Healthcare" + elif domain_filter.startswith('πŸ›‘οΈ'): + domain_filter_clean = "Insurance" + elif domain_filter.startswith('πŸ’°'): + domain_filter_clean = "Investment" + elif domain_filter.startswith('πŸ“±'): + domain_filter_clean = "Telecom" + + return f""" +
+
+ πŸ“ˆ +

+ Agent Leaderboard for {domain_filter_clean} +

+
+
+ """ + + def filter_and_sort_data(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order): + """Filter and sort the leaderboard data""" + df = load_leaderboard_data() + + # Apply filters + filtered_df = df.copy() + + # Strip emoji prefix from domain filter + domain_filter_clean = domain_filter + if domain_filter.startswith('🌐'): + domain_filter_clean = "All" + elif domain_filter.startswith('🏦'): + domain_filter_clean = "Banking" + elif domain_filter.startswith('πŸ₯'): + domain_filter_clean = "Healthcare" + elif domain_filter.startswith('πŸ›‘οΈ'): + domain_filter_clean = "Insurance" + elif domain_filter.startswith('πŸ’°'): + domain_filter_clean = "Investment" + elif domain_filter.startswith('πŸ“±'): + domain_filter_clean = "Telecom" + + # Domain filtering (check if model performs well in specific domain) + if domain_filter_clean != "All": + domain_col_map = { + "Banking": "Banking AC", + "Healthcare": "Healthcare AC", + "Insurance": "Insurance AC", + "Investment": "Investment AC", + "Telecom": "Telecom AC" + } + if domain_filter_clean in domain_col_map: + domain_col = domain_col_map[domain_filter_clean] + # Only show models that have data for this domain + filtered_df = filtered_df[filtered_df[domain_col] != ''] + + # Model type filtering + if model_type_filter != "All": + if model_type_filter == "Open Source": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] + elif model_type_filter == "Proprietary": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] + + # Reasoning filtering + if reasoning_filter != "All": + if reasoning_filter == "Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] + elif reasoning_filter == "Non-Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] + + # Map display name to actual column name using shared mapping + actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) + + # If domain is selected and sorting by AC or TSQ, use domain-specific column + if domain_filter_clean != "All": + if actual_sort_column == "Avg AC": + actual_sort_column = f"{domain_filter_clean} AC" + elif actual_sort_column == "Avg TSQ": + actual_sort_column = f"{domain_filter_clean} TSQ" + elif actual_sort_column == "Avg Total Cost": + actual_sort_column = f"{domain_filter_clean} Cost" + elif actual_sort_column == "Avg Session Duration": + actual_sort_column = f"{domain_filter_clean} Duration" + elif actual_sort_column == "Avg Turns": + actual_sort_column = f"{domain_filter_clean} Turns" + + if actual_sort_column and actual_sort_column in filtered_df.columns: + ascending = (sort_order == "Ascending") + filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') + + # Generate HTML table + return generate_html_table(filtered_df, domain_filter_clean) + + # Load initial data + initial_table = filter_and_sort_data("🌐 All", "All", "All", "Avg AC", "Descending") + initial_df = load_leaderboard_data() # Load raw data for model selector + + # Custom CSS for Galileo dark theme + custom_css = """ + + + + """ + + gr.HTML(custom_css) + + gr.HTML(""" +
+

+ πŸš€ Galileo Agent Leaderboard V2 +

+

+ Comprehensive performance metrics for LLM agents across business domains +

+
+ """) + + # Metrics overview cards with insights + gr.HTML(""" +
+ + + + +
+
+
+
+ 🎯 +
+
+
Task Completion
+
+ Compare models based on their ability to complete real-world business tasks accurately and efficiently +
+
+ +
+
+
+ πŸ’‘ +
+
+
Tool Selection
+
+ Analyze how precisely models choose the right tools for each task and make optimal decisions +
+
+ +
+
+
+ πŸ’° +
+
+
Cost Efficiency
+
+ Find models that deliver the best performance per dollar spent and optimize your ROI +
+
+ +
+
+
+ πŸ›οΈ +
+
+
Domain Coverage
+
+ Banking, Healthcare, Insurance, Investment, and Telecom industries analyzed for specialized performance +
+
+ +
+
+
+ πŸš€ +
+
+
Speed vs Accuracy
+
+ Understand the trade-offs between response time and accuracy to find the right balance +
+
+
+ + +
+
+
Model Capabilities
+
+
+ πŸ”“ + Open Source +
+
+ πŸ”’ + Proprietary +
+
+ 🧠 + Reasoning +
+
+
+ +
+
Interactive Visualizations
+
+
+ πŸ•ΈοΈ + Radar Charts +
+
+ πŸ“Š + Heatmaps +
+
+ πŸ“ˆ + Scatter Plots +
+
+
+ +
+
Real-World Performance
+
+
+ πŸ’Ό + Business Tasks +
+
+ πŸ”„ + Multi-Turn +
+
+ πŸ“‹ + Benchmarks +
+
+
+
+
+ """) + + # Domain filter section with enhanced styling + gr.HTML(""" + + +
+
+

πŸ›οΈ Select Business Domain

+

Choose a domain to see specialized agent performance

+
+ """) + + # Creating a custom radio with better visual design + domain_choices = [ + ("All", "🌐", "All Domains"), + ("Banking", "🏦", "Banking"), + ("Healthcare", "πŸ₯", "Healthcare"), + ("Insurance", "πŸ›‘οΈ", "Insurance"), + ("Investment", "πŸ’°", "Investment"), + ("Telecom", "πŸ“±", "Telecom") + ] + + with gr.Row(): + domain_filter = gr.Radio( + choices=["🌐 All", "🏦 Banking", "πŸ₯ Healthcare", "πŸ›‘οΈ Insurance", "πŸ’° Investment", "πŸ“± Telecom"], + value="🌐 All", + label="", + interactive=True, + elem_classes=["domain-radio"] + ) + + gr.HTML(""" +
+

+ πŸ’‘ Tip: Domain selection filters the leaderboard to show only models tested in that specific business context +

+
+
+ """) + + # Filter controls with enhanced styling + gr.HTML(""" +
+
+ πŸ” +

+ Filters & Sorting +

+
+ """) + + # First row: Model filters and sort order + with gr.Row(elem_classes=["compact-filter-row"]): + with gr.Column(scale=1): + model_type_filter = gr.Radio( + choices=["All", "Open Source", "Proprietary"], + value="All", + label="πŸ”“ Model Access", + elem_classes=["compact-radio"] + ) + + with gr.Column(scale=1): + reasoning_filter = gr.Radio( + choices=["All", "Reasoning", "Non-Reasoning"], + value="All", + label="🧠 Model Type", + elem_classes=["compact-radio"] + ) + + with gr.Column(scale=1): + sort_order = gr.Radio( + choices=["Descending", "Ascending"], + value="Descending", + label="πŸ”„ Sort Order", + elem_classes=["compact-radio"] + ) + + # Second row: Sort by options + gr.HTML("""
+

πŸ“Š Sort By

+
""") + + gr.HTML('
') + sort_by = gr.Radio( + choices=["Avg Action Completion", "Avg Tool Selection Quality", "Avg Session Cost", "Avg Session Duration", "Avg Turns"], + value="Avg Action Completion", + label="", + elem_classes=["domain-radio"] + ) + gr.HTML('
') + + gr.HTML("
") + + # Main leaderboard table with dynamic title + leaderboard_title = gr.HTML(""" +
+
+ πŸ“ˆ +

+ Agent Leaderboard for All +

+
+
+ """) + + leaderboard_table = gr.HTML(initial_table) + + gr.HTML(""" +
+
+

+ πŸ’‘ Tips: Hover over model names for highlighting β€’ Score bars show performance metrics visually +

+
+
""") + + # Radar Chart Section + gr.HTML(""" +
+
+ πŸ•ΈοΈ +

+ Domain Performance Analysis +

+
+

Compare model performance across different business domains

+ """) + + with gr.Row(): + with gr.Column(scale=1): + model_selector = gr.Dropdown( + choices=initial_df['Model'].tolist()[:10], + value=initial_df['Model'].tolist()[:5], + multiselect=True, + label="🎯 Select Models for Comparison", + info="Choose up to 5 models to visualize", + elem_classes=["dropdown"] + ) + + # Radar chart plot - wrapped in centered container + gr.HTML('
') + radar_chart = gr.Plot( + label="", + value=create_domain_radar_chart( + load_leaderboard_data(), + "Avg AC", + initial_df['Model'].tolist()[:5] + ), + elem_classes=["radar-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # Update functions + def update_table(*args): + title_html = update_leaderboard_title(args[0]) # domain_filter is first arg + table_html = filter_and_sort_data(*args) + return title_html, table_html + + def update_radar_chart(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models): + # Get filtered dataframe + df = load_leaderboard_data() + filtered_df = df.copy() + + # Strip emoji prefix from domain filter + domain_filter_clean = domain_filter + if domain_filter.startswith('🌐'): + domain_filter_clean = "All" + elif domain_filter.startswith('🏦'): + domain_filter_clean = "Banking" + elif domain_filter.startswith('πŸ₯'): + domain_filter_clean = "Healthcare" + elif domain_filter.startswith('πŸ›‘οΈ'): + domain_filter_clean = "Insurance" + elif domain_filter.startswith('πŸ’°'): + domain_filter_clean = "Investment" + elif domain_filter.startswith('πŸ“±'): + domain_filter_clean = "Telecom" + + # Apply filters (same logic as filter_and_sort_data) + if domain_filter_clean != "All": + domain_col_map = { + "Banking": "Banking AC", + "Healthcare": "Healthcare AC", + "Insurance": "Insurance AC", + "Investment": "Investment AC", + "Telecom": "Telecom AC" + } + if domain_filter_clean in domain_col_map: + domain_col = domain_col_map[domain_filter_clean] + # Only show models that have data for this domain + filtered_df = filtered_df[filtered_df[domain_col] != ''] + + if model_type_filter != "All": + if model_type_filter == "Open Source": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] + elif model_type_filter == "Proprietary": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] + + if reasoning_filter != "All": + if reasoning_filter == "Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] + elif reasoning_filter == "Non-Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] + + # Map display name to actual column name using shared mapping + actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) + + # If domain is selected and sorting by AC or TSQ, use domain-specific column + if domain_filter_clean != "All": + if actual_sort_column == "Avg AC": + actual_sort_column = f"{domain_filter_clean} AC" + elif actual_sort_column == "Avg TSQ": + actual_sort_column = f"{domain_filter_clean} TSQ" + elif actual_sort_column == "Avg Total Cost": + actual_sort_column = f"{domain_filter_clean} Cost" + elif actual_sort_column == "Avg Session Duration": + actual_sort_column = f"{domain_filter_clean} Duration" + elif actual_sort_column == "Avg Turns": + actual_sort_column = f"{domain_filter_clean} Turns" + + if actual_sort_column and actual_sort_column in filtered_df.columns: + ascending = (sort_order == "Ascending") + filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') + + # Update model selector choices based on filtered data + available_models = filtered_df['Model'].tolist()[:15] # Top 15 from filtered results + + # If selected models are not in available models, reset to top 5 + if selected_models: + valid_selected = [m for m in selected_models if m in available_models] + if not valid_selected: + valid_selected = available_models[:5] + else: + valid_selected = available_models[:5] + + # Create radar chart + chart = create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected) + + return gr.Dropdown(choices=available_models, value=valid_selected), chart + + def update_radar_only(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models): + # Get filtered dataframe + df = load_leaderboard_data() + filtered_df = df.copy() + + # Strip emoji prefix from domain filter + domain_filter_clean = domain_filter + if domain_filter.startswith('🌐'): + domain_filter_clean = "All" + elif domain_filter.startswith('🏦'): + domain_filter_clean = "Banking" + elif domain_filter.startswith('πŸ₯'): + domain_filter_clean = "Healthcare" + elif domain_filter.startswith('πŸ›‘οΈ'): + domain_filter_clean = "Insurance" + elif domain_filter.startswith('πŸ’°'): + domain_filter_clean = "Investment" + elif domain_filter.startswith('πŸ“±'): + domain_filter_clean = "Telecom" + + # Apply filters (same logic as filter_and_sort_data) + if domain_filter_clean != "All": + domain_col_map = { + "Banking": "Banking AC", + "Healthcare": "Healthcare AC", + "Insurance": "Insurance AC", + "Investment": "Investment AC", + "Telecom": "Telecom AC" + } + if domain_filter_clean in domain_col_map: + domain_col = domain_col_map[domain_filter_clean] + filtered_df = filtered_df[filtered_df[domain_col] != ''] + + if model_type_filter != "All": + if model_type_filter == "Open Source": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] + elif model_type_filter == "Proprietary": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] + + if reasoning_filter != "All": + if reasoning_filter == "Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] + elif reasoning_filter == "Non-Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] + + # Map display name to actual column name using shared mapping + actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) + + if actual_sort_column and actual_sort_column in filtered_df.columns: + ascending = (sort_order == "Ascending") + filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') + + if selected_models: + valid_selected = [m for m in selected_models if m in filtered_df['Model'].tolist()] + if not valid_selected: + valid_selected = filtered_df['Model'].tolist()[:5] + else: + valid_selected = filtered_df['Model'].tolist()[:5] + + return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected) + + # Update table when filters change + filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order] + + for input_component in filter_inputs: + input_component.change( + fn=update_table, + inputs=filter_inputs, + outputs=[leaderboard_title, leaderboard_table] + ) + + # Also update radar chart when filters change + input_component.change( + fn=update_radar_chart, + inputs=filter_inputs + [model_selector], + outputs=[model_selector, radar_chart] + ) + + # Update radar chart when model selection changes + model_selector.change( + fn=update_radar_only, + inputs=filter_inputs + [model_selector], + outputs=[radar_chart] + ) + + # Performance insights section + gr.HTML(""" +
+
+ πŸ“Š +

+ Key Insights +

+
+ +
+
+

πŸ† Top Performers

+
    +
  • Highest AC scores indicate best action completion
  • +
  • Superior TSQ shows optimal tool selection
  • +
  • Balance cost-effectiveness with performance
  • +
+
+ +
+

πŸ” Filter Features

+
    +
  • Domain-specific performance analysis
  • +
  • Compare open source vs private models
  • +
  • Reasoning vs standard model comparison
  • +
+
+ +
+

πŸ“ˆ Visualization

+
    +
  • Interactive radar charts for domain breakdown
  • +
  • Compare up to 5 models simultaneously
  • +
  • Hover for detailed performance metrics
  • +
+
+
+
+ """) + + # NEW VISUALIZATIONS START HERE + + # 1. Cost-Performance Efficiency Scatter Plot + gr.HTML(""" +
+
+ πŸ’‘ +

+ Cost-Performance Efficiency Analysis +

+
+

+ Identify models that deliver the best performance per dollar spent +

+ """) + + with gr.Row(): + with gr.Column(scale=1): + efficiency_metric = gr.Dropdown( + choices=["Avg Action Completion", "Avg Tool Selection Quality"], + value="Avg Action Completion", + label="πŸ“Š Performance Metric", + info="Select which performance metric to analyze against cost", + elem_classes=["dropdown"] + ) + + gr.HTML('
') + cost_performance_plot = gr.Plot( + label="", + value=create_cost_performance_scatter(load_leaderboard_data(), "Avg AC"), + elem_classes=["efficiency-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # 2. Speed vs Accuracy Trade-off Chart + gr.HTML(""" +
+
+ ⚑ +

+ Speed vs Accuracy Trade-off +

+
+

+ Find the sweet spot between response time and accuracy +

+ """) + + gr.HTML('
') + speed_accuracy_plot = gr.Plot( + label="", + value=create_speed_accuracy_plot(load_leaderboard_data(), "Avg AC"), + elem_classes=["speed-accuracy-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # 3. Performance Heatmap + gr.HTML(""" +
+
+ πŸ”₯ +

+ Comprehensive Performance Heatmap +

+
+

+ All metrics at a glance - darker colors indicate better performance +

+ """) + + gr.HTML('
') + performance_heatmap = gr.Plot( + label="", + value=create_performance_heatmap(load_leaderboard_data()), + elem_classes=["heatmap-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # 4. Domain Specialization Matrix + gr.HTML(""" +
+
+ 🎯 +

+ Domain Specialization Matrix +

+
+

+ Bubble size shows performance level, color intensity shows specialization strength +

+ """) + + with gr.Row(): + with gr.Column(scale=1): + specialization_metric = gr.Dropdown( + choices=["AC (Action Completion)", "TSQ (Tool Selection Quality)"], + value="AC (Action Completion)", + label="πŸ“Š Metric Type", + info="Choose which metric to analyze for domain specialization", + elem_classes=["dropdown"] + ) + + gr.HTML('
') + domain_specialization_plot = gr.Plot( + label="", + value=create_domain_specialization_matrix(load_leaderboard_data(), "AC"), + elem_classes=["specialization-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # 5. Performance Gap Analysis + gr.HTML(""" +
+
+ πŸ“ˆ +

+ Performance Gap Analysis by Domain +

+
+

+ Visualize the performance range across models for each domain +

+ """) + + gr.HTML('
') + performance_gap_plot = gr.Plot( + label="", + value=create_performance_gap_analysis(load_leaderboard_data(), "AC"), + elem_classes=["gap-analysis-chart", "plot-container"] + ) + gr.HTML('
') + + gr.HTML("
") + + # Update functions for new visualizations + def update_cost_performance(efficiency_metric): + actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric) + return create_cost_performance_scatter(load_leaderboard_data(), actual_metric) + + def update_speed_accuracy(efficiency_metric): + actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric) + return create_speed_accuracy_plot(load_leaderboard_data(), actual_metric) + + def update_domain_specialization(specialization_metric): + metric_type = "AC" if "AC" in specialization_metric else "TSQ" + return create_domain_specialization_matrix(load_leaderboard_data(), metric_type) + + def update_performance_gap(specialization_metric): + metric_type = "AC" if "AC" in specialization_metric else "TSQ" + return create_performance_gap_analysis(load_leaderboard_data(), metric_type) + + def update_all_visualizations(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order): + """Update all new visualizations when filters change""" + df = load_leaderboard_data() + filtered_df = apply_filters(df, domain_filter, model_type_filter, reasoning_filter) + + # Update efficiency metric based on current sort + actual_metric = SORT_COLUMN_MAP.get(sort_by, sort_by) if sort_by in ["Avg Action Completion", "Avg Tool Selection Quality"] else "Avg AC" + + # Update all plots + cost_perf = create_cost_performance_scatter(filtered_df, actual_metric) + speed_acc = create_speed_accuracy_plot(filtered_df, actual_metric) + heatmap = create_performance_heatmap(filtered_df) + + return cost_perf, speed_acc, heatmap + + def apply_filters(df, domain_filter, model_type_filter, reasoning_filter): + """Apply filters to dataframe""" + filtered_df = df.copy() + + # Strip emoji prefix from domain filter + domain_filter_clean = domain_filter + if domain_filter.startswith('🌐'): + domain_filter_clean = "All" + elif domain_filter.startswith('🏦'): + domain_filter_clean = "Banking" + elif domain_filter.startswith('πŸ₯'): + domain_filter_clean = "Healthcare" + elif domain_filter.startswith('πŸ›‘οΈ'): + domain_filter_clean = "Insurance" + elif domain_filter.startswith('πŸ’°'): + domain_filter_clean = "Investment" + elif domain_filter.startswith('πŸ“±'): + domain_filter_clean = "Telecom" + + # Domain filtering + if domain_filter_clean != "All": + domain_col_map = { + "Banking": "Banking AC", + "Healthcare": "Healthcare AC", + "Insurance": "Insurance AC", + "Investment": "Investment AC", + "Telecom": "Telecom AC" + } + if domain_filter_clean in domain_col_map: + domain_col = domain_col_map[domain_filter_clean] + filtered_df = filtered_df[filtered_df[domain_col] != ''] + + # Model type filtering + if model_type_filter != "All": + if model_type_filter == "Open Source": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] + elif model_type_filter == "Proprietary": + filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] + + # Reasoning filtering + if reasoning_filter != "All": + if reasoning_filter == "Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] + elif reasoning_filter == "Non-Reasoning": + filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] + + return filtered_df + + # Connect update functions to components + efficiency_metric.change( + fn=update_cost_performance, + inputs=[efficiency_metric], + outputs=[cost_performance_plot] + ) + + efficiency_metric.change( + fn=update_speed_accuracy, + inputs=[efficiency_metric], + outputs=[speed_accuracy_plot] + ) + + specialization_metric.change( + fn=update_domain_specialization, + inputs=[specialization_metric], + outputs=[domain_specialization_plot] + ) + + specialization_metric.change( + fn=update_performance_gap, + inputs=[specialization_metric], + outputs=[performance_gap_plot] + ) + + # Update new visualizations when main filters change + for input_component in filter_inputs: + input_component.change( + fn=update_all_visualizations, + inputs=filter_inputs, + outputs=[cost_performance_plot, speed_accuracy_plot, performance_heatmap] + ) + + return leaderboard_table + + +def create_leaderboard_v2_interface(): + """Create the complete leaderboard v2 interface""" + return create_leaderboard_v2_tab() + + +def create_domain_radar_chart(df, metric_type, selected_models=None, max_models=5): + """Create a radar chart showing model performance across domains for the selected metric""" + + # Map the metric_type to actual column name using shared mapping + actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) + + if selected_models is None or len(selected_models) == 0: + # Default to top 5 models by the selected metric if available + if actual_metric_type in df.columns: + selected_models = df.nlargest(max_models, actual_metric_type)['Model'].tolist() + else: + selected_models = df.head(max_models)['Model'].tolist() + + # Limit to max_models for readability + selected_models = selected_models[:max_models] + + # Define domain mapping based on metric type + domain_mapping = { + 'Avg AC': { + 'Banking': 'Banking AC', + 'Healthcare': 'Healthcare AC', + 'Insurance': 'Insurance AC', + 'Investment': 'Investment AC', + 'Telecom': 'Telecom AC' + }, + 'Avg TSQ': { + 'Banking': 'Banking TSQ', + 'Healthcare': 'Healthcare TSQ', + 'Insurance': 'Insurance TSQ', + 'Investment': 'Investment TSQ', + 'Telecom': 'Telecom TSQ' + }, + 'Avg Total Cost': { + 'Banking': 'Banking Cost', + 'Healthcare': 'Healthcare Cost', + 'Insurance': 'Insurance Cost', + 'Investment': 'Investment Cost', + 'Telecom': 'Telecom Cost' + }, + 'Avg Session Duration': { + 'Banking': 'Banking Duration', + 'Healthcare': 'Healthcare Duration', + 'Insurance': 'Insurance Duration', + 'Investment': 'Investment Duration', + 'Telecom': 'Telecom Duration' + }, + 'Avg Turns': { + 'Banking': 'Banking Turns', + 'Healthcare': 'Healthcare Turns', + 'Insurance': 'Insurance Turns', + 'Investment': 'Investment Turns', + 'Telecom': 'Telecom Turns' + } + } + + # Only show radar chart for AC and TSQ metrics that have domain breakdowns + if actual_metric_type not in domain_mapping: + return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}") + + fig = go.Figure() + domains = list(domain_mapping[actual_metric_type].keys()) + domain_columns = list(domain_mapping[actual_metric_type].values()) + + # Galileo dark theme color scheme + galileo_dark_colors = [ + {'fill': 'rgba(227, 84, 84, 0.25)', 'line': '#E35454', 'name': 'Vanguard'}, # Vanguard Red + {'fill': 'rgba(16, 152, 247, 0.15)', 'line': '#1098F7', 'name': 'Airglow'}, # Airglow Blue + {'fill': 'rgba(245, 246, 247, 0.15)', 'line': '#F5F6F7', 'name': 'Mercury'}, # Light Mercury + {'fill': 'rgba(227, 84, 84, 0.35)', 'line': '#B94545', 'name': 'Deep Red'}, # Darker Vanguard + {'fill': 'rgba(16, 152, 247, 0.25)', 'line': '#0A6BC4', 'name': 'Deep Blue'} # Darker Airglow + ] + + for idx, model_name in enumerate(selected_models): + model_data = df[df['Model'] == model_name] + if model_data.empty: + continue + + model_row = model_data.iloc[0] + values = [] + + # Get values for each domain + for col in domain_columns: + if col in df.columns and col in model_row: + val = model_row[col] + if pd.isna(val) or val == '': + val = 0 + else: + val = float(val) + values.append(val) + else: + values.append(0) + + # Close the radar chart by repeating first value + values_plot = values + [values[0]] + domains_plot = domains + [domains[0]] + + colors = galileo_dark_colors[idx % len(galileo_dark_colors)] + + fig.add_trace( + go.Scatterpolar( + r=values_plot, + theta=domains_plot, + fill='toself', + fillcolor=colors['fill'], + line=dict( + color=colors['line'], + width=3, + shape='spline', + smoothing=0.8 + ), + marker=dict( + size=10, + color=colors['line'], + symbol='circle', + line=dict(width=2, color='#01091A') + ), + name=model_name, + mode="lines+markers", + hovertemplate="%{fullData.name}
" + + "%{theta}
" + + "%{r:.3f}
" + + "", + hoverlabel=dict( + bgcolor="rgba(1, 9, 26, 0.95)", + bordercolor=colors['line'], + font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif") + ) + ) + ) + + # Determine appropriate range based on metric type + if actual_metric_type in ['Avg AC', 'Avg TSQ']: + max_range = 1.0 + else: + # Calculate max from data for other metrics (Cost, Duration, Turns) + all_values = [] + for model_name in selected_models: + model_data = df[df['Model'] == model_name] + if not model_data.empty: + model_row = model_data.iloc[0] + for col in domain_columns: + if col in df.columns and col in model_row: + val = model_row[col] + if pd.notna(val) and val != '': + all_values.append(float(val)) + max_range = max(all_values) * 1.1 if all_values else 1.0 + + # Create custom tick values for better readability + tick_vals = [i * max_range / 5 for i in range(6)] + tick_text = [f"{val:.2f}" for val in tick_vals] + + fig.update_layout( + polar=dict( + bgcolor='rgba(245, 246, 247, 0.03)', + radialaxis=dict( + visible=True, + range=[0, max_range], + showline=True, + linewidth=2, + linecolor='rgba(245, 246, 247, 0.2)', + gridcolor='rgba(245, 246, 247, 0.1)', + gridwidth=1, + tickvals=tick_vals, + ticktext=tick_text, + tickfont=dict( + size=11, + color='#94A3B8', + family="'Geist Mono', monospace" + ), + tickangle=0 + ), + angularaxis=dict( + showline=True, + linewidth=2, + linecolor='rgba(245, 246, 247, 0.2)', + gridcolor='rgba(245, 246, 247, 0.08)', + tickfont=dict( + size=14, + family="'Geist', sans-serif", + color='#F5F6F7', + weight=600 + ), + rotation=90, + direction="clockwise", + ), + ), + showlegend=True, + legend=dict( + orientation="v", + yanchor="middle", + y=0.5, + xanchor="left", + x=1.05, + font=dict( + size=12, + family="'Geist', sans-serif", + color='#F5F6F7' + ), + bgcolor='rgba(1, 9, 26, 0.8)', + bordercolor='rgba(245, 246, 247, 0.2)', + borderwidth=1, + itemsizing='constant', + itemwidth=30 + ), + title=dict( + text=f"Domain Performance: {metric_type}", + x=0.5, + y=0.97, + font=dict( + size=22, + family="'Geist', sans-serif", + color="#F5F6F7", + weight=700 + ), + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=800, + width=1200, + margin=dict(t=100, b=80, l=80, r=200), + annotations=[ + dict( + text="Galileo Agent Leaderboard", + xref="paper", yref="paper", + x=0.98, y=0.02, + xanchor='right', yanchor='bottom', + font=dict(size=10, color='#64748B'), + showarrow=False + ) + ] + ) + + return fig + + +def create_empty_radar_chart(message): + """Create an empty radar chart with a message""" + fig = go.Figure() + + fig.add_annotation( + text=f"πŸ“Š {message}", + xref="paper", yref="paper", + x=0.5, y=0.5, + xanchor='center', yanchor='middle', + font=dict( + size=18, + color="#94A3B8", + family="'Geist', sans-serif" + ), + showarrow=False, + bgcolor="rgba(245, 246, 247, 0.05)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + borderpad=20 + ) + + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=800, + width=1200, + margin=dict(t=100, b=80, l=80, r=200), + title=dict( + text="Domain Performance Chart", + x=0.5, + y=0.97, + font=dict( + size=22, + family="'Geist', sans-serif", + color="#F5F6F7", + weight=700 + ), + ), + annotations=[ + dict( + text="Galileo Agent Leaderboard", + xref="paper", yref="paper", + x=0.98, y=0.02, + xanchor='right', yanchor='bottom', + font=dict(size=10, color='#64748B'), + showarrow=False + ) + ] + ) + + return fig + + +# NEW VISUALIZATION FUNCTIONS + +def create_cost_performance_scatter(df, metric="Avg AC"): + """Create scatter plot showing cost vs performance efficiency""" + # Filter out models without cost or performance data + df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy() + + if df_filtered.empty: + return create_empty_chart("No data available for cost-performance analysis") + + # Convert to numeric + df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') + df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') + df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce') + + # Create color mapping for model type + color_map = { + 'Proprietary': '#E35454', # Vanguard Red + 'Open source': '#1098F7' # Airglow Blue + } + df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7') + + fig = go.Figure() + + # Add scatter points + for model_type in df_filtered['Model Type'].unique(): + df_type = df_filtered[df_filtered['Model Type'] == model_type] + + fig.add_trace(go.Scatter( + x=df_type[metric], + y=df_type['Avg Total Cost'], + mode='markers+text', + name=model_type, + text=df_type['Model'], + textposition="top center", + textfont=dict(size=10, color='#94A3B8'), + marker=dict( + size=df_type['Avg Turns'] * 3, # Size based on number of turns + color=color_map.get(model_type, '#F5F6F7'), + opacity=0.8, + line=dict(width=2, color='#01091A') + ), + hovertemplate="%{text}
" + + f"{metric}: %{{x:.3f}}
" + + "Cost: $%{y:.3f}
" + + "Turns: %{marker.size:.1f}
" + + "" + )) + + # Add quadrant lines + median_x = df_filtered[metric].median() + median_y = df_filtered['Avg Total Cost'].median() + + fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) + fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) + + # Add quadrant labels + fig.add_annotation(x=0.95, y=0.05, text="πŸ’Ž High Performance
Low Cost", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)") + fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance
High Cost", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="#E35454"), bgcolor="rgba(227, 84, 84, 0.1)") + + metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Cost-Performance Efficiency: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) + ), + xaxis=dict( + title=dict( + text=f"{metric_display}", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=12, color="#94A3B8"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + yaxis=dict( + title=dict( + text="Average Session Cost ($)", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=12, color="#94A3B8"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=800, + width=1200, + showlegend=True, + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1, + font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'), + bgcolor='rgba(1, 9, 26, 0.8)', + bordercolor='rgba(245, 246, 247, 0.2)', + borderwidth=1 + ), + margin=dict(t=100, b=80, l=80, r=80) + ) + + return fig + + +def create_speed_accuracy_plot(df, metric="Avg AC"): + """Create scatter plot showing speed vs accuracy trade-off""" + # Filter out models without duration or performance data + df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy() + + if df_filtered.empty: + return create_empty_chart("No data available for speed-accuracy analysis") + + # Convert to numeric + df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce') + df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') + + # Create color scale based on cost + df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') + + fig = go.Figure() + + # Add scatter trace + fig.add_trace(go.Scatter( + x=df_filtered['Avg Session Duration'], + y=df_filtered[metric], + mode='markers+text', + text=df_filtered['Model'], + textposition="top center", + textfont=dict(size=9, color='#94A3B8'), + marker=dict( + size=12, + color=df_filtered['Avg Total Cost'], + colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']], + showscale=True, + colorbar=dict( + title=dict( + text="Cost ($)", + font=dict(color="#F5F6F7") + ), + tickfont=dict(color="#94A3B8"), + bgcolor="rgba(1, 9, 26, 0.8)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + x=1.02 + ), + line=dict(width=2, color='#01091A') + ), + hovertemplate="%{text}
" + + "Duration: %{x:.1f}s
" + + f"{metric}: %{{y:.3f}}
" + + "Cost: $%{marker.color:.3f}
" + + "" + )) + + # Add quadrant lines + median_x = df_filtered['Avg Session Duration'].median() + median_y = df_filtered[metric].median() + + fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) + fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) + + # Add quadrant labels + fig.add_annotation(x=0.05, y=0.95, text="⚑ Fast & Accurate", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="#F5F6F7", weight=600)) + fig.add_annotation(x=0.95, y=0.05, text="🐌 Slow & Inaccurate", + showarrow=False, xref="paper", yref="paper", + font=dict(size=12, color="#E35454", weight=600)) + + metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Speed vs Accuracy Trade-off: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) + ), + xaxis=dict( + title=dict( + text="Average Session Duration (seconds)", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=12, color="#94A3B8"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + yaxis=dict( + title=dict( + text=f"{metric_display}", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=12, color="#94A3B8"), + gridcolor="rgba(245, 246, 247, 0.1)", + zerolinecolor="rgba(245, 246, 247, 0.2)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=800, + width=1200, + margin=dict(t=100, b=80, l=80, r=120) + ) + + return fig + + +def create_performance_heatmap(df): + """Create a heatmap showing all metrics for all models""" + # Select relevant columns + metrics = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns'] + + # Filter models with data + df_filtered = df[df['Avg AC'] != ''].copy() + + if df_filtered.empty: + return create_empty_chart("No data available for performance heatmap") + + # Convert to numeric and normalize + for col in metrics: + df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce') + + # Create normalized data (0-1 scale) + # For cost, duration, and turns, lower is better so we invert + normalized_data = [] + metric_labels = [] + + for col in metrics: + if col in ['Avg Total Cost', 'Avg Session Duration', 'Avg Turns']: + # Invert these metrics (lower is better) + normalized = 1 - (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min()) + else: + # Higher is better for AC and TSQ + normalized = (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min()) + normalized_data.append(normalized.values) + + # Create better labels + label_map = { + 'Avg AC': 'Action Completion', + 'Avg TSQ': 'Tool Selection', + 'Avg Total Cost': 'Cost Efficiency', + 'Avg Session Duration': 'Speed', + 'Avg Turns': 'Conversation Efficiency' + } + metric_labels.append(label_map.get(col, col)) + + # Create heatmap + fig = go.Figure(data=go.Heatmap( + z=normalized_data, + x=df_filtered['Model'].tolist(), + y=metric_labels, + colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']], + hovertemplate="%{x}
" + + "%{y}: %{z:.2f}
" + + "", + text=[[f"{val:.2f}" for val in row] for row in normalized_data], + texttemplate="%{text}", + textfont={"size": 10, "color": "#F5F6F7"}, + showscale=True, + colorbar=dict( + title=dict( + text="Performance
Score", + font=dict(color="#F5F6F7") + ), + tickfont=dict(color="#94A3B8"), + bgcolor="rgba(1, 9, 26, 0.8)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1 + ) + )) + + fig.update_layout( + title=dict( + text="Comprehensive Performance Heatmap", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) + ), + xaxis=dict( + side="bottom", + tickfont=dict(size=11, color="#94A3B8"), + tickangle=-45 + ), + yaxis=dict( + tickfont=dict(size=13, color="#F5F6F7", weight=600) + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=600, + width=1400, + margin=dict(t=100, b=120, l=170, r=120) + ) + + return fig + + +def create_domain_specialization_matrix(df, metric_type="AC"): + """Create bubble chart showing domain specialization""" + domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] + + # Prepare data + data = [] + for _, model in df.iterrows(): + if model['Model'] == '': + continue + + model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce') + if pd.isna(model_avg): + continue + + for domain in domains: + domain_col = f'{domain} {metric_type}' + if domain_col in model and model[domain_col] != '': + domain_val = pd.to_numeric(model[domain_col], errors='coerce') + if not pd.isna(domain_val): + # Calculate specialization strength (deviation from model average) + specialization = domain_val - model_avg + data.append({ + 'Model': model['Model'], + 'Domain': domain, + 'Performance': domain_val, + 'Specialization': specialization, + 'Model Type': model['Model Type'] + }) + + if not data: + return create_empty_chart("No domain specialization data available") + + df_plot = pd.DataFrame(data) + + # Create bubble chart + fig = go.Figure() + + # Color based on specialization strength + fig.add_trace(go.Scatter( + x=df_plot['Domain'], + y=df_plot['Model'], + mode='markers', + marker=dict( + size=df_plot['Performance'] * 30, # Size based on absolute performance + color=df_plot['Specialization'], + colorscale=[[0, '#1098F7'], [0.5, '#F5F6F7'], [1, '#E35454']], + showscale=True, + colorbar=dict( + title=dict( + text="Specialization
Strength", + font=dict(color="#F5F6F7") + ), + tickfont=dict(color="#94A3B8"), + bgcolor="rgba(1, 9, 26, 0.8)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1 + ), + line=dict(width=2, color='#01091A'), + opacity=0.8 + ), + text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}" + for p, s in zip(df_plot['Performance'], df_plot['Specialization'])], + hovertemplate="%{y}
" + + "Domain: %{x}
" + + "%{text}
" + + "" + )) + + metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Domain Specialization Matrix: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) + ), + xaxis=dict( + title=dict( + text="Business Domains", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=13, color="#F5F6F7"), + gridcolor="rgba(245, 246, 247, 0.1)" + ), + yaxis=dict( + title=dict( + text="Models", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=11, color="#94A3B8"), + gridcolor="rgba(245, 246, 247, 0.1)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=1000, + width=1200, + margin=dict(t=100, b=80, l=220, r=120) + ) + + return fig + + +def create_performance_gap_analysis(df, metric_type="AC"): + """Create range plot showing performance gaps by domain""" + domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] + + # Calculate min, max, median for each domain + gap_data = [] + for domain in domains: + domain_col = f'{domain} {metric_type}' + if domain_col in df.columns: + domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna() + if len(domain_values) > 0: + gap_data.append({ + 'Domain': domain, + 'Min': domain_values.min(), + 'Max': domain_values.max(), + 'Median': domain_values.median(), + 'Q1': domain_values.quantile(0.25), + 'Q3': domain_values.quantile(0.75), + 'Gap': domain_values.max() - domain_values.min() + }) + + if not gap_data: + return create_empty_chart("No data available for gap analysis") + + df_gap = pd.DataFrame(gap_data) + df_gap = df_gap.sort_values('Gap', ascending=True) + + fig = go.Figure() + + # Add range bars + for idx, row in df_gap.iterrows(): + # Add full range line + fig.add_trace(go.Scatter( + x=[row['Min'], row['Max']], + y=[row['Domain'], row['Domain']], + mode='lines', + line=dict(color='#64748B', width=2), + showlegend=False, + hoverinfo='skip' + )) + + # Add IQR box + fig.add_trace(go.Scatter( + x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']], + y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']], + fill='toself', + fillcolor='rgba(227, 84, 84, 0.3)', + line=dict(color='#E35454', width=2), + showlegend=False, + hoverinfo='skip', + mode='lines' + )) + + # Add median marker + fig.add_trace(go.Scatter( + x=[row['Median']], + y=[row['Domain']], + mode='markers', + marker=dict( + size=12, + color='#E35454', + symbol='diamond', + line=dict(width=2, color='#01091A') + ), + showlegend=False, + hovertemplate=f"{row['Domain']}
" + + f"Min: {row['Min']:.3f}
" + + f"Q1: {row['Q1']:.3f}
" + + f"Median: {row['Median']:.3f}
" + + f"Q3: {row['Q3']:.3f}
" + + f"Max: {row['Max']:.3f}
" + + f"Gap: {row['Gap']:.3f}
" + + "" + )) + + # Add min/max points + for idx, row in df_gap.iterrows(): + fig.add_trace(go.Scatter( + x=[row['Min'], row['Max']], + y=[row['Domain'], row['Domain']], + mode='markers', + marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')), + showlegend=False, + hoverinfo='skip' + )) + + metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" + + fig.update_layout( + title=dict( + text=f"Performance Gap Analysis by Domain: {metric_display}", + x=0.5, + y=0.97, + font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) + ), + xaxis=dict( + title=dict( + text=f"{metric_display} Score", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=12, color="#94A3B8"), + gridcolor="rgba(245, 246, 247, 0.1)", + range=[0, 1] if metric_type in ['AC', 'TSQ'] else None + ), + yaxis=dict( + title=dict( + text="Business Domain", + font=dict(size=16, color="#F5F6F7") + ), + tickfont=dict(size=13, color="#F5F6F7"), + gridcolor="rgba(245, 246, 247, 0.1)" + ), + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=700, + width=1200, + margin=dict(t=100, b=80, l=140, r=80), + showlegend=False + ) + + # Add legend manually + fig.add_annotation( + text="β—† Median ━ IQR ─ Full Range", + xref="paper", yref="paper", + x=0.98, y=0.02, + xanchor='right', yanchor='bottom', + font=dict(size=12, color='#94A3B8'), + showarrow=False + ) + + return fig + + +def create_empty_chart(message): + """Create an empty chart with a message""" + fig = go.Figure() + + fig.add_annotation( + text=f"πŸ“Š {message}", + xref="paper", yref="paper", + x=0.5, y=0.5, + xanchor='center', yanchor='middle', + font=dict( + size=18, + color="#94A3B8", + family="'Geist', sans-serif" + ), + showarrow=False, + bgcolor="rgba(245, 246, 247, 0.05)", + bordercolor="rgba(245, 246, 247, 0.2)", + borderwidth=1, + borderpad=20 + ) + + fig.update_layout( + paper_bgcolor="#01091A", + plot_bgcolor="rgba(245, 246, 247, 0.02)", + height=600, + width=1200, + margin=dict(t=80, b=80, l=80, r=80) + ) \ No newline at end of file