+ """
+
+ def filter_and_sort_data(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order):
+ """Filter and sort the leaderboard data"""
+ df = load_leaderboard_data()
+
+ # Apply filters
+ filtered_df = df.copy()
+
+ # Strip emoji prefix from domain filter
+ domain_filter_clean = domain_filter
+ if domain_filter.startswith('π'):
+ domain_filter_clean = "All"
+ elif domain_filter.startswith('π¦'):
+ domain_filter_clean = "Banking"
+ elif domain_filter.startswith('π₯'):
+ domain_filter_clean = "Healthcare"
+ elif domain_filter.startswith('π‘οΈ'):
+ domain_filter_clean = "Insurance"
+ elif domain_filter.startswith('π°'):
+ domain_filter_clean = "Investment"
+ elif domain_filter.startswith('π±'):
+ domain_filter_clean = "Telecom"
+
+ # Domain filtering (check if model performs well in specific domain)
+ if domain_filter_clean != "All":
+ domain_col_map = {
+ "Banking": "Banking AC",
+ "Healthcare": "Healthcare AC",
+ "Insurance": "Insurance AC",
+ "Investment": "Investment AC",
+ "Telecom": "Telecom AC"
+ }
+ if domain_filter_clean in domain_col_map:
+ domain_col = domain_col_map[domain_filter_clean]
+ # Only show models that have data for this domain
+ filtered_df = filtered_df[filtered_df[domain_col] != '']
+
+ # Model type filtering
+ if model_type_filter != "All":
+ if model_type_filter == "Open Source":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
+ elif model_type_filter == "Proprietary":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
+
+ # Reasoning filtering
+ if reasoning_filter != "All":
+ if reasoning_filter == "Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
+ elif reasoning_filter == "Non-Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
+
+ # Map display name to actual column name using shared mapping
+ actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
+
+ # If domain is selected and sorting by AC or TSQ, use domain-specific column
+ if domain_filter_clean != "All":
+ if actual_sort_column == "Avg AC":
+ actual_sort_column = f"{domain_filter_clean} AC"
+ elif actual_sort_column == "Avg TSQ":
+ actual_sort_column = f"{domain_filter_clean} TSQ"
+ elif actual_sort_column == "Avg Total Cost":
+ actual_sort_column = f"{domain_filter_clean} Cost"
+ elif actual_sort_column == "Avg Session Duration":
+ actual_sort_column = f"{domain_filter_clean} Duration"
+ elif actual_sort_column == "Avg Turns":
+ actual_sort_column = f"{domain_filter_clean} Turns"
+
+ if actual_sort_column and actual_sort_column in filtered_df.columns:
+ ascending = (sort_order == "Ascending")
+ filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
+
+ # Generate HTML table
+ return generate_html_table(filtered_df, domain_filter_clean)
+
+ # Load initial data
+ initial_table = filter_and_sort_data("π All", "All", "All", "Avg AC", "Descending")
+ initial_df = load_leaderboard_data() # Load raw data for model selector
+
+ # Custom CSS for Galileo dark theme
+ custom_css = """
+
+
+
+ """
+
+ gr.HTML(custom_css)
+
+ gr.HTML("""
+
+
+ π Galileo Agent Leaderboard V2
+
+
+ Comprehensive performance metrics for LLM agents across business domains
+
+
+ """)
+
+ # Metrics overview cards with insights
+ gr.HTML("""
+
+
+
+
+
+
+
+
+
Task Completion
+
+ Compare models based on their ability to complete real-world business tasks accurately and efficiently
+
+
+
+
+
+
Tool Selection
+
+ Analyze how precisely models choose the right tools for each task and make optimal decisions
+
+
+
+
+
+
Cost Efficiency
+
+ Find models that deliver the best performance per dollar spent and optimize your ROI
+
+
+
+
+
+
Domain Coverage
+
+ Banking, Healthcare, Insurance, Investment, and Telecom industries analyzed for specialized performance
+
+
+
+
+
+
Speed vs Accuracy
+
+ Understand the trade-offs between response time and accuracy to find the right balance
+
+
+
+
+
+
+
+
Model Capabilities
+
+
+ π
+ Open Source
+
+
+ π
+ Proprietary
+
+
+ π§
+ Reasoning
+
+
+
+
+
+
Interactive Visualizations
+
+
+ πΈοΈ
+ Radar Charts
+
+
+ π
+ Heatmaps
+
+
+ π
+ Scatter Plots
+
+
+
+
+
+
Real-World Performance
+
+
+ πΌ
+ Business Tasks
+
+
+ π
+ Multi-Turn
+
+
+ π
+ Benchmarks
+
+
+
+
+
+ """)
+
+ # Domain filter section with enhanced styling
+ gr.HTML("""
+
+
+
+
+
ποΈ Select Business Domain
+
Choose a domain to see specialized agent performance
+
+ """)
+
+ # Creating a custom radio with better visual design
+ domain_choices = [
+ ("All", "π", "All Domains"),
+ ("Banking", "π¦", "Banking"),
+ ("Healthcare", "π₯", "Healthcare"),
+ ("Insurance", "π‘οΈ", "Insurance"),
+ ("Investment", "π°", "Investment"),
+ ("Telecom", "π±", "Telecom")
+ ]
+
+ with gr.Row():
+ domain_filter = gr.Radio(
+ choices=["π All", "π¦ Banking", "π₯ Healthcare", "π‘οΈ Insurance", "π° Investment", "π± Telecom"],
+ value="π All",
+ label="",
+ interactive=True,
+ elem_classes=["domain-radio"]
+ )
+
+ gr.HTML("""
+
+
+ π‘ Tip: Domain selection filters the leaderboard to show only models tested in that specific business context
+
+
+
+ """)
+
+ # Filter controls with enhanced styling
+ gr.HTML("""
+
+
+ """)
+
+ # First row: Model filters and sort order
+ with gr.Row(elem_classes=["compact-filter-row"]):
+ with gr.Column(scale=1):
+ model_type_filter = gr.Radio(
+ choices=["All", "Open Source", "Proprietary"],
+ value="All",
+ label="π Model Access",
+ elem_classes=["compact-radio"]
+ )
+
+ with gr.Column(scale=1):
+ reasoning_filter = gr.Radio(
+ choices=["All", "Reasoning", "Non-Reasoning"],
+ value="All",
+ label="π§ Model Type",
+ elem_classes=["compact-radio"]
+ )
+
+ with gr.Column(scale=1):
+ sort_order = gr.Radio(
+ choices=["Descending", "Ascending"],
+ value="Descending",
+ label="π Sort Order",
+ elem_classes=["compact-radio"]
+ )
+
+ # Second row: Sort by options
+ gr.HTML("""
+
π Sort By
+ """)
+
+ gr.HTML('
')
+ sort_by = gr.Radio(
+ choices=["Avg Action Completion", "Avg Tool Selection Quality", "Avg Session Cost", "Avg Session Duration", "Avg Turns"],
+ value="Avg Action Completion",
+ label="",
+ elem_classes=["domain-radio"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # Main leaderboard table with dynamic title
+ leaderboard_title = gr.HTML("""
+
+
+
+ """)
+
+ leaderboard_table = gr.HTML(initial_table)
+
+ gr.HTML("""
+
+
+
+ π‘ Tips: Hover over model names for highlighting β’ Score bars show performance metrics visually
+
+
+
""")
+
+ # Radar Chart Section
+ gr.HTML("""
+
+
+
Compare model performance across different business domains
+ """)
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ model_selector = gr.Dropdown(
+ choices=initial_df['Model'].tolist()[:10],
+ value=initial_df['Model'].tolist()[:5],
+ multiselect=True,
+ label="π― Select Models for Comparison",
+ info="Choose up to 5 models to visualize",
+ elem_classes=["dropdown"]
+ )
+
+ # Radar chart plot - wrapped in centered container
+ gr.HTML('
')
+ radar_chart = gr.Plot(
+ label="",
+ value=create_domain_radar_chart(
+ load_leaderboard_data(),
+ "Avg AC",
+ initial_df['Model'].tolist()[:5]
+ ),
+ elem_classes=["radar-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # Update functions
+ def update_table(*args):
+ title_html = update_leaderboard_title(args[0]) # domain_filter is first arg
+ table_html = filter_and_sort_data(*args)
+ return title_html, table_html
+
+ def update_radar_chart(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models):
+ # Get filtered dataframe
+ df = load_leaderboard_data()
+ filtered_df = df.copy()
+
+ # Strip emoji prefix from domain filter
+ domain_filter_clean = domain_filter
+ if domain_filter.startswith('π'):
+ domain_filter_clean = "All"
+ elif domain_filter.startswith('π¦'):
+ domain_filter_clean = "Banking"
+ elif domain_filter.startswith('π₯'):
+ domain_filter_clean = "Healthcare"
+ elif domain_filter.startswith('π‘οΈ'):
+ domain_filter_clean = "Insurance"
+ elif domain_filter.startswith('π°'):
+ domain_filter_clean = "Investment"
+ elif domain_filter.startswith('π±'):
+ domain_filter_clean = "Telecom"
+
+ # Apply filters (same logic as filter_and_sort_data)
+ if domain_filter_clean != "All":
+ domain_col_map = {
+ "Banking": "Banking AC",
+ "Healthcare": "Healthcare AC",
+ "Insurance": "Insurance AC",
+ "Investment": "Investment AC",
+ "Telecom": "Telecom AC"
+ }
+ if domain_filter_clean in domain_col_map:
+ domain_col = domain_col_map[domain_filter_clean]
+ # Only show models that have data for this domain
+ filtered_df = filtered_df[filtered_df[domain_col] != '']
+
+ if model_type_filter != "All":
+ if model_type_filter == "Open Source":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
+ elif model_type_filter == "Proprietary":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
+
+ if reasoning_filter != "All":
+ if reasoning_filter == "Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
+ elif reasoning_filter == "Non-Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
+
+ # Map display name to actual column name using shared mapping
+ actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
+
+ # If domain is selected and sorting by AC or TSQ, use domain-specific column
+ if domain_filter_clean != "All":
+ if actual_sort_column == "Avg AC":
+ actual_sort_column = f"{domain_filter_clean} AC"
+ elif actual_sort_column == "Avg TSQ":
+ actual_sort_column = f"{domain_filter_clean} TSQ"
+ elif actual_sort_column == "Avg Total Cost":
+ actual_sort_column = f"{domain_filter_clean} Cost"
+ elif actual_sort_column == "Avg Session Duration":
+ actual_sort_column = f"{domain_filter_clean} Duration"
+ elif actual_sort_column == "Avg Turns":
+ actual_sort_column = f"{domain_filter_clean} Turns"
+
+ if actual_sort_column and actual_sort_column in filtered_df.columns:
+ ascending = (sort_order == "Ascending")
+ filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
+
+ # Update model selector choices based on filtered data
+ available_models = filtered_df['Model'].tolist()[:15] # Top 15 from filtered results
+
+ # If selected models are not in available models, reset to top 5
+ if selected_models:
+ valid_selected = [m for m in selected_models if m in available_models]
+ if not valid_selected:
+ valid_selected = available_models[:5]
+ else:
+ valid_selected = available_models[:5]
+
+ # Create radar chart
+ chart = create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
+
+ return gr.Dropdown(choices=available_models, value=valid_selected), chart
+
+ def update_radar_only(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models):
+ # Get filtered dataframe
+ df = load_leaderboard_data()
+ filtered_df = df.copy()
+
+ # Strip emoji prefix from domain filter
+ domain_filter_clean = domain_filter
+ if domain_filter.startswith('π'):
+ domain_filter_clean = "All"
+ elif domain_filter.startswith('π¦'):
+ domain_filter_clean = "Banking"
+ elif domain_filter.startswith('π₯'):
+ domain_filter_clean = "Healthcare"
+ elif domain_filter.startswith('π‘οΈ'):
+ domain_filter_clean = "Insurance"
+ elif domain_filter.startswith('π°'):
+ domain_filter_clean = "Investment"
+ elif domain_filter.startswith('π±'):
+ domain_filter_clean = "Telecom"
+
+ # Apply filters (same logic as filter_and_sort_data)
+ if domain_filter_clean != "All":
+ domain_col_map = {
+ "Banking": "Banking AC",
+ "Healthcare": "Healthcare AC",
+ "Insurance": "Insurance AC",
+ "Investment": "Investment AC",
+ "Telecom": "Telecom AC"
+ }
+ if domain_filter_clean in domain_col_map:
+ domain_col = domain_col_map[domain_filter_clean]
+ filtered_df = filtered_df[filtered_df[domain_col] != '']
+
+ if model_type_filter != "All":
+ if model_type_filter == "Open Source":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
+ elif model_type_filter == "Proprietary":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
+
+ if reasoning_filter != "All":
+ if reasoning_filter == "Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
+ elif reasoning_filter == "Non-Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
+
+ # Map display name to actual column name using shared mapping
+ actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
+
+ if actual_sort_column and actual_sort_column in filtered_df.columns:
+ ascending = (sort_order == "Ascending")
+ filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
+
+ if selected_models:
+ valid_selected = [m for m in selected_models if m in filtered_df['Model'].tolist()]
+ if not valid_selected:
+ valid_selected = filtered_df['Model'].tolist()[:5]
+ else:
+ valid_selected = filtered_df['Model'].tolist()[:5]
+
+ return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
+
+ # Update table when filters change
+ filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
+
+ for input_component in filter_inputs:
+ input_component.change(
+ fn=update_table,
+ inputs=filter_inputs,
+ outputs=[leaderboard_title, leaderboard_table]
+ )
+
+ # Also update radar chart when filters change
+ input_component.change(
+ fn=update_radar_chart,
+ inputs=filter_inputs + [model_selector],
+ outputs=[model_selector, radar_chart]
+ )
+
+ # Update radar chart when model selection changes
+ model_selector.change(
+ fn=update_radar_only,
+ inputs=filter_inputs + [model_selector],
+ outputs=[radar_chart]
+ )
+
+ # Performance insights section
+ gr.HTML("""
+
+
+
+
+
+
π Top Performers
+
+ - Highest AC scores indicate best action completion
+ - Superior TSQ shows optimal tool selection
+ - Balance cost-effectiveness with performance
+
+
+
+
+
π Filter Features
+
+ - Domain-specific performance analysis
+ - Compare open source vs private models
+ - Reasoning vs standard model comparison
+
+
+
+
+
π Visualization
+
+ - Interactive radar charts for domain breakdown
+ - Compare up to 5 models simultaneously
+ - Hover for detailed performance metrics
+
+
+
+
+ """)
+
+ # NEW VISUALIZATIONS START HERE
+
+ # 1. Cost-Performance Efficiency Scatter Plot
+ gr.HTML("""
+
+
+
+ Identify models that deliver the best performance per dollar spent
+
+ """)
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ efficiency_metric = gr.Dropdown(
+ choices=["Avg Action Completion", "Avg Tool Selection Quality"],
+ value="Avg Action Completion",
+ label="π Performance Metric",
+ info="Select which performance metric to analyze against cost",
+ elem_classes=["dropdown"]
+ )
+
+ gr.HTML('
')
+ cost_performance_plot = gr.Plot(
+ label="",
+ value=create_cost_performance_scatter(load_leaderboard_data(), "Avg AC"),
+ elem_classes=["efficiency-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # 2. Speed vs Accuracy Trade-off Chart
+ gr.HTML("""
+
+
+
+ Find the sweet spot between response time and accuracy
+
+ """)
+
+ gr.HTML('
')
+ speed_accuracy_plot = gr.Plot(
+ label="",
+ value=create_speed_accuracy_plot(load_leaderboard_data(), "Avg AC"),
+ elem_classes=["speed-accuracy-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # 3. Performance Heatmap
+ gr.HTML("""
+
+
+
+ All metrics at a glance - darker colors indicate better performance
+
+ """)
+
+ gr.HTML('
')
+ performance_heatmap = gr.Plot(
+ label="",
+ value=create_performance_heatmap(load_leaderboard_data()),
+ elem_classes=["heatmap-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # 4. Domain Specialization Matrix
+ gr.HTML("""
+
+
+
+ Bubble size shows performance level, color intensity shows specialization strength
+
+ """)
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ specialization_metric = gr.Dropdown(
+ choices=["AC (Action Completion)", "TSQ (Tool Selection Quality)"],
+ value="AC (Action Completion)",
+ label="π Metric Type",
+ info="Choose which metric to analyze for domain specialization",
+ elem_classes=["dropdown"]
+ )
+
+ gr.HTML('
')
+ domain_specialization_plot = gr.Plot(
+ label="",
+ value=create_domain_specialization_matrix(load_leaderboard_data(), "AC"),
+ elem_classes=["specialization-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # 5. Performance Gap Analysis
+ gr.HTML("""
+
+
+
+ Visualize the performance range across models for each domain
+
+ """)
+
+ gr.HTML('
')
+ performance_gap_plot = gr.Plot(
+ label="",
+ value=create_performance_gap_analysis(load_leaderboard_data(), "AC"),
+ elem_classes=["gap-analysis-chart", "plot-container"]
+ )
+ gr.HTML('
')
+
+ gr.HTML("
")
+
+ # Update functions for new visualizations
+ def update_cost_performance(efficiency_metric):
+ actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric)
+ return create_cost_performance_scatter(load_leaderboard_data(), actual_metric)
+
+ def update_speed_accuracy(efficiency_metric):
+ actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric)
+ return create_speed_accuracy_plot(load_leaderboard_data(), actual_metric)
+
+ def update_domain_specialization(specialization_metric):
+ metric_type = "AC" if "AC" in specialization_metric else "TSQ"
+ return create_domain_specialization_matrix(load_leaderboard_data(), metric_type)
+
+ def update_performance_gap(specialization_metric):
+ metric_type = "AC" if "AC" in specialization_metric else "TSQ"
+ return create_performance_gap_analysis(load_leaderboard_data(), metric_type)
+
+ def update_all_visualizations(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order):
+ """Update all new visualizations when filters change"""
+ df = load_leaderboard_data()
+ filtered_df = apply_filters(df, domain_filter, model_type_filter, reasoning_filter)
+
+ # Update efficiency metric based on current sort
+ actual_metric = SORT_COLUMN_MAP.get(sort_by, sort_by) if sort_by in ["Avg Action Completion", "Avg Tool Selection Quality"] else "Avg AC"
+
+ # Update all plots
+ cost_perf = create_cost_performance_scatter(filtered_df, actual_metric)
+ speed_acc = create_speed_accuracy_plot(filtered_df, actual_metric)
+ heatmap = create_performance_heatmap(filtered_df)
+
+ return cost_perf, speed_acc, heatmap
+
+ def apply_filters(df, domain_filter, model_type_filter, reasoning_filter):
+ """Apply filters to dataframe"""
+ filtered_df = df.copy()
+
+ # Strip emoji prefix from domain filter
+ domain_filter_clean = domain_filter
+ if domain_filter.startswith('π'):
+ domain_filter_clean = "All"
+ elif domain_filter.startswith('π¦'):
+ domain_filter_clean = "Banking"
+ elif domain_filter.startswith('π₯'):
+ domain_filter_clean = "Healthcare"
+ elif domain_filter.startswith('π‘οΈ'):
+ domain_filter_clean = "Insurance"
+ elif domain_filter.startswith('π°'):
+ domain_filter_clean = "Investment"
+ elif domain_filter.startswith('π±'):
+ domain_filter_clean = "Telecom"
+
+ # Domain filtering
+ if domain_filter_clean != "All":
+ domain_col_map = {
+ "Banking": "Banking AC",
+ "Healthcare": "Healthcare AC",
+ "Insurance": "Insurance AC",
+ "Investment": "Investment AC",
+ "Telecom": "Telecom AC"
+ }
+ if domain_filter_clean in domain_col_map:
+ domain_col = domain_col_map[domain_filter_clean]
+ filtered_df = filtered_df[filtered_df[domain_col] != '']
+
+ # Model type filtering
+ if model_type_filter != "All":
+ if model_type_filter == "Open Source":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
+ elif model_type_filter == "Proprietary":
+ filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
+
+ # Reasoning filtering
+ if reasoning_filter != "All":
+ if reasoning_filter == "Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
+ elif reasoning_filter == "Non-Reasoning":
+ filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
+
+ return filtered_df
+
+ # Connect update functions to components
+ efficiency_metric.change(
+ fn=update_cost_performance,
+ inputs=[efficiency_metric],
+ outputs=[cost_performance_plot]
+ )
+
+ efficiency_metric.change(
+ fn=update_speed_accuracy,
+ inputs=[efficiency_metric],
+ outputs=[speed_accuracy_plot]
+ )
+
+ specialization_metric.change(
+ fn=update_domain_specialization,
+ inputs=[specialization_metric],
+ outputs=[domain_specialization_plot]
+ )
+
+ specialization_metric.change(
+ fn=update_performance_gap,
+ inputs=[specialization_metric],
+ outputs=[performance_gap_plot]
+ )
+
+ # Update new visualizations when main filters change
+ for input_component in filter_inputs:
+ input_component.change(
+ fn=update_all_visualizations,
+ inputs=filter_inputs,
+ outputs=[cost_performance_plot, speed_accuracy_plot, performance_heatmap]
+ )
+
+ return leaderboard_table
+
+
+def create_leaderboard_v2_interface():
+ """Create the complete leaderboard v2 interface"""
+ return create_leaderboard_v2_tab()
+
+
+def create_domain_radar_chart(df, metric_type, selected_models=None, max_models=5):
+ """Create a radar chart showing model performance across domains for the selected metric"""
+
+ # Map the metric_type to actual column name using shared mapping
+ actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
+
+ if selected_models is None or len(selected_models) == 0:
+ # Default to top 5 models by the selected metric if available
+ if actual_metric_type in df.columns:
+ selected_models = df.nlargest(max_models, actual_metric_type)['Model'].tolist()
+ else:
+ selected_models = df.head(max_models)['Model'].tolist()
+
+ # Limit to max_models for readability
+ selected_models = selected_models[:max_models]
+
+ # Define domain mapping based on metric type
+ domain_mapping = {
+ 'Avg AC': {
+ 'Banking': 'Banking AC',
+ 'Healthcare': 'Healthcare AC',
+ 'Insurance': 'Insurance AC',
+ 'Investment': 'Investment AC',
+ 'Telecom': 'Telecom AC'
+ },
+ 'Avg TSQ': {
+ 'Banking': 'Banking TSQ',
+ 'Healthcare': 'Healthcare TSQ',
+ 'Insurance': 'Insurance TSQ',
+ 'Investment': 'Investment TSQ',
+ 'Telecom': 'Telecom TSQ'
+ },
+ 'Avg Total Cost': {
+ 'Banking': 'Banking Cost',
+ 'Healthcare': 'Healthcare Cost',
+ 'Insurance': 'Insurance Cost',
+ 'Investment': 'Investment Cost',
+ 'Telecom': 'Telecom Cost'
+ },
+ 'Avg Session Duration': {
+ 'Banking': 'Banking Duration',
+ 'Healthcare': 'Healthcare Duration',
+ 'Insurance': 'Insurance Duration',
+ 'Investment': 'Investment Duration',
+ 'Telecom': 'Telecom Duration'
+ },
+ 'Avg Turns': {
+ 'Banking': 'Banking Turns',
+ 'Healthcare': 'Healthcare Turns',
+ 'Insurance': 'Insurance Turns',
+ 'Investment': 'Investment Turns',
+ 'Telecom': 'Telecom Turns'
+ }
+ }
+
+ # Only show radar chart for AC and TSQ metrics that have domain breakdowns
+ if actual_metric_type not in domain_mapping:
+ return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
+
+ fig = go.Figure()
+ domains = list(domain_mapping[actual_metric_type].keys())
+ domain_columns = list(domain_mapping[actual_metric_type].values())
+
+ # Galileo dark theme color scheme
+ galileo_dark_colors = [
+ {'fill': 'rgba(227, 84, 84, 0.25)', 'line': '#E35454', 'name': 'Vanguard'}, # Vanguard Red
+ {'fill': 'rgba(16, 152, 247, 0.15)', 'line': '#1098F7', 'name': 'Airglow'}, # Airglow Blue
+ {'fill': 'rgba(245, 246, 247, 0.15)', 'line': '#F5F6F7', 'name': 'Mercury'}, # Light Mercury
+ {'fill': 'rgba(227, 84, 84, 0.35)', 'line': '#B94545', 'name': 'Deep Red'}, # Darker Vanguard
+ {'fill': 'rgba(16, 152, 247, 0.25)', 'line': '#0A6BC4', 'name': 'Deep Blue'} # Darker Airglow
+ ]
+
+ for idx, model_name in enumerate(selected_models):
+ model_data = df[df['Model'] == model_name]
+ if model_data.empty:
+ continue
+
+ model_row = model_data.iloc[0]
+ values = []
+
+ # Get values for each domain
+ for col in domain_columns:
+ if col in df.columns and col in model_row:
+ val = model_row[col]
+ if pd.isna(val) or val == '':
+ val = 0
+ else:
+ val = float(val)
+ values.append(val)
+ else:
+ values.append(0)
+
+ # Close the radar chart by repeating first value
+ values_plot = values + [values[0]]
+ domains_plot = domains + [domains[0]]
+
+ colors = galileo_dark_colors[idx % len(galileo_dark_colors)]
+
+ fig.add_trace(
+ go.Scatterpolar(
+ r=values_plot,
+ theta=domains_plot,
+ fill='toself',
+ fillcolor=colors['fill'],
+ line=dict(
+ color=colors['line'],
+ width=3,
+ shape='spline',
+ smoothing=0.8
+ ),
+ marker=dict(
+ size=10,
+ color=colors['line'],
+ symbol='circle',
+ line=dict(width=2, color='#01091A')
+ ),
+ name=model_name,
+ mode="lines+markers",
+ hovertemplate="
%{fullData.name}" +
+ "
%{theta}" +
+ "
%{r:.3f}" +
+ "
",
+ hoverlabel=dict(
+ bgcolor="rgba(1, 9, 26, 0.95)",
+ bordercolor=colors['line'],
+ font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif")
+ )
+ )
+ )
+
+ # Determine appropriate range based on metric type
+ if actual_metric_type in ['Avg AC', 'Avg TSQ']:
+ max_range = 1.0
+ else:
+ # Calculate max from data for other metrics (Cost, Duration, Turns)
+ all_values = []
+ for model_name in selected_models:
+ model_data = df[df['Model'] == model_name]
+ if not model_data.empty:
+ model_row = model_data.iloc[0]
+ for col in domain_columns:
+ if col in df.columns and col in model_row:
+ val = model_row[col]
+ if pd.notna(val) and val != '':
+ all_values.append(float(val))
+ max_range = max(all_values) * 1.1 if all_values else 1.0
+
+ # Create custom tick values for better readability
+ tick_vals = [i * max_range / 5 for i in range(6)]
+ tick_text = [f"{val:.2f}" for val in tick_vals]
+
+ fig.update_layout(
+ polar=dict(
+ bgcolor='rgba(245, 246, 247, 0.03)',
+ radialaxis=dict(
+ visible=True,
+ range=[0, max_range],
+ showline=True,
+ linewidth=2,
+ linecolor='rgba(245, 246, 247, 0.2)',
+ gridcolor='rgba(245, 246, 247, 0.1)',
+ gridwidth=1,
+ tickvals=tick_vals,
+ ticktext=tick_text,
+ tickfont=dict(
+ size=11,
+ color='#94A3B8',
+ family="'Geist Mono', monospace"
+ ),
+ tickangle=0
+ ),
+ angularaxis=dict(
+ showline=True,
+ linewidth=2,
+ linecolor='rgba(245, 246, 247, 0.2)',
+ gridcolor='rgba(245, 246, 247, 0.08)',
+ tickfont=dict(
+ size=14,
+ family="'Geist', sans-serif",
+ color='#F5F6F7',
+ weight=600
+ ),
+ rotation=90,
+ direction="clockwise",
+ ),
+ ),
+ showlegend=True,
+ legend=dict(
+ orientation="v",
+ yanchor="middle",
+ y=0.5,
+ xanchor="left",
+ x=1.05,
+ font=dict(
+ size=12,
+ family="'Geist', sans-serif",
+ color='#F5F6F7'
+ ),
+ bgcolor='rgba(1, 9, 26, 0.8)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1,
+ itemsizing='constant',
+ itemwidth=30
+ ),
+ title=dict(
+ text=f"
Domain Performance: {metric_type}",
+ x=0.5,
+ y=0.97,
+ font=dict(
+ size=22,
+ family="'Geist', sans-serif",
+ color="#F5F6F7",
+ weight=700
+ ),
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=1200,
+ margin=dict(t=100, b=80, l=80, r=200),
+ annotations=[
+ dict(
+ text="Galileo Agent Leaderboard",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=10, color='#64748B'),
+ showarrow=False
+ )
+ ]
+ )
+
+ return fig
+
+
+def create_empty_radar_chart(message):
+ """Create an empty radar chart with a message"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ text=f"π {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="#94A3B8",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=1200,
+ margin=dict(t=100, b=80, l=80, r=200),
+ title=dict(
+ text="
Domain Performance Chart",
+ x=0.5,
+ y=0.97,
+ font=dict(
+ size=22,
+ family="'Geist', sans-serif",
+ color="#F5F6F7",
+ weight=700
+ ),
+ ),
+ annotations=[
+ dict(
+ text="Galileo Agent Leaderboard",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=10, color='#64748B'),
+ showarrow=False
+ )
+ ]
+ )
+
+ return fig
+
+
+# NEW VISUALIZATION FUNCTIONS
+
+def create_cost_performance_scatter(df, metric="Avg AC"):
+ """Create scatter plot showing cost vs performance efficiency"""
+ # Filter out models without cost or performance data
+ df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for cost-performance analysis")
+
+ # Convert to numeric
+ df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
+ df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
+ df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
+
+ # Create color mapping for model type
+ color_map = {
+ 'Proprietary': '#E35454', # Vanguard Red
+ 'Open source': '#1098F7' # Airglow Blue
+ }
+ df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
+
+ fig = go.Figure()
+
+ # Add scatter points
+ for model_type in df_filtered['Model Type'].unique():
+ df_type = df_filtered[df_filtered['Model Type'] == model_type]
+
+ fig.add_trace(go.Scatter(
+ x=df_type[metric],
+ y=df_type['Avg Total Cost'],
+ mode='markers+text',
+ name=model_type,
+ text=df_type['Model'],
+ textposition="top center",
+ textfont=dict(size=10, color='#94A3B8'),
+ marker=dict(
+ size=df_type['Avg Turns'] * 3, # Size based on number of turns
+ color=color_map.get(model_type, '#F5F6F7'),
+ opacity=0.8,
+ line=dict(width=2, color='#01091A')
+ ),
+ hovertemplate="
%{text}" +
+ f"{metric}: %{{x:.3f}}
" +
+ "Cost: $%{y:.3f}
" +
+ "Turns: %{marker.size:.1f}
" +
+ "
"
+ ))
+
+ # Add quadrant lines
+ median_x = df_filtered[metric].median()
+ median_y = df_filtered['Avg Total Cost'].median()
+
+ fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
+ fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
+
+ # Add quadrant labels
+ fig.add_annotation(x=0.95, y=0.05, text="π High Performance
Low Cost",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)")
+ fig.add_annotation(x=0.05, y=0.95, text="β οΈ Low Performance
High Cost",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#E35454"), bgcolor="rgba(227, 84, 84, 0.1)")
+
+ metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"
Cost-Performance Efficiency: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"
{metric_display}",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="
Average Session Cost ($)",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=1200,
+ showlegend=True,
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=1.02,
+ xanchor="right",
+ x=1,
+ font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'),
+ bgcolor='rgba(1, 9, 26, 0.8)',
+ bordercolor='rgba(245, 246, 247, 0.2)',
+ borderwidth=1
+ ),
+ margin=dict(t=100, b=80, l=80, r=80)
+ )
+
+ return fig
+
+
+def create_speed_accuracy_plot(df, metric="Avg AC"):
+ """Create scatter plot showing speed vs accuracy trade-off"""
+ # Filter out models without duration or performance data
+ df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for speed-accuracy analysis")
+
+ # Convert to numeric
+ df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
+ df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
+
+ # Create color scale based on cost
+ df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
+
+ fig = go.Figure()
+
+ # Add scatter trace
+ fig.add_trace(go.Scatter(
+ x=df_filtered['Avg Session Duration'],
+ y=df_filtered[metric],
+ mode='markers+text',
+ text=df_filtered['Model'],
+ textposition="top center",
+ textfont=dict(size=9, color='#94A3B8'),
+ marker=dict(
+ size=12,
+ color=df_filtered['Avg Total Cost'],
+ colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']],
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Cost ($)",
+ font=dict(color="#F5F6F7")
+ ),
+ tickfont=dict(color="#94A3B8"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ x=1.02
+ ),
+ line=dict(width=2, color='#01091A')
+ ),
+ hovertemplate="
%{text}" +
+ "Duration: %{x:.1f}s
" +
+ f"{metric}: %{{y:.3f}}
" +
+ "Cost: $%{marker.color:.3f}
" +
+ "
"
+ ))
+
+ # Add quadrant lines
+ median_x = df_filtered['Avg Session Duration'].median()
+ median_y = df_filtered[metric].median()
+
+ fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
+ fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
+
+ # Add quadrant labels
+ fig.add_annotation(x=0.05, y=0.95, text="β‘ Fast & Accurate",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#F5F6F7", weight=600))
+ fig.add_annotation(x=0.95, y=0.05, text="π Slow & Inaccurate",
+ showarrow=False, xref="paper", yref="paper",
+ font=dict(size=12, color="#E35454", weight=600))
+
+ metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"
Speed vs Accuracy Trade-off: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text="
Average Session Duration (seconds)",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text=f"
{metric_display}",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=800,
+ width=1200,
+ margin=dict(t=100, b=80, l=80, r=120)
+ )
+
+ return fig
+
+
+def create_performance_heatmap(df):
+ """Create a heatmap showing all metrics for all models"""
+ # Select relevant columns
+ metrics = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns']
+
+ # Filter models with data
+ df_filtered = df[df['Avg AC'] != ''].copy()
+
+ if df_filtered.empty:
+ return create_empty_chart("No data available for performance heatmap")
+
+ # Convert to numeric and normalize
+ for col in metrics:
+ df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')
+
+ # Create normalized data (0-1 scale)
+ # For cost, duration, and turns, lower is better so we invert
+ normalized_data = []
+ metric_labels = []
+
+ for col in metrics:
+ if col in ['Avg Total Cost', 'Avg Session Duration', 'Avg Turns']:
+ # Invert these metrics (lower is better)
+ normalized = 1 - (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min())
+ else:
+ # Higher is better for AC and TSQ
+ normalized = (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min())
+ normalized_data.append(normalized.values)
+
+ # Create better labels
+ label_map = {
+ 'Avg AC': 'Action Completion',
+ 'Avg TSQ': 'Tool Selection',
+ 'Avg Total Cost': 'Cost Efficiency',
+ 'Avg Session Duration': 'Speed',
+ 'Avg Turns': 'Conversation Efficiency'
+ }
+ metric_labels.append(label_map.get(col, col))
+
+ # Create heatmap
+ fig = go.Figure(data=go.Heatmap(
+ z=normalized_data,
+ x=df_filtered['Model'].tolist(),
+ y=metric_labels,
+ colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']],
+ hovertemplate="
%{x}" +
+ "%{y}: %{z:.2f}
" +
+ "
",
+ text=[[f"{val:.2f}" for val in row] for row in normalized_data],
+ texttemplate="%{text}",
+ textfont={"size": 10, "color": "#F5F6F7"},
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Performance
Score",
+ font=dict(color="#F5F6F7")
+ ),
+ tickfont=dict(color="#94A3B8"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1
+ )
+ ))
+
+ fig.update_layout(
+ title=dict(
+ text="
Comprehensive Performance Heatmap",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ side="bottom",
+ tickfont=dict(size=11, color="#94A3B8"),
+ tickangle=-45
+ ),
+ yaxis=dict(
+ tickfont=dict(size=13, color="#F5F6F7", weight=600)
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=600,
+ width=1400,
+ margin=dict(t=100, b=120, l=170, r=120)
+ )
+
+ return fig
+
+
+def create_domain_specialization_matrix(df, metric_type="AC"):
+ """Create bubble chart showing domain specialization"""
+ domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
+
+ # Prepare data
+ data = []
+ for _, model in df.iterrows():
+ if model['Model'] == '':
+ continue
+
+ model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
+ if pd.isna(model_avg):
+ continue
+
+ for domain in domains:
+ domain_col = f'{domain} {metric_type}'
+ if domain_col in model and model[domain_col] != '':
+ domain_val = pd.to_numeric(model[domain_col], errors='coerce')
+ if not pd.isna(domain_val):
+ # Calculate specialization strength (deviation from model average)
+ specialization = domain_val - model_avg
+ data.append({
+ 'Model': model['Model'],
+ 'Domain': domain,
+ 'Performance': domain_val,
+ 'Specialization': specialization,
+ 'Model Type': model['Model Type']
+ })
+
+ if not data:
+ return create_empty_chart("No domain specialization data available")
+
+ df_plot = pd.DataFrame(data)
+
+ # Create bubble chart
+ fig = go.Figure()
+
+ # Color based on specialization strength
+ fig.add_trace(go.Scatter(
+ x=df_plot['Domain'],
+ y=df_plot['Model'],
+ mode='markers',
+ marker=dict(
+ size=df_plot['Performance'] * 30, # Size based on absolute performance
+ color=df_plot['Specialization'],
+ colorscale=[[0, '#1098F7'], [0.5, '#F5F6F7'], [1, '#E35454']],
+ showscale=True,
+ colorbar=dict(
+ title=dict(
+ text="Specialization
Strength",
+ font=dict(color="#F5F6F7")
+ ),
+ tickfont=dict(color="#94A3B8"),
+ bgcolor="rgba(1, 9, 26, 0.8)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1
+ ),
+ line=dict(width=2, color='#01091A'),
+ opacity=0.8
+ ),
+ text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}"
+ for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
+ hovertemplate="
%{y}" +
+ "Domain: %{x}
" +
+ "%{text}
" +
+ "
"
+ ))
+
+ metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"
Domain Specialization Matrix: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text="
Business Domains",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=13, color="#F5F6F7"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ yaxis=dict(
+ title=dict(
+ text="
Models",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=11, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=1000,
+ width=1200,
+ margin=dict(t=100, b=80, l=220, r=120)
+ )
+
+ return fig
+
+
+def create_performance_gap_analysis(df, metric_type="AC"):
+ """Create range plot showing performance gaps by domain"""
+ domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
+
+ # Calculate min, max, median for each domain
+ gap_data = []
+ for domain in domains:
+ domain_col = f'{domain} {metric_type}'
+ if domain_col in df.columns:
+ domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
+ if len(domain_values) > 0:
+ gap_data.append({
+ 'Domain': domain,
+ 'Min': domain_values.min(),
+ 'Max': domain_values.max(),
+ 'Median': domain_values.median(),
+ 'Q1': domain_values.quantile(0.25),
+ 'Q3': domain_values.quantile(0.75),
+ 'Gap': domain_values.max() - domain_values.min()
+ })
+
+ if not gap_data:
+ return create_empty_chart("No data available for gap analysis")
+
+ df_gap = pd.DataFrame(gap_data)
+ df_gap = df_gap.sort_values('Gap', ascending=True)
+
+ fig = go.Figure()
+
+ # Add range bars
+ for idx, row in df_gap.iterrows():
+ # Add full range line
+ fig.add_trace(go.Scatter(
+ x=[row['Min'], row['Max']],
+ y=[row['Domain'], row['Domain']],
+ mode='lines',
+ line=dict(color='#64748B', width=2),
+ showlegend=False,
+ hoverinfo='skip'
+ ))
+
+ # Add IQR box
+ fig.add_trace(go.Scatter(
+ x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
+ y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
+ fill='toself',
+ fillcolor='rgba(227, 84, 84, 0.3)',
+ line=dict(color='#E35454', width=2),
+ showlegend=False,
+ hoverinfo='skip',
+ mode='lines'
+ ))
+
+ # Add median marker
+ fig.add_trace(go.Scatter(
+ x=[row['Median']],
+ y=[row['Domain']],
+ mode='markers',
+ marker=dict(
+ size=12,
+ color='#E35454',
+ symbol='diamond',
+ line=dict(width=2, color='#01091A')
+ ),
+ showlegend=False,
+ hovertemplate=f"
{row['Domain']}" +
+ f"Min: {row['Min']:.3f}
" +
+ f"Q1: {row['Q1']:.3f}
" +
+ f"Median: {row['Median']:.3f}
" +
+ f"Q3: {row['Q3']:.3f}
" +
+ f"Max: {row['Max']:.3f}
" +
+ f"Gap: {row['Gap']:.3f}
" +
+ "
"
+ ))
+
+ # Add min/max points
+ for idx, row in df_gap.iterrows():
+ fig.add_trace(go.Scatter(
+ x=[row['Min'], row['Max']],
+ y=[row['Domain'], row['Domain']],
+ mode='markers',
+ marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')),
+ showlegend=False,
+ hoverinfo='skip'
+ ))
+
+ metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
+
+ fig.update_layout(
+ title=dict(
+ text=f"
Performance Gap Analysis by Domain: {metric_display}",
+ x=0.5,
+ y=0.97,
+ font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
+ ),
+ xaxis=dict(
+ title=dict(
+ text=f"
{metric_display} Score",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=12, color="#94A3B8"),
+ gridcolor="rgba(245, 246, 247, 0.1)",
+ range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
+ ),
+ yaxis=dict(
+ title=dict(
+ text="
Business Domain",
+ font=dict(size=16, color="#F5F6F7")
+ ),
+ tickfont=dict(size=13, color="#F5F6F7"),
+ gridcolor="rgba(245, 246, 247, 0.1)"
+ ),
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=700,
+ width=1200,
+ margin=dict(t=100, b=80, l=140, r=80),
+ showlegend=False
+ )
+
+ # Add legend manually
+ fig.add_annotation(
+ text="β Median β IQR β Full Range",
+ xref="paper", yref="paper",
+ x=0.98, y=0.02,
+ xanchor='right', yanchor='bottom',
+ font=dict(size=12, color='#94A3B8'),
+ showarrow=False
+ )
+
+ return fig
+
+
+def create_empty_chart(message):
+ """Create an empty chart with a message"""
+ fig = go.Figure()
+
+ fig.add_annotation(
+ text=f"π {message}",
+ xref="paper", yref="paper",
+ x=0.5, y=0.5,
+ xanchor='center', yanchor='middle',
+ font=dict(
+ size=18,
+ color="#94A3B8",
+ family="'Geist', sans-serif"
+ ),
+ showarrow=False,
+ bgcolor="rgba(245, 246, 247, 0.05)",
+ bordercolor="rgba(245, 246, 247, 0.2)",
+ borderwidth=1,
+ borderpad=20
+ )
+
+ fig.update_layout(
+ paper_bgcolor="#01091A",
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
+ height=600,
+ width=1200,
+ margin=dict(t=80, b=80, l=80, r=80)
+ )
\ No newline at end of file