""" Compare Screen for TraceMind-AI Side-by-side comparison of two evaluation runs """ import gradio as gr import plotly.graph_objects as go from plotly.subplots import make_subplots from typing import Dict, Any from components.report_cards import generate_comparison_report_card def create_run_comparison_card(run_data: Dict[str, Any], label: str) -> str: """ Create HTML card for a run in comparison view Args: run_data: Dict with run information label: "A" or "B" Returns: HTML string for the card """ model = run_data.get('model', 'Unknown') success_rate = run_data.get('success_rate', 0) total_cost = run_data.get('total_cost_usd', 0) duration = run_data.get('total_duration_ms', 0) / 1000 # Convert to seconds tokens = run_data.get('total_tokens', 0) co2 = run_data.get('co2_emissions_g', 0) return f"""

Run {label}: {model}

Success Rate: {success_rate:.1f}%
Total Cost: ${total_cost:.4f}
Duration: {duration:.2f}s
Tokens: {tokens:,}
CO2: {co2:.2f}g
""" def create_comparison_charts(run_a: Dict[str, Any], run_b: Dict[str, Any]) -> go.Figure: """ Create comparison charts for two runs Args: run_a: First run data dict run_b: Second run data dict Returns: Plotly figure with comparison charts """ try: # Extract metrics metrics = { 'Success Rate (%)': [run_a.get('success_rate', 0), run_b.get('success_rate', 0)], 'Cost ($)': [run_a.get('total_cost_usd', 0), run_b.get('total_cost_usd', 0)], 'Duration (s)': [run_a.get('total_duration_ms', 0) / 1000, run_b.get('total_duration_ms', 0) / 1000], 'Tokens': [run_a.get('total_tokens', 0), run_b.get('total_tokens', 0)], 'CO2 (g)': [run_a.get('co2_emissions_g', 0), run_b.get('co2_emissions_g', 0)] } # Create subplots fig = make_subplots( rows=2, cols=3, subplot_titles=list(metrics.keys()), specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}], [{"type": "bar"}, {"type": "bar"}, {"type": "indicator"}]], vertical_spacing=0.15, horizontal_spacing=0.1 ) model_a = run_a.get('model', 'Run A') model_b = run_b.get('model', 'Run B') # Add bar charts for each metric positions = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2)] colors_a = ['#667eea', '#667eea', '#667eea', '#667eea', '#667eea'] colors_b = ['#764ba2', '#764ba2', '#764ba2', '#764ba2', '#764ba2'] for idx, (metric_name, values) in enumerate(metrics.items()): if idx < 5: # First 5 metrics row, col = positions[idx] fig.add_trace( go.Bar( name=model_a, x=[model_a], y=[values[0]], marker_color=colors_a[idx], text=[f"{values[0]:.2f}"], textposition='auto', showlegend=(idx == 0) ), row=row, col=col ) fig.add_trace( go.Bar( name=model_b, x=[model_b], y=[values[1]], marker_color=colors_b[idx], text=[f"{values[1]:.2f}"], textposition='auto', showlegend=(idx == 0) ), row=row, col=col ) fig.update_layout( height=600, showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=50, r=50, t=80, b=50) ) return fig except Exception as e: print(f"[ERROR] Creating comparison charts: {e}") fig = go.Figure() fig.add_annotation(text=f"Error creating charts: {str(e)}", showarrow=False) return fig def generate_winner_summary(run_a: Dict[str, Any], run_b: Dict[str, Any]) -> str: """ Generate winner summary comparing two runs Args: run_a: First run data dict run_b: Second run data dict Returns: Markdown string with winner analysis """ model_a = run_a.get('model', 'Run A') model_b = run_b.get('model', 'Run B') # Compare metrics winners = { 'accuracy': model_a if run_a.get('success_rate', 0) > run_b.get('success_rate', 0) else model_b, 'cost': model_a if run_a.get('total_cost_usd', 999) < run_b.get('total_cost_usd', 999) else model_b, 'speed': model_a if run_a.get('total_duration_ms', 999999) < run_b.get('total_duration_ms', 999999) else model_b, 'eco': model_a if run_a.get('co2_emissions_g', 999) < run_b.get('co2_emissions_g', 999) else model_b } # Count wins a_wins = sum(1 for w in winners.values() if w == model_a) b_wins = sum(1 for w in winners.values() if w == model_b) overall_winner = model_a if a_wins > b_wins else model_b if b_wins > a_wins else "Tie" return f""" ### Category Winners | Category | Winner | Metric | |----------|--------|--------| | **Accuracy** | **{winners['accuracy']}** | {run_a.get('success_rate', 0):.1f}% vs {run_b.get('success_rate', 0):.1f}% | | **Cost** | **{winners['cost']}** | ${run_a.get('total_cost_usd', 0):.4f} vs ${run_b.get('total_cost_usd', 0):.4f} | | **Speed** | **{winners['speed']}** | {run_a.get('total_duration_ms', 0)/1000:.2f}s vs {run_b.get('total_duration_ms', 0)/1000:.2f}s | | **Eco-Friendly** | **{winners['eco']}** | {run_a.get('co2_emissions_g', 0):.2f}g vs {run_b.get('co2_emissions_g', 0):.2f}g | --- ### Overall Winner: **{overall_winner}** **{model_a}** wins {a_wins} categories **{model_b}** wins {b_wins} categories ### Recommendation {f"**{model_a}** is the better choice for most use cases" if a_wins > b_wins else f"**{model_b}** is the better choice for most use cases" if b_wins > a_wins else "Both runs are evenly matched - choose based on your specific priorities"} """ def create_compare_ui(): """ Create the compare screen UI components Returns: Tuple of (screen_column, component_dict) """ components = {} with gr.Column(visible=False) as compare_screen: gr.Markdown("# Compare Runs") gr.Markdown("*Side-by-side comparison of two evaluation runs*") components['back_to_leaderboard_btn'] = gr.Button( "⬅️ Back to Leaderboard", variant="secondary", size="sm" ) gr.Markdown("## Select Runs to Compare") with gr.Row(): with gr.Column(): components['compare_run_a_dropdown'] = gr.Dropdown( label="Run A", choices=[], interactive=True, info="Select the first evaluation run for comparison" ) with gr.Column(): components['compare_run_b_dropdown'] = gr.Dropdown( label="Run B", choices=[], interactive=True, info="Select the second evaluation run for comparison" ) components['compare_button'] = gr.Button( "Compare Selected Runs", variant="primary", size="lg" ) # Comparison results with gr.Column(visible=False) as comparison_output: gr.Markdown("## Comparison Results") with gr.Tabs(): with gr.TabItem("Side-by-Side"): # Side-by-side metrics with gr.Row(): with gr.Column(): gr.Markdown("### Run A") components['run_a_card'] = gr.HTML() with gr.Column(): gr.Markdown("### Run B") components['run_b_card'] = gr.HTML() # Comparison charts gr.Markdown("## Metric Comparisons") components['comparison_charts'] = gr.Plot( label="Comparison Charts", show_label=False ) # Winner summary gr.Markdown("## Winner Summary") components['winner_summary'] = gr.Markdown() with gr.TabItem("Radar Comparison"): gr.Markdown(""" ### Multi-Dimensional Comparison Compare runs across **6 normalized dimensions**: - **Success Rate**: Percentage of successful test cases - **Speed**: Execution time (faster is better) - **Cost Efficiency**: Dollar cost per test (cheaper is better) - **Token Efficiency**: Success per 1000 tokens - **CO2 Efficiency**: Environmental impact (lower is better) - **GPU Utilization**: Resource usage (if applicable) """) components['radar_comparison_chart'] = gr.Plot( label="Multi-Dimensional Radar Chart", show_label=False ) with gr.TabItem("📄 Report Card"): gr.Markdown("### 📥 Downloadable Comparison Report Card") gr.Markdown("*Side-by-side comparison card with winner analysis*") with gr.Row(): with gr.Column(scale=1): components['download_comparison_card_btn'] = gr.Button( "📥 Download as PNG", variant="primary", size="lg" ) with gr.Column(scale=2): components['comparison_card_html'] = gr.HTML( label="Comparison Report Card", elem_id="comparison-card-html" ) with gr.TabItem("🤖 AI Insights"): gr.Markdown("### AI-Powered Comparison Analysis") gr.Markdown("*Get intelligent insights about the differences between these runs using the MCP server*") with gr.Row(): components['comparison_focus'] = gr.Dropdown( label="Analysis Focus", choices=["comprehensive", "cost", "performance", "eco_friendly"], value="comprehensive", info="Choose what aspect to focus on in the AI analysis" ) components['generate_ai_comparison_btn'] = gr.Button( "🤖 Generate AI Insights", variant="primary", size="lg" ) components['ai_comparison_insights'] = gr.Markdown( "*Click 'Generate AI Insights' to get intelligent analysis powered by the MCP server*" ) components['comparison_output'] = comparison_output return compare_screen, components def on_compare_runs(run_a_id: str, run_b_id: str, leaderboard_df, components: Dict): """ Handle comparison of two runs Args: run_a_id: ID of first run run_b_id: ID of second run leaderboard_df: Full leaderboard dataframe components: Dictionary of Gradio components Returns: Dictionary of component updates """ try: if not run_a_id or not run_b_id: gr.Warning("Please select two runs to compare") return { components['comparison_output']: gr.update(visible=False) } if run_a_id == run_b_id: gr.Warning("Please select two different runs") return { components['comparison_output']: gr.update(visible=False) } if leaderboard_df is None or leaderboard_df.empty: gr.Warning("Leaderboard data not loaded") return { components['comparison_output']: gr.update(visible=False) } # Parse composite keys (run_id|timestamp) run_a_parts = run_a_id.split('|') run_b_parts = run_b_id.split('|') if len(run_a_parts) != 2 or len(run_b_parts) != 2: gr.Warning("Invalid run selection") return { components['comparison_output']: gr.update(visible=False) } run_a_id_parsed, run_a_timestamp = run_a_parts run_b_id_parsed, run_b_timestamp = run_b_parts # Debug logging print(f"[COMPARE DEBUG] Looking for Run A:") print(f" run_id: {run_a_id_parsed} (type: {type(run_a_id_parsed)})") print(f" timestamp: {run_a_timestamp} (type: {type(run_a_timestamp)})") print(f"[COMPARE DEBUG] Looking for Run B:") print(f" run_id: {run_b_id_parsed} (type: {type(run_b_id_parsed)})") print(f" timestamp: {run_b_timestamp} (type: {type(run_b_timestamp)})") print(f"[COMPARE DEBUG] Leaderboard dataframe timestamp column type: {leaderboard_df['timestamp'].dtype}") print(f"[COMPARE DEBUG] Sample timestamps from leaderboard:") for idx, ts in enumerate(leaderboard_df['timestamp'].head(3)): print(f" [{idx}] {ts} (type: {type(ts)})") # Check if run_ids exist first run_a_by_id = leaderboard_df[leaderboard_df['run_id'] == run_a_id_parsed] run_b_by_id = leaderboard_df[leaderboard_df['run_id'] == run_b_id_parsed] print(f"[COMPARE DEBUG] Runs matching run_id only:") print(f" Run A matches: {len(run_a_by_id)}") if len(run_a_by_id) > 0: print(f" Timestamps: {run_a_by_id['timestamp'].tolist()}") print(f" Run B matches: {len(run_b_by_id)}") if len(run_b_by_id) > 0: print(f" Timestamps: {run_b_by_id['timestamp'].tolist()}") # Find the runs in the dataframe using both run_id and timestamp run_a_match = leaderboard_df[ (leaderboard_df['run_id'] == run_a_id_parsed) & (leaderboard_df['timestamp'] == run_a_timestamp) ] run_b_match = leaderboard_df[ (leaderboard_df['run_id'] == run_b_id_parsed) & (leaderboard_df['timestamp'] == run_b_timestamp) ] print(f"[COMPARE DEBUG] Final matches: Run A={len(run_a_match)}, Run B={len(run_b_match)}") if run_a_match.empty or run_b_match.empty: gr.Warning("Could not find selected runs in leaderboard data") return { components['comparison_output']: gr.update(visible=False) } run_a = run_a_match.iloc[0].to_dict() run_b = run_b_match.iloc[0].to_dict() # Create comparison visualizations card_a = create_run_comparison_card(run_a, "A") card_b = create_run_comparison_card(run_b, "B") charts = create_comparison_charts(run_a, run_b) summary = generate_winner_summary(run_a, run_b) # Create radar chart for multi-dimensional comparison from components.analytics_charts import create_comparison_radar radar_chart = create_comparison_radar([run_a, run_b]) # Generate comparison report card comparison_card = generate_comparison_report_card(run_a, run_b) return { components['comparison_output']: gr.update(visible=True), components['run_a_card']: gr.update(value=card_a), components['run_b_card']: gr.update(value=card_b), components['comparison_charts']: gr.update(value=charts), components['winner_summary']: gr.update(value=summary), components['radar_comparison_chart']: gr.update(value=radar_chart), components['comparison_card_html']: gr.update(value=comparison_card) } except Exception as e: print(f"[ERROR] Comparing runs: {e}") import traceback traceback.print_exc() gr.Warning(f"Error comparing runs: {str(e)}") return { components['comparison_output']: gr.update(visible=False) }