TraceMind / screens /compare.py
Mandark-droid
Add info parameters to all UI components
4dc8a59
raw
history blame
15.3 kB
"""
Compare Screen for TraceMind-AI
Side-by-side comparison of two evaluation runs
"""
import gradio as gr
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import Dict, Any
from components.report_cards import generate_comparison_report_card
def create_run_comparison_card(run_data: Dict[str, Any], label: str) -> str:
"""
Create HTML card for a run in comparison view
Args:
run_data: Dict with run information
label: "A" or "B"
Returns:
HTML string for the card
"""
model = run_data.get('model', 'Unknown')
success_rate = run_data.get('success_rate', 0)
total_cost = run_data.get('total_cost_usd', 0)
duration = run_data.get('total_duration_ms', 0) / 1000 # Convert to seconds
tokens = run_data.get('total_tokens', 0)
co2 = run_data.get('co2_emissions_g', 0)
return f"""
<div style="background: linear-gradient(135deg, {'#667eea' if label == 'A' else '#764ba2'} 0%, {'#764ba2' if label == 'A' else '#f093fb'} 100%);
padding: 25px;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0,0,0,0.2);
color: white;">
<h3 style="margin-top: 0;">Run {label}: {model}</h3>
<div style="margin: 20px 0;">
<div style="display: flex; justify-content: space-between; margin: 10px 0;">
<span>Success Rate:</span>
<strong>{success_rate:.1f}%</strong>
</div>
<div style="display: flex; justify-content: space-between; margin: 10px 0;">
<span>Total Cost:</span>
<strong>${total_cost:.4f}</strong>
</div>
<div style="display: flex; justify-content: space-between; margin: 10px 0;">
<span>Duration:</span>
<strong>{duration:.2f}s</strong>
</div>
<div style="display: flex; justify-content: space-between; margin: 10px 0;">
<span>Tokens:</span>
<strong>{tokens:,}</strong>
</div>
<div style="display: flex; justify-content: space-between; margin: 10px 0;">
<span>CO2:</span>
<strong>{co2:.2f}g</strong>
</div>
</div>
</div>
"""
def create_comparison_charts(run_a: Dict[str, Any], run_b: Dict[str, Any]) -> go.Figure:
"""
Create comparison charts for two runs
Args:
run_a: First run data dict
run_b: Second run data dict
Returns:
Plotly figure with comparison charts
"""
try:
# Extract metrics
metrics = {
'Success Rate (%)': [run_a.get('success_rate', 0), run_b.get('success_rate', 0)],
'Cost ($)': [run_a.get('total_cost_usd', 0), run_b.get('total_cost_usd', 0)],
'Duration (s)': [run_a.get('total_duration_ms', 0) / 1000, run_b.get('total_duration_ms', 0) / 1000],
'Tokens': [run_a.get('total_tokens', 0), run_b.get('total_tokens', 0)],
'CO2 (g)': [run_a.get('co2_emissions_g', 0), run_b.get('co2_emissions_g', 0)]
}
# Create subplots
fig = make_subplots(
rows=2, cols=3,
subplot_titles=list(metrics.keys()),
specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
[{"type": "bar"}, {"type": "bar"}, {"type": "indicator"}]],
vertical_spacing=0.15,
horizontal_spacing=0.1
)
model_a = run_a.get('model', 'Run A')
model_b = run_b.get('model', 'Run B')
# Add bar charts for each metric
positions = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2)]
colors_a = ['#667eea', '#667eea', '#667eea', '#667eea', '#667eea']
colors_b = ['#764ba2', '#764ba2', '#764ba2', '#764ba2', '#764ba2']
for idx, (metric_name, values) in enumerate(metrics.items()):
if idx < 5: # First 5 metrics
row, col = positions[idx]
fig.add_trace(
go.Bar(
name=model_a,
x=[model_a],
y=[values[0]],
marker_color=colors_a[idx],
text=[f"{values[0]:.2f}"],
textposition='auto',
showlegend=(idx == 0)
),
row=row, col=col
)
fig.add_trace(
go.Bar(
name=model_b,
x=[model_b],
y=[values[1]],
marker_color=colors_b[idx],
text=[f"{values[1]:.2f}"],
textposition='auto',
showlegend=(idx == 0)
),
row=row, col=col
)
fig.update_layout(
height=600,
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
),
margin=dict(l=50, r=50, t=80, b=50)
)
return fig
except Exception as e:
print(f"[ERROR] Creating comparison charts: {e}")
fig = go.Figure()
fig.add_annotation(text=f"Error creating charts: {str(e)}", showarrow=False)
return fig
def generate_winner_summary(run_a: Dict[str, Any], run_b: Dict[str, Any]) -> str:
"""
Generate winner summary comparing two runs
Args:
run_a: First run data dict
run_b: Second run data dict
Returns:
Markdown string with winner analysis
"""
model_a = run_a.get('model', 'Run A')
model_b = run_b.get('model', 'Run B')
# Compare metrics
winners = {
'accuracy': model_a if run_a.get('success_rate', 0) > run_b.get('success_rate', 0) else model_b,
'cost': model_a if run_a.get('total_cost_usd', 999) < run_b.get('total_cost_usd', 999) else model_b,
'speed': model_a if run_a.get('total_duration_ms', 999999) < run_b.get('total_duration_ms', 999999) else model_b,
'eco': model_a if run_a.get('co2_emissions_g', 999) < run_b.get('co2_emissions_g', 999) else model_b
}
# Count wins
a_wins = sum(1 for w in winners.values() if w == model_a)
b_wins = sum(1 for w in winners.values() if w == model_b)
overall_winner = model_a if a_wins > b_wins else model_b if b_wins > a_wins else "Tie"
return f"""
### Category Winners
| Category | Winner | Metric |
|----------|--------|--------|
| **Accuracy** | **{winners['accuracy']}** | {run_a.get('success_rate', 0):.1f}% vs {run_b.get('success_rate', 0):.1f}% |
| **Cost** | **{winners['cost']}** | ${run_a.get('total_cost_usd', 0):.4f} vs ${run_b.get('total_cost_usd', 0):.4f} |
| **Speed** | **{winners['speed']}** | {run_a.get('total_duration_ms', 0)/1000:.2f}s vs {run_b.get('total_duration_ms', 0)/1000:.2f}s |
| **Eco-Friendly** | **{winners['eco']}** | {run_a.get('co2_emissions_g', 0):.2f}g vs {run_b.get('co2_emissions_g', 0):.2f}g |
---
### Overall Winner: **{overall_winner}**
**{model_a}** wins {a_wins} categories
**{model_b}** wins {b_wins} categories
### Recommendation
{f"**{model_a}** is the better choice for most use cases" if a_wins > b_wins else
f"**{model_b}** is the better choice for most use cases" if b_wins > a_wins else
"Both runs are evenly matched - choose based on your specific priorities"}
"""
def create_compare_ui():
"""
Create the compare screen UI components
Returns:
Tuple of (screen_column, component_dict)
"""
components = {}
with gr.Column(visible=False) as compare_screen:
gr.Markdown("# Compare Runs")
gr.Markdown("*Side-by-side comparison of two evaluation runs*")
components['back_to_leaderboard_btn'] = gr.Button(
"⬅️ Back to Leaderboard",
variant="secondary",
size="sm"
)
gr.Markdown("## Select Runs to Compare")
with gr.Row():
with gr.Column():
components['compare_run_a_dropdown'] = gr.Dropdown(
label="Run A",
choices=[],
interactive=True,
info="Select the first evaluation run for comparison"
)
with gr.Column():
components['compare_run_b_dropdown'] = gr.Dropdown(
label="Run B",
choices=[],
interactive=True,
info="Select the second evaluation run for comparison"
)
components['compare_button'] = gr.Button(
"Compare Selected Runs",
variant="primary",
size="lg"
)
# Comparison results
with gr.Column(visible=False) as comparison_output:
gr.Markdown("## Comparison Results")
with gr.Tabs():
with gr.TabItem("Side-by-Side"):
# Side-by-side metrics
with gr.Row():
with gr.Column():
gr.Markdown("### Run A")
components['run_a_card'] = gr.HTML()
with gr.Column():
gr.Markdown("### Run B")
components['run_b_card'] = gr.HTML()
# Comparison charts
gr.Markdown("## Metric Comparisons")
components['comparison_charts'] = gr.Plot(
label="Comparison Charts",
show_label=False
)
# Winner summary
gr.Markdown("## Winner Summary")
components['winner_summary'] = gr.Markdown()
with gr.TabItem("Radar Comparison"):
gr.Markdown("""
### Multi-Dimensional Comparison
Compare runs across **6 normalized dimensions**:
- **Success Rate**: Percentage of successful test cases
- **Speed**: Execution time (faster is better)
- **Cost Efficiency**: Dollar cost per test (cheaper is better)
- **Token Efficiency**: Success per 1000 tokens
- **CO2 Efficiency**: Environmental impact (lower is better)
- **GPU Utilization**: Resource usage (if applicable)
""")
components['radar_comparison_chart'] = gr.Plot(
label="Multi-Dimensional Radar Chart",
show_label=False
)
with gr.TabItem("📄 Report Card"):
gr.Markdown("### 📥 Downloadable Comparison Report Card")
gr.Markdown("*Side-by-side comparison card with winner analysis*")
with gr.Row():
with gr.Column(scale=1):
components['download_comparison_card_btn'] = gr.Button(
"📥 Download as PNG",
variant="primary",
size="lg"
)
with gr.Column(scale=2):
components['comparison_card_html'] = gr.HTML(
label="Comparison Report Card",
elem_id="comparison-card-html"
)
components['comparison_output'] = comparison_output
return compare_screen, components
def on_compare_runs(run_a_id: str, run_b_id: str, leaderboard_df, components: Dict):
"""
Handle comparison of two runs
Args:
run_a_id: ID of first run
run_b_id: ID of second run
leaderboard_df: Full leaderboard dataframe
components: Dictionary of Gradio components
Returns:
Dictionary of component updates
"""
try:
if not run_a_id or not run_b_id:
gr.Warning("Please select two runs to compare")
return {
components['comparison_output']: gr.update(visible=False)
}
if run_a_id == run_b_id:
gr.Warning("Please select two different runs")
return {
components['comparison_output']: gr.update(visible=False)
}
if leaderboard_df is None or leaderboard_df.empty:
gr.Warning("Leaderboard data not loaded")
return {
components['comparison_output']: gr.update(visible=False)
}
# Parse composite keys (run_id|timestamp)
run_a_parts = run_a_id.split('|')
run_b_parts = run_b_id.split('|')
if len(run_a_parts) != 2 or len(run_b_parts) != 2:
gr.Warning("Invalid run selection")
return {
components['comparison_output']: gr.update(visible=False)
}
run_a_id_parsed, run_a_timestamp = run_a_parts
run_b_id_parsed, run_b_timestamp = run_b_parts
# Find the runs in the dataframe using both run_id and timestamp
run_a_match = leaderboard_df[
(leaderboard_df['run_id'] == run_a_id_parsed) &
(leaderboard_df['timestamp'] == run_a_timestamp)
]
run_b_match = leaderboard_df[
(leaderboard_df['run_id'] == run_b_id_parsed) &
(leaderboard_df['timestamp'] == run_b_timestamp)
]
if run_a_match.empty or run_b_match.empty:
gr.Warning("Could not find selected runs in leaderboard data")
return {
components['comparison_output']: gr.update(visible=False)
}
run_a = run_a_match.iloc[0].to_dict()
run_b = run_b_match.iloc[0].to_dict()
# Create comparison visualizations
card_a = create_run_comparison_card(run_a, "A")
card_b = create_run_comparison_card(run_b, "B")
charts = create_comparison_charts(run_a, run_b)
summary = generate_winner_summary(run_a, run_b)
# Create radar chart for multi-dimensional comparison
from components.analytics_charts import create_comparison_radar
radar_chart = create_comparison_radar([run_a, run_b])
# Generate comparison report card
comparison_card = generate_comparison_report_card(run_a, run_b)
return {
components['comparison_output']: gr.update(visible=True),
components['run_a_card']: gr.update(value=card_a),
components['run_b_card']: gr.update(value=card_b),
components['comparison_charts']: gr.update(value=charts),
components['winner_summary']: gr.update(value=summary),
components['radar_comparison_chart']: gr.update(value=radar_chart),
components['comparison_card_html']: gr.update(value=comparison_card)
}
except Exception as e:
print(f"[ERROR] Comparing runs: {e}")
import traceback
traceback.print_exc()
gr.Warning(f"Error comparing runs: {str(e)}")
return {
components['comparison_output']: gr.update(visible=False)
}