feat: refactor Device Duel
Browse files- src/components/device_comparison.py +611 -103
- src/components/header.py +1 -1
src/components/device_comparison.py
CHANGED
@@ -1,11 +1,392 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
from typing import List, Optional
|
4 |
|
5 |
from ..core.glicko2_ranking import analyze_device_glicko2_matches
|
6 |
from ..components.visualizations import clean_device_id
|
7 |
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
|
10 |
"""
|
11 |
Render a component for comparing two devices and analyzing their matches.
|
@@ -16,41 +397,122 @@ def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str])
|
|
16 |
"""
|
17 |
st.title("⚔️ Device Duel Arena")
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# Create mapping of normalized IDs to display names
|
20 |
device_display_names = {
|
21 |
device_id: clean_device_id(device_id) for device_id in normalized_device_ids
|
22 |
}
|
23 |
|
24 |
-
# Create two columns for device selection
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
with col1:
|
|
|
|
|
|
|
|
|
28 |
device1 = st.selectbox(
|
29 |
-
"
|
30 |
options=normalized_device_ids,
|
31 |
format_func=lambda x: device_display_names[x],
|
32 |
key="device_compare_1",
|
|
|
|
|
33 |
)
|
34 |
|
35 |
with col2:
|
|
|
|
|
|
|
|
|
36 |
device2 = st.selectbox(
|
37 |
-
"
|
38 |
options=normalized_device_ids,
|
39 |
format_func=lambda x: device_display_names[x],
|
40 |
key="device_compare_2",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
)
|
42 |
|
43 |
-
|
44 |
-
if st.button("Start Duel", key="analyze_matches_btn"):
|
45 |
# Validate device selection
|
46 |
-
if device1
|
|
|
|
|
|
|
47 |
st.error("Please select two different devices to compare.")
|
48 |
return
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
with st.spinner(
|
53 |
-
f"
|
54 |
):
|
55 |
try:
|
56 |
# Analyze matches using Glicko-2
|
@@ -60,117 +522,163 @@ def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str])
|
|
60 |
# Show summary statistics
|
61 |
total_matches = len(matches_df)
|
62 |
|
63 |
-
# Set up metrics
|
64 |
-
col1, col2, col3 = st.columns(3)
|
65 |
-
|
66 |
-
with col1:
|
67 |
-
st.metric("Total Matches", total_matches)
|
68 |
-
|
69 |
# Check for required columns before calculating metrics
|
70 |
if (
|
71 |
"Token Winner" in matches_df.columns
|
72 |
and "Prompt Winner" in matches_df.columns
|
|
|
73 |
):
|
74 |
token_wins_1 = sum(matches_df["Token Winner"] == device1)
|
75 |
prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
|
|
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
|
88 |
-
#
|
89 |
-
|
90 |
-
combined_wins_1
|
91 |
-
|
92 |
-
|
93 |
-
st.metric(
|
94 |
-
f"{device_display_names[device1]}'s Combined Wins",
|
95 |
-
f"{combined_wins_1} ({combined_wins_1/total_matches*100:.1f}%)",
|
96 |
-
)
|
97 |
-
else:
|
98 |
-
st.warning("Winner information is missing from the match data.")
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
"Token Win Prob",
|
110 |
-
"Prompt Processing 1",
|
111 |
-
"Prompt Processing 2",
|
112 |
-
"Prompt Winner",
|
113 |
-
"Prompt Win Prob",
|
114 |
-
"Combined Winner",
|
115 |
-
"Combined Win Prob",
|
116 |
-
"Platform 1",
|
117 |
-
"Platform 2",
|
118 |
-
]
|
119 |
-
|
120 |
-
# Ensure all columns exist in the dataframe
|
121 |
-
valid_cols = [
|
122 |
-
col for col in display_cols if col in matches_df.columns
|
123 |
-
]
|
124 |
-
|
125 |
-
if valid_cols:
|
126 |
-
# Rename some columns for better display
|
127 |
-
matches_display = matches_df[valid_cols].copy()
|
128 |
-
|
129 |
-
# Define a rename mapping but only apply for columns that exist
|
130 |
-
rename_mapping = {
|
131 |
-
"Token Generation 1": f"{device_display_names[device1]} Token Gen",
|
132 |
-
"Token Generation 2": f"{device_display_names[device2]} Token Gen",
|
133 |
-
"Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
|
134 |
-
"Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
|
135 |
-
"Platform 1": f"{device_display_names[device1]} Platform",
|
136 |
-
"Platform 2": f"{device_display_names[device2]} Platform",
|
137 |
-
"Token Win Prob": "Device 1 Token Win Prob",
|
138 |
-
"Prompt Win Prob": "Device 1 Prompt Win Prob",
|
139 |
-
"Combined Win Prob": "Device 1 Combined Win Prob",
|
140 |
-
}
|
141 |
-
|
142 |
-
# Only rename columns that exist in the dataframe
|
143 |
-
rename_filtered = {
|
144 |
-
k: v
|
145 |
-
for k, v in rename_mapping.items()
|
146 |
-
if k in matches_display.columns
|
147 |
-
}
|
148 |
-
matches_display = matches_display.rename(
|
149 |
-
columns=rename_filtered
|
150 |
)
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
)
|
162 |
-
|
163 |
-
st.
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
)
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
else:
|
173 |
-
st.
|
174 |
f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
|
175 |
)
|
176 |
st.info(
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import plotly.graph_objects as go
|
4 |
from typing import List, Optional
|
5 |
|
6 |
from ..core.glicko2_ranking import analyze_device_glicko2_matches
|
7 |
from ..components.visualizations import clean_device_id
|
8 |
|
9 |
|
10 |
+
def create_head_to_head_battle_chart(
|
11 |
+
device1: str,
|
12 |
+
device2: str,
|
13 |
+
device1_display: str,
|
14 |
+
device2_display: str,
|
15 |
+
token_wins_1: int,
|
16 |
+
prompt_wins_1: int,
|
17 |
+
combined_wins_1: int,
|
18 |
+
total_matches: int,
|
19 |
+
):
|
20 |
+
"""Create an engaging head-to-head battle visualization."""
|
21 |
+
|
22 |
+
# Calculate win percentages for both devices
|
23 |
+
token_pct_1 = token_wins_1 / total_matches * 100
|
24 |
+
token_pct_2 = 100 - token_pct_1
|
25 |
+
|
26 |
+
prompt_pct_1 = prompt_wins_1 / total_matches * 100
|
27 |
+
prompt_pct_2 = 100 - prompt_pct_1
|
28 |
+
|
29 |
+
combined_pct_1 = combined_wins_1 / total_matches * 100
|
30 |
+
combined_pct_2 = 100 - combined_pct_1
|
31 |
+
|
32 |
+
# Create figure
|
33 |
+
fig = go.Figure()
|
34 |
+
|
35 |
+
# Add bars for device 1
|
36 |
+
fig.add_trace(
|
37 |
+
go.Bar(
|
38 |
+
y=["Token Gen", "Prompt Proc", "Combined"],
|
39 |
+
x=[token_pct_1, prompt_pct_1, combined_pct_1],
|
40 |
+
name=device1_display,
|
41 |
+
orientation="h",
|
42 |
+
marker=dict(
|
43 |
+
color="rgba(58, 71, 180, 0.8)",
|
44 |
+
line=dict(color="rgba(58, 71, 180, 1.0)", width=2),
|
45 |
+
),
|
46 |
+
text=[
|
47 |
+
f"{token_pct_1:.1f}%",
|
48 |
+
f"{prompt_pct_1:.1f}%",
|
49 |
+
f"{combined_pct_1:.1f}%",
|
50 |
+
],
|
51 |
+
textposition="inside",
|
52 |
+
insidetextanchor="middle",
|
53 |
+
hoverinfo="text",
|
54 |
+
hovertext=[
|
55 |
+
f"{device1_display}<br>Token Wins: {token_wins_1} ({token_pct_1:.1f}%)",
|
56 |
+
f"{device1_display}<br>Prompt Wins: {prompt_wins_1} ({prompt_pct_1:.1f}%)",
|
57 |
+
f"{device1_display}<br>Combined Wins: {combined_wins_1} ({combined_pct_1:.1f}%)",
|
58 |
+
],
|
59 |
+
width=0.5,
|
60 |
+
)
|
61 |
+
)
|
62 |
+
|
63 |
+
# Add bars for device 2
|
64 |
+
token_wins_2 = total_matches - token_wins_1
|
65 |
+
prompt_wins_2 = total_matches - prompt_wins_1
|
66 |
+
combined_wins_2 = total_matches - combined_wins_1
|
67 |
+
|
68 |
+
fig.add_trace(
|
69 |
+
go.Bar(
|
70 |
+
y=["Token Gen", "Prompt Proc", "Combined"],
|
71 |
+
x=[-token_pct_2, -prompt_pct_2, -combined_pct_2], # Negative to go left
|
72 |
+
name=device2_display,
|
73 |
+
orientation="h",
|
74 |
+
marker=dict(
|
75 |
+
color="rgba(231, 99, 99, 0.8)",
|
76 |
+
line=dict(color="rgba(231, 99, 99, 1.0)", width=2),
|
77 |
+
),
|
78 |
+
text=[
|
79 |
+
f"{token_pct_2:.1f}%",
|
80 |
+
f"{prompt_pct_2:.1f}%",
|
81 |
+
f"{combined_pct_2:.1f}%",
|
82 |
+
],
|
83 |
+
textposition="inside",
|
84 |
+
insidetextanchor="middle",
|
85 |
+
hoverinfo="text",
|
86 |
+
hovertext=[
|
87 |
+
f"{device2_display}<br>Token Wins: {token_wins_2} ({token_pct_2:.1f}%)",
|
88 |
+
f"{device2_display}<br>Prompt Wins: {prompt_wins_2} ({prompt_pct_2:.1f}%)",
|
89 |
+
f"{device2_display}<br>Combined Wins: {combined_wins_2} ({combined_pct_2:.1f}%)",
|
90 |
+
],
|
91 |
+
width=0.5,
|
92 |
+
)
|
93 |
+
)
|
94 |
+
|
95 |
+
# Design: Add center line and decorations
|
96 |
+
fig.add_shape(
|
97 |
+
type="line",
|
98 |
+
x0=0,
|
99 |
+
y0=-0.5,
|
100 |
+
x1=0,
|
101 |
+
y1=2.5,
|
102 |
+
line=dict(color="black", width=2, dash="solid"),
|
103 |
+
)
|
104 |
+
|
105 |
+
# VS label in the middle
|
106 |
+
# fig.add_annotation(
|
107 |
+
# x=0,
|
108 |
+
# y=1.5,
|
109 |
+
# text="VS",
|
110 |
+
# showarrow=False,
|
111 |
+
# font=dict(size=20, color="black", family="Arial Black"),
|
112 |
+
# bgcolor="rgba(255, 255, 255, 0.8)",
|
113 |
+
# bordercolor="black",
|
114 |
+
# borderwidth=2,
|
115 |
+
# borderpad=4,
|
116 |
+
# width=50,
|
117 |
+
# height=30,
|
118 |
+
# )
|
119 |
+
|
120 |
+
# Update layout for a battle-like appearance
|
121 |
+
fig.update_layout(
|
122 |
+
title=dict(
|
123 |
+
text=f"⚔️ {device1_display} vs {device2_display} ⚔️",
|
124 |
+
font=dict(size=24, family="Arial Black"),
|
125 |
+
x=0.5,
|
126 |
+
),
|
127 |
+
barmode="overlay",
|
128 |
+
bargap=0.15,
|
129 |
+
bargroupgap=0.1,
|
130 |
+
legend=dict(x=0.5, y=1.05, xanchor="center", orientation="h"),
|
131 |
+
xaxis=dict(
|
132 |
+
title="Win Rate (%)",
|
133 |
+
range=[-100, 100],
|
134 |
+
tickvals=[-100, -75, -50, -25, 0, 25, 50, 75, 100],
|
135 |
+
ticktext=["100%", "75%", "50%", "25%", "0%", "25%", "50%", "75%", "100%"],
|
136 |
+
zeroline=True,
|
137 |
+
zerolinewidth=2,
|
138 |
+
zerolinecolor="black",
|
139 |
+
),
|
140 |
+
yaxis=dict(title="", autorange="reversed"),
|
141 |
+
plot_bgcolor="rgba(240, 240, 240, 0.8)",
|
142 |
+
height=400,
|
143 |
+
margin=dict(l=20, r=20, t=80, b=20),
|
144 |
+
# annotations=[
|
145 |
+
# dict(
|
146 |
+
# x=-50,
|
147 |
+
# y="Token Gen",
|
148 |
+
# text=device2_display,
|
149 |
+
# showarrow=False,
|
150 |
+
# font=dict(
|
151 |
+
# size=14, color="rgba(231, 99, 99, 1.0)", family="Arial Black"
|
152 |
+
# ),
|
153 |
+
# align="center",
|
154 |
+
# xanchor="center",
|
155 |
+
# ),
|
156 |
+
# dict(
|
157 |
+
# x=50,
|
158 |
+
# y="Token Gen",
|
159 |
+
# text=device1_display,
|
160 |
+
# showarrow=False,
|
161 |
+
# font=dict(
|
162 |
+
# size=14, color="rgba(58, 71, 180, 1.0)", family="Arial Black"
|
163 |
+
# ),
|
164 |
+
# align="center",
|
165 |
+
# xanchor="center",
|
166 |
+
# ),
|
167 |
+
# ],
|
168 |
+
)
|
169 |
+
|
170 |
+
return fig
|
171 |
+
|
172 |
+
|
173 |
+
def create_victory_badge(winner_device: str, loser_device: str, win_percentage: float):
|
174 |
+
"""Create a stylized victory badge."""
|
175 |
+
badge_color = (
|
176 |
+
"#FFD700"
|
177 |
+
if win_percentage >= 75
|
178 |
+
else "#C0C0C0" if win_percentage >= 50 else "#CD7F32"
|
179 |
+
)
|
180 |
+
badge_text = (
|
181 |
+
"DOMINANT VICTORY"
|
182 |
+
if win_percentage >= 75
|
183 |
+
else "CLEAR WINNER" if win_percentage >= 50 else "NARROW VICTORY"
|
184 |
+
)
|
185 |
+
|
186 |
+
html = f"""
|
187 |
+
<div style="display: flex; justify-content: center; margin: 20px 0;">
|
188 |
+
<div style="
|
189 |
+
background: linear-gradient(135deg, {badge_color} 0%, #FFFFFF 50%, {badge_color} 100%);
|
190 |
+
border-radius: 16px;
|
191 |
+
padding: 20px;
|
192 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.2);
|
193 |
+
text-align: center;
|
194 |
+
border: 2px solid {badge_color};
|
195 |
+
max-width: 90%;
|
196 |
+
">
|
197 |
+
<div style="font-size: 24px; font-weight: bold; margin-bottom: 8px; font-family: 'Arial Black', sans-serif;">
|
198 |
+
🏆 {badge_text} 🏆
|
199 |
+
</div>
|
200 |
+
<div style="font-size: 18px; font-weight: bold; color: #333;">
|
201 |
+
{winner_device}
|
202 |
+
</div>
|
203 |
+
<div style="font-size: 14px; margin: 8px 0;">
|
204 |
+
defeated
|
205 |
+
</div>
|
206 |
+
<div style="font-size: 16px; color: #555;">
|
207 |
+
{loser_device}
|
208 |
+
</div>
|
209 |
+
<div style="font-size: 20px; font-weight: bold; margin-top: 8px; color: #333;">
|
210 |
+
{win_percentage:.1f}% Win Rate
|
211 |
+
</div>
|
212 |
+
</div>
|
213 |
+
</div>
|
214 |
+
"""
|
215 |
+
return html
|
216 |
+
|
217 |
+
|
218 |
+
def create_model_performance_chart(
|
219 |
+
matches_df, device1, device2, device1_display, device2_display, top_n=8
|
220 |
+
):
|
221 |
+
"""Create an improved model performance comparison chart with vertical models and side-by-side bars."""
|
222 |
+
# Group by model and calculate mean for both devices
|
223 |
+
token_cols = ["Model", "Token Generation 1", "Token Generation 2"]
|
224 |
+
prompt_cols = ["Model", "Prompt Processing 1", "Prompt Processing 2"]
|
225 |
+
|
226 |
+
# Ensure all required columns exist
|
227 |
+
if not all(col in matches_df.columns for col in token_cols + prompt_cols[1:]):
|
228 |
+
return None
|
229 |
+
|
230 |
+
# Prepare data
|
231 |
+
grouped = (
|
232 |
+
matches_df.groupby("Model")
|
233 |
+
.agg(
|
234 |
+
{
|
235 |
+
"Token Generation 1": "mean",
|
236 |
+
"Token Generation 2": "mean",
|
237 |
+
"Prompt Processing 1": "mean",
|
238 |
+
"Prompt Processing 2": "mean",
|
239 |
+
}
|
240 |
+
)
|
241 |
+
.reset_index()
|
242 |
+
)
|
243 |
+
|
244 |
+
# Sort by the sum of token generation (most performance difference first)
|
245 |
+
grouped["token_diff"] = abs(
|
246 |
+
grouped["Token Generation 1"] - grouped["Token Generation 2"]
|
247 |
+
)
|
248 |
+
grouped = grouped.sort_values("token_diff", ascending=False).head(top_n)
|
249 |
+
|
250 |
+
# Create figure with subplots - one row per model, two columns for token/prompt
|
251 |
+
fig = go.Figure()
|
252 |
+
|
253 |
+
models = grouped["Model"].tolist()
|
254 |
+
token_gen_1 = grouped["Token Generation 1"].tolist()
|
255 |
+
token_gen_2 = grouped["Token Generation 2"].tolist()
|
256 |
+
prompt_proc_1 = grouped["Prompt Processing 1"].tolist()
|
257 |
+
prompt_proc_2 = grouped["Prompt Processing 2"].tolist()
|
258 |
+
|
259 |
+
# Add Token Generation traces
|
260 |
+
fig.add_trace(
|
261 |
+
go.Bar(
|
262 |
+
x=token_gen_1,
|
263 |
+
y=models,
|
264 |
+
name=f"{device1_display} Token Gen",
|
265 |
+
orientation="h",
|
266 |
+
marker=dict(color="rgba(58, 71, 180, 0.8)"),
|
267 |
+
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
|
268 |
+
legendgroup="device1",
|
269 |
+
offsetgroup=1,
|
270 |
+
xaxis="x",
|
271 |
+
)
|
272 |
+
)
|
273 |
+
|
274 |
+
fig.add_trace(
|
275 |
+
go.Bar(
|
276 |
+
x=token_gen_2,
|
277 |
+
y=models,
|
278 |
+
name=f"{device2_display} Token Gen",
|
279 |
+
orientation="h",
|
280 |
+
marker=dict(color="rgba(231, 99, 99, 0.8)"),
|
281 |
+
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
|
282 |
+
legendgroup="device2",
|
283 |
+
offsetgroup=2,
|
284 |
+
xaxis="x",
|
285 |
+
)
|
286 |
+
)
|
287 |
+
|
288 |
+
# Add Prompt Processing traces
|
289 |
+
fig.add_trace(
|
290 |
+
go.Bar(
|
291 |
+
x=prompt_proc_1,
|
292 |
+
y=models,
|
293 |
+
name=f"{device1_display} Prompt Proc",
|
294 |
+
orientation="h",
|
295 |
+
marker=dict(color="rgba(58, 71, 180, 0.4)"),
|
296 |
+
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
|
297 |
+
legendgroup="device1",
|
298 |
+
offsetgroup=1,
|
299 |
+
xaxis="x2",
|
300 |
+
showlegend=False,
|
301 |
+
)
|
302 |
+
)
|
303 |
+
|
304 |
+
fig.add_trace(
|
305 |
+
go.Bar(
|
306 |
+
x=prompt_proc_2,
|
307 |
+
y=models,
|
308 |
+
name=f"{device2_display} Prompt Proc",
|
309 |
+
orientation="h",
|
310 |
+
marker=dict(color="rgba(231, 99, 99, 0.4)"),
|
311 |
+
hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
|
312 |
+
legendgroup="device2",
|
313 |
+
offsetgroup=2,
|
314 |
+
xaxis="x2",
|
315 |
+
showlegend=False,
|
316 |
+
)
|
317 |
+
)
|
318 |
+
|
319 |
+
# Create layout with two x-axes
|
320 |
+
fig.update_layout(
|
321 |
+
title_text="📊 Performance Breakdown by Model",
|
322 |
+
grid=dict(rows=1, columns=2, pattern="independent"),
|
323 |
+
legend=dict(orientation="h", yanchor="bottom", y=1.12, xanchor="right", x=1),
|
324 |
+
height=max(
|
325 |
+
350, 50 * len(models) + 120
|
326 |
+
), # Dynamic height based on number of models
|
327 |
+
margin=dict(l=20, r=20, t=80, b=50),
|
328 |
+
xaxis=dict(
|
329 |
+
title="Token Generation (tokens/sec)", side="bottom", domain=[0, 0.48]
|
330 |
+
),
|
331 |
+
xaxis2=dict(
|
332 |
+
title="Prompt Processing (tokens/sec)", side="bottom", domain=[0.52, 1]
|
333 |
+
),
|
334 |
+
yaxis=dict(title="", autorange="reversed"),
|
335 |
+
)
|
336 |
+
|
337 |
+
# Add a center divider
|
338 |
+
fig.add_shape(
|
339 |
+
type="line",
|
340 |
+
x0=0.5,
|
341 |
+
y0=0,
|
342 |
+
x1=0.5,
|
343 |
+
y1=1,
|
344 |
+
xref="paper",
|
345 |
+
yref="paper",
|
346 |
+
line=dict(color="rgba(0,0,0,0.2)", width=1, dash="dash"),
|
347 |
+
)
|
348 |
+
|
349 |
+
# Add headers for each section
|
350 |
+
fig.add_annotation(
|
351 |
+
x=0.4,
|
352 |
+
y=1.08,
|
353 |
+
xanchor="right",
|
354 |
+
xref="paper",
|
355 |
+
yref="paper",
|
356 |
+
text="Token Generation",
|
357 |
+
showarrow=False,
|
358 |
+
font=dict(
|
359 |
+
size=14,
|
360 |
+
color="rgba(58, 71, 180, 1.0)",
|
361 |
+
family="Arial, sans-serif",
|
362 |
+
weight="bold",
|
363 |
+
),
|
364 |
+
)
|
365 |
+
|
366 |
+
fig.add_annotation(
|
367 |
+
x=0.6,
|
368 |
+
y=1.08,
|
369 |
+
xanchor="left",
|
370 |
+
xref="paper",
|
371 |
+
yref="paper",
|
372 |
+
text="Prompt Processing",
|
373 |
+
showarrow=False,
|
374 |
+
font=dict(
|
375 |
+
size=14,
|
376 |
+
color="rgba(231, 99, 99, 1.0)",
|
377 |
+
family="Arial, sans-serif",
|
378 |
+
weight="bold",
|
379 |
+
),
|
380 |
+
)
|
381 |
+
|
382 |
+
# Better styling for the model names
|
383 |
+
fig.update_yaxes(
|
384 |
+
tickfont=dict(size=12, family="Arial, sans-serif"), gridcolor="rgba(0,0,0,0.05)"
|
385 |
+
)
|
386 |
+
|
387 |
+
return fig
|
388 |
+
|
389 |
+
|
390 |
def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
|
391 |
"""
|
392 |
Render a component for comparing two devices and analyzing their matches.
|
|
|
397 |
"""
|
398 |
st.title("⚔️ Device Duel Arena")
|
399 |
|
400 |
+
# Add dramatic introduction with some CSS styling
|
401 |
+
st.markdown(
|
402 |
+
"""
|
403 |
+
<div style="text-align: center; padding: 10px; margin-bottom: 20px;
|
404 |
+
background: linear-gradient(135deg, #f6f8fa 0%, #e9ecef 100%);
|
405 |
+
border-radius: 10px; border: 1px solid #dee2e6;">
|
406 |
+
<p style="font-size: 16px; font-style: italic; color: #495057;">
|
407 |
+
Welcome to the arena where devices face off in direct comparison!
|
408 |
+
Choose any two and see how they stack up.
|
409 |
+
</p>
|
410 |
+
</div>
|
411 |
+
""",
|
412 |
+
unsafe_allow_html=True,
|
413 |
+
)
|
414 |
+
|
415 |
# Create mapping of normalized IDs to display names
|
416 |
device_display_names = {
|
417 |
device_id: clean_device_id(device_id) for device_id in normalized_device_ids
|
418 |
}
|
419 |
|
420 |
+
# Create two columns for device selection with battle-themed styling
|
421 |
+
st.markdown(
|
422 |
+
"""
|
423 |
+
<style>
|
424 |
+
.device-select-header {
|
425 |
+
font-weight: bold;
|
426 |
+
font-size: 18px;
|
427 |
+
margin-bottom: 10px;
|
428 |
+
text-align: center;
|
429 |
+
padding: 5px;
|
430 |
+
border-radius: 5px;
|
431 |
+
}
|
432 |
+
.device1-header {
|
433 |
+
background-color: rgba(58, 71, 180, 0.2);
|
434 |
+
border-left: 4px solid rgba(58, 71, 180, 1.0);
|
435 |
+
}
|
436 |
+
.device2-header {
|
437 |
+
background-color: rgba(231, 99, 99, 0.2);
|
438 |
+
border-left: 4px solid rgba(231, 99, 99, 1.0);
|
439 |
+
}
|
440 |
+
</style>
|
441 |
+
""",
|
442 |
+
unsafe_allow_html=True,
|
443 |
+
)
|
444 |
+
|
445 |
+
col1, vs_col, col2 = st.columns([0.45, 0.1, 0.45])
|
446 |
+
|
447 |
+
with vs_col:
|
448 |
+
st.markdown(
|
449 |
+
"""
|
450 |
+
<div style="display: flex; height: 100%; align-items: center; justify-content: center;">
|
451 |
+
<div style="font-size: 24px; font-weight: bold; color: #555;">VS</div>
|
452 |
+
</div>
|
453 |
+
""",
|
454 |
+
unsafe_allow_html=True,
|
455 |
+
)
|
456 |
|
457 |
with col1:
|
458 |
+
st.markdown(
|
459 |
+
'<div class="device-select-header device1-header">CHALLENGER</div>',
|
460 |
+
unsafe_allow_html=True,
|
461 |
+
)
|
462 |
device1 = st.selectbox(
|
463 |
+
"First Device",
|
464 |
options=normalized_device_ids,
|
465 |
format_func=lambda x: device_display_names[x],
|
466 |
key="device_compare_1",
|
467 |
+
index=None,
|
468 |
+
placeholder="Select a device ...",
|
469 |
)
|
470 |
|
471 |
with col2:
|
472 |
+
st.markdown(
|
473 |
+
'<div class="device-select-header device2-header">OPPONENT</div>',
|
474 |
+
unsafe_allow_html=True,
|
475 |
+
)
|
476 |
device2 = st.selectbox(
|
477 |
+
"Second Device",
|
478 |
options=normalized_device_ids,
|
479 |
format_func=lambda x: device_display_names[x],
|
480 |
key="device_compare_2",
|
481 |
+
index=None,
|
482 |
+
placeholder="Select a device ...",
|
483 |
+
)
|
484 |
+
|
485 |
+
# Button to analyze matches with a more exciting style
|
486 |
+
button_col1, button_col2, button_col3 = st.columns([0.3, 0.4, 0.3])
|
487 |
+
with button_col2:
|
488 |
+
duel_button = st.button(
|
489 |
+
"️Start",
|
490 |
+
key="analyze_matches_btn",
|
491 |
+
use_container_width=True,
|
492 |
)
|
493 |
|
494 |
+
if duel_button:
|
|
|
495 |
# Validate device selection
|
496 |
+
if not device1 or not device2:
|
497 |
+
st.error("Please select two devices to battle!")
|
498 |
+
return
|
499 |
+
elif device1 == device2:
|
500 |
st.error("Please select two different devices to compare.")
|
501 |
return
|
502 |
|
503 |
+
# Create dramatic divider
|
504 |
+
st.markdown(
|
505 |
+
"""
|
506 |
+
<div style="text-align: center; margin: 20px 0;">
|
507 |
+
<div style="font-size: 24px; font-weight: bold; color: #333;">⚔️ BATTLE RESULTS ⚔️</div>
|
508 |
+
<div style="height: 4px; background: linear-gradient(90deg, rgba(58,71,180,1) 0%, rgba(231,99,99,1) 100%); margin: 10px 0;"></div>
|
509 |
+
</div>
|
510 |
+
""",
|
511 |
+
unsafe_allow_html=True,
|
512 |
+
)
|
513 |
|
514 |
with st.spinner(
|
515 |
+
f"⚔️ Battle in progress between {device_display_names[device1]} and {device_display_names[device2]}..."
|
516 |
):
|
517 |
try:
|
518 |
# Analyze matches using Glicko-2
|
|
|
522 |
# Show summary statistics
|
523 |
total_matches = len(matches_df)
|
524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
# Check for required columns before calculating metrics
|
526 |
if (
|
527 |
"Token Winner" in matches_df.columns
|
528 |
and "Prompt Winner" in matches_df.columns
|
529 |
+
and "Combined Winner" in matches_df.columns
|
530 |
):
|
531 |
token_wins_1 = sum(matches_df["Token Winner"] == device1)
|
532 |
prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
|
533 |
+
combined_wins_1 = sum(matches_df["Combined Winner"] == device1)
|
534 |
|
535 |
+
# Display total matches info
|
536 |
+
st.markdown(
|
537 |
+
f"""
|
538 |
+
<div style="text-align: center; padding: 10px; background-color: #f8f9fa;
|
539 |
+
border-radius: 5px; margin: 10px 0; border: 1px solid #dee2e6;">
|
540 |
+
<span style="font-size: 16px; font-weight: bold;">Total Matches: {total_matches}</span>
|
541 |
+
</div>
|
542 |
+
""",
|
543 |
+
unsafe_allow_html=True,
|
544 |
+
)
|
545 |
|
546 |
+
# Show victory badge for the overall winner
|
547 |
+
winner_device = (
|
548 |
+
device1 if combined_wins_1 > total_matches / 2 else device2
|
549 |
+
)
|
550 |
+
loser_device = device2 if winner_device == device1 else device1
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
|
552 |
+
winner_display = device_display_names[winner_device]
|
553 |
+
loser_display = device_display_names[loser_device]
|
554 |
+
|
555 |
+
win_percentage = (
|
556 |
+
(combined_wins_1 / total_matches * 100)
|
557 |
+
if winner_device == device1
|
558 |
+
else (
|
559 |
+
(total_matches - combined_wins_1) / total_matches * 100
|
560 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
)
|
562 |
|
563 |
+
st.markdown(
|
564 |
+
create_victory_badge(
|
565 |
+
winner_display, loser_display, win_percentage
|
566 |
+
),
|
567 |
+
unsafe_allow_html=True,
|
568 |
+
)
|
569 |
|
570 |
+
# Create battle visualization
|
571 |
+
battle_fig = create_head_to_head_battle_chart(
|
572 |
+
device1,
|
573 |
+
device2,
|
574 |
+
device_display_names[device1],
|
575 |
+
device_display_names[device2],
|
576 |
+
token_wins_1,
|
577 |
+
prompt_wins_1,
|
578 |
+
combined_wins_1,
|
579 |
+
total_matches,
|
580 |
)
|
581 |
+
|
582 |
+
st.plotly_chart(battle_fig, use_container_width=True)
|
583 |
+
|
584 |
+
# Replace the model-specific charts with the new integrated version
|
585 |
+
model_performance_chart = create_model_performance_chart(
|
586 |
+
matches_df,
|
587 |
+
device1,
|
588 |
+
device2,
|
589 |
+
device_display_names[device1],
|
590 |
+
device_display_names[device2],
|
591 |
)
|
592 |
|
593 |
+
if model_performance_chart:
|
594 |
+
st.plotly_chart(
|
595 |
+
model_performance_chart, use_container_width=True
|
596 |
+
)
|
597 |
+
|
598 |
+
# Show the detailed match table
|
599 |
+
with st.expander("View Detailed Match Results", expanded=False):
|
600 |
+
st.markdown("#### All Match Data")
|
601 |
+
|
602 |
+
# Define display columns for Glicko-2
|
603 |
+
display_cols = [
|
604 |
+
"Model",
|
605 |
+
"Token Generation 1",
|
606 |
+
"Token Generation 2",
|
607 |
+
"Token Winner",
|
608 |
+
"Token Win Prob",
|
609 |
+
"Prompt Processing 1",
|
610 |
+
"Prompt Processing 2",
|
611 |
+
"Prompt Winner",
|
612 |
+
"Prompt Win Prob",
|
613 |
+
"Combined Winner",
|
614 |
+
"Combined Win Prob",
|
615 |
+
"Platform 1",
|
616 |
+
"Platform 2",
|
617 |
+
]
|
618 |
+
|
619 |
+
# Ensure all columns exist in the dataframe
|
620 |
+
valid_cols = [
|
621 |
+
col for col in display_cols if col in matches_df.columns
|
622 |
+
]
|
623 |
+
|
624 |
+
if valid_cols:
|
625 |
+
# Rename some columns for better display
|
626 |
+
matches_display = matches_df[valid_cols].copy()
|
627 |
+
|
628 |
+
# Define a rename mapping but only apply for columns that exist
|
629 |
+
rename_mapping = {
|
630 |
+
"Token Generation 1": f"{device_display_names[device1]} Token Gen",
|
631 |
+
"Token Generation 2": f"{device_display_names[device2]} Token Gen",
|
632 |
+
"Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
|
633 |
+
"Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
|
634 |
+
"Platform 1": f"{device_display_names[device1]} Platform",
|
635 |
+
"Platform 2": f"{device_display_names[device2]} Platform",
|
636 |
+
"Token Win Prob": "Device 1 Token Win Prob",
|
637 |
+
"Prompt Win Prob": "Device 1 Prompt Win Prob",
|
638 |
+
"Combined Win Prob": "Device 1 Combined Win Prob",
|
639 |
+
}
|
640 |
+
|
641 |
+
# Only rename columns that exist in the dataframe
|
642 |
+
rename_filtered = {
|
643 |
+
k: v
|
644 |
+
for k, v in rename_mapping.items()
|
645 |
+
if k in matches_display.columns
|
646 |
+
}
|
647 |
+
matches_display = matches_display.rename(
|
648 |
+
columns=rename_filtered
|
649 |
+
)
|
650 |
+
|
651 |
+
# Round any numeric columns for better display
|
652 |
+
for col in matches_display.columns:
|
653 |
+
if matches_display[col].dtype in [
|
654 |
+
"float64",
|
655 |
+
"float32",
|
656 |
+
]:
|
657 |
+
matches_display[col] = matches_display[
|
658 |
+
col
|
659 |
+
].round(2)
|
660 |
+
|
661 |
+
st.dataframe(
|
662 |
+
matches_display,
|
663 |
+
use_container_width=True,
|
664 |
+
height=400,
|
665 |
+
)
|
666 |
+
else:
|
667 |
+
st.warning(
|
668 |
+
"No valid columns found for display in the match data."
|
669 |
+
)
|
670 |
+
|
671 |
+
# # Platform breakdown if available
|
672 |
+
# if "Platform 2" in matches_df.columns:
|
673 |
+
# with st.expander("Platform Distribution", expanded=False):
|
674 |
+
# platform_counts = matches_df[
|
675 |
+
# "Platform 2"
|
676 |
+
# ].value_counts()
|
677 |
+
# st.bar_chart(platform_counts)
|
678 |
+
else:
|
679 |
+
st.warning("Winner information is missing from the match data.")
|
680 |
else:
|
681 |
+
st.error(
|
682 |
f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
|
683 |
)
|
684 |
st.info(
|
src/components/header.py
CHANGED
@@ -114,7 +114,7 @@ def render_header():
|
|
114 |
<div class="logos-container">
|
115 |
<img src="data:image/png;base64,{get_image_base64(pocketpal_logo_path)}" class="logo pocketpal" alt="PocketPal AI Logo">
|
116 |
</div>
|
117 |
-
<h1 class="header-title">AI Phone
|
118 |
<p class="header-subtitle">Comparing Large Language Models performance across AI Phones. Powered by PocketPal AI.</p>
|
119 |
</div>
|
120 |
"""
|
|
|
114 |
<div class="logos-container">
|
115 |
<img src="data:image/png;base64,{get_image_base64(pocketpal_logo_path)}" class="logo pocketpal" alt="PocketPal AI Logo">
|
116 |
</div>
|
117 |
+
<h1 class="header-title">AI Phone Leaderboard</h1>
|
118 |
<p class="header-subtitle">Comparing Large Language Models performance across AI Phones. Powered by PocketPal AI.</p>
|
119 |
</div>
|
120 |
"""
|