agh123 commited on
Commit
4d24dca
·
1 Parent(s): 1ee32d1

feat: refactor Device Duel

Browse files
src/components/device_comparison.py CHANGED
@@ -1,11 +1,392 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
  from typing import List, Optional
4
 
5
  from ..core.glicko2_ranking import analyze_device_glicko2_matches
6
  from ..components.visualizations import clean_device_id
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
10
  """
11
  Render a component for comparing two devices and analyzing their matches.
@@ -16,41 +397,122 @@ def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str])
16
  """
17
  st.title("⚔️ Device Duel Arena")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Create mapping of normalized IDs to display names
20
  device_display_names = {
21
  device_id: clean_device_id(device_id) for device_id in normalized_device_ids
22
  }
23
 
24
- # Create two columns for device selection
25
- col1, col2 = st.columns(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  with col1:
 
 
 
 
28
  device1 = st.selectbox(
29
- "Select First Device",
30
  options=normalized_device_ids,
31
  format_func=lambda x: device_display_names[x],
32
  key="device_compare_1",
 
 
33
  )
34
 
35
  with col2:
 
 
 
 
36
  device2 = st.selectbox(
37
- "Select Second Device",
38
  options=normalized_device_ids,
39
  format_func=lambda x: device_display_names[x],
40
  key="device_compare_2",
 
 
 
 
 
 
 
 
 
 
 
41
  )
42
 
43
- # Button to analyze matches
44
- if st.button("Start Duel", key="analyze_matches_btn"):
45
  # Validate device selection
46
- if device1 == device2:
 
 
 
47
  st.error("Please select two different devices to compare.")
48
  return
49
 
50
- st.markdown("### Match Analysis Results")
 
 
 
 
 
 
 
 
 
51
 
52
  with st.spinner(
53
- f"Analyzing matches between {device_display_names[device1]} and {device_display_names[device2]}..."
54
  ):
55
  try:
56
  # Analyze matches using Glicko-2
@@ -60,117 +522,163 @@ def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str])
60
  # Show summary statistics
61
  total_matches = len(matches_df)
62
 
63
- # Set up metrics
64
- col1, col2, col3 = st.columns(3)
65
-
66
- with col1:
67
- st.metric("Total Matches", total_matches)
68
-
69
  # Check for required columns before calculating metrics
70
  if (
71
  "Token Winner" in matches_df.columns
72
  and "Prompt Winner" in matches_df.columns
 
73
  ):
74
  token_wins_1 = sum(matches_df["Token Winner"] == device1)
75
  prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
 
76
 
77
- with col2:
78
- st.metric(
79
- f"{device_display_names[device1]}'s Token Wins",
80
- f"{token_wins_1} ({token_wins_1/total_matches*100:.1f}%)",
81
- )
82
- with col3:
83
- st.metric(
84
- f"{device_display_names[device1]}'s Prompt Wins",
85
- f"{prompt_wins_1} ({prompt_wins_1/total_matches*100:.1f}%)",
86
- )
87
 
88
- # Add Combined Winner metric if available
89
- if "Combined Winner" in matches_df.columns:
90
- combined_wins_1 = sum(
91
- matches_df["Combined Winner"] == device1
92
- )
93
- st.metric(
94
- f"{device_display_names[device1]}'s Combined Wins",
95
- f"{combined_wins_1} ({combined_wins_1/total_matches*100:.1f}%)",
96
- )
97
- else:
98
- st.warning("Winner information is missing from the match data.")
99
 
100
- # Show the detailed match table
101
- st.markdown("#### Detailed Match Results")
102
-
103
- # Define display columns for Glicko-2
104
- display_cols = [
105
- "Model",
106
- "Token Generation 1",
107
- "Token Generation 2",
108
- "Token Winner",
109
- "Token Win Prob",
110
- "Prompt Processing 1",
111
- "Prompt Processing 2",
112
- "Prompt Winner",
113
- "Prompt Win Prob",
114
- "Combined Winner",
115
- "Combined Win Prob",
116
- "Platform 1",
117
- "Platform 2",
118
- ]
119
-
120
- # Ensure all columns exist in the dataframe
121
- valid_cols = [
122
- col for col in display_cols if col in matches_df.columns
123
- ]
124
-
125
- if valid_cols:
126
- # Rename some columns for better display
127
- matches_display = matches_df[valid_cols].copy()
128
-
129
- # Define a rename mapping but only apply for columns that exist
130
- rename_mapping = {
131
- "Token Generation 1": f"{device_display_names[device1]} Token Gen",
132
- "Token Generation 2": f"{device_display_names[device2]} Token Gen",
133
- "Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
134
- "Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
135
- "Platform 1": f"{device_display_names[device1]} Platform",
136
- "Platform 2": f"{device_display_names[device2]} Platform",
137
- "Token Win Prob": "Device 1 Token Win Prob",
138
- "Prompt Win Prob": "Device 1 Prompt Win Prob",
139
- "Combined Win Prob": "Device 1 Combined Win Prob",
140
- }
141
-
142
- # Only rename columns that exist in the dataframe
143
- rename_filtered = {
144
- k: v
145
- for k, v in rename_mapping.items()
146
- if k in matches_display.columns
147
- }
148
- matches_display = matches_display.rename(
149
- columns=rename_filtered
150
  )
151
 
152
- # Round any numeric columns for better display
153
- for col in matches_display.columns:
154
- if matches_display[col].dtype in ["float64", "float32"]:
155
- matches_display[col] = matches_display[col].round(2)
 
 
156
 
157
- st.dataframe(
158
- matches_display,
159
- use_container_width=True,
160
- height=400,
 
 
 
 
 
 
161
  )
162
- else:
163
- st.warning(
164
- "No valid columns found for display in the match data."
 
 
 
 
 
 
 
165
  )
166
 
167
- # Platform breakdown if available
168
- if "Platform 2" in matches_df.columns:
169
- st.markdown("#### Platform Distribution")
170
- platform_counts = matches_df["Platform 2"].value_counts()
171
- st.bar_chart(platform_counts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  else:
173
- st.warning(
174
  f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
175
  )
176
  st.info(
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import plotly.graph_objects as go
4
  from typing import List, Optional
5
 
6
  from ..core.glicko2_ranking import analyze_device_glicko2_matches
7
  from ..components.visualizations import clean_device_id
8
 
9
 
10
+ def create_head_to_head_battle_chart(
11
+ device1: str,
12
+ device2: str,
13
+ device1_display: str,
14
+ device2_display: str,
15
+ token_wins_1: int,
16
+ prompt_wins_1: int,
17
+ combined_wins_1: int,
18
+ total_matches: int,
19
+ ):
20
+ """Create an engaging head-to-head battle visualization."""
21
+
22
+ # Calculate win percentages for both devices
23
+ token_pct_1 = token_wins_1 / total_matches * 100
24
+ token_pct_2 = 100 - token_pct_1
25
+
26
+ prompt_pct_1 = prompt_wins_1 / total_matches * 100
27
+ prompt_pct_2 = 100 - prompt_pct_1
28
+
29
+ combined_pct_1 = combined_wins_1 / total_matches * 100
30
+ combined_pct_2 = 100 - combined_pct_1
31
+
32
+ # Create figure
33
+ fig = go.Figure()
34
+
35
+ # Add bars for device 1
36
+ fig.add_trace(
37
+ go.Bar(
38
+ y=["Token Gen", "Prompt Proc", "Combined"],
39
+ x=[token_pct_1, prompt_pct_1, combined_pct_1],
40
+ name=device1_display,
41
+ orientation="h",
42
+ marker=dict(
43
+ color="rgba(58, 71, 180, 0.8)",
44
+ line=dict(color="rgba(58, 71, 180, 1.0)", width=2),
45
+ ),
46
+ text=[
47
+ f"{token_pct_1:.1f}%",
48
+ f"{prompt_pct_1:.1f}%",
49
+ f"{combined_pct_1:.1f}%",
50
+ ],
51
+ textposition="inside",
52
+ insidetextanchor="middle",
53
+ hoverinfo="text",
54
+ hovertext=[
55
+ f"{device1_display}<br>Token Wins: {token_wins_1} ({token_pct_1:.1f}%)",
56
+ f"{device1_display}<br>Prompt Wins: {prompt_wins_1} ({prompt_pct_1:.1f}%)",
57
+ f"{device1_display}<br>Combined Wins: {combined_wins_1} ({combined_pct_1:.1f}%)",
58
+ ],
59
+ width=0.5,
60
+ )
61
+ )
62
+
63
+ # Add bars for device 2
64
+ token_wins_2 = total_matches - token_wins_1
65
+ prompt_wins_2 = total_matches - prompt_wins_1
66
+ combined_wins_2 = total_matches - combined_wins_1
67
+
68
+ fig.add_trace(
69
+ go.Bar(
70
+ y=["Token Gen", "Prompt Proc", "Combined"],
71
+ x=[-token_pct_2, -prompt_pct_2, -combined_pct_2], # Negative to go left
72
+ name=device2_display,
73
+ orientation="h",
74
+ marker=dict(
75
+ color="rgba(231, 99, 99, 0.8)",
76
+ line=dict(color="rgba(231, 99, 99, 1.0)", width=2),
77
+ ),
78
+ text=[
79
+ f"{token_pct_2:.1f}%",
80
+ f"{prompt_pct_2:.1f}%",
81
+ f"{combined_pct_2:.1f}%",
82
+ ],
83
+ textposition="inside",
84
+ insidetextanchor="middle",
85
+ hoverinfo="text",
86
+ hovertext=[
87
+ f"{device2_display}<br>Token Wins: {token_wins_2} ({token_pct_2:.1f}%)",
88
+ f"{device2_display}<br>Prompt Wins: {prompt_wins_2} ({prompt_pct_2:.1f}%)",
89
+ f"{device2_display}<br>Combined Wins: {combined_wins_2} ({combined_pct_2:.1f}%)",
90
+ ],
91
+ width=0.5,
92
+ )
93
+ )
94
+
95
+ # Design: Add center line and decorations
96
+ fig.add_shape(
97
+ type="line",
98
+ x0=0,
99
+ y0=-0.5,
100
+ x1=0,
101
+ y1=2.5,
102
+ line=dict(color="black", width=2, dash="solid"),
103
+ )
104
+
105
+ # VS label in the middle
106
+ # fig.add_annotation(
107
+ # x=0,
108
+ # y=1.5,
109
+ # text="VS",
110
+ # showarrow=False,
111
+ # font=dict(size=20, color="black", family="Arial Black"),
112
+ # bgcolor="rgba(255, 255, 255, 0.8)",
113
+ # bordercolor="black",
114
+ # borderwidth=2,
115
+ # borderpad=4,
116
+ # width=50,
117
+ # height=30,
118
+ # )
119
+
120
+ # Update layout for a battle-like appearance
121
+ fig.update_layout(
122
+ title=dict(
123
+ text=f"⚔️ {device1_display} vs {device2_display} ⚔️",
124
+ font=dict(size=24, family="Arial Black"),
125
+ x=0.5,
126
+ ),
127
+ barmode="overlay",
128
+ bargap=0.15,
129
+ bargroupgap=0.1,
130
+ legend=dict(x=0.5, y=1.05, xanchor="center", orientation="h"),
131
+ xaxis=dict(
132
+ title="Win Rate (%)",
133
+ range=[-100, 100],
134
+ tickvals=[-100, -75, -50, -25, 0, 25, 50, 75, 100],
135
+ ticktext=["100%", "75%", "50%", "25%", "0%", "25%", "50%", "75%", "100%"],
136
+ zeroline=True,
137
+ zerolinewidth=2,
138
+ zerolinecolor="black",
139
+ ),
140
+ yaxis=dict(title="", autorange="reversed"),
141
+ plot_bgcolor="rgba(240, 240, 240, 0.8)",
142
+ height=400,
143
+ margin=dict(l=20, r=20, t=80, b=20),
144
+ # annotations=[
145
+ # dict(
146
+ # x=-50,
147
+ # y="Token Gen",
148
+ # text=device2_display,
149
+ # showarrow=False,
150
+ # font=dict(
151
+ # size=14, color="rgba(231, 99, 99, 1.0)", family="Arial Black"
152
+ # ),
153
+ # align="center",
154
+ # xanchor="center",
155
+ # ),
156
+ # dict(
157
+ # x=50,
158
+ # y="Token Gen",
159
+ # text=device1_display,
160
+ # showarrow=False,
161
+ # font=dict(
162
+ # size=14, color="rgba(58, 71, 180, 1.0)", family="Arial Black"
163
+ # ),
164
+ # align="center",
165
+ # xanchor="center",
166
+ # ),
167
+ # ],
168
+ )
169
+
170
+ return fig
171
+
172
+
173
+ def create_victory_badge(winner_device: str, loser_device: str, win_percentage: float):
174
+ """Create a stylized victory badge."""
175
+ badge_color = (
176
+ "#FFD700"
177
+ if win_percentage >= 75
178
+ else "#C0C0C0" if win_percentage >= 50 else "#CD7F32"
179
+ )
180
+ badge_text = (
181
+ "DOMINANT VICTORY"
182
+ if win_percentage >= 75
183
+ else "CLEAR WINNER" if win_percentage >= 50 else "NARROW VICTORY"
184
+ )
185
+
186
+ html = f"""
187
+ <div style="display: flex; justify-content: center; margin: 20px 0;">
188
+ <div style="
189
+ background: linear-gradient(135deg, {badge_color} 0%, #FFFFFF 50%, {badge_color} 100%);
190
+ border-radius: 16px;
191
+ padding: 20px;
192
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2);
193
+ text-align: center;
194
+ border: 2px solid {badge_color};
195
+ max-width: 90%;
196
+ ">
197
+ <div style="font-size: 24px; font-weight: bold; margin-bottom: 8px; font-family: 'Arial Black', sans-serif;">
198
+ 🏆 {badge_text} 🏆
199
+ </div>
200
+ <div style="font-size: 18px; font-weight: bold; color: #333;">
201
+ {winner_device}
202
+ </div>
203
+ <div style="font-size: 14px; margin: 8px 0;">
204
+ defeated
205
+ </div>
206
+ <div style="font-size: 16px; color: #555;">
207
+ {loser_device}
208
+ </div>
209
+ <div style="font-size: 20px; font-weight: bold; margin-top: 8px; color: #333;">
210
+ {win_percentage:.1f}% Win Rate
211
+ </div>
212
+ </div>
213
+ </div>
214
+ """
215
+ return html
216
+
217
+
218
+ def create_model_performance_chart(
219
+ matches_df, device1, device2, device1_display, device2_display, top_n=8
220
+ ):
221
+ """Create an improved model performance comparison chart with vertical models and side-by-side bars."""
222
+ # Group by model and calculate mean for both devices
223
+ token_cols = ["Model", "Token Generation 1", "Token Generation 2"]
224
+ prompt_cols = ["Model", "Prompt Processing 1", "Prompt Processing 2"]
225
+
226
+ # Ensure all required columns exist
227
+ if not all(col in matches_df.columns for col in token_cols + prompt_cols[1:]):
228
+ return None
229
+
230
+ # Prepare data
231
+ grouped = (
232
+ matches_df.groupby("Model")
233
+ .agg(
234
+ {
235
+ "Token Generation 1": "mean",
236
+ "Token Generation 2": "mean",
237
+ "Prompt Processing 1": "mean",
238
+ "Prompt Processing 2": "mean",
239
+ }
240
+ )
241
+ .reset_index()
242
+ )
243
+
244
+ # Sort by the sum of token generation (most performance difference first)
245
+ grouped["token_diff"] = abs(
246
+ grouped["Token Generation 1"] - grouped["Token Generation 2"]
247
+ )
248
+ grouped = grouped.sort_values("token_diff", ascending=False).head(top_n)
249
+
250
+ # Create figure with subplots - one row per model, two columns for token/prompt
251
+ fig = go.Figure()
252
+
253
+ models = grouped["Model"].tolist()
254
+ token_gen_1 = grouped["Token Generation 1"].tolist()
255
+ token_gen_2 = grouped["Token Generation 2"].tolist()
256
+ prompt_proc_1 = grouped["Prompt Processing 1"].tolist()
257
+ prompt_proc_2 = grouped["Prompt Processing 2"].tolist()
258
+
259
+ # Add Token Generation traces
260
+ fig.add_trace(
261
+ go.Bar(
262
+ x=token_gen_1,
263
+ y=models,
264
+ name=f"{device1_display} Token Gen",
265
+ orientation="h",
266
+ marker=dict(color="rgba(58, 71, 180, 0.8)"),
267
+ hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
268
+ legendgroup="device1",
269
+ offsetgroup=1,
270
+ xaxis="x",
271
+ )
272
+ )
273
+
274
+ fig.add_trace(
275
+ go.Bar(
276
+ x=token_gen_2,
277
+ y=models,
278
+ name=f"{device2_display} Token Gen",
279
+ orientation="h",
280
+ marker=dict(color="rgba(231, 99, 99, 0.8)"),
281
+ hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
282
+ legendgroup="device2",
283
+ offsetgroup=2,
284
+ xaxis="x",
285
+ )
286
+ )
287
+
288
+ # Add Prompt Processing traces
289
+ fig.add_trace(
290
+ go.Bar(
291
+ x=prompt_proc_1,
292
+ y=models,
293
+ name=f"{device1_display} Prompt Proc",
294
+ orientation="h",
295
+ marker=dict(color="rgba(58, 71, 180, 0.4)"),
296
+ hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
297
+ legendgroup="device1",
298
+ offsetgroup=1,
299
+ xaxis="x2",
300
+ showlegend=False,
301
+ )
302
+ )
303
+
304
+ fig.add_trace(
305
+ go.Bar(
306
+ x=prompt_proc_2,
307
+ y=models,
308
+ name=f"{device2_display} Prompt Proc",
309
+ orientation="h",
310
+ marker=dict(color="rgba(231, 99, 99, 0.4)"),
311
+ hovertemplate="%{y}<br>%{x:.2f} tokens/sec<extra></extra>",
312
+ legendgroup="device2",
313
+ offsetgroup=2,
314
+ xaxis="x2",
315
+ showlegend=False,
316
+ )
317
+ )
318
+
319
+ # Create layout with two x-axes
320
+ fig.update_layout(
321
+ title_text="📊 Performance Breakdown by Model",
322
+ grid=dict(rows=1, columns=2, pattern="independent"),
323
+ legend=dict(orientation="h", yanchor="bottom", y=1.12, xanchor="right", x=1),
324
+ height=max(
325
+ 350, 50 * len(models) + 120
326
+ ), # Dynamic height based on number of models
327
+ margin=dict(l=20, r=20, t=80, b=50),
328
+ xaxis=dict(
329
+ title="Token Generation (tokens/sec)", side="bottom", domain=[0, 0.48]
330
+ ),
331
+ xaxis2=dict(
332
+ title="Prompt Processing (tokens/sec)", side="bottom", domain=[0.52, 1]
333
+ ),
334
+ yaxis=dict(title="", autorange="reversed"),
335
+ )
336
+
337
+ # Add a center divider
338
+ fig.add_shape(
339
+ type="line",
340
+ x0=0.5,
341
+ y0=0,
342
+ x1=0.5,
343
+ y1=1,
344
+ xref="paper",
345
+ yref="paper",
346
+ line=dict(color="rgba(0,0,0,0.2)", width=1, dash="dash"),
347
+ )
348
+
349
+ # Add headers for each section
350
+ fig.add_annotation(
351
+ x=0.4,
352
+ y=1.08,
353
+ xanchor="right",
354
+ xref="paper",
355
+ yref="paper",
356
+ text="Token Generation",
357
+ showarrow=False,
358
+ font=dict(
359
+ size=14,
360
+ color="rgba(58, 71, 180, 1.0)",
361
+ family="Arial, sans-serif",
362
+ weight="bold",
363
+ ),
364
+ )
365
+
366
+ fig.add_annotation(
367
+ x=0.6,
368
+ y=1.08,
369
+ xanchor="left",
370
+ xref="paper",
371
+ yref="paper",
372
+ text="Prompt Processing",
373
+ showarrow=False,
374
+ font=dict(
375
+ size=14,
376
+ color="rgba(231, 99, 99, 1.0)",
377
+ family="Arial, sans-serif",
378
+ weight="bold",
379
+ ),
380
+ )
381
+
382
+ # Better styling for the model names
383
+ fig.update_yaxes(
384
+ tickfont=dict(size=12, family="Arial, sans-serif"), gridcolor="rgba(0,0,0,0.05)"
385
+ )
386
+
387
+ return fig
388
+
389
+
390
  def render_device_comparison(df: pd.DataFrame, normalized_device_ids: List[str]):
391
  """
392
  Render a component for comparing two devices and analyzing their matches.
 
397
  """
398
  st.title("⚔️ Device Duel Arena")
399
 
400
+ # Add dramatic introduction with some CSS styling
401
+ st.markdown(
402
+ """
403
+ <div style="text-align: center; padding: 10px; margin-bottom: 20px;
404
+ background: linear-gradient(135deg, #f6f8fa 0%, #e9ecef 100%);
405
+ border-radius: 10px; border: 1px solid #dee2e6;">
406
+ <p style="font-size: 16px; font-style: italic; color: #495057;">
407
+ Welcome to the arena where devices face off in direct comparison!
408
+ Choose any two and see how they stack up.
409
+ </p>
410
+ </div>
411
+ """,
412
+ unsafe_allow_html=True,
413
+ )
414
+
415
  # Create mapping of normalized IDs to display names
416
  device_display_names = {
417
  device_id: clean_device_id(device_id) for device_id in normalized_device_ids
418
  }
419
 
420
+ # Create two columns for device selection with battle-themed styling
421
+ st.markdown(
422
+ """
423
+ <style>
424
+ .device-select-header {
425
+ font-weight: bold;
426
+ font-size: 18px;
427
+ margin-bottom: 10px;
428
+ text-align: center;
429
+ padding: 5px;
430
+ border-radius: 5px;
431
+ }
432
+ .device1-header {
433
+ background-color: rgba(58, 71, 180, 0.2);
434
+ border-left: 4px solid rgba(58, 71, 180, 1.0);
435
+ }
436
+ .device2-header {
437
+ background-color: rgba(231, 99, 99, 0.2);
438
+ border-left: 4px solid rgba(231, 99, 99, 1.0);
439
+ }
440
+ </style>
441
+ """,
442
+ unsafe_allow_html=True,
443
+ )
444
+
445
+ col1, vs_col, col2 = st.columns([0.45, 0.1, 0.45])
446
+
447
+ with vs_col:
448
+ st.markdown(
449
+ """
450
+ <div style="display: flex; height: 100%; align-items: center; justify-content: center;">
451
+ <div style="font-size: 24px; font-weight: bold; color: #555;">VS</div>
452
+ </div>
453
+ """,
454
+ unsafe_allow_html=True,
455
+ )
456
 
457
  with col1:
458
+ st.markdown(
459
+ '<div class="device-select-header device1-header">CHALLENGER</div>',
460
+ unsafe_allow_html=True,
461
+ )
462
  device1 = st.selectbox(
463
+ "First Device",
464
  options=normalized_device_ids,
465
  format_func=lambda x: device_display_names[x],
466
  key="device_compare_1",
467
+ index=None,
468
+ placeholder="Select a device ...",
469
  )
470
 
471
  with col2:
472
+ st.markdown(
473
+ '<div class="device-select-header device2-header">OPPONENT</div>',
474
+ unsafe_allow_html=True,
475
+ )
476
  device2 = st.selectbox(
477
+ "Second Device",
478
  options=normalized_device_ids,
479
  format_func=lambda x: device_display_names[x],
480
  key="device_compare_2",
481
+ index=None,
482
+ placeholder="Select a device ...",
483
+ )
484
+
485
+ # Button to analyze matches with a more exciting style
486
+ button_col1, button_col2, button_col3 = st.columns([0.3, 0.4, 0.3])
487
+ with button_col2:
488
+ duel_button = st.button(
489
+ "️Start",
490
+ key="analyze_matches_btn",
491
+ use_container_width=True,
492
  )
493
 
494
+ if duel_button:
 
495
  # Validate device selection
496
+ if not device1 or not device2:
497
+ st.error("Please select two devices to battle!")
498
+ return
499
+ elif device1 == device2:
500
  st.error("Please select two different devices to compare.")
501
  return
502
 
503
+ # Create dramatic divider
504
+ st.markdown(
505
+ """
506
+ <div style="text-align: center; margin: 20px 0;">
507
+ <div style="font-size: 24px; font-weight: bold; color: #333;">⚔️ BATTLE RESULTS ⚔️</div>
508
+ <div style="height: 4px; background: linear-gradient(90deg, rgba(58,71,180,1) 0%, rgba(231,99,99,1) 100%); margin: 10px 0;"></div>
509
+ </div>
510
+ """,
511
+ unsafe_allow_html=True,
512
+ )
513
 
514
  with st.spinner(
515
+ f"⚔️ Battle in progress between {device_display_names[device1]} and {device_display_names[device2]}..."
516
  ):
517
  try:
518
  # Analyze matches using Glicko-2
 
522
  # Show summary statistics
523
  total_matches = len(matches_df)
524
 
 
 
 
 
 
 
525
  # Check for required columns before calculating metrics
526
  if (
527
  "Token Winner" in matches_df.columns
528
  and "Prompt Winner" in matches_df.columns
529
+ and "Combined Winner" in matches_df.columns
530
  ):
531
  token_wins_1 = sum(matches_df["Token Winner"] == device1)
532
  prompt_wins_1 = sum(matches_df["Prompt Winner"] == device1)
533
+ combined_wins_1 = sum(matches_df["Combined Winner"] == device1)
534
 
535
+ # Display total matches info
536
+ st.markdown(
537
+ f"""
538
+ <div style="text-align: center; padding: 10px; background-color: #f8f9fa;
539
+ border-radius: 5px; margin: 10px 0; border: 1px solid #dee2e6;">
540
+ <span style="font-size: 16px; font-weight: bold;">Total Matches: {total_matches}</span>
541
+ </div>
542
+ """,
543
+ unsafe_allow_html=True,
544
+ )
545
 
546
+ # Show victory badge for the overall winner
547
+ winner_device = (
548
+ device1 if combined_wins_1 > total_matches / 2 else device2
549
+ )
550
+ loser_device = device2 if winner_device == device1 else device1
 
 
 
 
 
 
551
 
552
+ winner_display = device_display_names[winner_device]
553
+ loser_display = device_display_names[loser_device]
554
+
555
+ win_percentage = (
556
+ (combined_wins_1 / total_matches * 100)
557
+ if winner_device == device1
558
+ else (
559
+ (total_matches - combined_wins_1) / total_matches * 100
560
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  )
562
 
563
+ st.markdown(
564
+ create_victory_badge(
565
+ winner_display, loser_display, win_percentage
566
+ ),
567
+ unsafe_allow_html=True,
568
+ )
569
 
570
+ # Create battle visualization
571
+ battle_fig = create_head_to_head_battle_chart(
572
+ device1,
573
+ device2,
574
+ device_display_names[device1],
575
+ device_display_names[device2],
576
+ token_wins_1,
577
+ prompt_wins_1,
578
+ combined_wins_1,
579
+ total_matches,
580
  )
581
+
582
+ st.plotly_chart(battle_fig, use_container_width=True)
583
+
584
+ # Replace the model-specific charts with the new integrated version
585
+ model_performance_chart = create_model_performance_chart(
586
+ matches_df,
587
+ device1,
588
+ device2,
589
+ device_display_names[device1],
590
+ device_display_names[device2],
591
  )
592
 
593
+ if model_performance_chart:
594
+ st.plotly_chart(
595
+ model_performance_chart, use_container_width=True
596
+ )
597
+
598
+ # Show the detailed match table
599
+ with st.expander("View Detailed Match Results", expanded=False):
600
+ st.markdown("#### All Match Data")
601
+
602
+ # Define display columns for Glicko-2
603
+ display_cols = [
604
+ "Model",
605
+ "Token Generation 1",
606
+ "Token Generation 2",
607
+ "Token Winner",
608
+ "Token Win Prob",
609
+ "Prompt Processing 1",
610
+ "Prompt Processing 2",
611
+ "Prompt Winner",
612
+ "Prompt Win Prob",
613
+ "Combined Winner",
614
+ "Combined Win Prob",
615
+ "Platform 1",
616
+ "Platform 2",
617
+ ]
618
+
619
+ # Ensure all columns exist in the dataframe
620
+ valid_cols = [
621
+ col for col in display_cols if col in matches_df.columns
622
+ ]
623
+
624
+ if valid_cols:
625
+ # Rename some columns for better display
626
+ matches_display = matches_df[valid_cols].copy()
627
+
628
+ # Define a rename mapping but only apply for columns that exist
629
+ rename_mapping = {
630
+ "Token Generation 1": f"{device_display_names[device1]} Token Gen",
631
+ "Token Generation 2": f"{device_display_names[device2]} Token Gen",
632
+ "Prompt Processing 1": f"{device_display_names[device1]} Prompt Proc",
633
+ "Prompt Processing 2": f"{device_display_names[device2]} Prompt Proc",
634
+ "Platform 1": f"{device_display_names[device1]} Platform",
635
+ "Platform 2": f"{device_display_names[device2]} Platform",
636
+ "Token Win Prob": "Device 1 Token Win Prob",
637
+ "Prompt Win Prob": "Device 1 Prompt Win Prob",
638
+ "Combined Win Prob": "Device 1 Combined Win Prob",
639
+ }
640
+
641
+ # Only rename columns that exist in the dataframe
642
+ rename_filtered = {
643
+ k: v
644
+ for k, v in rename_mapping.items()
645
+ if k in matches_display.columns
646
+ }
647
+ matches_display = matches_display.rename(
648
+ columns=rename_filtered
649
+ )
650
+
651
+ # Round any numeric columns for better display
652
+ for col in matches_display.columns:
653
+ if matches_display[col].dtype in [
654
+ "float64",
655
+ "float32",
656
+ ]:
657
+ matches_display[col] = matches_display[
658
+ col
659
+ ].round(2)
660
+
661
+ st.dataframe(
662
+ matches_display,
663
+ use_container_width=True,
664
+ height=400,
665
+ )
666
+ else:
667
+ st.warning(
668
+ "No valid columns found for display in the match data."
669
+ )
670
+
671
+ # # Platform breakdown if available
672
+ # if "Platform 2" in matches_df.columns:
673
+ # with st.expander("Platform Distribution", expanded=False):
674
+ # platform_counts = matches_df[
675
+ # "Platform 2"
676
+ # ].value_counts()
677
+ # st.bar_chart(platform_counts)
678
+ else:
679
+ st.warning("Winner information is missing from the match data.")
680
  else:
681
+ st.error(
682
  f"No matches found between {device_display_names[device1]} and {device_display_names[device2]}."
683
  )
684
  st.info(
src/components/header.py CHANGED
@@ -114,7 +114,7 @@ def render_header():
114
  <div class="logos-container">
115
  <img src="data:image/png;base64,{get_image_base64(pocketpal_logo_path)}" class="logo pocketpal" alt="PocketPal AI Logo">
116
  </div>
117
- <h1 class="header-title">AI Phone Leaderboard</h1>
118
  <p class="header-subtitle">Comparing Large Language Models performance across AI Phones. Powered by PocketPal AI.</p>
119
  </div>
120
  """
 
114
  <div class="logos-container">
115
  <img src="data:image/png;base64,{get_image_base64(pocketpal_logo_path)}" class="logo pocketpal" alt="PocketPal AI Logo">
116
  </div>
117
+ <h1 class="header-title">AI Phone Leaderboard</h1>
118
  <p class="header-subtitle">Comparing Large Language Models performance across AI Phones. Powered by PocketPal AI.</p>
119
  </div>
120
  """