Pratik Bhavsar commited on
Commit
c411387
·
1 Parent(s): 5f94245

cleaned up v1

Browse files
app.py CHANGED
@@ -14,8 +14,6 @@ from data_loader import (
14
  SCORES,
15
  )
16
  from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
17
- from tabs.model_comparison import create_model_comparison_tab, compare_models
18
- from tabs.data_exploration import create_exploration_tab, filter_and_display
19
 
20
 
21
  def create_app():
@@ -32,10 +30,6 @@ def create_app():
32
  df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
33
  )
34
 
35
- mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
36
-
37
- exp_outputs = create_exploration_tab(df)
38
-
39
  # Initial loads
40
  app.load(
41
  fn=lambda: filter_leaderboard(
@@ -44,26 +38,6 @@ def create_app():
44
  outputs=[lb_output, lb_plot1, lb_plot2],
45
  )
46
 
47
- app.load(
48
- fn=lambda: compare_models(
49
- df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
50
- ),
51
- outputs=[mc_info, mc_plot],
52
- )
53
-
54
- app.load(
55
- fn=lambda: filter_and_display(
56
- MODELS[0],
57
- DATASETS[0],
58
- min(SCORES),
59
- max(SCORES),
60
- 0,
61
- 0,
62
- 0,
63
- ),
64
- outputs=exp_outputs[:-1],
65
- )
66
-
67
  return app
68
 
69
 
 
14
  SCORES,
15
  )
16
  from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
 
 
17
 
18
 
19
  def create_app():
 
30
  df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
31
  )
32
 
 
 
 
 
33
  # Initial loads
34
  app.load(
35
  fn=lambda: filter_leaderboard(
 
38
  outputs=[lb_output, lb_plot1, lb_plot2],
39
  )
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return app
42
 
43
 
data_loader.py CHANGED
@@ -23,7 +23,7 @@ SCORES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
23
 
24
  def load_data():
25
  """Load and preprocess the data."""
26
- df = pd.read_csv("results.csv").dropna()
27
 
28
  # Add combined I/O cost column with 3:1 ratio
29
  df["IO Cost"] = (
@@ -596,7 +596,7 @@ HEADER_CONTENT = (
596
  <div class="header-content">
597
  <div class="title-section">
598
 
599
- <div class="title-gradient">Agent Leaderboard</div>
600
 
601
  <div class="description">
602
  GenAI is evolving rapidly with developers building high ROI agents. <br>
 
23
 
24
  def load_data():
25
  """Load and preprocess the data."""
26
+ df = pd.read_csv("results_v1.csv").dropna()
27
 
28
  # Add combined I/O cost column with 3:1 ratio
29
  df["IO Cost"] = (
 
596
  <div class="header-content">
597
  <div class="title-section">
598
 
599
+ <div class="title-gradient">Agent Leaderboard v1</div>
600
 
601
  <div class="description">
602
  GenAI is evolving rapidly with developers building high ROI agents. <br>
results.csv → results_v1.csv RENAMED
File without changes
tabs/data_exploration.py DELETED
@@ -1,810 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import numpy as np
4
- from data_loader import MODELS, DATASETS, SCORES, HEADER_CONTENT
5
- from chat import (
6
- format_chat_display,
7
- format_metrics_display,
8
- format_tool_info,
9
- )
10
-
11
-
12
- def get_updated_df(df, df_output):
13
- df = df.iloc[: len(df_output)].copy()
14
- df["response"] = df_output["response"].tolist()
15
- df["rationale"] = df_output["rationale"].tolist()
16
- df["explanation"] = df_output["explanation"].tolist()
17
- df["score"] = df_output["score"].tolist()
18
- cols = [
19
- "conversation",
20
- "tools_langchain",
21
- "n_turns",
22
- "len_query",
23
- "n_tools",
24
- "response",
25
- "rationale",
26
- "explanation",
27
- "score",
28
- ]
29
- return df[cols]
30
-
31
-
32
- def get_chat_and_score_df(model, dataset):
33
- df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
34
- df = pd.read_parquet(f"datasets/{dataset}.parquet")
35
- df = get_updated_df(df, df_output)
36
- return df
37
-
38
-
39
- def on_filter_change(
40
- model,
41
- dataset,
42
- min_score,
43
- max_score,
44
- min_n_turns,
45
- min_len_query,
46
- min_n_tools,
47
- ):
48
- try:
49
- # Call filter_and_display with index 0 and unpack 4 values
50
- chat_html, metrics_html, tool_html, index_html = filter_and_display(
51
- model,
52
- dataset,
53
- min_score,
54
- max_score,
55
- min_n_turns,
56
- min_len_query,
57
- min_n_tools,
58
- 0,
59
- )
60
- # Return exactly 4 values
61
- return chat_html, metrics_html, tool_html, index_html
62
- except Exception as e:
63
- error_html = f"""
64
- <div style="padding: 1.5rem; color: var(--score-low);">
65
- <div style="font-weight: 600;">Filter Error</div>
66
- <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
67
- {str(e)}
68
- </div>
69
- </div>
70
- """
71
- return (
72
- error_html,
73
- "<div style='text-align: center;'>No metrics available</div>",
74
- "<div style='text-align: center;'>No tool information available</div>",
75
- "<div style='text-align: center;'>0/0</div>",
76
- )
77
-
78
-
79
- def navigate_prev(
80
- current_idx,
81
- model,
82
- dataset,
83
- min_score,
84
- max_score,
85
- min_n_turns,
86
- min_len_query,
87
- min_n_tools,
88
- ):
89
- try:
90
- # Handle current_idx as dictionary
91
- if isinstance(current_idx, dict) and "value" in current_idx:
92
- idx_val = int(current_idx["value"])
93
- else:
94
- idx_val = int(current_idx) if current_idx is not None else 0
95
-
96
- new_index = max(0, idx_val - 1)
97
-
98
- chat_html, metrics_html, tool_html, index_html = filter_and_display(
99
- model,
100
- dataset,
101
- min_score,
102
- max_score,
103
- min_n_turns,
104
- min_len_query,
105
- min_n_tools,
106
- new_index,
107
- )
108
- return chat_html, metrics_html, tool_html, index_html, new_index
109
- except Exception as e:
110
- error_html = f"""
111
- <div style="padding: 1.5rem; color: var(--score-low);">
112
- <div style="font-weight: 600;">Navigation Error</div>
113
- <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
114
- {str(e)}
115
- </div>
116
- </div>
117
- """
118
- return (
119
- error_html,
120
- "<div style='text-align: center;'>No metrics available</div>",
121
- "<div style='text-align: center;'>No tool information available</div>",
122
- "<div style='text-align: center;'>0/0</div>",
123
- current_idx or 0,
124
- )
125
-
126
-
127
- def navigate_next(
128
- current_idx,
129
- model,
130
- dataset,
131
- min_score,
132
- max_score,
133
- min_n_turns,
134
- min_len_query,
135
- min_n_tools,
136
- ):
137
- try:
138
- # Handle current_idx as dictionary
139
- if isinstance(current_idx, dict) and "value" in current_idx:
140
- idx_val = int(current_idx["value"])
141
- else:
142
- idx_val = int(current_idx) if current_idx is not None else 0
143
-
144
- new_index = idx_val + 1
145
-
146
- chat_html, metrics_html, tool_html, index_html = filter_and_display(
147
- model,
148
- dataset,
149
- min_score,
150
- max_score,
151
- min_n_turns,
152
- min_len_query,
153
- min_n_tools,
154
- new_index,
155
- )
156
- return chat_html, metrics_html, tool_html, index_html, new_index
157
- except Exception as e:
158
- error_html = f"""
159
- <div style="padding: 1.5rem; color: var(--score-low);">
160
- <div style="font-weight: 600;">Navigation Error</div>
161
- <div style="font-family: monospace; background-color: var(--surface-color-alt); padding: 0.5rem; margin-top: 0.5rem;">
162
- {str(e)}
163
- </div>
164
- </div>
165
- """
166
- return (
167
- error_html,
168
- "<div style='text-align: center;'>No metrics available</div>",
169
- "<div style='text-align: center;'>No tool information available</div>",
170
- "<div style='text-align: center;'>0/0</div>",
171
- current_idx or 0,
172
- )
173
-
174
-
175
- def filter_and_display(
176
- model,
177
- dataset,
178
- min_score,
179
- max_score,
180
- min_n_turns,
181
- min_len_query,
182
- min_n_tools,
183
- index=0,
184
- ):
185
- """Combined function to filter data and update display"""
186
- try:
187
- # Extract model
188
- if isinstance(model, dict):
189
- if "value" in model:
190
- model_str = str(model["value"])
191
- else:
192
- model_str = MODELS[0]
193
- else:
194
- model_str = str(model) if model is not None else MODELS[0]
195
-
196
- # Extract dataset
197
- if isinstance(dataset, dict):
198
- if "value" in dataset:
199
- dataset_str = str(dataset["value"])
200
- else:
201
- dataset_str = DATASETS[0]
202
- else:
203
- dataset_str = str(dataset) if dataset is not None else DATASETS[0]
204
-
205
- # Extract min_score
206
- if isinstance(min_score, dict):
207
- if "value" in min_score:
208
- min_score_val = float(min_score["value"])
209
- else:
210
- min_score_val = float(min(SCORES))
211
- else:
212
- min_score_val = (
213
- float(min_score) if min_score is not None else float(min(SCORES))
214
- )
215
-
216
- # Extract max_score
217
- if isinstance(max_score, dict):
218
- if "value" in max_score:
219
- max_score_val = float(max_score["value"])
220
- else:
221
- max_score_val = float(max(SCORES))
222
- else:
223
- max_score_val = (
224
- float(max_score) if max_score is not None else float(max(SCORES))
225
- )
226
-
227
- # Extract min_n_turns
228
- if isinstance(min_n_turns, dict):
229
- if "value" in min_n_turns:
230
- min_n_turns_val = int(min_n_turns["value"])
231
- else:
232
- min_n_turns_val = 0
233
- else:
234
- min_n_turns_val = int(min_n_turns) if min_n_turns is not None else 0
235
-
236
- # Extract min_len_query
237
- if isinstance(min_len_query, dict):
238
- if "value" in min_len_query:
239
- min_len_query_val = int(min_len_query["value"])
240
- else:
241
- min_len_query_val = 0
242
- else:
243
- min_len_query_val = int(min_len_query) if min_len_query is not None else 0
244
-
245
- # Extract min_n_tools
246
- if isinstance(min_n_tools, dict):
247
- if "value" in min_n_tools:
248
- min_n_tools_val = int(min_n_tools["value"])
249
- else:
250
- min_n_tools_val = 0
251
- else:
252
- min_n_tools_val = int(min_n_tools) if min_n_tools is not None else 0
253
-
254
- # Extract index
255
- if isinstance(index, dict):
256
- if "value" in index:
257
- try:
258
- index_val = int(index["value"])
259
- except (ValueError, TypeError):
260
- index_val = 0
261
- else:
262
- index_val = 0
263
- else:
264
- try:
265
- index_val = int(index) if index is not None else 0
266
- except (ValueError, TypeError):
267
- index_val = 0
268
-
269
- # Get the data
270
- df_chat = get_chat_and_score_df(model_str, dataset_str)
271
-
272
- # Ensure filter columns exist
273
- for col, default in [
274
- ("score", 0.0),
275
- ("n_turns", 0),
276
- ("len_query", 0),
277
- ("n_tools", 0),
278
- ]:
279
- if col not in df_chat.columns:
280
- df_chat[col] = default
281
- else:
282
- df_chat[col] = pd.to_numeric(df_chat[col], errors="coerce").fillna(
283
- default
284
- )
285
-
286
- # Apply all filters
287
- df_filtered = df_chat[
288
- (df_chat["score"] >= min_score_val)
289
- & (df_chat["score"] <= max_score_val)
290
- & (df_chat["n_turns"] >= min_n_turns_val)
291
- & (df_chat["len_query"] >= min_len_query_val)
292
- & (df_chat["n_tools"] >= min_n_tools_val)
293
- ].copy()
294
-
295
- # Check if dataframe is empty
296
- if len(df_filtered) == 0:
297
- empty_message = """
298
- <div style="
299
- padding: 1.5rem;
300
- text-align: center;
301
- color: var(--text-muted);
302
- background-color: var(--surface-color-alt);
303
- border-radius: 8px;
304
- border: 1px dashed var(--border-color);
305
- margin: 1rem 0;">
306
- <div style="font-size: 2rem; margin-bottom: 1rem;">📭</div>
307
- <div style="font-weight: 500; margin-bottom: 0.5rem;">No Results Found</div>
308
- <div style="font-style: italic; font-size: 0.9rem;">Try adjusting your filters to see more data</div>
309
- </div>
310
- """
311
- return (
312
- empty_message,
313
- empty_message,
314
- empty_message,
315
- "<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
316
- )
317
-
318
- # Ensure index is valid
319
- max_index = len(df_filtered) - 1
320
- valid_index = max(0, min(index_val, max_index))
321
-
322
- # Get the row
323
- row = df_filtered.iloc[valid_index]
324
-
325
- # Format displays
326
- chat_html = format_chat_display(row)
327
- metrics_html = format_metrics_display(row)
328
-
329
- # Get tools info with error handling
330
- try:
331
- tool_html = format_tool_info(row["tools_langchain"])
332
- except Exception as e:
333
- tool_html = f"""
334
- <div style="padding: 1rem; background-color: var(--surface-color-alt); border-radius: 8px; color: var(--text-muted);">
335
- <div style="font-weight: 500; margin-bottom: 0.5rem;">Tool Information Unavailable</div>
336
- <div style="font-size: 0.9rem;">Error: {str(e)}</div>
337
- </div>
338
- """
339
-
340
- # Index display
341
- index_html = f"""
342
- <div style="
343
- display: flex;
344
- align-items: center;
345
- justify-content: center;
346
- font-weight: 500;
347
- color: var(--primary-text);
348
- background-color: var(--surface-color-alt);
349
- padding: 0.5rem 1rem;
350
- border-radius: 20px;
351
- font-size: 0.9rem;
352
- width: fit-content;
353
- margin: 0 auto;">
354
- <span style="margin-right: 0.5rem;">📄</span>{valid_index + 1}/{len(df_filtered)}
355
- </div>
356
- """
357
-
358
- return chat_html, metrics_html, tool_html, index_html
359
-
360
- except Exception as e:
361
- error_html = f"""
362
- <div style="
363
- padding: 1.5rem;
364
- color: var(--score-low);
365
- background-color: var(--surface-color);
366
- border: 1px solid var(--score-low);
367
- border-radius: 8px;
368
- margin: 1rem 0;
369
- display: flex;
370
- align-items: flex-start;">
371
- <div style="flex-shrink: 0; margin-right: 1rem; font-size: 1.5rem;">⚠️</div>
372
- <div>
373
- <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Occurred</div>
374
- <div style="
375
- font-family: monospace;
376
- background-color: var(--surface-color-alt);
377
- padding: 1rem;
378
- border-radius: 4px;
379
- white-space: pre-wrap;
380
- font-size: 0.9rem;">
381
- {str(e)}
382
- </div>
383
- </div>
384
- </div>
385
- """
386
- return (
387
- error_html,
388
- "<div style='padding: 1.5rem; color: var(--text-muted); text-align: center;'>No metrics available</div>",
389
- "<div style='padding: 1.5rem; color: var(--text-muted); text-align: center;'>No tool information available</div>",
390
- "<div style='text-align: center; color: var(--text-muted);'>0/0</div>",
391
- )
392
-
393
-
394
- def create_exploration_tab(df):
395
- """Create an enhanced data exploration tab with better UI and functionality."""
396
-
397
- # Main UI setup
398
- with gr.Tab("Data Exploration"):
399
- # CSS styling (unchanged)
400
- gr.HTML(
401
- """
402
- <style>
403
- /* Custom styling for the exploration tab */
404
- :root[data-theme="light"] {
405
- --surface-color: #f8f9fa;
406
- --surface-color-alt: #ffffff;
407
- --text-color: #202124;
408
- --text-muted: #666666;
409
- --primary-text: #1a73e8;
410
- --primary-text-light: rgba(26, 115, 232, 0.3);
411
- --border-color: #e9ecef;
412
- --border-color-light: #f1f3f5;
413
- --shadow-color: rgba(0,0,0,0.05);
414
- --message-bg-user: #E5F6FD;
415
- --message-bg-assistant: #F7F7F8;
416
- --message-bg-system: #FFF3E0;
417
- --response-bg: #F0F7FF;
418
- --score-high: #1a73e8;
419
- --score-med: #f4b400;
420
- --score-low: #ea4335;
421
- }
422
-
423
- :root[data-theme="dark"] {
424
- --surface-color: #1e1e1e;
425
- --surface-color-alt: #2d2d2d;
426
- --text-color: #ffffff;
427
- --text-muted: #a0a0a0;
428
- --primary-text: #60a5fa;
429
- --primary-text-light: rgba(96, 165, 250, 0.3);
430
- --border-color: #404040;
431
- --border-color-light: #333333;
432
- --shadow-color: rgba(0,0,0,0.2);
433
- --message-bg-user: #2d3748;
434
- --message-bg-assistant: #1a1a1a;
435
- --message-bg-system: #2c2516;
436
- --response-bg: #1e2a3a;
437
- --score-high: #60a5fa;
438
- --score-med: #fbbf24;
439
- --score-low: #ef4444;
440
- }
441
-
442
- #exploration-header {
443
- margin-bottom: 1.5rem;
444
- padding-bottom: 1rem;
445
- border-bottom: 1px solid var(--border-color);
446
- }
447
-
448
- .filter-container {
449
- background-color: var(--surface-color);
450
- border-radius: 10px;
451
- padding: 1rem;
452
- margin-bottom: 1.5rem;
453
- border: 1px solid var(--border-color);
454
- box-shadow: 0 2px 6px var(--shadow-color);
455
- }
456
-
457
- .navigation-buttons button {
458
- min-width: 120px;
459
- font-weight: 500;
460
- }
461
-
462
- .content-panel {
463
- margin-top: 1.5rem;
464
- }
465
-
466
- @media (max-width: 768px) {
467
- .filter-row {
468
- flex-direction: column;
469
- }
470
- }
471
- </style>
472
- """
473
- )
474
-
475
- # Header
476
- with gr.Row(elem_id="exploration-header"):
477
- gr.HTML(HEADER_CONTENT)
478
-
479
- # Filters section
480
- with gr.Column(elem_classes="filter-container"):
481
- gr.Markdown("### 🔍 Filter Options")
482
-
483
- with gr.Row(equal_height=True, elem_classes="filter-row"):
484
- explore_model = gr.Dropdown(
485
- choices=MODELS,
486
- value=MODELS[0],
487
- label="Model",
488
- container=True,
489
- scale=1,
490
- info="Select AI model",
491
- )
492
- explore_dataset = gr.Dropdown(
493
- choices=DATASETS,
494
- value=DATASETS[0],
495
- label="Dataset",
496
- container=True,
497
- scale=1,
498
- info="Select evaluation dataset",
499
- )
500
-
501
- with gr.Row(equal_height=True, elem_classes="filter-row"):
502
- min_score = gr.Slider(
503
- minimum=float(min(SCORES)),
504
- maximum=float(max(SCORES)),
505
- value=float(min(SCORES)),
506
- step=0.1,
507
- label="Minimum TSQ Score",
508
- container=True,
509
- scale=1,
510
- info="Filter responses with scores above this threshold",
511
- )
512
- max_score = gr.Slider(
513
- minimum=float(min(SCORES)),
514
- maximum=float(max(SCORES)),
515
- value=float(max(SCORES)),
516
- step=0.1,
517
- label="Maximum TSQ Score",
518
- container=True,
519
- scale=1,
520
- info="Filter responses with scores below this threshold",
521
- )
522
-
523
- # Get the data for initial ranges
524
- df_chat = get_chat_and_score_df(explore_model.value, explore_dataset.value)
525
-
526
- # Ensure columns exist and get ranges
527
- n_turns_max = int(df_chat["n_turns"].max())
528
- len_query_max = int(df_chat["len_query"].max())
529
- n_tools_max = int(df_chat["n_tools"].max())
530
-
531
- with gr.Row(equal_height=True, elem_classes="filter-row"):
532
- n_turns_filter = gr.Slider(
533
- minimum=0,
534
- maximum=n_turns_max,
535
- value=0,
536
- step=1,
537
- label="Minimum Turn Count",
538
- container=True,
539
- scale=1,
540
- info="Filter by minimum number of conversation turns",
541
- )
542
-
543
- len_query_filter = gr.Slider(
544
- minimum=0,
545
- maximum=len_query_max,
546
- value=0,
547
- step=10,
548
- label="Minimum Query Length",
549
- container=True,
550
- scale=1,
551
- info="Filter by minimum length of query in characters",
552
- )
553
-
554
- n_tools_filter = gr.Slider(
555
- minimum=0,
556
- maximum=n_tools_max,
557
- value=0,
558
- step=1,
559
- label="Minimum Tool Count",
560
- container=True,
561
- scale=1,
562
- info="Filter by minimum number of tools used",
563
- )
564
-
565
- with gr.Row():
566
- reset_btn = gr.Button("Reset Filters", size="sm", variant="secondary")
567
-
568
- # Navigation row
569
- with gr.Row(variant="panel"):
570
- with gr.Column(scale=1):
571
- prev_btn = gr.Button(
572
- "← Previous",
573
- size="lg",
574
- variant="secondary",
575
- elem_classes="navigation-buttons",
576
- )
577
-
578
- with gr.Column(scale=1, min_width=100):
579
- # Get initial count from default data
580
- df_initial = get_chat_and_score_df(MODELS[0], DATASETS[0])
581
- initial_count = len(df_initial)
582
-
583
- index_display = gr.HTML(
584
- value=f"""<div style="
585
- display: flex;
586
- align-items: center;
587
- justify-content: center;
588
- font-weight: 500;
589
- color: var(--primary-text);
590
- background-color: var(--surface-color-alt);
591
- padding: 0.5rem 1rem;
592
- border-radius: 20px;
593
- font-size: 0.9rem;
594
- width: fit-content;
595
- margin: 0 auto;">
596
- <span style="margin-right: 0.5rem;">📄</span>1/{initial_count}
597
- </div>""",
598
- elem_id="index-display",
599
- )
600
-
601
- with gr.Column(scale=1):
602
- next_btn = gr.Button(
603
- "Next →",
604
- size="lg",
605
- variant="secondary",
606
- elem_classes="navigation-buttons",
607
- )
608
-
609
- # Content areas
610
- with gr.Row(equal_height=True):
611
- with gr.Column(scale=1):
612
- chat_display = gr.HTML()
613
- with gr.Column(scale=1):
614
- metrics_display = gr.HTML()
615
-
616
- with gr.Row():
617
- tool_info_display = gr.HTML()
618
-
619
- # State for tracking current index (simple integer state)
620
- current_index = gr.State(value=0)
621
-
622
- def reset_index():
623
- """Reset the current index to 0"""
624
- return 0
625
-
626
- # Add these explicit event handlers for model and dataset changes
627
- explore_model.change(
628
- reset_index,
629
- inputs=[],
630
- outputs=[current_index],
631
- )
632
-
633
- explore_dataset.change(
634
- reset_index,
635
- inputs=[],
636
- outputs=[current_index],
637
- )
638
-
639
- min_score.change(
640
- reset_index,
641
- inputs=[],
642
- outputs=[current_index],
643
- )
644
-
645
- max_score.change(
646
- reset_index,
647
- inputs=[],
648
- outputs=[current_index],
649
- )
650
-
651
- n_turns_filter.change(
652
- reset_index,
653
- inputs=[],
654
- outputs=[current_index],
655
- )
656
-
657
- len_query_filter.change(
658
- reset_index,
659
- inputs=[],
660
- outputs=[current_index],
661
- )
662
-
663
- n_tools_filter.change(
664
- reset_index,
665
- inputs=[],
666
- outputs=[current_index],
667
- )
668
-
669
- # Reset filters
670
- def reset_filters():
671
- return (
672
- MODELS[0],
673
- DATASETS[0],
674
- float(min(SCORES)),
675
- float(max(SCORES)),
676
- 0, # n_turns
677
- 0, # len_query
678
- 0, # n_tools
679
- )
680
-
681
- reset_btn.click(
682
- reset_filters,
683
- outputs=[
684
- explore_model,
685
- explore_dataset,
686
- min_score,
687
- max_score,
688
- n_turns_filter,
689
- len_query_filter,
690
- n_tools_filter,
691
- ],
692
- )
693
-
694
- # Connect filter changes
695
- # Replace the existing filter connections with this:
696
- for control in [
697
- explore_model,
698
- explore_dataset,
699
- min_score,
700
- max_score,
701
- n_turns_filter,
702
- len_query_filter,
703
- n_tools_filter,
704
- ]:
705
- control.change(
706
- on_filter_change,
707
- inputs=[
708
- explore_model,
709
- explore_dataset,
710
- min_score,
711
- max_score,
712
- n_turns_filter,
713
- len_query_filter,
714
- n_tools_filter,
715
- ],
716
- outputs=[
717
- chat_display,
718
- metrics_display,
719
- tool_info_display,
720
- index_display,
721
- ],
722
- )
723
-
724
- # Connect navigation buttons with necessary filter parameters
725
- prev_btn.click(
726
- navigate_prev,
727
- inputs=[
728
- current_index,
729
- explore_model,
730
- explore_dataset,
731
- min_score,
732
- max_score,
733
- n_turns_filter,
734
- len_query_filter,
735
- n_tools_filter,
736
- ],
737
- outputs=[
738
- chat_display,
739
- metrics_display,
740
- tool_info_display,
741
- index_display,
742
- current_index,
743
- ],
744
- )
745
-
746
- next_btn.click(
747
- navigate_next,
748
- inputs=[
749
- current_index,
750
- explore_model,
751
- explore_dataset,
752
- min_score,
753
- max_score,
754
- n_turns_filter,
755
- len_query_filter,
756
- n_tools_filter,
757
- ],
758
- outputs=[
759
- chat_display,
760
- metrics_display,
761
- tool_info_display,
762
- index_display,
763
- current_index,
764
- ],
765
- )
766
-
767
- def update_slider_ranges(model, dataset):
768
- df_chat = get_chat_and_score_df(model, dataset)
769
-
770
- # Make sure columns are numeric first
771
- df_chat["n_turns"] = pd.to_numeric(
772
- df_chat["n_turns"], errors="coerce"
773
- ).fillna(0)
774
- df_chat["len_query"] = pd.to_numeric(
775
- df_chat["len_query"], errors="coerce"
776
- ).fillna(0)
777
- df_chat["n_tools"] = pd.to_numeric(
778
- df_chat["n_tools"], errors="coerce"
779
- ).fillna(0)
780
-
781
- # Calculate maximums with safety buffers
782
- n_turns_max = max(1, int(df_chat["n_turns"].max()))
783
- len_query_max = max(10, int(df_chat["len_query"].max()))
784
- n_tools_max = max(1, int(df_chat["n_tools"].max()))
785
-
786
- # Return updated sliders using gr.update()
787
- return (
788
- gr.update(maximum=n_turns_max, value=0),
789
- gr.update(maximum=len_query_max, value=0),
790
- gr.update(maximum=n_tools_max, value=0),
791
- )
792
-
793
- # Connect model and dataset changes to slider range updates
794
- explore_model.change(
795
- update_slider_ranges,
796
- inputs=[explore_model, explore_dataset],
797
- outputs=[n_turns_filter, len_query_filter, n_tools_filter],
798
- )
799
- explore_dataset.change(
800
- update_slider_ranges,
801
- inputs=[explore_model, explore_dataset],
802
- outputs=[n_turns_filter, len_query_filter, n_tools_filter],
803
- )
804
-
805
- return [
806
- chat_display,
807
- metrics_display,
808
- tool_info_display,
809
- index_display,
810
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tabs/leaderboard.py CHANGED
@@ -156,7 +156,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
156
 
157
 
158
  def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
159
- with gr.Tab("Leaderboard"):
160
  gr.HTML(HEADER_CONTENT + CARDS)
161
  gr.HTML(DESCRIPTION_HTML)
162
 
 
156
 
157
 
158
  def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
159
+ with gr.Tab("Leaderboard v1"):
160
  gr.HTML(HEADER_CONTENT + CARDS)
161
  gr.HTML(DESCRIPTION_HTML)
162