Pratik Bhavsar commited on
Commit
d3c87a6
·
1 Parent(s): c411387

working v2

Browse files
app.py CHANGED
@@ -9,26 +9,30 @@ from data_loader import (
9
  CATEGORIES,
10
  METHODOLOGY,
11
  HEADER_CONTENT,
12
- CARDS,
13
- DATASETS,
14
- SCORES,
15
  )
16
- from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
 
17
 
18
 
19
  def create_app():
20
  df = load_data()
21
 
22
- MODELS = [x.strip() for x in df["Model"].unique().tolist()]
23
-
24
  with gr.Blocks(
25
- theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
26
  ) as app:
27
  with gr.Tabs():
28
- # Create tabs
29
- lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
30
- df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
31
- )
 
 
 
 
 
 
 
32
 
33
  # Initial loads
34
  app.load(
 
9
  CATEGORIES,
10
  METHODOLOGY,
11
  HEADER_CONTENT,
12
+ CARDS
 
 
13
  )
14
+ from tabs.leaderboard_v1 import create_leaderboard_tab, filter_leaderboard
15
+ from tabs.leaderboard_v2 import create_leaderboard_v2_interface
16
 
17
 
18
  def create_app():
19
  df = load_data()
20
 
 
 
21
  with gr.Blocks(
22
+ theme=gr.themes.Default(primary_hue=gr.themes.colors.red)
23
  ) as app:
24
  with gr.Tabs():
25
+
26
+ # Create v2 tab
27
+ with gr.Tab("Leaderboard v2"):
28
+ create_leaderboard_v2_interface()
29
+
30
+ # Create v1 tab
31
+ with gr.Tab("Leaderboard v1"):
32
+ lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
33
+ df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
34
+ )
35
+
36
 
37
  # Initial loads
38
  app.load(
results_v2.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Vendor,Avg AC,Avg TSQ,Avg Total Cost,Avg Session Duration,Avg Turns,Banking AC,Healthcare AC,Insurance AC,Investment AC,Telecom AC,Banking TSQ,Healthcare TSQ,Insurance TSQ,Investment TSQ,Telecom TSQ,Avg Input Cost ($),Avg Output Cost ($),Banking Cost,Healthcare Cost,Insurance Cost,Investment Cost,Telecom Cost,Banking Duration,Healthcare Duration,Insurance Duration,Investment Duration,Telecom Duration,Banking Turns,Healthcare Turns,Insurance Turns,Investment Turns,Telecom Turns,Model Type,$/M input token,$/M output token,Output Type
2
+ gpt-4.1-2025-04-14,OpenAI,0.62,0.8,0.0684,24.32,3.1,0.6,0.62,0.66,0.64,0.58,0.81,0.83,0.68,0.88,0.82,0.0577,0.0107,0.052,0.0711,0.0629,0.0777,0.0783,18.52,24.4,25.24,27.88,25.58,2.61,3.15,2.92,3.3,3.48,Proprietary,2.0,8.0,Normal
3
+ gpt-4.1-mini-2025-04-14,OpenAI,0.56,0.79,0.0141,26.0,3.43,0.56,0.6,0.46,0.5,0.64,0.8,0.85,0.63,0.84,0.83,0.0123,0.0018,0.0115,0.0143,0.0131,0.0164,0.0156,21.28,26.82,23.32,30.5,28.07,2.99,3.32,3.28,3.76,3.79,Proprietary,0.4,1.6,Normal
4
+ claude-sonnet-4-20250514,Anthropic,0.55,0.92,0.1537,66.6,2.89,0.58,0.62,0.53,0.49,0.53,0.9,0.95,0.93,0.92,0.9,0.1212,0.0325,0.1359,0.1542,0.1442,0.1669,0.1675,55.36,57.93,56.87,86.44,76.38,2.54,2.84,2.79,3.06,3.22,Proprietary,3.0,15.0,Normal
5
+ qwen2.5-72b-instruct,Alibaba,0.51,0.8,0.0361,34.68,2.65,0.48,0.61,0.52,0.42,0.52,0.78,0.84,0.77,0.82,0.79,0.0338,0.0023,0.0292,0.0348,0.0338,0.0417,0.0415,27.34,29.09,30.46,41.32,45.2,2.3,2.46,2.47,3.0,3.0,Open source,0.9,0.9,Normal
6
+ gemini-2.5-pro,Google,0.43,0.86,0.1447,125.85,3.57,0.45,0.4,0.54,0.31,0.44,0.88,0.87,0.87,0.85,0.83,0.0442,0.1005,0.1253,0.1475,0.1386,0.1464,0.1656,108.83,126.91,121.99,129.62,141.92,3.1,3.57,3.49,3.7,3.97,Proprietary,1.25,10.0,Reasoning
7
+ deepseek-v3,Deepseek,0.4,0.8,0.0141,59.97,3.71,0.38,0.32,0.48,0.36,0.47,0.8,0.74,0.76,0.87,0.81,0.0119,0.0022,0.0123,0.0158,0.0139,0.0151,0.0138,44.46,68.38,48.54,70.21,68.27,3.27,4.2,3.48,3.79,3.83,Open source,0.27,1.1,Normal
8
+ gemini-2.5-flash,Google,0.38,0.94,0.0271,39.84,3.9,0.48,0.38,0.44,0.22,0.36,0.94,0.94,0.94,0.94,0.95,0.0123,0.0148,0.0248,0.0283,0.0273,0.0308,0.0241,33.03,36.81,38.24,42.78,48.34,3.53,3.96,3.78,4.28,3.98,Proprietary,0.3,2.5,Reasoning
9
+ gpt-4.1-nano-2025-04-14,OpenAI,0.38,0.63,0.0038,12.36,3.56,0.4,0.4,0.41,0.29,0.38,0.64,0.54,0.54,0.77,0.65,0.0034,0.0004,0.0029,0.0038,0.004,0.0042,0.0041,14.16,10.9,12.23,12.68,11.83,2.88,3.24,3.78,4.01,3.91,Proprietary,0.1,0.4,Normal
10
+ qwen3-235b-a22b,Alibaba,0.37,0.86,0.0106,133.24,2.86,0.36,0.33,0.41,0.3,0.44,0.88,0.86,0.85,0.84,0.85,0.0076,0.003,0.0087,0.0114,0.0114,0.0111,0.0105,117.72,137.4,135.24,147.35,128.48,2.43,2.99,3.04,3.01,2.83,Open source,0.2,0.6,Reasoning
11
+ magistral-medium-2506,Mistral,0.32,0.59,0.1182,32.96,4.4,0.3,0.35,0.38,0.26,0.3,0.59,0.67,0.56,0.63,0.51,0.108,0.0102,0.1067,0.0994,0.1077,0.1476,0.1294,24.98,35.81,33.33,39.18,31.49,4.21,3.46,3.92,5.36,5.07,Proprietary,2.0,5.0,Reasoning
12
+ nova-pro-v1,Amazon,0.29,0.65,0.0359,27.96,3.04,0.33,0.29,0.39,0.17,0.29,0.6,0.57,0.64,0.83,0.6,0.0316,0.0043,0.0304,0.0353,0.0359,0.04,0.038,23.45,27.94,27.9,32.09,28.43,2.72,2.88,2.99,3.36,3.26,Proprietary,0.8,3.2,Normal
13
+ mistral-small-2506,Mistral,0.26,0.71,0.0053,35.69,4.37,0.37,0.28,0.22,0.2,0.21,0.73,0.71,0.65,0.76,0.69,0.0049,0.0004,0.0041,0.0057,0.0054,0.0058,0.0056,30.64,36.02,30.83,41.96,39.02,3.3,4.47,4.52,4.87,4.67,Open source,0.1,0.3,Normal
14
+ caller,Arcee,0.16,0.65,0.0297,25.66,4.2,0.23,0.14,0.22,0.09,0.12,0.69,0.6,0.68,0.61,0.67,0.0282,0.0015,0.0262,0.0303,0.0305,0.0331,0.0286,22.83,25.54,26.42,29.66,23.85,3.76,4.19,4.18,4.75,4.14,Open source,0.55,0.85,Normal
15
+ nova-lite-v1,Amazon,0.16,0.55,0.0031,20.26,3.73,0.12,0.18,0.19,0.15,0.18,0.48,0.49,0.58,0.72,0.49,0.0027,0.0004,0.0026,0.0033,0.0031,0.0034,0.0029,17.53,20.61,19.67,24.28,19.2,3.31,4.13,3.57,4.04,3.62,Proprietary,0.06,0.24,Normal
16
+ magistral-small-2506,Mistral,0.16,0.53,0.0301,17.42,5.68,0.23,0.18,0.13,0.16,0.12,0.57,0.46,0.42,0.62,0.6,0.0275,0.0026,0.0245,0.0335,0.0302,0.034,0.0281,14.53,21.36,14.65,19.67,16.87,4.74,6.28,6.14,6.06,5.19,Open source,0.5,1.5,Reasoning
17
+ mistral-medium-2505,Mistral,0.16,0.52,0.0293,34.17,6.27,0.2,0.16,0.18,0.13,0.13,0.45,0.5,0.46,0.63,0.56,0.0256,0.0037,0.025,0.037,0.0328,0.0269,0.0251,30.07,39.7,36.76,31.84,32.49,5.61,7.75,7.08,5.68,5.23,Proprietary,0.4,2.0,Normal
tabs/{leaderboard.py → leaderboard_v1.py} RENAMED
@@ -156,51 +156,50 @@ def filter_leaderboard(df, model_type, category, sort_by):
156
 
157
 
158
  def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
159
- with gr.Tab("Leaderboard v1"):
160
- gr.HTML(HEADER_CONTENT + CARDS)
161
- gr.HTML(DESCRIPTION_HTML)
162
-
163
- # Filters row
164
- with gr.Row(equal_height=True):
165
- with gr.Column(scale=1):
166
- model_type = gr.Dropdown(
167
- choices=["All"] + df["Model Type"].unique().tolist(),
168
- value="All",
169
- label="Model Type",
170
- )
171
- with gr.Column(scale=1):
172
- category = gr.Dropdown(
173
- choices=list(CATEGORIES.keys()),
174
- value=list(CATEGORIES.keys())[0],
175
- label="Category",
176
- )
177
- with gr.Column(scale=1):
178
- sort_by = gr.Radio(
179
- choices=["Performance", "Cost"],
180
- value="Performance",
181
- label="Sort by",
182
- )
183
-
184
- # Content
185
- output = gr.HTML()
186
- plot1 = gr.Plot()
187
- plot2 = gr.Plot()
188
-
189
- gr.HTML(
190
- """<div class="note-box">
191
- <p style="margin: 0; font-size: 1em;">
192
- Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
193
- </p>
194
- </div>"""
195
- )
196
-
197
- gr.HTML(METHODOLOGY)
198
-
199
- for input_comp in [model_type, category, sort_by]:
200
- input_comp.change(
201
- fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
202
- inputs=[model_type, category, sort_by],
203
- outputs=[output, plot1, plot2],
204
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- return output, plot1, plot2
 
156
 
157
 
158
  def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
159
+ gr.HTML(HEADER_CONTENT + CARDS)
160
+ gr.HTML(DESCRIPTION_HTML)
161
+
162
+ # Filters row
163
+ with gr.Row(equal_height=True):
164
+ with gr.Column(scale=1):
165
+ model_type = gr.Dropdown(
166
+ choices=["All"] + df["Model Type"].unique().tolist(),
167
+ value="All",
168
+ label="Model Type",
169
+ )
170
+ with gr.Column(scale=1):
171
+ category = gr.Dropdown(
172
+ choices=list(CATEGORIES.keys()),
173
+ value=list(CATEGORIES.keys())[0],
174
+ label="Category",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  )
176
+ with gr.Column(scale=1):
177
+ sort_by = gr.Radio(
178
+ choices=["Performance", "Cost"],
179
+ value="Performance",
180
+ label="Sort by",
181
+ )
182
+
183
+ # Content
184
+ output = gr.HTML()
185
+ plot1 = gr.Plot()
186
+ plot2 = gr.Plot()
187
+
188
+ gr.HTML(
189
+ """<div class="note-box">
190
+ <p style="margin: 0; font-size: 1em;">
191
+ Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
192
+ </p>
193
+ </div>"""
194
+ )
195
+
196
+ gr.HTML(METHODOLOGY)
197
+
198
+ for input_comp in [model_type, category, sort_by]:
199
+ input_comp.change(
200
+ fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
201
+ inputs=[model_type, category, sort_by],
202
+ outputs=[output, plot1, plot2],
203
+ )
204
 
205
+ return output, plot1, plot2
tabs/leaderboard_v2.py ADDED
The diff for this file is too large to render. See raw diff
 
tabs/model_comparison.py DELETED
@@ -1,73 +0,0 @@
1
- import gradio as gr
2
- from visualization import create_radar_plot
3
-
4
-
5
- def compare_models(df, model_names=None):
6
- if model_names is None or len(model_names) == 0:
7
- model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
8
-
9
- filtered_df = df[df["Model"].isin(model_names)]
10
- radar_chart = create_radar_plot(df, model_names)
11
-
12
- # Create styled table for model info
13
- info_html = f"""
14
- <div class="dark-table-container">
15
- <table class="dark-styled-table">
16
- <thead>
17
- <tr>
18
- <th>Model</th>
19
- <th>Type</th>
20
- <th>Average</th>
21
- <th>I/O Cost</th>
22
- <th>Single Turn</th>
23
- <th>Multi Turn</th>
24
- </tr>
25
- </thead>
26
- <tbody>
27
- """
28
-
29
- for _, row in filtered_df.iterrows():
30
- info_html += f"""
31
- <tr>
32
- <td>{row['Model']}</td>
33
- <td>{row['Model Type']}</td>
34
- <td>{row['Model Avg']:.3f}</td>
35
- <td>${row['IO Cost']:.2f}</td>
36
- <td>{row['single turn perf']:.3f}</td>
37
- <td>{row['multi turn perf']:.3f}</td>
38
- </tr>
39
- """
40
-
41
- info_html += """
42
- </tbody>
43
- </table>
44
- </div>
45
- """
46
-
47
- return info_html, radar_chart
48
-
49
-
50
- def create_model_comparison_tab(df, HEADER_CONTENT):
51
- with gr.Tab("Model Comparison"):
52
- gr.HTML(HEADER_CONTENT)
53
- with gr.Column():
54
- # Filters row
55
- with gr.Row(equal_height=True):
56
- model_selector = gr.Dropdown(
57
- choices=df["Model"].unique().tolist(),
58
- value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
59
- multiselect=True,
60
- label="Select Models to Compare",
61
- )
62
-
63
- # Content
64
- model_info = gr.HTML()
65
- radar_plot = gr.Plot()
66
-
67
- model_selector.change(
68
- fn=lambda m: compare_models(df, m),
69
- inputs=[model_selector],
70
- outputs=[model_info, radar_plot],
71
- )
72
-
73
- return model_info, radar_plot