Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
d3c87a6
1
Parent(s):
c411387
working v2
Browse files- app.py +15 -11
- results_v2.csv +17 -0
- tabs/{leaderboard.py → leaderboard_v1.py} +45 -46
- tabs/leaderboard_v2.py +0 -0
- tabs/model_comparison.py +0 -73
app.py
CHANGED
@@ -9,26 +9,30 @@ from data_loader import (
|
|
9 |
CATEGORIES,
|
10 |
METHODOLOGY,
|
11 |
HEADER_CONTENT,
|
12 |
-
CARDS
|
13 |
-
DATASETS,
|
14 |
-
SCORES,
|
15 |
)
|
16 |
-
from tabs.
|
|
|
17 |
|
18 |
|
19 |
def create_app():
|
20 |
df = load_data()
|
21 |
|
22 |
-
MODELS = [x.strip() for x in df["Model"].unique().tolist()]
|
23 |
-
|
24 |
with gr.Blocks(
|
25 |
-
theme=gr.themes.
|
26 |
) as app:
|
27 |
with gr.Tabs():
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
# Initial loads
|
34 |
app.load(
|
|
|
9 |
CATEGORIES,
|
10 |
METHODOLOGY,
|
11 |
HEADER_CONTENT,
|
12 |
+
CARDS
|
|
|
|
|
13 |
)
|
14 |
+
from tabs.leaderboard_v1 import create_leaderboard_tab, filter_leaderboard
|
15 |
+
from tabs.leaderboard_v2 import create_leaderboard_v2_interface
|
16 |
|
17 |
|
18 |
def create_app():
|
19 |
df = load_data()
|
20 |
|
|
|
|
|
21 |
with gr.Blocks(
|
22 |
+
theme=gr.themes.Default(primary_hue=gr.themes.colors.red)
|
23 |
) as app:
|
24 |
with gr.Tabs():
|
25 |
+
|
26 |
+
# Create v2 tab
|
27 |
+
with gr.Tab("Leaderboard v2"):
|
28 |
+
create_leaderboard_v2_interface()
|
29 |
+
|
30 |
+
# Create v1 tab
|
31 |
+
with gr.Tab("Leaderboard v1"):
|
32 |
+
lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
|
33 |
+
df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
|
34 |
+
)
|
35 |
+
|
36 |
|
37 |
# Initial loads
|
38 |
app.load(
|
results_v2.csv
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Vendor,Avg AC,Avg TSQ,Avg Total Cost,Avg Session Duration,Avg Turns,Banking AC,Healthcare AC,Insurance AC,Investment AC,Telecom AC,Banking TSQ,Healthcare TSQ,Insurance TSQ,Investment TSQ,Telecom TSQ,Avg Input Cost ($),Avg Output Cost ($),Banking Cost,Healthcare Cost,Insurance Cost,Investment Cost,Telecom Cost,Banking Duration,Healthcare Duration,Insurance Duration,Investment Duration,Telecom Duration,Banking Turns,Healthcare Turns,Insurance Turns,Investment Turns,Telecom Turns,Model Type,$/M input token,$/M output token,Output Type
|
2 |
+
gpt-4.1-2025-04-14,OpenAI,0.62,0.8,0.0684,24.32,3.1,0.6,0.62,0.66,0.64,0.58,0.81,0.83,0.68,0.88,0.82,0.0577,0.0107,0.052,0.0711,0.0629,0.0777,0.0783,18.52,24.4,25.24,27.88,25.58,2.61,3.15,2.92,3.3,3.48,Proprietary,2.0,8.0,Normal
|
3 |
+
gpt-4.1-mini-2025-04-14,OpenAI,0.56,0.79,0.0141,26.0,3.43,0.56,0.6,0.46,0.5,0.64,0.8,0.85,0.63,0.84,0.83,0.0123,0.0018,0.0115,0.0143,0.0131,0.0164,0.0156,21.28,26.82,23.32,30.5,28.07,2.99,3.32,3.28,3.76,3.79,Proprietary,0.4,1.6,Normal
|
4 |
+
claude-sonnet-4-20250514,Anthropic,0.55,0.92,0.1537,66.6,2.89,0.58,0.62,0.53,0.49,0.53,0.9,0.95,0.93,0.92,0.9,0.1212,0.0325,0.1359,0.1542,0.1442,0.1669,0.1675,55.36,57.93,56.87,86.44,76.38,2.54,2.84,2.79,3.06,3.22,Proprietary,3.0,15.0,Normal
|
5 |
+
qwen2.5-72b-instruct,Alibaba,0.51,0.8,0.0361,34.68,2.65,0.48,0.61,0.52,0.42,0.52,0.78,0.84,0.77,0.82,0.79,0.0338,0.0023,0.0292,0.0348,0.0338,0.0417,0.0415,27.34,29.09,30.46,41.32,45.2,2.3,2.46,2.47,3.0,3.0,Open source,0.9,0.9,Normal
|
6 |
+
gemini-2.5-pro,Google,0.43,0.86,0.1447,125.85,3.57,0.45,0.4,0.54,0.31,0.44,0.88,0.87,0.87,0.85,0.83,0.0442,0.1005,0.1253,0.1475,0.1386,0.1464,0.1656,108.83,126.91,121.99,129.62,141.92,3.1,3.57,3.49,3.7,3.97,Proprietary,1.25,10.0,Reasoning
|
7 |
+
deepseek-v3,Deepseek,0.4,0.8,0.0141,59.97,3.71,0.38,0.32,0.48,0.36,0.47,0.8,0.74,0.76,0.87,0.81,0.0119,0.0022,0.0123,0.0158,0.0139,0.0151,0.0138,44.46,68.38,48.54,70.21,68.27,3.27,4.2,3.48,3.79,3.83,Open source,0.27,1.1,Normal
|
8 |
+
gemini-2.5-flash,Google,0.38,0.94,0.0271,39.84,3.9,0.48,0.38,0.44,0.22,0.36,0.94,0.94,0.94,0.94,0.95,0.0123,0.0148,0.0248,0.0283,0.0273,0.0308,0.0241,33.03,36.81,38.24,42.78,48.34,3.53,3.96,3.78,4.28,3.98,Proprietary,0.3,2.5,Reasoning
|
9 |
+
gpt-4.1-nano-2025-04-14,OpenAI,0.38,0.63,0.0038,12.36,3.56,0.4,0.4,0.41,0.29,0.38,0.64,0.54,0.54,0.77,0.65,0.0034,0.0004,0.0029,0.0038,0.004,0.0042,0.0041,14.16,10.9,12.23,12.68,11.83,2.88,3.24,3.78,4.01,3.91,Proprietary,0.1,0.4,Normal
|
10 |
+
qwen3-235b-a22b,Alibaba,0.37,0.86,0.0106,133.24,2.86,0.36,0.33,0.41,0.3,0.44,0.88,0.86,0.85,0.84,0.85,0.0076,0.003,0.0087,0.0114,0.0114,0.0111,0.0105,117.72,137.4,135.24,147.35,128.48,2.43,2.99,3.04,3.01,2.83,Open source,0.2,0.6,Reasoning
|
11 |
+
magistral-medium-2506,Mistral,0.32,0.59,0.1182,32.96,4.4,0.3,0.35,0.38,0.26,0.3,0.59,0.67,0.56,0.63,0.51,0.108,0.0102,0.1067,0.0994,0.1077,0.1476,0.1294,24.98,35.81,33.33,39.18,31.49,4.21,3.46,3.92,5.36,5.07,Proprietary,2.0,5.0,Reasoning
|
12 |
+
nova-pro-v1,Amazon,0.29,0.65,0.0359,27.96,3.04,0.33,0.29,0.39,0.17,0.29,0.6,0.57,0.64,0.83,0.6,0.0316,0.0043,0.0304,0.0353,0.0359,0.04,0.038,23.45,27.94,27.9,32.09,28.43,2.72,2.88,2.99,3.36,3.26,Proprietary,0.8,3.2,Normal
|
13 |
+
mistral-small-2506,Mistral,0.26,0.71,0.0053,35.69,4.37,0.37,0.28,0.22,0.2,0.21,0.73,0.71,0.65,0.76,0.69,0.0049,0.0004,0.0041,0.0057,0.0054,0.0058,0.0056,30.64,36.02,30.83,41.96,39.02,3.3,4.47,4.52,4.87,4.67,Open source,0.1,0.3,Normal
|
14 |
+
caller,Arcee,0.16,0.65,0.0297,25.66,4.2,0.23,0.14,0.22,0.09,0.12,0.69,0.6,0.68,0.61,0.67,0.0282,0.0015,0.0262,0.0303,0.0305,0.0331,0.0286,22.83,25.54,26.42,29.66,23.85,3.76,4.19,4.18,4.75,4.14,Open source,0.55,0.85,Normal
|
15 |
+
nova-lite-v1,Amazon,0.16,0.55,0.0031,20.26,3.73,0.12,0.18,0.19,0.15,0.18,0.48,0.49,0.58,0.72,0.49,0.0027,0.0004,0.0026,0.0033,0.0031,0.0034,0.0029,17.53,20.61,19.67,24.28,19.2,3.31,4.13,3.57,4.04,3.62,Proprietary,0.06,0.24,Normal
|
16 |
+
magistral-small-2506,Mistral,0.16,0.53,0.0301,17.42,5.68,0.23,0.18,0.13,0.16,0.12,0.57,0.46,0.42,0.62,0.6,0.0275,0.0026,0.0245,0.0335,0.0302,0.034,0.0281,14.53,21.36,14.65,19.67,16.87,4.74,6.28,6.14,6.06,5.19,Open source,0.5,1.5,Reasoning
|
17 |
+
mistral-medium-2505,Mistral,0.16,0.52,0.0293,34.17,6.27,0.2,0.16,0.18,0.13,0.13,0.45,0.5,0.46,0.63,0.56,0.0256,0.0037,0.025,0.037,0.0328,0.0269,0.0251,30.07,39.7,36.76,31.84,32.49,5.61,7.75,7.08,5.68,5.23,Proprietary,0.4,2.0,Normal
|
tabs/{leaderboard.py → leaderboard_v1.py}
RENAMED
@@ -156,51 +156,50 @@ def filter_leaderboard(df, model_type, category, sort_by):
|
|
156 |
|
157 |
|
158 |
def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
with gr.
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
label="Category",
|
176 |
-
)
|
177 |
-
with gr.Column(scale=1):
|
178 |
-
sort_by = gr.Radio(
|
179 |
-
choices=["Performance", "Cost"],
|
180 |
-
value="Performance",
|
181 |
-
label="Sort by",
|
182 |
-
)
|
183 |
-
|
184 |
-
# Content
|
185 |
-
output = gr.HTML()
|
186 |
-
plot1 = gr.Plot()
|
187 |
-
plot2 = gr.Plot()
|
188 |
-
|
189 |
-
gr.HTML(
|
190 |
-
"""<div class="note-box">
|
191 |
-
<p style="margin: 0; font-size: 1em;">
|
192 |
-
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
|
193 |
-
</p>
|
194 |
-
</div>"""
|
195 |
-
)
|
196 |
-
|
197 |
-
gr.HTML(METHODOLOGY)
|
198 |
-
|
199 |
-
for input_comp in [model_type, category, sort_by]:
|
200 |
-
input_comp.change(
|
201 |
-
fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
|
202 |
-
inputs=[model_type, category, sort_by],
|
203 |
-
outputs=[output, plot1, plot2],
|
204 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
-
|
|
|
156 |
|
157 |
|
158 |
def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
|
159 |
+
gr.HTML(HEADER_CONTENT + CARDS)
|
160 |
+
gr.HTML(DESCRIPTION_HTML)
|
161 |
+
|
162 |
+
# Filters row
|
163 |
+
with gr.Row(equal_height=True):
|
164 |
+
with gr.Column(scale=1):
|
165 |
+
model_type = gr.Dropdown(
|
166 |
+
choices=["All"] + df["Model Type"].unique().tolist(),
|
167 |
+
value="All",
|
168 |
+
label="Model Type",
|
169 |
+
)
|
170 |
+
with gr.Column(scale=1):
|
171 |
+
category = gr.Dropdown(
|
172 |
+
choices=list(CATEGORIES.keys()),
|
173 |
+
value=list(CATEGORIES.keys())[0],
|
174 |
+
label="Category",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
)
|
176 |
+
with gr.Column(scale=1):
|
177 |
+
sort_by = gr.Radio(
|
178 |
+
choices=["Performance", "Cost"],
|
179 |
+
value="Performance",
|
180 |
+
label="Sort by",
|
181 |
+
)
|
182 |
+
|
183 |
+
# Content
|
184 |
+
output = gr.HTML()
|
185 |
+
plot1 = gr.Plot()
|
186 |
+
plot2 = gr.Plot()
|
187 |
+
|
188 |
+
gr.HTML(
|
189 |
+
"""<div class="note-box">
|
190 |
+
<p style="margin: 0; font-size: 1em;">
|
191 |
+
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
|
192 |
+
</p>
|
193 |
+
</div>"""
|
194 |
+
)
|
195 |
+
|
196 |
+
gr.HTML(METHODOLOGY)
|
197 |
+
|
198 |
+
for input_comp in [model_type, category, sort_by]:
|
199 |
+
input_comp.change(
|
200 |
+
fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
|
201 |
+
inputs=[model_type, category, sort_by],
|
202 |
+
outputs=[output, plot1, plot2],
|
203 |
+
)
|
204 |
|
205 |
+
return output, plot1, plot2
|
tabs/leaderboard_v2.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tabs/model_comparison.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from visualization import create_radar_plot
|
3 |
-
|
4 |
-
|
5 |
-
def compare_models(df, model_names=None):
|
6 |
-
if model_names is None or len(model_names) == 0:
|
7 |
-
model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
8 |
-
|
9 |
-
filtered_df = df[df["Model"].isin(model_names)]
|
10 |
-
radar_chart = create_radar_plot(df, model_names)
|
11 |
-
|
12 |
-
# Create styled table for model info
|
13 |
-
info_html = f"""
|
14 |
-
<div class="dark-table-container">
|
15 |
-
<table class="dark-styled-table">
|
16 |
-
<thead>
|
17 |
-
<tr>
|
18 |
-
<th>Model</th>
|
19 |
-
<th>Type</th>
|
20 |
-
<th>Average</th>
|
21 |
-
<th>I/O Cost</th>
|
22 |
-
<th>Single Turn</th>
|
23 |
-
<th>Multi Turn</th>
|
24 |
-
</tr>
|
25 |
-
</thead>
|
26 |
-
<tbody>
|
27 |
-
"""
|
28 |
-
|
29 |
-
for _, row in filtered_df.iterrows():
|
30 |
-
info_html += f"""
|
31 |
-
<tr>
|
32 |
-
<td>{row['Model']}</td>
|
33 |
-
<td>{row['Model Type']}</td>
|
34 |
-
<td>{row['Model Avg']:.3f}</td>
|
35 |
-
<td>${row['IO Cost']:.2f}</td>
|
36 |
-
<td>{row['single turn perf']:.3f}</td>
|
37 |
-
<td>{row['multi turn perf']:.3f}</td>
|
38 |
-
</tr>
|
39 |
-
"""
|
40 |
-
|
41 |
-
info_html += """
|
42 |
-
</tbody>
|
43 |
-
</table>
|
44 |
-
</div>
|
45 |
-
"""
|
46 |
-
|
47 |
-
return info_html, radar_chart
|
48 |
-
|
49 |
-
|
50 |
-
def create_model_comparison_tab(df, HEADER_CONTENT):
|
51 |
-
with gr.Tab("Model Comparison"):
|
52 |
-
gr.HTML(HEADER_CONTENT)
|
53 |
-
with gr.Column():
|
54 |
-
# Filters row
|
55 |
-
with gr.Row(equal_height=True):
|
56 |
-
model_selector = gr.Dropdown(
|
57 |
-
choices=df["Model"].unique().tolist(),
|
58 |
-
value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
|
59 |
-
multiselect=True,
|
60 |
-
label="Select Models to Compare",
|
61 |
-
)
|
62 |
-
|
63 |
-
# Content
|
64 |
-
model_info = gr.HTML()
|
65 |
-
radar_plot = gr.Plot()
|
66 |
-
|
67 |
-
model_selector.change(
|
68 |
-
fn=lambda m: compare_models(df, m),
|
69 |
-
inputs=[model_selector],
|
70 |
-
outputs=[model_info, radar_plot],
|
71 |
-
)
|
72 |
-
|
73 |
-
return model_info, radar_plot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|