ehagey commited on
Commit
b5f5071
·
verified ·
1 Parent(s): d166947

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +395 -0
app.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ import json
6
+ from typing import Dict, List, Tuple
7
+
8
+ st.set_page_config(
9
+ page_title="LLM Healthcare Benchmarking Budgeting",
10
+ page_icon="🩺",
11
+ layout="wide"
12
+ )
13
+
14
+ blue_to_gray_palette = ["#0077b6", "#4a98c9", "#7ba7c5", "#a6b5c1", "#d0d7dc"]
15
+
16
+ st.markdown("""
17
+ <style>
18
+ .main-header {
19
+ font-size: 2.5rem;
20
+ font-weight: bold;
21
+ margin-bottom: 1rem;
22
+ }
23
+ .section-header {
24
+ font-size: 1.5rem;
25
+ font-weight: bold;
26
+ margin-top: 2rem;
27
+ margin-bottom: 1rem;
28
+ }
29
+ .info-box {
30
+ background-color: #f0f2f6;
31
+ padding: 1rem;
32
+ border-radius: 0.5rem;
33
+ margin-bottom: 1rem;
34
+ }
35
+ .cost-highlight {
36
+ font-size: 1.2rem;
37
+ font-weight: bold;
38
+ color: #ff4b4b;
39
+ }
40
+ </style>
41
+ """, unsafe_allow_html=True)
42
+
43
+
44
+ st.markdown('<div class="main-header">LLM Healthcare Benchmarking for MedMCQA</div>', unsafe_allow_html=True)
45
+
46
+
47
+ default_models_json = """{
48
+ "OpenAI gpt-4.5-preview": {"input_cost": 75, "output_cost": 150},
49
+ "OpenAI gpt-4o": {"input_cost": 2.5, "output_cost": 10},
50
+ "OpenAI gpt-4o-mini": {"input_cost": 0.15, "output_cost": 0.6},
51
+ "OpenAI o1": {"input_cost": 15, "output_cost": 60},
52
+ "OpenAI o1-mini": {"input_cost": 1.1, "output_cost": 4.4},
53
+ "OpenAI o3-mini": {"input_cost": 1.1, "output_cost": 4.4},
54
+ "Anthropic Claude 3.7 Sonnet": {"input_cost": 3, "output_cost": 15},
55
+ "Anthropic Claude 3.5 Haiku": {"input_cost": 0.8, "output_cost": 4},
56
+ "Anthropic Claude 3 Opus": {"input_cost": 0.8, "output_cost": 4},
57
+ "Anthropic Claude 3.5 Sonnet": {"input_cost": 3, "output_cost": 15},
58
+ "Anthropic Claude 3 Haiku": {"input_cost": 0.25, "output_cost": 1.25},
59
+ "TogetherAI DeepSeek-R1": {"input_cost": 3, "output_cost": 7},
60
+ "Llama 3.2 3B Instruct Turbo": {"input_cost": 0.06, "output_cost": 0.06},
61
+ "Gemini 2.0 Flash": {"input_cost": 0.1, "output_cost": 0.4},
62
+ "Gemini 2.0 Flash-Lite": {"input_cost": 0.075, "output_cost": 0.3},
63
+ "Gemini 1.5 Pro": {"input_cost": 1.25, "output_cost": 5},
64
+ "Gemini Pro": {"input_cost": 0.5, "output_cost": 1.5}
65
+ }"""
66
+
67
+ # Add JSON editor to sidebar
68
+ st.sidebar.markdown('<div class="section-header">LLM Models Configuration</div>', unsafe_allow_html=True)
69
+ st.sidebar.markdown("Edit the JSON below to modify existing models or add new ones:")
70
+
71
+ # Display JSON in a text area for editing
72
+ models_json = st.sidebar.text_area("Models JSON", default_models_json, height=400)
73
+
74
+ # Parse the JSON input
75
+ try:
76
+ llm_models = json.loads(models_json)
77
+ except json.JSONDecodeError as e:
78
+ st.sidebar.error(f"Invalid JSON: {str(e)}")
79
+ # Use default models if JSON is invalid
80
+ llm_models = json.loads(default_models_json)
81
+
82
+
83
+ medmcqa_splits = {
84
+ "Single-Select Questions": {
85
+ "questions": 120765,
86
+ "avg_q_tokens": 12.77, # Using the train dataset average
87
+ "description": "Single-select questions from the MedMCQA train dataset"
88
+ }
89
+ }
90
+
91
+ col1, col2 = st.columns([2, 1])
92
+
93
+ with col1:
94
+ st.markdown('<div class="section-header">Select LLM Models</div>', unsafe_allow_html=True)
95
+
96
+ selected_models = st.multiselect(
97
+ "Choose one or more LLM models:",
98
+ options=list(llm_models.keys()),
99
+ default=list(llm_models.keys())[:2]
100
+ )
101
+
102
+ with st.expander("View Model Details"):
103
+ models_df = pd.DataFrame([
104
+ {
105
+ "Model": model,
106
+ "Input Cost (per 1M tokens)": f"${llm_models[model]['input_cost']:.2f}",
107
+ "Output Cost (per 1M tokens)": f"${llm_models[model]['output_cost']:.2f}"
108
+ }
109
+ for model in llm_models
110
+ ])
111
+ st.dataframe(models_df, use_container_width=True)
112
+
113
+ with col2:
114
+ st.markdown('<div class="section-header">MedMCQA Dataset</div>', unsafe_allow_html=True)
115
+
116
+ st.markdown(f"""
117
+ **Single-Select Questions:** {medmcqa_splits['Single-Select Questions']['questions']:,}
118
+
119
+ **Average Question Tokens:** {medmcqa_splits['Single-Select Questions']['avg_q_tokens']}
120
+
121
+ **Description:** {medmcqa_splits['Single-Select Questions']['description']}
122
+ """)
123
+
124
+
125
+ st.markdown('<div class="section-header">Cost Simulation Parameters</div>', unsafe_allow_html=True)
126
+
127
+ col1, col2 = st.columns(2)
128
+
129
+ with col1:
130
+ prompt_tokens = st.number_input(
131
+ "Number of Prompt Tokens per Question",
132
+ min_value=1,
133
+ max_value=1000,
134
+ value=200,
135
+ step=10,
136
+ help="Number of tokens in each prompt (including the question and any additional instructions)"
137
+ )
138
+
139
+ with col2:
140
+ output_tokens = st.number_input(
141
+ "Average Output Tokens per Question",
142
+ min_value=1,
143
+ max_value=1000,
144
+ value=100,
145
+ step=10,
146
+ help="Average number of tokens in the model's response"
147
+ )
148
+
149
+ col1, col2, col3 = st.columns(3)
150
+
151
+ with col1:
152
+ num_runs = st.number_input(
153
+ "Number of Evaluation Runs",
154
+ min_value=1,
155
+ max_value=1000,
156
+ value=1,
157
+ step=1,
158
+ help="How many times each dataset will be processed by each model"
159
+ )
160
+
161
+ with col2:
162
+ st.write("")
163
+
164
+ with col3:
165
+ sampling_percentage = st.slider(
166
+ "Dataset Sampling Percentage",
167
+ min_value=1,
168
+ max_value=100,
169
+ value=100,
170
+ step=1,
171
+ help="Percentage of questions to process from each split"
172
+ )
173
+
174
+
175
+ def calculate_costs(models, prompt_token_count, output_token_count, runs, sampling_pct):
176
+ results = []
177
+
178
+ total_questions = medmcqa_splits["Single-Select Questions"]["questions"]
179
+ num_questions = int(total_questions * (sampling_pct / 100))
180
+
181
+ for model in models:
182
+ model_input_cost = llm_models[model]["input_cost"]
183
+ model_output_cost = llm_models[model]["output_cost"]
184
+
185
+ total_input_tokens = num_questions * prompt_token_count * runs
186
+ total_output_tokens = num_questions * output_token_count * runs
187
+
188
+ input_cost = (total_input_tokens / 1000000) * model_input_cost
189
+ output_cost = (total_output_tokens / 1000000) * model_output_cost
190
+ total_cost = input_cost + output_cost
191
+
192
+ results.append({
193
+ "Model": model,
194
+ "Questions": num_questions, # Changed from Total Questions to Questions
195
+ "Number of Prompt Tokens per Question": prompt_token_count,
196
+ "Number of Output Tokens per Question": output_token_count,
197
+ "Total Input Tokens": total_input_tokens,
198
+ "Total Output Tokens": total_output_tokens,
199
+ "Input Cost": input_cost,
200
+ "Output Cost": output_cost,
201
+ "Total Cost": total_cost,
202
+ "Split": "Single-Select Questions"
203
+ })
204
+
205
+ cost_df = pd.DataFrame(results)
206
+
207
+ model_summary = cost_df.groupby("Model").agg({
208
+ "Input Cost": "sum",
209
+ "Output Cost": "sum",
210
+ "Total Cost": "sum"
211
+ }).reset_index()
212
+
213
+ # Fixed: Using columns that actually exist in the DataFrame
214
+ split_summary = cost_df.groupby("Split").agg({
215
+ "Questions": "sum", # Changed from "Total Questions"
216
+ "Total Input Tokens": "sum",
217
+ "Total Output Tokens": "sum",
218
+ "Total Cost": "sum"
219
+ }).reset_index()
220
+
221
+ return cost_df, model_summary, split_summary
222
+
223
+ if selected_models:
224
+ detailed_costs, model_summary, split_summary = calculate_costs(
225
+ selected_models,
226
+ prompt_tokens,
227
+ output_tokens,
228
+ num_runs,
229
+ sampling_percentage
230
+ )
231
+
232
+ total_cost = detailed_costs["Total Cost"].sum()
233
+ total_questions = detailed_costs["Questions"][0] # Changed from "Total Questions"
234
+ total_input_tokens = detailed_costs["Total Input Tokens"].sum()
235
+ total_output_tokens = detailed_costs["Total Output Tokens"].sum()
236
+
237
+ st.markdown('<div class="section-header">Cost Calculation Breakdown</div>', unsafe_allow_html=True)
238
+
239
+ with st.expander("View Detailed Cost Calculation Formula", expanded=False):
240
+ st.markdown("""
241
+ ### Cost Calculation Formula
242
+
243
+ For each model, the cost is calculated as:
244
+
245
+ ```
246
+ Input Cost = (Number of Questions × Prompt Tokens per Question × Number of Runs ÷ 1,000,000) × Input Cost per Million Tokens
247
+ Output Cost = (Number of Questions × Output Tokens per Question × Number of Runs ÷ 1,000,000) × Output Cost per Million Tokens
248
+ Total Cost = Input Cost + Output Cost
249
+ ```
250
+
251
+ """)
252
+
253
+ for model in selected_models:
254
+ model_data = detailed_costs[detailed_costs["Model"] == model].iloc[0]
255
+ model_input_cost = llm_models[model]["input_cost"]
256
+ model_output_cost = llm_models[model]["output_cost"]
257
+
258
+ model_input_tokens = model_data["Total Input Tokens"]
259
+ model_output_tokens = model_data["Total Output Tokens"]
260
+ model_input_cost_total = model_data["Input Cost"]
261
+ model_output_cost_total = model_data["Output Cost"]
262
+ model_total_cost = model_data["Total Cost"]
263
+
264
+ st.markdown(f"""
265
+ #### {model}:
266
+
267
+ **Input Cost Calculation:**
268
+ ({total_questions:,} questions × {prompt_tokens} tokens × {num_runs} runs ÷ 1,000,000) × ${model_input_cost:.2f} = ${model_input_cost_total:.2f}
269
+
270
+ **Output Cost Calculation:**
271
+ ({total_questions:,} questions × {output_tokens} tokens × {num_runs} runs ÷ 1,000,000) × ${model_output_cost:.2f} = ${model_output_cost_total:.2f}
272
+
273
+ **Total Cost for {model}:** ${model_total_cost:.2f}
274
+ """)
275
+
276
+ st.markdown(f"""
277
+ <div class="info-box">
278
+ <div class="section-header">Total Estimated Cost</div>
279
+ <div class="cost-highlight">${total_cost:.2f}</div>
280
+ <p>For processing {total_questions:,} questions ({sampling_percentage}% of total)
281
+ with {len(selected_models)} models, {num_runs} time{'s' if num_runs > 1 else ''}.</p>
282
+ <p>Using {prompt_tokens} prompt tokens and {output_tokens} output tokens per question.</p>
283
+ <p>Total tokens processed: {total_input_tokens:,} input tokens + {total_output_tokens:,} output tokens = {total_input_tokens + total_output_tokens:,} total tokens</p>
284
+ </div>
285
+ """, unsafe_allow_html=True)
286
+
287
+ tab1, tab2 = st.tabs(["Cost Breakdown", "Detailed Costs"])
288
+
289
+ with tab1:
290
+ col1, col2 = st.columns(2)
291
+
292
+ with col1:
293
+ cost_types = ["Input Cost", "Output Cost"]
294
+
295
+ fig1 = px.bar(
296
+ model_summary,
297
+ x="Model",
298
+ y=cost_types,
299
+ title="Cost Breakdown by Model",
300
+ labels={"value": "Cost ($)", "variable": "Cost Type"},
301
+ color_discrete_sequence=blue_to_gray_palette,
302
+ )
303
+ fig1.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
304
+ st.plotly_chart(fig1, use_container_width=True)
305
+
306
+ with col2:
307
+ fig2 = go.Figure(data=[
308
+ go.Pie(
309
+ labels=model_summary["Model"],
310
+ values=model_summary["Total Cost"],
311
+ hole=.4,
312
+ textinfo="label+percent",
313
+ marker_colors=blue_to_gray_palette,
314
+ )
315
+ ])
316
+ fig2.update_layout(title_text="Proportion of Total Cost by Model")
317
+ st.plotly_chart(fig2, use_container_width=True)
318
+
319
+ if "Split" in detailed_costs.columns and len(detailed_costs["Split"].unique()) > 1:
320
+ pivot_df = detailed_costs.pivot(index="Split", columns="Model", values="Total Cost")
321
+ fig4 = px.imshow(
322
+ pivot_df,
323
+ labels=dict(x="Model", y="Split", color="Cost ($)"),
324
+ x=pivot_df.columns,
325
+ y=pivot_df.index,
326
+ color_continuous_scale=["#0077b6", "#4a98c9", "#7ba7c5", "#a6b5c1", "#d0d7dc"],
327
+ title="Cost Heatmap (Model vs Split)",
328
+ text_auto='.2f',
329
+ )
330
+ fig4.update_layout(height=400)
331
+ st.plotly_chart(fig4, use_container_width=True)
332
+
333
+ with tab2:
334
+ # Fixed display columns to match the actual DataFrame columns
335
+ display_cols = [
336
+ "Model", "Questions", # Changed from "Total Questions"
337
+ "Number of Prompt Tokens per Question", "Number of Output Tokens per Question",
338
+ "Total Input Tokens", "Total Output Tokens",
339
+ "Input Cost", "Output Cost", "Total Cost"
340
+ ]
341
+
342
+ formatted_df = detailed_costs[display_cols].copy()
343
+
344
+ # Format currency columns
345
+ for col in ["Input Cost", "Output Cost", "Total Cost"]:
346
+ if col in formatted_df.columns:
347
+ formatted_df[col] = formatted_df[col].map("${:.2f}".format)
348
+
349
+ # Format number columns
350
+ for col in ["Questions", "Total Input Tokens", "Total Output Tokens"]: # Changed from "Total Questions"
351
+ if col in formatted_df.columns:
352
+ formatted_df[col] = formatted_df[col].map("{:,}".format)
353
+
354
+ st.dataframe(formatted_df, use_container_width=True)
355
+
356
+ st.markdown('<div class="section-header">Export Results</div>', unsafe_allow_html=True)
357
+
358
+ col1, col2 = st.columns(2)
359
+
360
+ with col1:
361
+ csv = detailed_costs.to_csv(index=False)
362
+ st.download_button(
363
+ label="Download Full Results (CSV)",
364
+ data=csv,
365
+ file_name="medmcqa_llm_cost_analysis.csv",
366
+ mime="text/csv",
367
+ )
368
+
369
+ with col2:
370
+ export_json = {
371
+ "parameters": {
372
+ "models": selected_models,
373
+ "dataset": "MedMCQA Single-Select Questions",
374
+ "total_questions": medmcqa_splits["Single-Select Questions"]["questions"],
375
+ "prompt_tokens": prompt_tokens,
376
+ "output_tokens": output_tokens,
377
+ "sampling_percentage": sampling_percentage,
378
+ "num_runs": num_runs
379
+ },
380
+ "results": {
381
+ "total_cost": float(total_cost),
382
+ "detailed_costs": detailed_costs.to_dict(orient="records"),
383
+ "model_summary": model_summary.to_dict(orient="records")
384
+ }
385
+ }
386
+
387
+ st.download_button(
388
+ label="Download Full Results (JSON)",
389
+ data=json.dumps(export_json, indent=4),
390
+ file_name="medmcqa_llm_cost_analysis.json",
391
+ mime="application/json",
392
+ )
393
+
394
+ else:
395
+ st.info("Please select at least one model and one dataset split to calculate costs.")