Spaces:
Running
Running
Push main
#1
by
kaikaidai
- opened
- app.py +458 -666
- common.py +39 -14
- data/models.jsonl +16 -27
- gen_api_answer.py +53 -409
- leaderboard.py +0 -116
- prompts.py +0 -210
- random_sample_generation.py +0 -183
- requirements.txt +0 -2
app.py
CHANGED
@@ -2,57 +2,36 @@ import json
|
|
2 |
import re
|
3 |
import random
|
4 |
from collections import defaultdict
|
5 |
-
from datetime import datetime
|
6 |
import hashlib
|
7 |
-
import gradio as gr
|
8 |
|
9 |
from dotenv import load_dotenv
|
10 |
-
load_dotenv()
|
11 |
|
12 |
-
|
13 |
-
get_model_response,
|
14 |
-
parse_model_response,
|
15 |
-
prometheus_parse_model_response,
|
16 |
-
atla_parse_model_response,
|
17 |
-
flow_judge_parse_model_response
|
18 |
-
)
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
get_random_human_ai_ground_truth_pair,
|
23 |
-
generate_ai_response
|
24 |
-
)
|
25 |
from db import add_vote, create_db_connection, get_votes
|
26 |
-
|
27 |
from utils import Vote
|
28 |
-
|
29 |
from common import (
|
30 |
POLICY_CONTENT,
|
31 |
ACKNOWLEDGEMENTS,
|
|
|
|
|
|
|
32 |
CSS_STYLES,
|
33 |
MAIN_TITLE,
|
34 |
HOW_IT_WORKS,
|
|
|
|
|
|
|
35 |
)
|
36 |
-
from
|
37 |
-
DEFAULT_EVAL_PROMPT,
|
38 |
-
DEFAULT_EVAL_PROMPT_EDITABLE,
|
39 |
-
FIXED_EVAL_SUFFIX,
|
40 |
-
DEFAULT_EVAL_CRITERIA,
|
41 |
-
DEFAULT_SCORE_1,
|
42 |
-
DEFAULT_SCORE_2,
|
43 |
-
DEFAULT_SCORE_3,
|
44 |
-
DEFAULT_SCORE_4,
|
45 |
-
DEFAULT_SCORE_5,
|
46 |
-
)
|
47 |
-
from leaderboard import (
|
48 |
-
get_leaderboard,
|
49 |
-
get_leaderboard_stats,
|
50 |
-
get_model_rankings,
|
51 |
-
DEFAULT_ELO,
|
52 |
-
K_FACTOR
|
53 |
-
)
|
54 |
|
55 |
|
|
|
|
|
|
|
56 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
57 |
vote_counts = defaultdict(int)
|
58 |
|
@@ -73,7 +52,6 @@ def load_model_data():
|
|
73 |
"organization": model["organization"],
|
74 |
"license": model["license"],
|
75 |
"api_model": model["api_model"],
|
76 |
-
"active": model["active"]
|
77 |
}
|
78 |
except FileNotFoundError:
|
79 |
print("Warning: models.jsonl not found")
|
@@ -84,11 +62,9 @@ def load_model_data():
|
|
84 |
model_data = load_model_data()
|
85 |
|
86 |
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
|
87 |
-
prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
|
88 |
-
|
89 |
vote = Vote(
|
90 |
timestamp=datetime.now().isoformat(),
|
91 |
-
prompt=
|
92 |
response_a=response_a,
|
93 |
response_b=response_b,
|
94 |
model_a=model_a,
|
@@ -117,6 +93,40 @@ def get_final_prompt(eval_prompt, variable_values):
|
|
117 |
return eval_prompt
|
118 |
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
def get_ip(request: gr.Request) -> str:
|
122 |
"""Get and hash the IP address from the request."""
|
@@ -133,26 +143,6 @@ def get_ip(request: gr.Request) -> str:
|
|
133 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
134 |
|
135 |
|
136 |
-
def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
|
137 |
-
"""Generate appropriate message based on vote and model rankings.
|
138 |
-
Returns (title, message) tuple."""
|
139 |
-
# Get current rankings
|
140 |
-
voting_data = get_current_votes()
|
141 |
-
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
142 |
-
rankings = get_model_rankings(leaderboard)
|
143 |
-
pos_a = rankings.get(model_a, 0)
|
144 |
-
pos_b = rankings.get(model_b, 0)
|
145 |
-
|
146 |
-
if choice == "Tie":
|
147 |
-
return "It's a tie!", "Keep voting responsibly 🤗"
|
148 |
-
|
149 |
-
# Check if vote aligns with leaderboard
|
150 |
-
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
151 |
-
return "The favourite wins!", "Keep voting responsibly 🤗"
|
152 |
-
else:
|
153 |
-
return "The underdog wins!", "Keep voting responsibly 🤗"
|
154 |
-
|
155 |
-
|
156 |
def vote(
|
157 |
choice,
|
158 |
model_a,
|
@@ -202,39 +192,16 @@ def vote(
|
|
202 |
store_vote_data(
|
203 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
204 |
)
|
205 |
-
|
206 |
-
#
|
207 |
-
voting_data = get_current_votes()
|
208 |
-
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
209 |
-
rankings = get_model_rankings(leaderboard)
|
210 |
-
pos_a = rankings.get(model_a, 0)
|
211 |
-
pos_b = rankings.get(model_b, 0)
|
212 |
-
|
213 |
-
# Format model names with positions and win/loss indicators
|
214 |
-
if choice == "Tie":
|
215 |
-
model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
|
216 |
-
model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
|
217 |
-
else:
|
218 |
-
winner = model_a if choice == "A" else model_b
|
219 |
-
loser = model_b if choice == "A" else model_a
|
220 |
-
winner_pos = pos_a if choice == "A" else pos_b
|
221 |
-
loser_pos = pos_b if choice == "A" else pos_a
|
222 |
-
|
223 |
-
model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*"
|
224 |
-
model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*"
|
225 |
-
|
226 |
-
# Generate vote message
|
227 |
-
title, message = get_vote_message(choice, model_a, model_b)
|
228 |
-
|
229 |
return [
|
230 |
-
gr.update(
|
231 |
-
gr.update(
|
232 |
-
gr.update(
|
233 |
-
gr.update(value=
|
234 |
-
gr.update(value=
|
235 |
-
gr.update(interactive=True, value="
|
236 |
-
gr.update(
|
237 |
-
gr.Info(message, title=title), # success message
|
238 |
]
|
239 |
|
240 |
|
@@ -243,24 +210,150 @@ def get_current_votes():
|
|
243 |
return get_votes(db)
|
244 |
|
245 |
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
voting_data = get_current_votes()
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
|
266 |
# Update the leaderboard table definition in the UI
|
@@ -270,30 +363,63 @@ leaderboard_table = gr.Dataframe(
|
|
270 |
)
|
271 |
|
272 |
|
273 |
-
def
|
274 |
-
"""
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
return [
|
284 |
gr.update(value=human_msg),
|
285 |
-
gr.update(value=ai_msg)
|
286 |
-
gr.update(value="🎲", variant="secondary"), # Reset random button appearance
|
287 |
-
gr.update(value=""), # Clear score A
|
288 |
-
gr.update(value=""), # Clear critique A
|
289 |
-
gr.update(value=""), # Clear score B
|
290 |
-
gr.update(value=""), # Clear critique B
|
291 |
-
gr.update(interactive=False, variant="primary"), # Reset vote A
|
292 |
-
gr.update(interactive=False, variant="primary"), # Reset vote B
|
293 |
-
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
294 |
-
gr.update(value="*Model: Hidden*"), # Reset model name A
|
295 |
-
gr.update(value="*Model: Hidden*"), # Reset model name B
|
296 |
-
gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
|
297 |
]
|
298 |
|
299 |
|
@@ -309,43 +435,27 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
309 |
|
310 |
with gr.Tabs():
|
311 |
with gr.TabItem("Judge Arena"):
|
|
|
312 |
with gr.Row():
|
313 |
# Left side - Input section
|
314 |
with gr.Column(scale=1):
|
315 |
with gr.Group():
|
316 |
human_input = gr.TextArea(
|
317 |
-
label="👩
|
318 |
-
lines=
|
319 |
placeholder="Enter the human message here..."
|
320 |
)
|
321 |
-
with gr.Row():
|
322 |
-
generate_btn = gr.Button(
|
323 |
-
"Generate AI Response",
|
324 |
-
size="sm",
|
325 |
-
interactive=False
|
326 |
-
)
|
327 |
|
328 |
ai_response = gr.TextArea(
|
329 |
label="🤖 AI Response",
|
330 |
-
lines=15,
|
331 |
-
placeholder="Enter the AI response here..."
|
332 |
-
)
|
333 |
-
|
334 |
-
# Ground truth response (initially hidden)
|
335 |
-
ground_truth = gr.TextArea(
|
336 |
-
label="🎯 Ground truth response",
|
337 |
lines=12,
|
338 |
-
placeholder="Enter the
|
339 |
-
visible=False
|
340 |
)
|
341 |
|
342 |
-
with gr.Row():
|
343 |
-
random_btn = gr.Button("🎲", scale=2)
|
344 |
send_btn = gr.Button(
|
345 |
-
value="Run
|
346 |
variant="primary",
|
347 |
-
size="lg"
|
348 |
-
scale=8
|
349 |
)
|
350 |
|
351 |
# Right side - Model outputs
|
@@ -355,15 +465,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
355 |
model_name_a = gr.Markdown("*Model: Hidden*")
|
356 |
with gr.Row():
|
357 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
358 |
-
score_a = gr.Textbox(label="Score", lines=
|
359 |
-
vote_a = gr.Button("Vote A", variant="primary",
|
360 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
361 |
-
critique_a = gr.TextArea(label="Critique", lines=
|
|
|
|
|
|
|
362 |
|
363 |
# Tie button row
|
364 |
-
with gr.Row() as tie_button_row:
|
365 |
with gr.Column():
|
366 |
-
vote_tie = gr.Button("Tie", variant="
|
367 |
|
368 |
|
369 |
gr.Markdown("### 🧑⚖️ Judge B")
|
@@ -371,90 +484,16 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
371 |
model_name_b = gr.Markdown("*Model: Hidden*")
|
372 |
with gr.Row():
|
373 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
374 |
-
score_b = gr.Textbox(label="Score", lines=
|
375 |
-
vote_b = gr.Button("Vote B", variant="primary",
|
376 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
377 |
-
critique_b = gr.TextArea(label="Critique", lines=
|
378 |
-
|
379 |
|
380 |
gr.Markdown("<br>")
|
381 |
-
|
382 |
-
|
383 |
-
# Replace the "Edit Judge Prompt" Accordion section with:
|
384 |
-
with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
|
385 |
-
gr.Markdown("<br>")
|
386 |
-
use_reference_toggle = gr.Checkbox(
|
387 |
-
label="Use a reference response",
|
388 |
-
value=False
|
389 |
-
)
|
390 |
-
|
391 |
-
# Hide the default prompt editor
|
392 |
-
with gr.Column(visible=False) as default_prompt_editor:
|
393 |
-
eval_prompt_editable = gr.TextArea(
|
394 |
-
value=DEFAULT_EVAL_PROMPT_EDITABLE,
|
395 |
-
label="Evaluation Criteria",
|
396 |
-
lines=12
|
397 |
-
)
|
398 |
|
399 |
-
|
400 |
-
|
401 |
-
save_prompt_btn = gr.Button("Save", variant="primary")
|
402 |
-
gr.Markdown("*The sample being evaluated is always appended as:*")
|
403 |
-
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
|
404 |
-
|
405 |
-
# Show the compatible mode editor
|
406 |
-
with gr.Column(visible=True) as compatible_prompt_editor:
|
407 |
-
with gr.Row():
|
408 |
-
# Left column - Evaluation Criteria
|
409 |
-
with gr.Column(scale=1):
|
410 |
-
eval_criteria_text = gr.TextArea(
|
411 |
-
label="Evaluation Criteria",
|
412 |
-
lines=12,
|
413 |
-
value=DEFAULT_EVAL_CRITERIA,
|
414 |
-
placeholder="Enter the evaluation criteria..."
|
415 |
-
)
|
416 |
-
prometheus_reference = gr.Markdown(
|
417 |
-
"<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
|
418 |
-
visible=True
|
419 |
-
)
|
420 |
-
|
421 |
-
# Right column - Score Descriptions
|
422 |
-
with gr.Column(scale=1):
|
423 |
-
score1_description = gr.TextArea(
|
424 |
-
label="Score 1",
|
425 |
-
value=DEFAULT_SCORE_1,
|
426 |
-
placeholder="Description for score 1",
|
427 |
-
lines=2
|
428 |
-
)
|
429 |
-
score2_description = gr.TextArea(
|
430 |
-
label="Score 2",
|
431 |
-
value=DEFAULT_SCORE_2,
|
432 |
-
placeholder="Description for score 2",
|
433 |
-
lines=2
|
434 |
-
)
|
435 |
-
score3_description = gr.TextArea(
|
436 |
-
label="Score 3",
|
437 |
-
value=DEFAULT_SCORE_3,
|
438 |
-
placeholder="Description for score 3",
|
439 |
-
lines=2
|
440 |
-
)
|
441 |
-
score4_description = gr.TextArea(
|
442 |
-
label="Score 4",
|
443 |
-
value=DEFAULT_SCORE_4,
|
444 |
-
placeholder="Description for score 4",
|
445 |
-
lines=2
|
446 |
-
)
|
447 |
-
score5_description = gr.TextArea(
|
448 |
-
label="Score 5",
|
449 |
-
value=DEFAULT_SCORE_5,
|
450 |
-
placeholder="Description for score 5",
|
451 |
-
lines=2
|
452 |
-
)
|
453 |
-
|
454 |
-
# Add save/cancel buttons for compatible mode
|
455 |
-
with gr.Row(visible=False) as compatible_edit_buttons_row:
|
456 |
-
compatible_cancel_btn = gr.Button("Cancel")
|
457 |
-
compatible_save_btn = gr.Button("Save", variant="primary")
|
458 |
|
459 |
with gr.TabItem("Leaderboard"):
|
460 |
with gr.Row():
|
@@ -462,7 +501,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
462 |
show_preliminary = gr.Checkbox(
|
463 |
label="Reveal preliminary results",
|
464 |
value=True, # Checked by default
|
465 |
-
info="Show all models, including models with less human ratings (<
|
466 |
interactive=True
|
467 |
)
|
468 |
stats_display = gr.Markdown()
|
@@ -470,13 +509,24 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
470 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
471 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
472 |
)
|
473 |
-
|
474 |
-
gr.Markdown("""<br>
|
475 |
-
<br>
|
476 |
-
Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
|
477 |
|
478 |
-
|
479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
|
481 |
# Add change handler for checkbox
|
482 |
show_preliminary.change(
|
@@ -494,15 +544,67 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
494 |
|
495 |
with gr.TabItem("Policy"):
|
496 |
gr.Markdown(POLICY_CONTENT)
|
497 |
-
gr.Markdown(ACKNOWLEDGEMENTS)
|
498 |
|
499 |
# Define state variables for model tracking
|
500 |
model_a_state = gr.State()
|
501 |
model_b_state = gr.State()
|
502 |
final_prompt_state = gr.State()
|
503 |
-
|
504 |
-
|
505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
# Update model names after responses are generated
|
508 |
def update_model_names(model_a, model_b):
|
@@ -517,7 +619,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
517 |
vote_a.click(
|
518 |
fn=vote,
|
519 |
inputs=[
|
520 |
-
gr.State("A"),
|
521 |
model_a_state,
|
522 |
model_b_state,
|
523 |
final_prompt_state,
|
@@ -529,19 +631,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
529 |
outputs=[
|
530 |
vote_a,
|
531 |
vote_b,
|
532 |
-
|
533 |
model_name_a,
|
534 |
model_name_b,
|
535 |
send_btn,
|
536 |
-
|
537 |
-
gr.State(), # placeholder for success message
|
538 |
],
|
539 |
)
|
540 |
|
541 |
vote_b.click(
|
542 |
fn=vote,
|
543 |
inputs=[
|
544 |
-
gr.State("B"),
|
545 |
model_a_state,
|
546 |
model_b_state,
|
547 |
final_prompt_state,
|
@@ -553,19 +654,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
553 |
outputs=[
|
554 |
vote_a,
|
555 |
vote_b,
|
556 |
-
|
557 |
model_name_a,
|
558 |
model_name_b,
|
559 |
send_btn,
|
560 |
-
|
561 |
-
gr.State(), # placeholder for success message
|
562 |
],
|
563 |
)
|
564 |
|
565 |
vote_tie.click(
|
566 |
fn=vote,
|
567 |
inputs=[
|
568 |
-
gr.State("Tie"),
|
569 |
model_a_state,
|
570 |
model_b_state,
|
571 |
final_prompt_state,
|
@@ -577,250 +677,66 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
577 |
outputs=[
|
578 |
vote_a,
|
579 |
vote_b,
|
580 |
-
|
581 |
model_name_a,
|
582 |
model_name_b,
|
583 |
send_btn,
|
584 |
-
|
585 |
-
gr.State(), # placeholder for success message
|
586 |
],
|
587 |
)
|
588 |
|
589 |
-
#
|
590 |
-
def
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
|
|
603 |
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
inputs=[eval_prompt_editable, eval_prompt_previous],
|
612 |
-
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
|
613 |
-
)
|
614 |
|
615 |
-
|
616 |
-
|
617 |
-
inputs=[eval_prompt_previous],
|
618 |
-
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
|
619 |
-
)
|
620 |
-
|
621 |
-
eval_prompt_editable.change(
|
622 |
-
fn=show_edit_buttons,
|
623 |
-
inputs=[eval_prompt_editable, eval_prompt_previous],
|
624 |
-
outputs=edit_buttons_row
|
625 |
-
)
|
626 |
|
627 |
-
|
628 |
-
def toggle_use_reference(checked):
|
629 |
-
if checked:
|
630 |
-
# Get new random samples with ground truth when enabling reference mode
|
631 |
-
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
|
632 |
-
return {
|
633 |
-
ground_truth: gr.update(visible=True, value=ground_truth_msg),
|
634 |
-
human_input: gr.update(value=human_msg),
|
635 |
-
ai_response: gr.update(value=ai_msg),
|
636 |
-
# Reset other UI elements
|
637 |
-
score_a: gr.update(value=""),
|
638 |
-
critique_a: gr.update(value=""),
|
639 |
-
score_b: gr.update(value=""),
|
640 |
-
critique_b: gr.update(value=""),
|
641 |
-
vote_a: gr.update(interactive=False, variant="primary"),
|
642 |
-
vote_b: gr.update(interactive=False, variant="primary"),
|
643 |
-
vote_tie: gr.update(interactive=False, variant="primary"),
|
644 |
-
model_name_a: gr.update(value="*Model: Hidden*"),
|
645 |
-
model_name_b: gr.update(value="*Model: Hidden*"),
|
646 |
-
random_btn: gr.update(value="🎲", variant="secondary"),
|
647 |
-
}
|
648 |
-
else:
|
649 |
-
# Just hide ground truth when disabling reference mode
|
650 |
-
return {
|
651 |
-
ground_truth: gr.update(visible=False)
|
652 |
-
}
|
653 |
-
|
654 |
-
# Update the change handler to include all necessary outputs
|
655 |
-
use_reference_toggle.change(
|
656 |
-
fn=toggle_use_reference,
|
657 |
-
inputs=[use_reference_toggle],
|
658 |
-
outputs=[
|
659 |
-
ground_truth,
|
660 |
-
human_input,
|
661 |
-
ai_response,
|
662 |
score_a,
|
663 |
critique_a,
|
664 |
score_b,
|
665 |
critique_b,
|
666 |
-
vote_a
|
667 |
-
vote_b
|
668 |
-
|
669 |
-
model_name_a,
|
670 |
-
model_name_b,
|
671 |
-
random_btn,
|
672 |
-
]
|
673 |
-
)
|
674 |
-
|
675 |
-
# Add a new state variable to track first game
|
676 |
-
first_game_state = gr.State(True) # Initialize as True
|
677 |
-
|
678 |
-
# Update the submit function to use the state variable
|
679 |
-
def submit_and_store(
|
680 |
-
use_reference,
|
681 |
-
eval_criteria_text_input,
|
682 |
-
human_input,
|
683 |
-
ai_response,
|
684 |
-
ground_truth_input,
|
685 |
-
score1_description,
|
686 |
-
score2_description,
|
687 |
-
score3_description,
|
688 |
-
score4_description,
|
689 |
-
score5_description,
|
690 |
-
):
|
691 |
-
# Build prompt data dictionary
|
692 |
-
prompt_data = {
|
693 |
-
'human_input': human_input,
|
694 |
-
'ai_response': ai_response,
|
695 |
-
'ground_truth_input': ground_truth_input,
|
696 |
-
'eval_criteria': eval_criteria_text_input,
|
697 |
-
'score1_desc': score1_description,
|
698 |
-
'score2_desc': score2_description,
|
699 |
-
'score3_desc': score3_description,
|
700 |
-
'score4_desc': score4_description,
|
701 |
-
'score5_desc': score5_description,
|
702 |
-
}
|
703 |
-
|
704 |
-
# Get list of active models only for matches
|
705 |
-
active_models = [name for name, info in model_data.items()
|
706 |
-
if info.get("active", True) is True] # Explicitly check for True
|
707 |
-
|
708 |
-
# Define new models list
|
709 |
-
new_models = ["Atla Selene 1 Mini", "SFR-LLaMA-3.1-70B-Judge"]
|
710 |
-
|
711 |
-
# New models appear 40% of the time
|
712 |
-
if random.random() < 0.4:
|
713 |
-
# Randomly choose between new models
|
714 |
-
new_model = random.choice(new_models)
|
715 |
-
other_models = [m for m in active_models if m not in new_models]
|
716 |
-
other_model = random.choice(other_models)
|
717 |
-
|
718 |
-
if random.random() < 0.5:
|
719 |
-
model_a, model_b = new_model, other_model
|
720 |
-
else:
|
721 |
-
model_a, model_b = other_model, new_model
|
722 |
-
else:
|
723 |
-
# For other cases, exclude new models
|
724 |
-
non_special_models = [m for m in active_models if m not in new_models]
|
725 |
-
model1, model2 = random.sample(non_special_models, 2)
|
726 |
-
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
727 |
-
|
728 |
-
# Get responses from models
|
729 |
-
response_a = get_model_response(
|
730 |
model_a,
|
731 |
-
model_data.get(model_a),
|
732 |
-
prompt_data,
|
733 |
-
use_reference=use_reference
|
734 |
-
)
|
735 |
-
response_b = get_model_response(
|
736 |
model_b,
|
737 |
-
|
738 |
-
prompt_data,
|
739 |
-
use_reference=use_reference
|
740 |
-
)
|
741 |
-
|
742 |
-
|
743 |
-
is_prometheus_a = model_data.get(model_a, {}).get('organization') == 'Prometheus'
|
744 |
-
is_prometheus_b = model_data.get(model_b, {}).get('organization') == 'Prometheus'
|
745 |
-
is_atla_a = model_data.get(model_a, {}).get('organization') == 'Atla'
|
746 |
-
is_atla_b = model_data.get(model_b, {}).get('organization') == 'Atla'
|
747 |
-
is_flow_judge_a = model_data.get(model_a, {}).get('organization') == 'Flow AI'
|
748 |
-
is_flow_judge_b = model_data.get(model_b, {}).get('organization') == 'Flow AI'
|
749 |
-
is_salesforce_a = model_data.get(model_a, {}).get('organization') == 'Salesforce'
|
750 |
-
is_salesforce_b = model_data.get(model_b, {}).get('organization') == 'Salesforce'
|
751 |
-
|
752 |
-
# Parse the responses based on model, using appropriate parsing for different models
|
753 |
-
if is_prometheus_a:
|
754 |
-
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
755 |
-
score_a_val = f"{score_a_val} / 5"
|
756 |
-
elif is_atla_a or is_salesforce_a: # Same parser for Atla and Salesforce
|
757 |
-
score_a_val, critique_a_val = atla_parse_model_response(response_a)
|
758 |
-
score_a_val = f"{score_a_val} / 5"
|
759 |
-
elif is_flow_judge_a:
|
760 |
-
score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
|
761 |
-
score_a_val = f"{score_a_val} / 5"
|
762 |
-
else:
|
763 |
-
score_a_val, critique_a_val = parse_model_response(response_a)
|
764 |
-
score_a_val = f"{score_a_val} / 5"
|
765 |
-
|
766 |
-
if is_prometheus_b:
|
767 |
-
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
768 |
-
score_b_val = f"{score_b_val} / 5"
|
769 |
-
elif is_atla_b or is_salesforce_b: # Same parser for Atla and Salesforce
|
770 |
-
score_b_val, critique_b_val = atla_parse_model_response(response_b)
|
771 |
-
score_b_val = f"{score_b_val} / 5"
|
772 |
-
elif is_flow_judge_b:
|
773 |
-
score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
|
774 |
-
score_b_val = f"{score_b_val} / 5"
|
775 |
-
else:
|
776 |
-
score_b_val, critique_b_val = parse_model_response(response_b)
|
777 |
-
score_b_val = f"{score_b_val} / 5"
|
778 |
-
|
779 |
-
return (
|
780 |
-
score_a_val,
|
781 |
-
critique_a_val,
|
782 |
-
score_b_val,
|
783 |
-
critique_b_val,
|
784 |
-
gr.update(interactive=True, variant="primary"), # vote_a
|
785 |
-
gr.update(interactive=True, variant="primary"), # vote_b
|
786 |
-
gr.update(interactive=True, variant="primary"), # vote_tie
|
787 |
-
model_a,
|
788 |
-
model_b,
|
789 |
-
eval_prompt,
|
790 |
gr.update(value="*Model: Hidden*"),
|
791 |
gr.update(value="*Model: Hidden*"),
|
792 |
-
|
793 |
-
gr.update(
|
794 |
-
|
|
|
|
|
|
|
|
|
795 |
)
|
796 |
|
797 |
-
# Update the click handler to use False for is_first_game after first submission
|
798 |
-
def create_submit_handler():
|
799 |
-
first_game = True
|
800 |
-
|
801 |
-
def handler(*args):
|
802 |
-
nonlocal first_game
|
803 |
-
result = submit_and_store(*args)
|
804 |
-
first_game = False # Set to False after first submission
|
805 |
-
return result
|
806 |
-
|
807 |
-
return handler
|
808 |
-
|
809 |
-
# Update the send_btn click handler
|
810 |
send_btn.click(
|
811 |
fn=submit_and_store,
|
812 |
-
inputs=[
|
813 |
-
use_reference_toggle,
|
814 |
-
eval_criteria_text,
|
815 |
-
human_input,
|
816 |
-
ai_response,
|
817 |
-
ground_truth,
|
818 |
-
score1_description,
|
819 |
-
score2_description,
|
820 |
-
score3_description,
|
821 |
-
score4_description,
|
822 |
-
score5_description,
|
823 |
-
],
|
824 |
outputs=[
|
825 |
score_a,
|
826 |
critique_a,
|
@@ -828,225 +744,101 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
828 |
critique_b,
|
829 |
vote_a,
|
830 |
vote_b,
|
831 |
-
|
832 |
model_a_state,
|
833 |
model_b_state,
|
834 |
final_prompt_state,
|
835 |
model_name_a,
|
836 |
model_name_b,
|
837 |
send_btn,
|
838 |
-
|
839 |
],
|
840 |
)
|
841 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
842 |
# Add random button handler
|
843 |
random_btn.click(
|
844 |
fn=populate_random_example,
|
845 |
-
inputs=[
|
846 |
-
outputs=[
|
847 |
-
human_input,
|
848 |
-
ai_response,
|
849 |
-
random_btn,
|
850 |
-
score_a,
|
851 |
-
critique_a,
|
852 |
-
score_b,
|
853 |
-
critique_b,
|
854 |
-
vote_a,
|
855 |
-
vote_b,
|
856 |
-
vote_tie,
|
857 |
-
model_name_a,
|
858 |
-
model_name_b,
|
859 |
-
ground_truth, # Set ground truth
|
860 |
-
]
|
861 |
)
|
862 |
|
863 |
# Add new input change handlers
|
864 |
def handle_input_change():
|
865 |
-
"
|
866 |
-
return [
|
867 |
-
gr.update(interactive=False), # vote_a
|
868 |
-
gr.update(interactive=False), # vote_b
|
869 |
-
gr.update(interactive=False), # vote_tie
|
870 |
-
gr.update(value="Run judges", variant="primary"), # send_btn
|
871 |
-
gr.update(value="🎲", variant="secondary"), # random_btn
|
872 |
-
]
|
873 |
|
874 |
# Update the change handlers for inputs
|
875 |
human_input.change(
|
876 |
fn=handle_input_change,
|
877 |
inputs=[],
|
878 |
-
outputs=[
|
879 |
)
|
880 |
|
881 |
ai_response.change(
|
882 |
fn=handle_input_change,
|
883 |
inputs=[],
|
884 |
-
outputs=[
|
885 |
-
)
|
886 |
-
|
887 |
-
generate_btn.click(
|
888 |
-
fn=lambda msg: (
|
889 |
-
generate_ai_response(msg)[0], # Only take the response text
|
890 |
-
gr.update(
|
891 |
-
value="Generate AI Response", # Keep the label
|
892 |
-
interactive=False # Disable the button
|
893 |
-
)
|
894 |
-
),
|
895 |
-
inputs=[human_input],
|
896 |
-
outputs=[ai_response, generate_btn]
|
897 |
-
)
|
898 |
-
|
899 |
-
human_input.change(
|
900 |
-
fn=lambda x: gr.update(interactive=bool(x.strip())),
|
901 |
-
inputs=[human_input],
|
902 |
-
outputs=[generate_btn]
|
903 |
)
|
904 |
|
905 |
# Update the demo.load to include the random example population
|
906 |
demo.load(
|
907 |
-
fn=
|
908 |
inputs=[],
|
909 |
-
outputs=[
|
910 |
-
human_input,
|
911 |
-
ai_response,
|
912 |
-
random_btn,
|
913 |
-
score_a,
|
914 |
-
critique_a,
|
915 |
-
score_b,
|
916 |
-
critique_b,
|
917 |
-
vote_a,
|
918 |
-
vote_b,
|
919 |
-
vote_tie,
|
920 |
-
model_name_a,
|
921 |
-
model_name_b,
|
922 |
-
ground_truth,
|
923 |
-
]
|
924 |
)
|
925 |
|
926 |
-
# Add new state variables for compatible mode
|
927 |
-
eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
|
928 |
-
score1_previous = gr.State(value=DEFAULT_SCORE_1)
|
929 |
-
score2_previous = gr.State(value=DEFAULT_SCORE_2)
|
930 |
-
score3_previous = gr.State(value=DEFAULT_SCORE_3)
|
931 |
-
score4_previous = gr.State(value=DEFAULT_SCORE_4)
|
932 |
-
score5_previous = gr.State(value=DEFAULT_SCORE_5)
|
933 |
-
|
934 |
-
# Add new functions to handle compatible mode saves/cancels
|
935 |
-
def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
|
936 |
-
return [
|
937 |
-
gr.update(value=criteria), # Update criteria
|
938 |
-
criteria, # Update previous criteria state
|
939 |
-
gr.update(value=score1),
|
940 |
-
score1,
|
941 |
-
gr.update(value=score2),
|
942 |
-
score2,
|
943 |
-
gr.update(value=score3),
|
944 |
-
score3,
|
945 |
-
gr.update(value=score4),
|
946 |
-
score4,
|
947 |
-
gr.update(value=score5),
|
948 |
-
score5,
|
949 |
-
gr.update(visible=False) # Hide buttons
|
950 |
-
]
|
951 |
-
|
952 |
-
def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
|
953 |
-
return [
|
954 |
-
gr.update(value=prev_criteria),
|
955 |
-
prev_criteria,
|
956 |
-
gr.update(value=prev_score1),
|
957 |
-
prev_score1,
|
958 |
-
gr.update(value=prev_score2),
|
959 |
-
prev_score2,
|
960 |
-
gr.update(value=prev_score3),
|
961 |
-
prev_score3,
|
962 |
-
gr.update(value=prev_score4),
|
963 |
-
prev_score4,
|
964 |
-
gr.update(value=prev_score5),
|
965 |
-
prev_score5,
|
966 |
-
gr.update(visible=False)
|
967 |
-
]
|
968 |
-
|
969 |
-
def show_compatible_edit_buttons(*current_values):
|
970 |
-
previous_values = current_values[1::2] # Get previous values
|
971 |
-
current_values = current_values[::2] # Get current values
|
972 |
-
return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
|
973 |
-
|
974 |
-
# Add click handlers for compatible mode buttons
|
975 |
-
compatible_save_btn.click(
|
976 |
-
fn=save_compatible_prompt,
|
977 |
-
inputs=[
|
978 |
-
eval_criteria_text,
|
979 |
-
score1_description,
|
980 |
-
score2_description,
|
981 |
-
score3_description,
|
982 |
-
score4_description,
|
983 |
-
score5_description
|
984 |
-
],
|
985 |
-
outputs=[
|
986 |
-
eval_criteria_text,
|
987 |
-
eval_criteria_previous,
|
988 |
-
score1_description,
|
989 |
-
score1_previous,
|
990 |
-
score2_description,
|
991 |
-
score2_previous,
|
992 |
-
score3_description,
|
993 |
-
score3_previous,
|
994 |
-
score4_description,
|
995 |
-
score4_previous,
|
996 |
-
score5_description,
|
997 |
-
score5_previous,
|
998 |
-
compatible_edit_buttons_row
|
999 |
-
]
|
1000 |
-
)
|
1001 |
-
|
1002 |
-
compatible_cancel_btn.click(
|
1003 |
-
fn=cancel_compatible_prompt,
|
1004 |
-
inputs=[
|
1005 |
-
eval_criteria_previous,
|
1006 |
-
score1_previous,
|
1007 |
-
score2_previous,
|
1008 |
-
score3_previous,
|
1009 |
-
score4_previous,
|
1010 |
-
score5_previous
|
1011 |
-
],
|
1012 |
-
outputs=[
|
1013 |
-
eval_criteria_text,
|
1014 |
-
eval_criteria_previous,
|
1015 |
-
score1_description,
|
1016 |
-
score1_previous,
|
1017 |
-
score2_description,
|
1018 |
-
score2_previous,
|
1019 |
-
score3_description,
|
1020 |
-
score3_previous,
|
1021 |
-
score4_description,
|
1022 |
-
score4_previous,
|
1023 |
-
score5_description,
|
1024 |
-
score5_previous,
|
1025 |
-
compatible_edit_buttons_row
|
1026 |
-
]
|
1027 |
-
)
|
1028 |
-
|
1029 |
-
# Add change handlers for all compatible mode inputs
|
1030 |
-
for component in [eval_criteria_text, score1_description, score2_description,
|
1031 |
-
score3_description, score4_description, score5_description]:
|
1032 |
-
component.change(
|
1033 |
-
fn=show_compatible_edit_buttons,
|
1034 |
-
inputs=[
|
1035 |
-
eval_criteria_text,
|
1036 |
-
eval_criteria_previous,
|
1037 |
-
score1_description,
|
1038 |
-
score1_previous,
|
1039 |
-
score2_description,
|
1040 |
-
score2_previous,
|
1041 |
-
score3_description,
|
1042 |
-
score3_previous,
|
1043 |
-
score4_description,
|
1044 |
-
score4_previous,
|
1045 |
-
score5_description,
|
1046 |
-
score5_previous
|
1047 |
-
],
|
1048 |
-
outputs=compatible_edit_buttons_row
|
1049 |
-
)
|
1050 |
-
|
1051 |
if __name__ == "__main__":
|
1052 |
demo.launch()
|
|
|
2 |
import re
|
3 |
import random
|
4 |
from collections import defaultdict
|
5 |
+
from datetime import datetime, timezone
|
6 |
import hashlib
|
|
|
7 |
|
8 |
from dotenv import load_dotenv
|
|
|
9 |
|
10 |
+
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
import gradio as gr
|
13 |
+
from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
|
|
|
|
|
|
|
14 |
from db import add_vote, create_db_connection, get_votes
|
|
|
15 |
from utils import Vote
|
|
|
16 |
from common import (
|
17 |
POLICY_CONTENT,
|
18 |
ACKNOWLEDGEMENTS,
|
19 |
+
DEFAULT_EVAL_PROMPT,
|
20 |
+
DEFAULT_INPUT,
|
21 |
+
DEFAULT_RESPONSE,
|
22 |
CSS_STYLES,
|
23 |
MAIN_TITLE,
|
24 |
HOW_IT_WORKS,
|
25 |
+
BATTLE_RULES,
|
26 |
+
EVAL_DESCRIPTION,
|
27 |
+
VOTING_HEADER,
|
28 |
)
|
29 |
+
from example_metrics import EXAMPLE_METRICS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
+
# Model and ELO score data
|
33 |
+
DEFAULT_ELO = 1200 # Starting ELO for new models
|
34 |
+
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
35 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
36 |
vote_counts = defaultdict(int)
|
37 |
|
|
|
52 |
"organization": model["organization"],
|
53 |
"license": model["license"],
|
54 |
"api_model": model["api_model"],
|
|
|
55 |
}
|
56 |
except FileNotFoundError:
|
57 |
print("Warning: models.jsonl not found")
|
|
|
62 |
model_data = load_model_data()
|
63 |
|
64 |
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
|
|
|
|
|
65 |
vote = Vote(
|
66 |
timestamp=datetime.now().isoformat(),
|
67 |
+
prompt=prompt,
|
68 |
response_a=response_a,
|
69 |
response_b=response_b,
|
70 |
model_a=model_a,
|
|
|
93 |
return eval_prompt
|
94 |
|
95 |
|
96 |
+
def submit_prompt(eval_prompt, *variable_values):
|
97 |
+
try:
|
98 |
+
variables = parse_variables(eval_prompt)
|
99 |
+
variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
|
100 |
+
final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
|
101 |
+
|
102 |
+
models = list(model_data.keys())
|
103 |
+
model1, model2 = random.sample(models, 2)
|
104 |
+
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
|
105 |
+
|
106 |
+
response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
|
107 |
+
response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
|
108 |
+
|
109 |
+
return (
|
110 |
+
response_a,
|
111 |
+
response_b,
|
112 |
+
gr.update(visible=True),
|
113 |
+
gr.update(visible=True),
|
114 |
+
model_a,
|
115 |
+
model_b,
|
116 |
+
final_prompt,
|
117 |
+
)
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error in submit_prompt: {str(e)}")
|
120 |
+
return (
|
121 |
+
"Error generating response",
|
122 |
+
"Error generating response",
|
123 |
+
gr.update(visible=False),
|
124 |
+
gr.update(visible=False),
|
125 |
+
None,
|
126 |
+
None,
|
127 |
+
None,
|
128 |
+
)
|
129 |
+
|
130 |
|
131 |
def get_ip(request: gr.Request) -> str:
|
132 |
"""Get and hash the IP address from the request."""
|
|
|
143 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
144 |
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
def vote(
|
147 |
choice,
|
148 |
model_a,
|
|
|
192 |
store_vote_data(
|
193 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
194 |
)
|
195 |
+
|
196 |
+
# Return updates for UI components
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
return [
|
198 |
+
gr.update(visible=False), # vote_a
|
199 |
+
gr.update(visible=False), # vote_b
|
200 |
+
gr.update(visible=False), # tie_button_row
|
201 |
+
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
202 |
+
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
203 |
+
gr.update(interactive=True, value="Run the evaluators", variant="primary"), # send_btn
|
204 |
+
gr.update(visible=True), # spacing_div
|
|
|
205 |
]
|
206 |
|
207 |
|
|
|
210 |
return get_votes(db)
|
211 |
|
212 |
|
213 |
+
def get_leaderboard(show_preliminary=True):
|
214 |
+
"""Generate leaderboard data using fresh votes from MongoDB."""
|
215 |
+
# Get fresh voting data
|
216 |
voting_data = get_current_votes()
|
217 |
+
print(f"Fetched {len(voting_data)} votes from database") # Debug log
|
218 |
+
|
219 |
+
# Initialize dictionaries for tracking
|
220 |
+
ratings = defaultdict(lambda: DEFAULT_ELO)
|
221 |
+
matches = defaultdict(int)
|
222 |
+
|
223 |
+
# Process each vote
|
224 |
+
for vote in voting_data:
|
225 |
+
try:
|
226 |
+
model_a = vote.get("model_a")
|
227 |
+
model_b = vote.get("model_b")
|
228 |
+
winner = vote.get("winner")
|
229 |
+
|
230 |
+
# Skip if models aren't in current model_data
|
231 |
+
if (
|
232 |
+
not all([model_a, model_b, winner])
|
233 |
+
or model_a not in model_data
|
234 |
+
or model_b not in model_data
|
235 |
+
):
|
236 |
+
continue
|
237 |
+
|
238 |
+
# Update match counts
|
239 |
+
matches[model_a] += 1
|
240 |
+
matches[model_b] += 1
|
241 |
+
|
242 |
+
# Calculate ELO changes
|
243 |
+
elo_a = ratings[model_a]
|
244 |
+
elo_b = ratings[model_b]
|
245 |
+
|
246 |
+
# Expected scores
|
247 |
+
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
|
248 |
+
expected_b = 1 - expected_a
|
249 |
+
|
250 |
+
# Actual scores
|
251 |
+
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
|
252 |
+
score_b = 1 - score_a
|
253 |
+
|
254 |
+
# Update ratings
|
255 |
+
ratings[model_a] += K_FACTOR * (score_a - expected_a)
|
256 |
+
ratings[model_b] += K_FACTOR * (score_b - expected_b)
|
257 |
+
|
258 |
+
except Exception as e:
|
259 |
+
print(f"Error processing vote: {e}")
|
260 |
+
continue
|
261 |
+
|
262 |
+
# Generate leaderboard data
|
263 |
+
leaderboard = []
|
264 |
+
for model in model_data.keys():
|
265 |
+
votes = matches[model]
|
266 |
+
# Skip models with < 500 votes if show_preliminary is False
|
267 |
+
if not show_preliminary and votes < 500:
|
268 |
+
continue
|
269 |
+
|
270 |
+
elo = ratings[model]
|
271 |
+
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
272 |
+
data = {
|
273 |
+
"Model": model,
|
274 |
+
"ELO Score": f"{int(elo)}",
|
275 |
+
"95% CI": f"±{int(ci)}",
|
276 |
+
"# Votes": votes,
|
277 |
+
"Organization": model_data[model]["organization"],
|
278 |
+
"License": model_data[model]["license"],
|
279 |
+
}
|
280 |
+
leaderboard.append(data)
|
281 |
+
|
282 |
+
# Sort leaderboard by ELO score in descending order
|
283 |
+
leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
|
284 |
+
|
285 |
+
return leaderboard
|
286 |
+
|
287 |
+
|
288 |
+
def calculate_elo_change(rating_a, rating_b, winner):
|
289 |
+
"""Calculate ELO rating changes for both players."""
|
290 |
+
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
291 |
+
expected_b = 1 - expected_a
|
292 |
+
|
293 |
+
if winner == "A":
|
294 |
+
score_a, score_b = 1, 0
|
295 |
+
elif winner == "B":
|
296 |
+
score_a, score_b = 0, 1
|
297 |
+
else: # Handle ties
|
298 |
+
score_a, score_b = 0.5, 0.5
|
299 |
+
|
300 |
+
change_a = K_FACTOR * (score_a - expected_a)
|
301 |
+
change_b = K_FACTOR * (score_b - expected_b)
|
302 |
+
|
303 |
+
return change_a, change_b
|
304 |
+
|
305 |
+
|
306 |
+
def update_leaderboard():
|
307 |
+
"""Generate leaderboard DataFrame using fresh votes from MongoDB."""
|
308 |
+
# Get fresh voting data
|
309 |
+
voting_data = get_current_votes()
|
310 |
+
print(f"Found {len(voting_data)} votes in database")
|
311 |
+
matches = defaultdict(int)
|
312 |
+
|
313 |
+
# Process each vote chronologically
|
314 |
+
for vote in voting_data:
|
315 |
+
# Extract model names from the vote document
|
316 |
+
try:
|
317 |
+
model_a = vote.get("model_a")
|
318 |
+
model_b = vote.get("model_b")
|
319 |
+
winner = vote.get("winner")
|
320 |
+
|
321 |
+
print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
|
322 |
+
|
323 |
+
# Skip if any required field is missing or models aren't in current model_data
|
324 |
+
if not all([model_a, model_b, winner]):
|
325 |
+
print(f"Missing required fields in vote: {vote}")
|
326 |
+
continue
|
327 |
+
|
328 |
+
if model_a not in model_data:
|
329 |
+
print(f"Model A '{model_a}' not found in model_data")
|
330 |
+
continue
|
331 |
+
|
332 |
+
if model_b not in model_data:
|
333 |
+
print(f"Model B '{model_b}' not found in model_data")
|
334 |
+
continue
|
335 |
+
|
336 |
+
# Update match counts
|
337 |
+
matches[model_a] += 1
|
338 |
+
matches[model_b] += 1
|
339 |
+
print(
|
340 |
+
f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
|
341 |
+
)
|
342 |
+
except Exception as e:
|
343 |
+
print(f"Error processing vote: {e}")
|
344 |
+
print(f"Problematic vote data: {vote}")
|
345 |
+
continue
|
346 |
+
|
347 |
+
|
348 |
+
# Update the display_leaderboard function
|
349 |
+
def display_leaderboard():
|
350 |
+
df = update_leaderboard()
|
351 |
+
return gr.DataFrame(
|
352 |
+
value=df,
|
353 |
+
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
354 |
+
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
355 |
+
row_count=(len(df) + 1, "dynamic"),
|
356 |
+
)
|
357 |
|
358 |
|
359 |
# Update the leaderboard table definition in the UI
|
|
|
363 |
)
|
364 |
|
365 |
|
366 |
+
def get_leaderboard_stats():
|
367 |
+
"""Get summary statistics for the leaderboard."""
|
368 |
+
now = datetime.now(timezone.utc)
|
369 |
+
total_votes = len(get_current_votes())
|
370 |
+
total_models = len(model_data)
|
371 |
+
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
|
372 |
+
"%B %d, %Y at %H:00 UTC"
|
373 |
+
)
|
374 |
+
|
375 |
+
return f"""
|
376 |
+
### Leaderboard Stats
|
377 |
+
- **Total Models**: {total_models}
|
378 |
+
- **Total Votes**: {total_votes}
|
379 |
+
- **Last Updated**: {last_updated}
|
380 |
+
"""
|
381 |
+
|
382 |
+
|
383 |
+
#def set_example_metric(metric_name):
|
384 |
+
# if metric_name == "Custom":
|
385 |
+
# variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
386 |
+
# variable_values = []
|
387 |
+
# for var in variables:
|
388 |
+
# if var == "input":
|
389 |
+
# variable_values.append(DEFAULT_INPUT)
|
390 |
+
# elif var == "response":
|
391 |
+
# variable_values.append(DEFAULT_RESPONSE)
|
392 |
+
# else:
|
393 |
+
# variable_values.append("") # Default empty value
|
394 |
+
# Pad variable_values to match the length of variable_rows
|
395 |
+
# while len(variable_values) < len(variable_rows):
|
396 |
+
# variable_values.append("")
|
397 |
+
# return [DEFAULT_EVAL_PROMPT] + variable_values
|
398 |
+
|
399 |
+
# metric_data = EXAMPLE_METRICS[metric_name]
|
400 |
+
# variables = parse_variables(metric_data["prompt"])
|
401 |
+
# variable_values = []
|
402 |
+
# for var in variables:
|
403 |
+
# value = metric_data.get(var, "") # Default to empty string if not found
|
404 |
+
# variable_values.append(value)
|
405 |
+
# Pad variable_values to match the length of variable_rows
|
406 |
+
# while len(variable_values) < len(variable_rows):
|
407 |
+
# variable_values.append("")
|
408 |
+
# return [metric_data["prompt"]] + variable_values
|
409 |
+
|
410 |
+
|
411 |
+
# Select random metric at startup
|
412 |
+
# def get_random_metric():
|
413 |
+
# metrics = list(EXAMPLE_METRICS.keys())
|
414 |
+
# return set_example_metric(random.choice(metrics))
|
415 |
+
|
416 |
+
|
417 |
+
def populate_random_example(request: gr.Request):
|
418 |
+
"""Generate a random human-AI conversation example."""
|
419 |
+
human_msg, ai_msg = get_random_human_ai_pair()
|
420 |
return [
|
421 |
gr.update(value=human_msg),
|
422 |
+
gr.update(value=ai_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
]
|
424 |
|
425 |
|
|
|
435 |
|
436 |
with gr.Tabs():
|
437 |
with gr.TabItem("Judge Arena"):
|
438 |
+
random_btn = gr.Button("🎲", scale=0)
|
439 |
with gr.Row():
|
440 |
# Left side - Input section
|
441 |
with gr.Column(scale=1):
|
442 |
with gr.Group():
|
443 |
human_input = gr.TextArea(
|
444 |
+
label="👩 Human Input",
|
445 |
+
lines=12,
|
446 |
placeholder="Enter the human message here..."
|
447 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
|
449 |
ai_response = gr.TextArea(
|
450 |
label="🤖 AI Response",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
lines=12,
|
452 |
+
placeholder="Enter the AI response here..."
|
|
|
453 |
)
|
454 |
|
|
|
|
|
455 |
send_btn = gr.Button(
|
456 |
+
value="Run the evaluators",
|
457 |
variant="primary",
|
458 |
+
size="lg"
|
|
|
459 |
)
|
460 |
|
461 |
# Right side - Model outputs
|
|
|
465 |
model_name_a = gr.Markdown("*Model: Hidden*")
|
466 |
with gr.Row():
|
467 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
468 |
+
score_a = gr.Textbox(label="Score", lines=5, interactive=False)
|
469 |
+
vote_a = gr.Button("Vote A", variant="primary", visible=False)
|
470 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
471 |
+
critique_a = gr.TextArea(label="Critique", lines=7, interactive=False)
|
472 |
+
|
473 |
+
# Spacing div that's visible only when tie button is hidden
|
474 |
+
spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
|
475 |
|
476 |
# Tie button row
|
477 |
+
with gr.Row(visible=False) as tie_button_row:
|
478 |
with gr.Column():
|
479 |
+
vote_tie = gr.Button("Tie", variant="secondary")
|
480 |
|
481 |
|
482 |
gr.Markdown("### 🧑⚖️ Judge B")
|
|
|
484 |
model_name_b = gr.Markdown("*Model: Hidden*")
|
485 |
with gr.Row():
|
486 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
487 |
+
score_b = gr.Textbox(label="Score", lines=5, interactive=False)
|
488 |
+
vote_b = gr.Button("Vote B", variant="primary", visible=False)
|
489 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
490 |
+
critique_b = gr.TextArea(label="Critique", lines=7, interactive=False)
|
491 |
+
# Place Vote B button directly under Judge B
|
492 |
|
493 |
gr.Markdown("<br>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
+
# Add spacing and acknowledgements at the bottom
|
496 |
+
gr.Markdown(ACKNOWLEDGEMENTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
with gr.TabItem("Leaderboard"):
|
499 |
with gr.Row():
|
|
|
501 |
show_preliminary = gr.Checkbox(
|
502 |
label="Reveal preliminary results",
|
503 |
value=True, # Checked by default
|
504 |
+
info="Show all models, including models with less few human ratings (< 500 votes)",
|
505 |
interactive=True
|
506 |
)
|
507 |
stats_display = gr.Markdown()
|
|
|
509 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
510 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
511 |
)
|
|
|
|
|
|
|
|
|
512 |
|
513 |
+
# Update refresh_leaderboard to use the checkbox value
|
514 |
+
def refresh_leaderboard(show_preliminary):
|
515 |
+
"""Refresh the leaderboard data and stats."""
|
516 |
+
leaderboard = get_leaderboard(show_preliminary)
|
517 |
+
data = [
|
518 |
+
[
|
519 |
+
entry["Model"],
|
520 |
+
float(entry["ELO Score"]),
|
521 |
+
entry["95% CI"],
|
522 |
+
entry["# Votes"],
|
523 |
+
entry["Organization"],
|
524 |
+
entry["License"],
|
525 |
+
]
|
526 |
+
for entry in leaderboard
|
527 |
+
]
|
528 |
+
stats = get_leaderboard_stats()
|
529 |
+
return [gr.update(value=data), gr.update(value=stats)]
|
530 |
|
531 |
# Add change handler for checkbox
|
532 |
show_preliminary.change(
|
|
|
544 |
|
545 |
with gr.TabItem("Policy"):
|
546 |
gr.Markdown(POLICY_CONTENT)
|
|
|
547 |
|
548 |
# Define state variables for model tracking
|
549 |
model_a_state = gr.State()
|
550 |
model_b_state = gr.State()
|
551 |
final_prompt_state = gr.State()
|
552 |
+
|
553 |
+
# Update variable inputs based on the eval prompt
|
554 |
+
def update_variables(eval_prompt):
|
555 |
+
variables = parse_variables(eval_prompt)
|
556 |
+
updates = []
|
557 |
+
|
558 |
+
for i in range(len(variable_rows)):
|
559 |
+
var_row, var_input = variable_rows[i]
|
560 |
+
if i < len(variables):
|
561 |
+
var_name = variables[i]
|
562 |
+
# Set the number of lines based on the variable name
|
563 |
+
if var_name == "response":
|
564 |
+
lines = 4 # Adjust this number as needed
|
565 |
+
else:
|
566 |
+
lines = 1 # Default to single line for other variables
|
567 |
+
updates.extend(
|
568 |
+
[
|
569 |
+
gr.update(visible=True), # Show the variable row
|
570 |
+
gr.update(
|
571 |
+
label=var_name, visible=True, lines=lines
|
572 |
+
), # Update label and lines
|
573 |
+
]
|
574 |
+
)
|
575 |
+
else:
|
576 |
+
updates.extend(
|
577 |
+
[
|
578 |
+
gr.update(visible=False), # Hide the variable row
|
579 |
+
gr.update(value="", visible=False), # Clear value when hidden
|
580 |
+
]
|
581 |
+
)
|
582 |
+
return updates
|
583 |
+
|
584 |
+
#eval_prompt.change(
|
585 |
+
# fn=update_variables,
|
586 |
+
# inputs=eval_prompt,
|
587 |
+
# outputs=[item for sublist in variable_rows for item in sublist],
|
588 |
+
#)
|
589 |
+
|
590 |
+
# Regenerate button functionality
|
591 |
+
#regenerate_button.click(
|
592 |
+
# fn=regenerate_prompt,
|
593 |
+
# inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
|
594 |
+
# outputs=[
|
595 |
+
# score_a,
|
596 |
+
# critique_a,
|
597 |
+
# score_b,
|
598 |
+
# critique_b,
|
599 |
+
# vote_a,
|
600 |
+
# vote_b,
|
601 |
+
# tie_button_row,
|
602 |
+
# model_name_a,
|
603 |
+
# model_name_b,
|
604 |
+
# model_a_state,
|
605 |
+
# model_b_state,
|
606 |
+
# ],
|
607 |
+
#)
|
608 |
|
609 |
# Update model names after responses are generated
|
610 |
def update_model_names(model_a, model_b):
|
|
|
619 |
vote_a.click(
|
620 |
fn=vote,
|
621 |
inputs=[
|
622 |
+
gr.State("A"), # Choice
|
623 |
model_a_state,
|
624 |
model_b_state,
|
625 |
final_prompt_state,
|
|
|
631 |
outputs=[
|
632 |
vote_a,
|
633 |
vote_b,
|
634 |
+
tie_button_row,
|
635 |
model_name_a,
|
636 |
model_name_b,
|
637 |
send_btn,
|
638 |
+
spacing_div,
|
|
|
639 |
],
|
640 |
)
|
641 |
|
642 |
vote_b.click(
|
643 |
fn=vote,
|
644 |
inputs=[
|
645 |
+
gr.State("B"), # Choice
|
646 |
model_a_state,
|
647 |
model_b_state,
|
648 |
final_prompt_state,
|
|
|
654 |
outputs=[
|
655 |
vote_a,
|
656 |
vote_b,
|
657 |
+
tie_button_row,
|
658 |
model_name_a,
|
659 |
model_name_b,
|
660 |
send_btn,
|
661 |
+
spacing_div,
|
|
|
662 |
],
|
663 |
)
|
664 |
|
665 |
vote_tie.click(
|
666 |
fn=vote,
|
667 |
inputs=[
|
668 |
+
gr.State("Tie"), # Choice
|
669 |
model_a_state,
|
670 |
model_b_state,
|
671 |
final_prompt_state,
|
|
|
677 |
outputs=[
|
678 |
vote_a,
|
679 |
vote_b,
|
680 |
+
tie_button_row,
|
681 |
model_name_a,
|
682 |
model_name_b,
|
683 |
send_btn,
|
684 |
+
spacing_div,
|
|
|
685 |
],
|
686 |
)
|
687 |
|
688 |
+
# Update the send button handler to store the submitted inputs
|
689 |
+
def submit_and_store(prompt, *variables):
|
690 |
+
# Create a copy of the current submission
|
691 |
+
current_submission = {"prompt": prompt, "variables": variables}
|
692 |
+
|
693 |
+
# Get the responses
|
694 |
+
(
|
695 |
+
response_a,
|
696 |
+
response_b,
|
697 |
+
buttons_visible,
|
698 |
+
regen_visible,
|
699 |
+
model_a,
|
700 |
+
model_b,
|
701 |
+
final_prompt,
|
702 |
+
) = submit_prompt(prompt, *variables)
|
703 |
|
704 |
+
# Parse the responses
|
705 |
+
score_a, critique_a = parse_model_response(response_a)
|
706 |
+
score_b, critique_b = parse_model_response(response_b)
|
707 |
|
708 |
+
# Format scores with "/ 5"
|
709 |
+
score_a = f"{score_a} / 5"
|
710 |
+
score_b = f"{score_b} / 5"
|
|
|
|
|
|
|
711 |
|
712 |
+
# Update the last_submission state with the current values
|
713 |
+
last_submission.value = current_submission
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
714 |
|
715 |
+
return (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
score_a,
|
717 |
critique_a,
|
718 |
score_b,
|
719 |
critique_b,
|
720 |
+
gr.update(visible=True), # vote_a
|
721 |
+
gr.update(visible=True), # vote_b
|
722 |
+
gr.update(visible=True), # tie_button_row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
723 |
model_a,
|
|
|
|
|
|
|
|
|
|
|
724 |
model_b,
|
725 |
+
final_prompt, # Add final_prompt to state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
gr.update(value="*Model: Hidden*"),
|
727 |
gr.update(value="*Model: Hidden*"),
|
728 |
+
# Change the button to "Regenerate" mode after evaluation
|
729 |
+
gr.update(
|
730 |
+
value="Regenerate with different models",
|
731 |
+
variant="secondary",
|
732 |
+
interactive=True
|
733 |
+
),
|
734 |
+
gr.update(visible=False), # spacing_div
|
735 |
)
|
736 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
737 |
send_btn.click(
|
738 |
fn=submit_and_store,
|
739 |
+
inputs=[eval_prompt, human_input, ai_response],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
740 |
outputs=[
|
741 |
score_a,
|
742 |
critique_a,
|
|
|
744 |
critique_b,
|
745 |
vote_a,
|
746 |
vote_b,
|
747 |
+
tie_button_row,
|
748 |
model_a_state,
|
749 |
model_b_state,
|
750 |
final_prompt_state,
|
751 |
model_name_a,
|
752 |
model_name_b,
|
753 |
send_btn,
|
754 |
+
spacing_div,
|
755 |
],
|
756 |
)
|
757 |
|
758 |
+
# Update the input change handlers to also disable regenerate button
|
759 |
+
def handle_input_changes(prompt, *variables):
|
760 |
+
"""Enable send button and manage regenerate button based on input changes"""
|
761 |
+
last_inputs = last_submission.value
|
762 |
+
current_inputs = {"prompt": prompt, "variables": variables}
|
763 |
+
inputs_changed = last_inputs != current_inputs
|
764 |
+
return [
|
765 |
+
gr.update(interactive=True), # send button always enabled
|
766 |
+
gr.update(
|
767 |
+
interactive=not inputs_changed
|
768 |
+
), # regenerate button disabled if inputs changed
|
769 |
+
]
|
770 |
+
|
771 |
+
# Update the change handlers for prompt and variables
|
772 |
+
#eval_prompt.change(
|
773 |
+
# fn=handle_input_changes,
|
774 |
+
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
775 |
+
# outputs=[send_btn, regenerate_button],
|
776 |
+
#)
|
777 |
+
|
778 |
+
# for _, var_input in variable_rows:
|
779 |
+
# var_input.change(
|
780 |
+
# fn=handle_input_changes,
|
781 |
+
# inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
782 |
+
# outputs=[send_btn, regenerate_button],
|
783 |
+
# )
|
784 |
+
|
785 |
+
# Add click handlers for metric buttons
|
786 |
+
#outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
787 |
+
|
788 |
+
#custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
|
789 |
+
|
790 |
+
#hallucination_btn.click(
|
791 |
+
# fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
|
792 |
+
#)
|
793 |
+
|
794 |
+
#precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
|
795 |
+
|
796 |
+
#recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
|
797 |
+
|
798 |
+
#coherence_btn.click(
|
799 |
+
# fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
|
800 |
+
#)
|
801 |
+
|
802 |
+
#faithfulness_btn.click(
|
803 |
+
# fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
|
804 |
+
#)
|
805 |
+
|
806 |
+
# Set default metric at startup
|
807 |
+
demo.load(
|
808 |
+
#fn=lambda: set_example_metric("Hallucination"),
|
809 |
+
#outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
|
810 |
+
)
|
811 |
+
|
812 |
# Add random button handler
|
813 |
random_btn.click(
|
814 |
fn=populate_random_example,
|
815 |
+
inputs=[],
|
816 |
+
outputs=[human_input, ai_response]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
817 |
)
|
818 |
|
819 |
# Add new input change handlers
|
820 |
def handle_input_change():
|
821 |
+
return gr.update(value="Run the evaluators", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
822 |
|
823 |
# Update the change handlers for inputs
|
824 |
human_input.change(
|
825 |
fn=handle_input_change,
|
826 |
inputs=[],
|
827 |
+
outputs=[send_btn]
|
828 |
)
|
829 |
|
830 |
ai_response.change(
|
831 |
fn=handle_input_change,
|
832 |
inputs=[],
|
833 |
+
outputs=[send_btn]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
834 |
)
|
835 |
|
836 |
# Update the demo.load to include the random example population
|
837 |
demo.load(
|
838 |
+
fn=populate_random_example,
|
839 |
inputs=[],
|
840 |
+
outputs=[human_input, ai_response]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
)
|
842 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
843 |
if __name__ == "__main__":
|
844 |
demo.launch()
|
common.py
CHANGED
@@ -37,7 +37,7 @@ CSS_STYLES = """
|
|
37 |
gap: 8px;
|
38 |
}
|
39 |
"""
|
40 |
-
|
41 |
# Default Eval Prompt
|
42 |
EVAL_DESCRIPTION = """
|
43 |
## 📝 Tips
|
@@ -47,6 +47,27 @@ EVAL_DESCRIPTION = """
|
|
47 |
- Examples (Optional)
|
48 |
"""
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
# Voting Section Header
|
51 |
VOTING_HEADER = """
|
52 |
# Start Voting Now
|
@@ -68,50 +89,55 @@ POLICY_CONTENT = """
|
|
68 |
|
69 |
Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
|
70 |
<br><br>
|
71 |
-
#
|
72 |
|
73 |
-
By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
|
74 |
-
Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
|
75 |
<br><br>
|
76 |
# Judge Arena Policy
|
77 |
|
78 |
## Overview
|
79 |
|
80 |
-
Judge Arena is an open-source platform dedicated to
|
81 |
|
82 |
## Transparency
|
83 |
|
84 |
- **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
|
85 |
-
- **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
|
86 |
- **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
|
87 |
|
88 |
## Model Inclusion Criteria
|
89 |
|
90 |
Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
|
91 |
|
92 |
-
- **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
|
93 |
-
- **
|
94 |
- **Accessibility**:
|
95 |
- **Public API Access**: Models accessible through public APIs without restrictive barriers.
|
96 |
- **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
|
97 |
|
98 |
## Leaderboard Management
|
99 |
|
100 |
-
- **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of
|
101 |
- **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
|
102 |
- **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
|
103 |
|
104 |
-
|
105 |
-
|
106 |
# FAQ
|
107 |
|
108 |
**Isn't this the same as Chatbot Arena?**
|
109 |
|
110 |
We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
**Why should I trust this leaderboard?**
|
113 |
|
114 |
-
We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
|
115 |
|
116 |
**Who funds this effort?**
|
117 |
|
@@ -122,5 +148,4 @@ Atla currently funds this out of our own pocket. We are looking for API credits
|
|
122 |
We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
|
123 |
<br><br>
|
124 |
# Get in touch
|
125 |
-
|
126 |
-
\nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
|
|
|
37 |
gap: 8px;
|
38 |
}
|
39 |
"""
|
40 |
+
|
41 |
# Default Eval Prompt
|
42 |
EVAL_DESCRIPTION = """
|
43 |
## 📝 Tips
|
|
|
47 |
- Examples (Optional)
|
48 |
"""
|
49 |
|
50 |
+
DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on how well it follows the user's instructions. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Do not allow the length of the response to influence your evaluation. Be objective as possible and give a brief explanation for your score.
|
51 |
+
|
52 |
+
Score:
|
53 |
+
Score 1: The response ignores or misinterprets instructions, providing irrelevant or inaccurate content that fails to address the request.
|
54 |
+
Score 2: The response follows instructions partially but misses key elements, lacking depth or precision while containing minor inaccuracies.
|
55 |
+
Score 3: The response follows main instructions adequately, providing correct and relevant information with reasonable depth.
|
56 |
+
Score 4: The response follows instructions thoroughly with strong attention to detail, offering accurate, well-developed content that thoughtfully addresses needs.
|
57 |
+
Score 5: The response demonstrates exceptional instruction following with precise, comprehensive content that shows both insight and perfect alignment with the request.
|
58 |
+
|
59 |
+
[User Query]: {{input}}
|
60 |
+
|
61 |
+
[Response]: {{response}}"""
|
62 |
+
|
63 |
+
# Default Variable Values
|
64 |
+
DEFAULT_INPUT = """Which of these animals is least likely to be found in a rainforest?"
|
65 |
+
A) Jaguar
|
66 |
+
B) Toucan
|
67 |
+
C) Polar Bear
|
68 |
+
D) Sloth"""
|
69 |
+
DEFAULT_RESPONSE = "C) Polar Bear"
|
70 |
+
|
71 |
# Voting Section Header
|
72 |
VOTING_HEADER = """
|
73 |
# Start Voting Now
|
|
|
89 |
|
90 |
Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
|
91 |
<br><br>
|
92 |
+
# Our Mission
|
93 |
|
94 |
+
By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
|
|
|
95 |
<br><br>
|
96 |
# Judge Arena Policy
|
97 |
|
98 |
## Overview
|
99 |
|
100 |
+
Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
|
101 |
|
102 |
## Transparency
|
103 |
|
104 |
- **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
|
105 |
+
- **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
|
106 |
- **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
|
107 |
|
108 |
## Model Inclusion Criteria
|
109 |
|
110 |
Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
|
111 |
|
112 |
+
- **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
|
113 |
+
- **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
|
114 |
- **Accessibility**:
|
115 |
- **Public API Access**: Models accessible through public APIs without restrictive barriers.
|
116 |
- **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
|
117 |
|
118 |
## Leaderboard Management
|
119 |
|
120 |
+
- **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
|
121 |
- **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
|
122 |
- **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
|
123 |
|
124 |
+
This policy might be updated to reflect changes in our practices or in response to community feedback.
|
125 |
+
|
126 |
# FAQ
|
127 |
|
128 |
**Isn't this the same as Chatbot Arena?**
|
129 |
|
130 |
We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
|
131 |
|
132 |
+
**What are the Evaluator Prompt Templates based on?**
|
133 |
+
|
134 |
+
As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
|
135 |
+
|
136 |
+
These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
|
137 |
+
|
138 |
**Why should I trust this leaderboard?**
|
139 |
|
140 |
+
We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
|
141 |
|
142 |
**Who funds this effort?**
|
143 |
|
|
|
148 |
We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
|
149 |
<br><br>
|
150 |
# Get in touch
|
151 |
+
Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
|
|
data/models.jsonl
CHANGED
@@ -1,27 +1,16 @@
|
|
1 |
-
{"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
|
2 |
-
{"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
|
3 |
-
{"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"
|
4 |
-
{"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"
|
5 |
-
{"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"
|
6 |
-
{"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"
|
7 |
-
{"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"
|
8 |
-
{"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"
|
9 |
-
{"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"
|
10 |
-
{"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"
|
11 |
-
{"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"
|
12 |
-
{"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-
|
13 |
-
{"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
|
14 |
-
{"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"
|
15 |
-
{"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"
|
16 |
-
{"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"
|
17 |
-
{"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "active": true}
|
18 |
-
{"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "active": true}
|
19 |
-
{"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2", "active": false}
|
20 |
-
{"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r", "active": true}
|
21 |
-
{"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
|
22 |
-
{"name": "Atla-8B-preview", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview", "active": false}
|
23 |
-
{"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
|
24 |
-
{"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
|
25 |
-
{"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2", "active": false}
|
26 |
-
{"name": "SFR-LLaMA-3.1-70B-Judge", "organization": "Salesforce", "license": "Proprietary", "api_model": "sfr-llama-3.1-70b-judge", "active": true}
|
27 |
-
{"name": "Atla Selene 1 Mini", "organization": "Atla", "license": "Open Source", "api_model": "Atla-Selene-Mini", "active": true}
|
|
|
1 |
+
{"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
|
2 |
+
{"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
|
3 |
+
{"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
|
4 |
+
{"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
|
5 |
+
{"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
|
6 |
+
{"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
|
7 |
+
{"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
|
8 |
+
{"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
|
9 |
+
{"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
|
10 |
+
{"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
|
11 |
+
{"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
|
12 |
+
{"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-20240229"}
|
13 |
+
{"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
|
14 |
+
{"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
|
15 |
+
{"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
|
16 |
+
{"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gen_api_answer.py
CHANGED
@@ -1,238 +1,95 @@
|
|
1 |
from openai import OpenAI
|
2 |
import anthropic
|
3 |
from together import Together
|
4 |
-
import cohere
|
5 |
import json
|
6 |
import re
|
7 |
-
import os
|
8 |
-
import requests
|
9 |
-
from prompts import (
|
10 |
-
JUDGE_SYSTEM_PROMPT,
|
11 |
-
PROMETHEUS_PROMPT,
|
12 |
-
PROMETHEUS_PROMPT_WITH_REFERENCE,
|
13 |
-
ATLA_PROMPT,
|
14 |
-
ATLA_PROMPT_WITH_REFERENCE,
|
15 |
-
FLOW_JUDGE_PROMPT
|
16 |
-
)
|
17 |
-
from transformers import AutoTokenizer
|
18 |
|
19 |
# Initialize clients
|
20 |
anthropic_client = anthropic.Anthropic()
|
21 |
openai_client = OpenAI()
|
22 |
together_client = Together()
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
"""Get response from OpenAI API"""
|
29 |
try:
|
30 |
response = openai_client.chat.completions.create(
|
31 |
model=model_name,
|
32 |
messages=[
|
33 |
-
{"role": "system", "content":
|
34 |
{"role": "user", "content": prompt},
|
35 |
],
|
36 |
-
max_completion_tokens=max_tokens,
|
37 |
-
temperature=temperature,
|
38 |
)
|
39 |
return response.choices[0].message.content
|
40 |
except Exception as e:
|
41 |
return f"Error with OpenAI model {model_name}: {str(e)}"
|
42 |
|
43 |
-
|
|
|
44 |
"""Get response from Anthropic API"""
|
45 |
try:
|
46 |
response = anthropic_client.messages.create(
|
47 |
model=model_name,
|
48 |
-
max_tokens=
|
49 |
-
temperature=
|
50 |
-
system=
|
51 |
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
|
52 |
)
|
53 |
return response.content[0].text
|
54 |
except Exception as e:
|
55 |
return f"Error with Anthropic model {model_name}: {str(e)}"
|
56 |
|
57 |
-
|
|
|
58 |
"""Get response from Together API"""
|
59 |
try:
|
60 |
response = together_client.chat.completions.create(
|
61 |
model=model_name,
|
62 |
messages=[
|
63 |
-
{"role": "system", "content":
|
64 |
{"role": "user", "content": prompt},
|
65 |
],
|
66 |
-
max_tokens=max_tokens,
|
67 |
-
temperature=temperature,
|
68 |
stream=False,
|
69 |
)
|
70 |
return response.choices[0].message.content
|
71 |
except Exception as e:
|
72 |
return f"Error with Together model {model_name}: {str(e)}"
|
73 |
|
74 |
-
def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
|
75 |
-
"""Get response from Hugging Face model"""
|
76 |
-
try:
|
77 |
-
headers = {
|
78 |
-
"Accept": "application/json",
|
79 |
-
"Authorization": f"Bearer {hf_api_key}",
|
80 |
-
"Content-Type": "application/json"
|
81 |
-
}
|
82 |
-
|
83 |
-
# Create messages list for chat template
|
84 |
-
messages = []
|
85 |
-
if system_prompt:
|
86 |
-
messages.append({"role": "system", "content": system_prompt})
|
87 |
-
messages.append({"role": "user", "content": prompt})
|
88 |
-
|
89 |
-
# Apply chat template
|
90 |
-
model_id = "prometheus-eval/prometheus-7b-v2.0"
|
91 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
|
92 |
-
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
93 |
-
|
94 |
-
payload = {
|
95 |
-
"inputs": formatted_prompt,
|
96 |
-
"parameters": {
|
97 |
-
"max_new_tokens": max_tokens,
|
98 |
-
"return_full_text": False,
|
99 |
-
"temperature": temperature
|
100 |
-
}
|
101 |
-
}
|
102 |
-
|
103 |
-
response = requests.post(
|
104 |
-
"https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
|
105 |
-
headers=headers,
|
106 |
-
json=payload
|
107 |
-
)
|
108 |
-
return response.json()[0]["generated_text"]
|
109 |
-
except Exception as e:
|
110 |
-
return f"Error with Hugging Face model {model_name}: {str(e)}"
|
111 |
-
|
112 |
-
def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
|
113 |
-
"""Get response from HF endpoint for Atla model"""
|
114 |
-
try:
|
115 |
-
headers = {
|
116 |
-
"Accept": "application/json",
|
117 |
-
"Authorization": f"Bearer {hf_api_key}",
|
118 |
-
"Content-Type": "application/json"
|
119 |
-
}
|
120 |
-
|
121 |
-
# Create messages list for chat template
|
122 |
-
messages = []
|
123 |
-
if system_prompt:
|
124 |
-
messages.append({"role": "system", "content": system_prompt})
|
125 |
-
messages.append({"role": "user", "content": prompt})
|
126 |
-
|
127 |
-
# Apply chat template
|
128 |
-
model_id = "AtlaAI/Selene-1-Mini-Llama-3.1-8B"
|
129 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
|
130 |
-
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
131 |
-
|
132 |
-
payload = {
|
133 |
-
"inputs": formatted_prompt,
|
134 |
-
"parameters": {
|
135 |
-
"max_new_tokens": max_tokens,
|
136 |
-
"return_full_text": False,
|
137 |
-
"temperature": temperature,
|
138 |
-
"seed": 42,
|
139 |
-
"add_generation_prompt": True
|
140 |
-
}
|
141 |
-
}
|
142 |
-
|
143 |
-
response = requests.post(
|
144 |
-
"https://bkp9p28gri93egqh.us-east-1.aws.endpoints.huggingface.cloud",
|
145 |
-
headers=headers,
|
146 |
-
json=payload
|
147 |
-
)
|
148 |
-
return response.json()[0]["generated_text"]
|
149 |
-
except Exception as e:
|
150 |
-
return f"Error with Atla model {model_name}: {str(e)}"
|
151 |
-
|
152 |
-
def get_flow_judge_response(model_name, prompt, max_tokens=2048, temperature=0.1, top_p=0.95) -> str:
|
153 |
-
"""Get response from Flow Judge"""
|
154 |
-
try:
|
155 |
-
response = requests.post(
|
156 |
-
"https://arena.flow-ai.io/v1/chat/completions",
|
157 |
-
headers={
|
158 |
-
"Content-Type": "application/json",
|
159 |
-
"Authorization": f"Bearer {flow_judge_api_key}"
|
160 |
-
},
|
161 |
-
json={
|
162 |
-
"model": model_name,
|
163 |
-
"messages": [
|
164 |
-
{"role": "user", "content": prompt}
|
165 |
-
],
|
166 |
-
"max_tokens": max_tokens,
|
167 |
-
"temperature": temperature,
|
168 |
-
"top_p": top_p,
|
169 |
-
"stop": None
|
170 |
-
}
|
171 |
-
)
|
172 |
-
response.raise_for_status()
|
173 |
-
return response.json()["choices"][0]['message']['content']
|
174 |
-
except Exception as e:
|
175 |
-
return f"Error with Flow Judge completions model {model_name}: {str(e)}"
|
176 |
-
|
177 |
-
def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
178 |
-
"""Get response from Cohere API"""
|
179 |
-
try:
|
180 |
-
response = cohere_client.chat(
|
181 |
-
model=model_name,
|
182 |
-
messages=[
|
183 |
-
{"role": "system", "content": system_prompt},
|
184 |
-
{"role": "user", "content": prompt}
|
185 |
-
],
|
186 |
-
max_tokens=max_tokens,
|
187 |
-
temperature=temperature
|
188 |
-
)
|
189 |
-
# Extract the text from the content items
|
190 |
-
content_items = response.message.content
|
191 |
-
if isinstance(content_items, list):
|
192 |
-
# Get the text from the first content item
|
193 |
-
return content_items[0].text
|
194 |
-
return str(content_items) # Fallback if it's not a list
|
195 |
-
except Exception as e:
|
196 |
-
return f"Error with Cohere model {model_name}: {str(e)}"
|
197 |
-
|
198 |
-
def get_salesforce_response(model_name, prompt, system_prompt=None, max_tokens=2048, temperature=0):
|
199 |
-
"""Get response from Salesforce Research API"""
|
200 |
-
try:
|
201 |
-
headers = {
|
202 |
-
'accept': 'application/json',
|
203 |
-
"content-type": "application/json",
|
204 |
-
"X-Api-Key": salesforce_api_key,
|
205 |
-
}
|
206 |
-
|
207 |
-
# Create messages list
|
208 |
-
messages = []
|
209 |
-
messages.append({"role": "user", "content": prompt})
|
210 |
-
|
211 |
-
json_data = {
|
212 |
-
"prompts": messages,
|
213 |
-
"temperature": temperature,
|
214 |
-
"top_p": 1,
|
215 |
-
"max_tokens": max_tokens,
|
216 |
-
}
|
217 |
-
|
218 |
-
response = requests.post(
|
219 |
-
'https://gateway.salesforceresearch.ai/sfr-judge/process',
|
220 |
-
headers=headers,
|
221 |
-
json=json_data
|
222 |
-
)
|
223 |
-
response.raise_for_status()
|
224 |
-
return response.json()['result'][0]
|
225 |
-
except Exception as e:
|
226 |
-
return f"Error with Salesforce model {model_name}: {str(e)}"
|
227 |
|
228 |
-
def get_model_response(
|
229 |
-
model_name,
|
230 |
-
model_info,
|
231 |
-
prompt_data,
|
232 |
-
use_reference=False,
|
233 |
-
max_tokens=500,
|
234 |
-
temperature=0
|
235 |
-
):
|
236 |
"""Get response from appropriate API based on model organization"""
|
237 |
if not model_info:
|
238 |
return "Model not found or unsupported."
|
@@ -240,250 +97,37 @@ def get_model_response(
|
|
240 |
api_model = model_info["api_model"]
|
241 |
organization = model_info["organization"]
|
242 |
|
243 |
-
# Determine if model is Prometheus, Atla, Flow Judge, or Salesforce
|
244 |
-
is_prometheus = (organization == "Prometheus")
|
245 |
-
is_atla = (organization == "Atla")
|
246 |
-
is_flow_judge = (organization == "Flow AI")
|
247 |
-
is_salesforce = (organization == "Salesforce")
|
248 |
-
|
249 |
-
# For non-Prometheus/Atla/Flow Judge/Salesforce models, use the Judge system prompt
|
250 |
-
system_prompt = None if (is_prometheus or is_atla or is_flow_judge or is_salesforce) else JUDGE_SYSTEM_PROMPT
|
251 |
-
|
252 |
-
# Select the appropriate base prompt
|
253 |
-
if is_atla or is_salesforce: # Use same prompt for Atla and Salesforce
|
254 |
-
base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
|
255 |
-
elif is_flow_judge:
|
256 |
-
base_prompt = FLOW_JUDGE_PROMPT
|
257 |
-
else:
|
258 |
-
base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
|
259 |
-
|
260 |
-
# For non-Prometheus/non-Atla/non-Salesforce models, use Prometheus but replace the output format with JSON
|
261 |
-
if not (is_prometheus or is_atla or is_flow_judge or is_salesforce):
|
262 |
-
base_prompt = base_prompt.replace(
|
263 |
-
'3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
|
264 |
-
'3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
|
265 |
-
)
|
266 |
-
|
267 |
-
try:
|
268 |
-
if not is_flow_judge:
|
269 |
-
# Format the prompt with the provided data, only using available keys
|
270 |
-
final_prompt = base_prompt.format(
|
271 |
-
human_input=prompt_data['human_input'],
|
272 |
-
ai_response=prompt_data['ai_response'],
|
273 |
-
ground_truth_input=prompt_data.get('ground_truth_input', ''),
|
274 |
-
eval_criteria=prompt_data['eval_criteria'],
|
275 |
-
score1_desc=prompt_data['score1_desc'],
|
276 |
-
score2_desc=prompt_data['score2_desc'],
|
277 |
-
score3_desc=prompt_data['score3_desc'],
|
278 |
-
score4_desc=prompt_data['score4_desc'],
|
279 |
-
score5_desc=prompt_data['score5_desc']
|
280 |
-
)
|
281 |
-
else:
|
282 |
-
human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
|
283 |
-
ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
|
284 |
-
ground_truth=prompt_data.get('ground_truth_input', '')
|
285 |
-
if ground_truth:
|
286 |
-
response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
|
287 |
-
else:
|
288 |
-
response_reference = ""
|
289 |
-
eval_criteria = prompt_data['eval_criteria']
|
290 |
-
score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
|
291 |
-
score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
|
292 |
-
score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
|
293 |
-
score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
|
294 |
-
score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
|
295 |
-
rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
|
296 |
-
if response_reference:
|
297 |
-
inputs = human_input + "\n"+ response_reference
|
298 |
-
else:
|
299 |
-
inputs = human_input
|
300 |
-
final_prompt = base_prompt.format(
|
301 |
-
INPUTS=inputs,
|
302 |
-
OUTPUT=ai_response,
|
303 |
-
EVALUATION_CRITERIA=eval_criteria,
|
304 |
-
RUBRIC=rubric
|
305 |
-
)
|
306 |
-
|
307 |
-
except KeyError as e:
|
308 |
-
return f"Error formatting prompt: Missing required field {str(e)}"
|
309 |
-
|
310 |
try:
|
311 |
if organization == "OpenAI":
|
312 |
-
return get_openai_response(
|
313 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature
|
314 |
-
)
|
315 |
elif organization == "Anthropic":
|
316 |
-
return get_anthropic_response(
|
317 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature
|
318 |
-
)
|
319 |
-
elif organization == "Prometheus":
|
320 |
-
return get_prometheus_response(
|
321 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
|
322 |
-
)
|
323 |
-
elif organization == "Atla":
|
324 |
-
return get_atla_response(
|
325 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
|
326 |
-
)
|
327 |
-
elif organization == "Cohere":
|
328 |
-
return get_cohere_response(
|
329 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature
|
330 |
-
)
|
331 |
-
elif organization == "Flow AI":
|
332 |
-
return get_flow_judge_response(
|
333 |
-
api_model, final_prompt
|
334 |
-
)
|
335 |
-
elif organization == "Salesforce":
|
336 |
-
response = get_salesforce_response(
|
337 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature
|
338 |
-
)
|
339 |
-
return response
|
340 |
else:
|
341 |
# All other organizations use Together API
|
342 |
-
return get_together_response(
|
343 |
-
api_model, final_prompt, system_prompt, max_tokens, temperature
|
344 |
-
)
|
345 |
except Exception as e:
|
346 |
return f"Error with {organization} model {model_name}: {str(e)}"
|
347 |
|
|
|
348 |
def parse_model_response(response):
|
349 |
try:
|
350 |
# Debug print
|
351 |
print(f"Raw model response: {response}")
|
352 |
|
353 |
-
# If response is already a dictionary, use it directly
|
354 |
-
if isinstance(response, dict):
|
355 |
-
return str(response.get("result", "N/A")), response.get("feedback", "N/A")
|
356 |
-
|
357 |
# First try to parse the entire response as JSON
|
358 |
try:
|
359 |
data = json.loads(response)
|
360 |
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
361 |
except json.JSONDecodeError:
|
362 |
-
# If that fails
|
363 |
-
|
364 |
-
# Use ATLA parser for Salesforce responses
|
365 |
-
return atla_parse_model_response(response)
|
366 |
-
|
367 |
-
# Otherwise try to find JSON within the response
|
368 |
-
json_match = re.search(r"{.*}", response, re.DOTALL)
|
369 |
if json_match:
|
370 |
data = json.loads(json_match.group(0))
|
371 |
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
372 |
else:
|
373 |
-
return "Error", f"
|
374 |
|
375 |
except Exception as e:
|
376 |
# Debug print for error case
|
377 |
print(f"Failed to parse response: {str(e)}")
|
378 |
-
|
379 |
-
# If the error message itself contains valid JSON, try to parse that
|
380 |
-
try:
|
381 |
-
error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
|
382 |
-
if error_json_match:
|
383 |
-
data = json.loads(error_json_match.group(0))
|
384 |
-
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
385 |
-
except:
|
386 |
-
pass
|
387 |
-
|
388 |
return "Error", f"Failed to parse response: {response}"
|
389 |
-
|
390 |
-
def prometheus_parse_model_response(output):
|
391 |
-
try:
|
392 |
-
print(f"Raw model response: {output}")
|
393 |
-
output = output.strip()
|
394 |
-
|
395 |
-
# Remove "Feedback:" prefix if present (case insensitive)
|
396 |
-
output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
|
397 |
-
|
398 |
-
# New pattern to match [RESULT] X at the beginning
|
399 |
-
begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
|
400 |
-
begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
|
401 |
-
if begin_match:
|
402 |
-
score = int(begin_match.group(1))
|
403 |
-
feedback = begin_match.group(2).strip()
|
404 |
-
return str(score), feedback
|
405 |
-
|
406 |
-
# Existing patterns for end-of-string results...
|
407 |
-
pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
|
408 |
-
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
|
409 |
-
if match:
|
410 |
-
feedback = match.group(1).strip()
|
411 |
-
score = int(match.group(2))
|
412 |
-
return str(score), feedback
|
413 |
-
|
414 |
-
# If no match, try to match "... Score: X"
|
415 |
-
pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
|
416 |
-
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
|
417 |
-
if match:
|
418 |
-
feedback = match.group(1).strip()
|
419 |
-
score = int(match.group(2))
|
420 |
-
return str(score), feedback
|
421 |
-
|
422 |
-
# Pattern to handle [Score X] at the end
|
423 |
-
pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
|
424 |
-
match = re.search(pattern, output, re.DOTALL)
|
425 |
-
if match:
|
426 |
-
feedback = match.group(1).strip()
|
427 |
-
score = int(match.group(2))
|
428 |
-
return str(score), feedback
|
429 |
-
|
430 |
-
# Final fallback attempt
|
431 |
-
pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
|
432 |
-
match = re.search(pattern, output)
|
433 |
-
if match:
|
434 |
-
score = int(match.group(1))
|
435 |
-
feedback = output[:match.start()].rstrip()
|
436 |
-
# Remove any trailing brackets from feedback
|
437 |
-
feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
|
438 |
-
return str(score), feedback
|
439 |
-
|
440 |
-
return "Error", f"Failed to parse response: {output}"
|
441 |
-
|
442 |
-
except Exception as e:
|
443 |
-
print(f"Failed to parse response: {str(e)}")
|
444 |
-
return "Error", f"Exception during parsing: {str(e)}"
|
445 |
-
|
446 |
-
def atla_parse_model_response(output):
|
447 |
-
"""Parse response from ATLA model"""
|
448 |
-
try:
|
449 |
-
print(f"Raw Atla model response: {output}")
|
450 |
-
output = output.strip()
|
451 |
-
|
452 |
-
# Look for the Reasoning and Result sections
|
453 |
-
reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
|
454 |
-
result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
|
455 |
-
|
456 |
-
if reasoning_match and result_match:
|
457 |
-
feedback = reasoning_match.group(1).strip()
|
458 |
-
score = result_match.group(1)
|
459 |
-
return str(score), feedback
|
460 |
-
|
461 |
-
return "Error", f"Failed to parse ATLA response format: {output}"
|
462 |
-
|
463 |
-
except Exception as e:
|
464 |
-
print(f"Failed to parse ATLA response: {str(e)}")
|
465 |
-
return "Error", f"Exception during parsing: {str(e)}"
|
466 |
-
|
467 |
-
def flow_judge_parse_model_response(output):
|
468 |
-
try:
|
469 |
-
print(f"Raw model response: {output}")
|
470 |
-
# Convert multiple line breaks to single ones and strip whitespace
|
471 |
-
output = re.sub(r'\n{2,}', '\n', output.strip())
|
472 |
-
|
473 |
-
# Compile regex patterns
|
474 |
-
feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
|
475 |
-
score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
|
476 |
-
|
477 |
-
feedback_match = feedback_pattern.search(output)
|
478 |
-
score_match = score_pattern.search(output)
|
479 |
-
|
480 |
-
if feedback_match or not score_match:
|
481 |
-
feedback = feedback_match.group(1).strip()
|
482 |
-
score = int(score_match.group(1).strip())
|
483 |
-
return str(score), feedback
|
484 |
-
|
485 |
-
return "Error", f"Failed to parse response: {output}"
|
486 |
-
|
487 |
-
except Exception as e:
|
488 |
-
print(f"Failed to parse response: {str(e)}")
|
489 |
-
return "Error", f"Exception during parsing: {str(e)}"
|
|
|
1 |
from openai import OpenAI
|
2 |
import anthropic
|
3 |
from together import Together
|
|
|
4 |
import json
|
5 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Initialize clients
|
8 |
anthropic_client = anthropic.Anthropic()
|
9 |
openai_client = OpenAI()
|
10 |
together_client = Together()
|
11 |
+
|
12 |
+
# Initialize OpenAI client
|
13 |
+
|
14 |
+
EXAMPLE_GENERATION_PROMPT_SYSTEM = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes."""
|
15 |
+
EXAMPLE_GENERATION_PROMPT_USER = """Please provide a random human message and an appropriate AI response in the format of an academic benchmark dataset e.g.,. User: "Hi, I'm trying to solve a crossword puzzle, but I've never done one of these before. Can you help me out?" / AI Response: "Absolutely! I'd be delighted to help you with your crossword puzzle. Just tell me the clues and the number of letters needed for each answer (and any letters you may have already filled in), and I'll do my best to help you find the solutions. If you have any specific questions about how to approach solving crossword puzzles in general, feel free to ask those as well!". Format the output as JSON:\n\n{\"human\": \"<human message>\", \"ai\": \"<AI assistant response>\"}"""
|
16 |
+
|
17 |
+
def get_random_human_ai_pair():
|
18 |
+
# Use GPT-3.5 to generate a random conversation
|
19 |
+
completion = openai_client.chat.completions.create(
|
20 |
+
model="gpt-3.5-turbo",
|
21 |
+
messages=[
|
22 |
+
{"role": "system", "content": EXAMPLE_GENERATION_PROMPT_SYSTEM},
|
23 |
+
{"role": "user", "content": EXAMPLE_GENERATION_PROMPT_USER},
|
24 |
+
],
|
25 |
+
max_completion_tokens=300,
|
26 |
+
temperature=1,
|
27 |
+
)
|
28 |
+
|
29 |
+
# Parse the response to get the human input and AI response
|
30 |
+
raw_response = completion.choices[0].message.content.strip()
|
31 |
+
|
32 |
+
try:
|
33 |
+
data = json.loads(raw_response)
|
34 |
+
human_message = data.get("human", "Hello, how are you?")
|
35 |
+
ai_message = data.get("ai", "I'm doing well, thank you!")
|
36 |
+
except json.JSONDecodeError:
|
37 |
+
# If parsing fails, set default messages
|
38 |
+
human_message = "Hello, how are you?"
|
39 |
+
ai_message = "I'm doing well, thank you!"
|
40 |
+
|
41 |
+
return human_message, ai_message
|
42 |
+
|
43 |
+
SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
44 |
+
|
45 |
+
|
46 |
+
def get_openai_response(model_name, prompt):
|
47 |
"""Get response from OpenAI API"""
|
48 |
try:
|
49 |
response = openai_client.chat.completions.create(
|
50 |
model=model_name,
|
51 |
messages=[
|
52 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
53 |
{"role": "user", "content": prompt},
|
54 |
],
|
|
|
|
|
55 |
)
|
56 |
return response.choices[0].message.content
|
57 |
except Exception as e:
|
58 |
return f"Error with OpenAI model {model_name}: {str(e)}"
|
59 |
|
60 |
+
|
61 |
+
def get_anthropic_response(model_name, prompt):
|
62 |
"""Get response from Anthropic API"""
|
63 |
try:
|
64 |
response = anthropic_client.messages.create(
|
65 |
model=model_name,
|
66 |
+
max_tokens=1000,
|
67 |
+
temperature=0,
|
68 |
+
system=SYSTEM_PROMPT,
|
69 |
messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
|
70 |
)
|
71 |
return response.content[0].text
|
72 |
except Exception as e:
|
73 |
return f"Error with Anthropic model {model_name}: {str(e)}"
|
74 |
|
75 |
+
|
76 |
+
def get_together_response(model_name, prompt):
|
77 |
"""Get response from Together API"""
|
78 |
try:
|
79 |
response = together_client.chat.completions.create(
|
80 |
model=model_name,
|
81 |
messages=[
|
82 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
83 |
{"role": "user", "content": prompt},
|
84 |
],
|
|
|
|
|
85 |
stream=False,
|
86 |
)
|
87 |
return response.choices[0].message.content
|
88 |
except Exception as e:
|
89 |
return f"Error with Together model {model_name}: {str(e)}"
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
def get_model_response(model_name, model_info, prompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
"""Get response from appropriate API based on model organization"""
|
94 |
if not model_info:
|
95 |
return "Model not found or unsupported."
|
|
|
97 |
api_model = model_info["api_model"]
|
98 |
organization = model_info["organization"]
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
try:
|
101 |
if organization == "OpenAI":
|
102 |
+
return get_openai_response(api_model, prompt)
|
|
|
|
|
103 |
elif organization == "Anthropic":
|
104 |
+
return get_anthropic_response(api_model, prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
else:
|
106 |
# All other organizations use Together API
|
107 |
+
return get_together_response(api_model, prompt)
|
|
|
|
|
108 |
except Exception as e:
|
109 |
return f"Error with {organization} model {model_name}: {str(e)}"
|
110 |
|
111 |
+
|
112 |
def parse_model_response(response):
|
113 |
try:
|
114 |
# Debug print
|
115 |
print(f"Raw model response: {response}")
|
116 |
|
|
|
|
|
|
|
|
|
117 |
# First try to parse the entire response as JSON
|
118 |
try:
|
119 |
data = json.loads(response)
|
120 |
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
121 |
except json.JSONDecodeError:
|
122 |
+
# If that fails (typically for smaller models), try to find JSON within the response
|
123 |
+
json_match = re.search(r"{.*}", response)
|
|
|
|
|
|
|
|
|
|
|
124 |
if json_match:
|
125 |
data = json.loads(json_match.group(0))
|
126 |
return str(data.get("result", "N/A")), data.get("feedback", "N/A")
|
127 |
else:
|
128 |
+
return "Error", f"Failed to parse response: {response}"
|
129 |
|
130 |
except Exception as e:
|
131 |
# Debug print for error case
|
132 |
print(f"Failed to parse response: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
return "Error", f"Failed to parse response: {response}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard.py
DELETED
@@ -1,116 +0,0 @@
|
|
1 |
-
from collections import defaultdict
|
2 |
-
from datetime import datetime, timezone
|
3 |
-
from typing import Dict, List
|
4 |
-
|
5 |
-
# Constants
|
6 |
-
DEFAULT_ELO = 1200 # Starting ELO for new models
|
7 |
-
K_FACTOR = 32 # Standard chess K-factor
|
8 |
-
|
9 |
-
def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
|
10 |
-
"""Generate leaderboard data using votes from MongoDB."""
|
11 |
-
# Initialize dictionaries for tracking
|
12 |
-
ratings = defaultdict(lambda: DEFAULT_ELO)
|
13 |
-
matches = defaultdict(int)
|
14 |
-
|
15 |
-
# Process each vote
|
16 |
-
for vote in voting_data:
|
17 |
-
try:
|
18 |
-
model_a = vote.get("model_a")
|
19 |
-
model_b = vote.get("model_b")
|
20 |
-
winner = vote.get("winner")
|
21 |
-
|
22 |
-
# Skip if models aren't in current model_data
|
23 |
-
if (
|
24 |
-
not all([model_a, model_b, winner])
|
25 |
-
or model_a not in model_data
|
26 |
-
or model_b not in model_data
|
27 |
-
):
|
28 |
-
continue
|
29 |
-
|
30 |
-
# Update match counts
|
31 |
-
matches[model_a] += 1
|
32 |
-
matches[model_b] += 1
|
33 |
-
|
34 |
-
# Calculate ELO changes
|
35 |
-
elo_a = ratings[model_a]
|
36 |
-
elo_b = ratings[model_b]
|
37 |
-
|
38 |
-
# Expected scores
|
39 |
-
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
|
40 |
-
expected_b = 1 - expected_a
|
41 |
-
|
42 |
-
# Actual scores
|
43 |
-
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
|
44 |
-
score_b = 1 - score_a
|
45 |
-
|
46 |
-
# Update ratings
|
47 |
-
ratings[model_a] += K_FACTOR * (score_a - expected_a)
|
48 |
-
ratings[model_b] += K_FACTOR * (score_b - expected_b)
|
49 |
-
|
50 |
-
except Exception as e:
|
51 |
-
print(f"Error processing vote: {e}")
|
52 |
-
continue
|
53 |
-
|
54 |
-
# Generate leaderboard data
|
55 |
-
leaderboard = []
|
56 |
-
for model in model_data.keys():
|
57 |
-
votes = matches[model]
|
58 |
-
# Skip models with < 300 votes if show_preliminary is False
|
59 |
-
if not show_preliminary and votes < 300:
|
60 |
-
continue
|
61 |
-
|
62 |
-
elo = ratings[model]
|
63 |
-
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
64 |
-
data = {
|
65 |
-
"Model": model,
|
66 |
-
"ELO Score": f"{int(elo)}",
|
67 |
-
"95% CI": f"±{int(ci)}",
|
68 |
-
"# Votes": votes,
|
69 |
-
"Organization": model_data[model]["organization"],
|
70 |
-
"License": model_data[model]["license"],
|
71 |
-
}
|
72 |
-
leaderboard.append(data)
|
73 |
-
|
74 |
-
# Sort leaderboard by ELO score in descending order
|
75 |
-
leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
|
76 |
-
|
77 |
-
return leaderboard
|
78 |
-
|
79 |
-
def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
|
80 |
-
"""Get summary statistics for the leaderboard."""
|
81 |
-
now = datetime.now(timezone.utc)
|
82 |
-
total_votes = len(voting_data)
|
83 |
-
total_models = len(model_data)
|
84 |
-
# last_updated = now.strftime("%B %d, %Y at %H:%M:%S UTC")
|
85 |
-
|
86 |
-
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
|
87 |
-
"%B %d, %Y at %H:00 UTC"
|
88 |
-
)
|
89 |
-
|
90 |
-
return f"""
|
91 |
-
### Leaderboard Stats
|
92 |
-
- **Total Models**: {total_models}
|
93 |
-
- **Total Votes**: {total_votes}
|
94 |
-
- **Last Updated**: {last_updated}
|
95 |
-
"""
|
96 |
-
|
97 |
-
def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
|
98 |
-
"""Calculate ELO rating changes for both players."""
|
99 |
-
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
100 |
-
expected_b = 1 - expected_a
|
101 |
-
|
102 |
-
if winner == "A":
|
103 |
-
score_a, score_b = 1, 0
|
104 |
-
elif winner == "B":
|
105 |
-
score_a, score_b = 0, 1
|
106 |
-
else: # Handle ties
|
107 |
-
score_a, score_b = 0.5, 0.5
|
108 |
-
|
109 |
-
change_a = K_FACTOR * (score_a - expected_a)
|
110 |
-
change_b = K_FACTOR * (score_b - expected_b)
|
111 |
-
|
112 |
-
return change_a, change_b
|
113 |
-
|
114 |
-
def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
|
115 |
-
"""Get current rankings of all models from leaderboard data."""
|
116 |
-
return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prompts.py
DELETED
@@ -1,210 +0,0 @@
|
|
1 |
-
# Default values for compatible mode
|
2 |
-
DEFAULT_EVAL_CRITERIA = """Does the model provide relevant and useful responses to the user's needs or questions?"""
|
3 |
-
|
4 |
-
DEFAULT_SCORE_1 = "The model's responses are irrelevant or unhelpful to the user's needs or queries."
|
5 |
-
DEFAULT_SCORE_2 = "The model sometimes provides helpful information, but often fails to address the user's actual needs or questions."
|
6 |
-
DEFAULT_SCORE_3 = "The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark."
|
7 |
-
DEFAULT_SCORE_4 = "The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies."
|
8 |
-
DEFAULT_SCORE_5 = "The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."
|
9 |
-
|
10 |
-
# Default Eval Prompt
|
11 |
-
DEFAULT_EVAL_PROMPT = """Does the model provide relevant and useful responses to the user's needs or questions?
|
12 |
-
|
13 |
-
Scoring Rubric:
|
14 |
-
Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
|
15 |
-
Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
|
16 |
-
Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
|
17 |
-
Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
|
18 |
-
Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries.
|
19 |
-
|
20 |
-
[User Query]: {{input}}
|
21 |
-
|
22 |
-
[AI Response]: {{response}}"""
|
23 |
-
|
24 |
-
# Split the eval prompt into editable and fixed parts
|
25 |
-
DEFAULT_EVAL_PROMPT_EDITABLE = """Does the model provide relevant and useful responses to the user's needs or questions?
|
26 |
-
|
27 |
-
Scoring Rubric:
|
28 |
-
Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
|
29 |
-
Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
|
30 |
-
Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
|
31 |
-
Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
|
32 |
-
Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
|
33 |
-
|
34 |
-
# Fixed suffix that will always be appended
|
35 |
-
FIXED_EVAL_SUFFIX = """
|
36 |
-
[User Query]: {{human_input}}
|
37 |
-
|
38 |
-
[AI Response]: {{ai_response}}"""
|
39 |
-
|
40 |
-
# Define the Prometheus prompt used by default (without reference)
|
41 |
-
PROMETHEUS_PROMPT = """###Task Description:
|
42 |
-
An instruction (might include an Input inside it) and a response to evaluate are given.
|
43 |
-
1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
|
44 |
-
2. After writing the feedback, write a score that is an integer between 1 and 5.
|
45 |
-
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
|
46 |
-
4. Please do not generate any other openings, closings, or explanations.
|
47 |
-
|
48 |
-
###The instruction to evaluate:
|
49 |
-
{human_input}
|
50 |
-
|
51 |
-
###Response to evaluate:
|
52 |
-
{ai_response}
|
53 |
-
|
54 |
-
###Score Rubrics:
|
55 |
-
[{eval_criteria}]
|
56 |
-
Score 1: {score1_desc}
|
57 |
-
Score 2: {score2_desc}
|
58 |
-
Score 3: {score3_desc}
|
59 |
-
Score 4: {score4_desc}
|
60 |
-
Score 5: {score5_desc}
|
61 |
-
|
62 |
-
###Feedback:
|
63 |
-
"""
|
64 |
-
|
65 |
-
# Define the Prometheus prompt with reference response
|
66 |
-
PROMETHEUS_PROMPT_WITH_REFERENCE = """###Task Description:
|
67 |
-
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
|
68 |
-
1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
|
69 |
-
2. After writing the feedback, write a score that is an integer between 1 and 5.
|
70 |
-
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
|
71 |
-
4. Please do not generate any other openings, closings, or explanations.
|
72 |
-
|
73 |
-
###The instruction to evaluate:
|
74 |
-
{human_input}
|
75 |
-
|
76 |
-
###Response to evaluate:
|
77 |
-
{ai_response}
|
78 |
-
|
79 |
-
###Reference Answer (Score 5):
|
80 |
-
{ground_truth_input}
|
81 |
-
|
82 |
-
###Score Rubrics:
|
83 |
-
[{eval_criteria}]
|
84 |
-
Score 1: {score1_desc}
|
85 |
-
Score 2: {score2_desc}
|
86 |
-
Score 3: {score3_desc}
|
87 |
-
Score 4: {score4_desc}
|
88 |
-
Score 5: {score5_desc}
|
89 |
-
|
90 |
-
###Feedback:
|
91 |
-
"""
|
92 |
-
|
93 |
-
# Judge system prompt for non-Prometheus models
|
94 |
-
JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
95 |
-
|
96 |
-
ATLA_PROMPT = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
|
97 |
-
Here are some rules of the evaluation:
|
98 |
-
(1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
|
99 |
-
|
100 |
-
Your reply should strictly follow this format:
|
101 |
-
**Reasoning:** <Your feedback>
|
102 |
-
|
103 |
-
**Result:** <an integer between 1 and 5>
|
104 |
-
|
105 |
-
Here is the data:
|
106 |
-
|
107 |
-
Instruction:
|
108 |
-
```
|
109 |
-
{human_input}
|
110 |
-
```
|
111 |
-
|
112 |
-
Response:
|
113 |
-
```
|
114 |
-
{ai_response}
|
115 |
-
```
|
116 |
-
|
117 |
-
Score Rubrics:
|
118 |
-
[{eval_criteria}]
|
119 |
-
Score 1: {score1_desc}
|
120 |
-
Score 2: {score2_desc}
|
121 |
-
Score 3: {score3_desc}
|
122 |
-
Score 4: {score4_desc}
|
123 |
-
Score 5: {score5_desc}"""
|
124 |
-
|
125 |
-
ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
|
126 |
-
|
127 |
-
Here are some rules of the evaluation:
|
128 |
-
(1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
|
129 |
-
|
130 |
-
Your reply should strictly follow this format:
|
131 |
-
**Reasoning:** <Your feedback>
|
132 |
-
|
133 |
-
**Result:** <an integer between 1 and 5>
|
134 |
-
|
135 |
-
Here is the data:
|
136 |
-
|
137 |
-
Instruction:
|
138 |
-
```
|
139 |
-
{human_input}
|
140 |
-
```
|
141 |
-
|
142 |
-
Response:
|
143 |
-
```
|
144 |
-
{ai_response}
|
145 |
-
```
|
146 |
-
|
147 |
-
Score Rubrics:
|
148 |
-
[{eval_criteria}]
|
149 |
-
Score 1: {score1_desc}
|
150 |
-
Score 2: {score2_desc}
|
151 |
-
Score 3: {score3_desc}
|
152 |
-
Score 4: {score4_desc}
|
153 |
-
Score 5: {score5_desc}
|
154 |
-
|
155 |
-
Reference answer:
|
156 |
-
{ground_truth_input}"""
|
157 |
-
|
158 |
-
# Define the Flow Judge prompt
|
159 |
-
FLOW_JUDGE_PROMPT = """# GOAL
|
160 |
-
Your job is to evaluate a task carried out by an AI system powered by a large \
|
161 |
-
language model.
|
162 |
-
|
163 |
-
You will be provided with the inputs and output of the task, as well as the evaluation criteria \
|
164 |
-
and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
|
165 |
-
criteria and scoring rubric provided.
|
166 |
-
|
167 |
-
# INPUT
|
168 |
-
Below are the inputs required for performing the task:
|
169 |
-
<inputs>
|
170 |
-
{INPUTS}
|
171 |
-
</inputs>
|
172 |
-
|
173 |
-
# OUTPUT
|
174 |
-
Below is the output of the task:
|
175 |
-
<output>
|
176 |
-
{OUTPUT}
|
177 |
-
</output>
|
178 |
-
|
179 |
-
# EVALUATION CRITERIA AND SCORING RUBRIC
|
180 |
-
Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
|
181 |
-
<evaluation_criteria>
|
182 |
-
{EVALUATION_CRITERIA}
|
183 |
-
</evaluation_criteria>
|
184 |
-
|
185 |
-
<scoring_rubric>
|
186 |
-
{RUBRIC}
|
187 |
-
</scoring_rubric>
|
188 |
-
|
189 |
-
# INSTRUCTIONS FOR THE EVALUATION
|
190 |
-
1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
|
191 |
-
Review the evaluation criteria and scoring rubric to understand the different levels of \
|
192 |
-
performance and the descriptions for each score.
|
193 |
-
2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
|
194 |
-
generated from completing the task.
|
195 |
-
3. Compare output to score descriptions: Compare the output against the criteria and score \
|
196 |
-
descriptions in the scoring rubric. For each criterion,decide which description best matches the \
|
197 |
-
output.
|
198 |
-
4. After comparing the output to the score descriptions, pay attention to the small details that \
|
199 |
-
might impact the final score that you assign. Sometimes a small difference can dictate the final \
|
200 |
-
score.
|
201 |
-
5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
|
202 |
-
to specific aspects of the output and comparing them to the rubric.
|
203 |
-
6. Assign a final score based on the scoring rubric.
|
204 |
-
|
205 |
-
## FORMAT FOR THE EVALUATION
|
206 |
-
- Write the verbal feedback inside <feedback> tags without any additional surrounding text.
|
207 |
-
- Write the numeric score inside <score> tags, without any additional surrounding text and always \
|
208 |
-
after the feedback.
|
209 |
-
|
210 |
-
Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
random_sample_generation.py
DELETED
@@ -1,183 +0,0 @@
|
|
1 |
-
from openai import OpenAI
|
2 |
-
import anthropic
|
3 |
-
import json
|
4 |
-
import re
|
5 |
-
import random
|
6 |
-
import os
|
7 |
-
from gen_api_answer import get_openai_response, get_anthropic_response
|
8 |
-
|
9 |
-
# Initialize clients
|
10 |
-
anthropic_client = anthropic.Anthropic()
|
11 |
-
openai_client = OpenAI()
|
12 |
-
|
13 |
-
GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
14 |
-
BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
15 |
-
AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
16 |
-
|
17 |
-
GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response generated should be a few sentences long and contain accurate information. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
18 |
-
BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
19 |
-
AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
20 |
-
|
21 |
-
GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be a few sentences long."""
|
22 |
-
GENERATION_PROMPT_WITH_GROUND_TRUTH = """Please generate:
|
23 |
-
1. A random human message (not a simple one-word answer question)
|
24 |
-
2. An AI response (a few sentences long)
|
25 |
-
3. A perfect reference answer that would score 5/5 on all criteria (e.g., concise, helpful, and accurate)
|
26 |
-
|
27 |
-
Format as JSON with "human", "ai", and "ground_truth" fields."""
|
28 |
-
|
29 |
-
RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
|
30 |
-
|
31 |
-
def get_random_human_ai_pair():
|
32 |
-
# Select system prompt with specified probabilities
|
33 |
-
system_prompt = random.choices(
|
34 |
-
[GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
|
35 |
-
weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
|
36 |
-
)[0]
|
37 |
-
|
38 |
-
# Log which type of response is being generated
|
39 |
-
prompt_type = {
|
40 |
-
GOOD_SYSTEM_PROMPT: "good",
|
41 |
-
BAD_SYSTEM_PROMPT: "bad",
|
42 |
-
AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
|
43 |
-
}[system_prompt]
|
44 |
-
print(f"Generating {prompt_type} response")
|
45 |
-
|
46 |
-
# Randomly choose between GPT-3.5 and Claude with 65%/35% weights
|
47 |
-
model_choice = random.choices([
|
48 |
-
("gpt-3.5-turbo", get_openai_response),
|
49 |
-
("claude-3-5-haiku-latest", get_anthropic_response)
|
50 |
-
], weights=[0.5, 0.5])[0]
|
51 |
-
model_name, api_func = model_choice
|
52 |
-
|
53 |
-
# Generate response using selected model
|
54 |
-
response = api_func(
|
55 |
-
model_name=model_name,
|
56 |
-
prompt=GENERATION_PROMPT,
|
57 |
-
system_prompt=system_prompt,
|
58 |
-
max_tokens=500,
|
59 |
-
temperature=1
|
60 |
-
)
|
61 |
-
|
62 |
-
# Define default messages
|
63 |
-
default_human = "How do muscles grow?"
|
64 |
-
default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
|
65 |
-
|
66 |
-
try:
|
67 |
-
# Clean the response by replacing newlines with spaces
|
68 |
-
cleaned_response = response.replace('\n', ' ').replace('\r', '')
|
69 |
-
data = json.loads(cleaned_response)
|
70 |
-
|
71 |
-
# Extract messages with fallbacks
|
72 |
-
human_message = data.get("human", default_human)
|
73 |
-
ai_message = data.get("ai", default_ai)
|
74 |
-
|
75 |
-
# Debug logging
|
76 |
-
print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
|
77 |
-
|
78 |
-
except Exception as e:
|
79 |
-
print(f"Failed to parse response: {str(e)}\n {response}")
|
80 |
-
human_message = default_human
|
81 |
-
ai_message = default_ai
|
82 |
-
|
83 |
-
return human_message, ai_message
|
84 |
-
|
85 |
-
def get_random_human_ai_ground_truth_pair():
|
86 |
-
# Select system prompt with specified probabilities
|
87 |
-
system_prompts = {
|
88 |
-
"good": GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
|
89 |
-
"bad": BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
|
90 |
-
"ambiguous": AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH
|
91 |
-
}
|
92 |
-
|
93 |
-
prompt_type = random.choices(
|
94 |
-
["good", "bad", "ambiguous"],
|
95 |
-
weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
|
96 |
-
)[0]
|
97 |
-
|
98 |
-
system_prompt = system_prompts[prompt_type]
|
99 |
-
print(f"Generating {prompt_type} response with ground truth")
|
100 |
-
|
101 |
-
# Randomly choose between GPT-3.5 and Claude with 50/50 weights
|
102 |
-
model_choice = random.choices([
|
103 |
-
("gpt-3.5-turbo", get_openai_response),
|
104 |
-
("claude-3-5-haiku-latest", get_anthropic_response)
|
105 |
-
], weights=[0.5, 0.5])[0]
|
106 |
-
model_name, api_func = model_choice
|
107 |
-
|
108 |
-
# Define default messages
|
109 |
-
defaults = {
|
110 |
-
"human": "How do muscles grow?",
|
111 |
-
"ai": """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis.""",
|
112 |
-
"ground_truth": """Muscle growth (hypertrophy) occurs through a complex biological process involving several key mechanisms:
|
113 |
-
|
114 |
-
1. Mechanical Tension: Resistance training creates mechanical tension in muscle fibers, triggering molecular and cellular responses that promote growth.
|
115 |
-
|
116 |
-
2. Metabolic Stress: The depletion of energy resources and accumulation of metabolic byproducts during exercise contributes to muscle growth signaling.
|
117 |
-
|
118 |
-
3. Muscle Damage: Exercise-induced micro-damage to muscle fibers activates satellite cells, which help repair and build new muscle tissue.
|
119 |
-
|
120 |
-
4. Protein Synthesis: After exercise, increased protein synthesis rates exceed protein breakdown, leading to net muscle protein accretion.
|
121 |
-
|
122 |
-
5. Hormonal Response: Exercise triggers the release of growth-promoting hormones like testosterone, growth hormone, and IGF-1.
|
123 |
-
|
124 |
-
6. Recovery: Adequate rest between training sessions allows for repair and growth, supported by proper nutrition, particularly protein intake (1.6-2.2g/kg/day).
|
125 |
-
|
126 |
-
This process is influenced by factors including genetics, age, sex, nutrition, sleep quality, and training variables. Optimal muscle growth requires a structured resistance training program, adequate protein intake, sufficient calories, and proper recovery."""
|
127 |
-
}
|
128 |
-
|
129 |
-
# Generate response using selected model
|
130 |
-
response = api_func(
|
131 |
-
model_name=model_name,
|
132 |
-
prompt=GENERATION_PROMPT_WITH_GROUND_TRUTH,
|
133 |
-
system_prompt=system_prompt,
|
134 |
-
max_tokens=1000, # Increased token limit to accommodate ground truth
|
135 |
-
temperature=1
|
136 |
-
)
|
137 |
-
|
138 |
-
# Parse the response to get all three components
|
139 |
-
try:
|
140 |
-
# Clean the response by replacing newlines with spaces
|
141 |
-
cleaned_response = response.replace('\n', ' ').replace('\r', '')
|
142 |
-
data = json.loads(cleaned_response)
|
143 |
-
|
144 |
-
# Extract messages with fallbacks
|
145 |
-
human_message = data.get("human", defaults["human"])
|
146 |
-
ai_message = data.get("ai", defaults["ai"])
|
147 |
-
ground_truth = data.get("ground_truth", defaults["ground_truth"])
|
148 |
-
|
149 |
-
# Debug logging
|
150 |
-
print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...', ground_truth='{ground_truth[:50]}...'")
|
151 |
-
|
152 |
-
except Exception as e:
|
153 |
-
print(f"Failed to parse response: {str(e)}\n {response}")
|
154 |
-
human_message = defaults["human"]
|
155 |
-
ai_message = defaults["ai"]
|
156 |
-
ground_truth = defaults["ground_truth"]
|
157 |
-
|
158 |
-
return human_message, ai_message, ground_truth
|
159 |
-
|
160 |
-
def generate_ai_response(human_msg):
|
161 |
-
"""Generate AI response using GPT-3.5-turbo"""
|
162 |
-
if not human_msg.strip():
|
163 |
-
return "", False
|
164 |
-
|
165 |
-
try:
|
166 |
-
response = get_openai_response(
|
167 |
-
"gpt-3.5-turbo",
|
168 |
-
human_msg,
|
169 |
-
system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
|
170 |
-
max_tokens=1000,
|
171 |
-
temperature=1
|
172 |
-
)
|
173 |
-
# Extract just the response content since we don't need JSON format here
|
174 |
-
if isinstance(response, str):
|
175 |
-
# Clean up any JSON formatting if present
|
176 |
-
try:
|
177 |
-
data = json.loads(response)
|
178 |
-
response = data.get("content", response)
|
179 |
-
except json.JSONDecodeError:
|
180 |
-
pass
|
181 |
-
return response, False # Return response and button interactive state
|
182 |
-
except Exception as e:
|
183 |
-
return f"Error generating response: {str(e)}", False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -4,5 +4,3 @@ python-dotenv
|
|
4 |
openai
|
5 |
anthropic
|
6 |
together
|
7 |
-
cohere
|
8 |
-
transformers
|
|
|
4 |
openai
|
5 |
anthropic
|
6 |
together
|
|
|
|