Files changed (8) hide show
  1. app.py +458 -666
  2. common.py +39 -14
  3. data/models.jsonl +16 -27
  4. gen_api_answer.py +53 -409
  5. leaderboard.py +0 -116
  6. prompts.py +0 -210
  7. random_sample_generation.py +0 -183
  8. requirements.txt +0 -2
app.py CHANGED
@@ -2,57 +2,36 @@ import json
2
  import re
3
  import random
4
  from collections import defaultdict
5
- from datetime import datetime
6
  import hashlib
7
- import gradio as gr
8
 
9
  from dotenv import load_dotenv
10
- load_dotenv()
11
 
12
- from gen_api_answer import (
13
- get_model_response,
14
- parse_model_response,
15
- prometheus_parse_model_response,
16
- atla_parse_model_response,
17
- flow_judge_parse_model_response
18
- )
19
 
20
- from random_sample_generation import (
21
- get_random_human_ai_pair,
22
- get_random_human_ai_ground_truth_pair,
23
- generate_ai_response
24
- )
25
  from db import add_vote, create_db_connection, get_votes
26
-
27
  from utils import Vote
28
-
29
  from common import (
30
  POLICY_CONTENT,
31
  ACKNOWLEDGEMENTS,
 
 
 
32
  CSS_STYLES,
33
  MAIN_TITLE,
34
  HOW_IT_WORKS,
 
 
 
35
  )
36
- from prompts import (
37
- DEFAULT_EVAL_PROMPT,
38
- DEFAULT_EVAL_PROMPT_EDITABLE,
39
- FIXED_EVAL_SUFFIX,
40
- DEFAULT_EVAL_CRITERIA,
41
- DEFAULT_SCORE_1,
42
- DEFAULT_SCORE_2,
43
- DEFAULT_SCORE_3,
44
- DEFAULT_SCORE_4,
45
- DEFAULT_SCORE_5,
46
- )
47
- from leaderboard import (
48
- get_leaderboard,
49
- get_leaderboard_stats,
50
- get_model_rankings,
51
- DEFAULT_ELO,
52
- K_FACTOR
53
- )
54
 
55
 
 
 
 
56
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
57
  vote_counts = defaultdict(int)
58
 
@@ -73,7 +52,6 @@ def load_model_data():
73
  "organization": model["organization"],
74
  "license": model["license"],
75
  "api_model": model["api_model"],
76
- "active": model["active"]
77
  }
78
  except FileNotFoundError:
79
  print("Warning: models.jsonl not found")
@@ -84,11 +62,9 @@ def load_model_data():
84
  model_data = load_model_data()
85
 
86
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
87
- prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
88
-
89
  vote = Vote(
90
  timestamp=datetime.now().isoformat(),
91
- prompt=prompt_value,
92
  response_a=response_a,
93
  response_b=response_b,
94
  model_a=model_a,
@@ -117,6 +93,40 @@ def get_final_prompt(eval_prompt, variable_values):
117
  return eval_prompt
118
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  def get_ip(request: gr.Request) -> str:
122
  """Get and hash the IP address from the request."""
@@ -133,26 +143,6 @@ def get_ip(request: gr.Request) -> str:
133
  return hashlib.sha256(ip.encode()).hexdigest()[:16]
134
 
135
 
136
- def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
137
- """Generate appropriate message based on vote and model rankings.
138
- Returns (title, message) tuple."""
139
- # Get current rankings
140
- voting_data = get_current_votes()
141
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
142
- rankings = get_model_rankings(leaderboard)
143
- pos_a = rankings.get(model_a, 0)
144
- pos_b = rankings.get(model_b, 0)
145
-
146
- if choice == "Tie":
147
- return "It's a tie!", "Keep voting responsibly 🤗"
148
-
149
- # Check if vote aligns with leaderboard
150
- if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
151
- return "The favourite wins!", "Keep voting responsibly 🤗"
152
- else:
153
- return "The underdog wins!", "Keep voting responsibly 🤗"
154
-
155
-
156
  def vote(
157
  choice,
158
  model_a,
@@ -202,39 +192,16 @@ def vote(
202
  store_vote_data(
203
  final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
204
  )
205
-
206
- # Get model positions for display
207
- voting_data = get_current_votes()
208
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
209
- rankings = get_model_rankings(leaderboard)
210
- pos_a = rankings.get(model_a, 0)
211
- pos_b = rankings.get(model_b, 0)
212
-
213
- # Format model names with positions and win/loss indicators
214
- if choice == "Tie":
215
- model_a_display = f"*Model: {model_a} (Position #{pos_a})*"
216
- model_b_display = f"*Model: {model_b} (Position #{pos_b})*"
217
- else:
218
- winner = model_a if choice == "A" else model_b
219
- loser = model_b if choice == "A" else model_a
220
- winner_pos = pos_a if choice == "A" else pos_b
221
- loser_pos = pos_b if choice == "A" else pos_a
222
-
223
- model_a_display = f"*Model: {model_a} {'✅' if choice == 'A' else '❌'} (Position #{pos_a})*"
224
- model_b_display = f"*Model: {model_b} {'✅' if choice == 'B' else '❌'} (Position #{pos_b})*"
225
-
226
- # Generate vote message
227
- title, message = get_vote_message(choice, model_a, model_b)
228
-
229
  return [
230
- gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
231
- gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
232
- gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
233
- gr.update(value=model_a_display), # model_name_a
234
- gr.update(value=model_b_display), # model_name_b
235
- gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
236
- gr.update(value="🎲 New round", variant="primary"), # random_btn
237
- gr.Info(message, title=title), # success message
238
  ]
239
 
240
 
@@ -243,24 +210,150 @@ def get_current_votes():
243
  return get_votes(db)
244
 
245
 
246
- # Update the refresh_leaderboard function
247
- def refresh_leaderboard(show_preliminary):
248
- """Refresh the leaderboard data and stats."""
249
  voting_data = get_current_votes()
250
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
251
- data = [
252
- [
253
- entry["Model"],
254
- float(entry["ELO Score"]),
255
- entry["95% CI"],
256
- entry["# Votes"],
257
- entry["Organization"],
258
- entry["License"],
259
- ]
260
- for entry in leaderboard
261
- ]
262
- stats = get_leaderboard_stats(model_data, voting_data)
263
- return [gr.update(value=data), gr.update(value=stats)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
 
266
  # Update the leaderboard table definition in the UI
@@ -270,30 +363,63 @@ leaderboard_table = gr.Dataframe(
270
  )
271
 
272
 
273
- def populate_random_example(request: gr.Request, compatible_mode: bool):
274
- """Generate a random human-AI conversation example and reset judge outputs."""
275
- if compatible_mode:
276
- # Generate all three components when compatible mode is enabled
277
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
278
- else:
279
- # Generate only human and AI messages when compatible mode is disabled
280
- human_msg, ai_msg = get_random_human_ai_pair()
281
- ground_truth_msg = ""
282
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  return [
284
  gr.update(value=human_msg),
285
- gr.update(value=ai_msg),
286
- gr.update(value="🎲", variant="secondary"), # Reset random button appearance
287
- gr.update(value=""), # Clear score A
288
- gr.update(value=""), # Clear critique A
289
- gr.update(value=""), # Clear score B
290
- gr.update(value=""), # Clear critique B
291
- gr.update(interactive=False, variant="primary"), # Reset vote A
292
- gr.update(interactive=False, variant="primary"), # Reset vote B
293
- gr.update(interactive=False, variant="primary"), # Reset vote tie
294
- gr.update(value="*Model: Hidden*"), # Reset model name A
295
- gr.update(value="*Model: Hidden*"), # Reset model name B
296
- gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
297
  ]
298
 
299
 
@@ -309,43 +435,27 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
309
 
310
  with gr.Tabs():
311
  with gr.TabItem("Judge Arena"):
 
312
  with gr.Row():
313
  # Left side - Input section
314
  with gr.Column(scale=1):
315
  with gr.Group():
316
  human_input = gr.TextArea(
317
- label="👩 User Input",
318
- lines=10,
319
  placeholder="Enter the human message here..."
320
  )
321
- with gr.Row():
322
- generate_btn = gr.Button(
323
- "Generate AI Response",
324
- size="sm",
325
- interactive=False
326
- )
327
 
328
  ai_response = gr.TextArea(
329
  label="🤖 AI Response",
330
- lines=15,
331
- placeholder="Enter the AI response here..."
332
- )
333
-
334
- # Ground truth response (initially hidden)
335
- ground_truth = gr.TextArea(
336
- label="🎯 Ground truth response",
337
  lines=12,
338
- placeholder="Enter the ground truth response here...",
339
- visible=False
340
  )
341
 
342
- with gr.Row():
343
- random_btn = gr.Button("🎲", scale=2)
344
  send_btn = gr.Button(
345
- value="Run judges",
346
  variant="primary",
347
- size="lg",
348
- scale=8
349
  )
350
 
351
  # Right side - Model outputs
@@ -355,15 +465,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
355
  model_name_a = gr.Markdown("*Model: Hidden*")
356
  with gr.Row():
357
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
358
- score_a = gr.Textbox(label="Score", lines=6, interactive=False)
359
- vote_a = gr.Button("Vote A", variant="primary", interactive=False)
360
  with gr.Column(scale=9, min_width=400): # Wider width for critique
361
- critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
 
 
 
362
 
363
  # Tie button row
364
- with gr.Row() as tie_button_row:
365
  with gr.Column():
366
- vote_tie = gr.Button("Tie", variant="primary", interactive=False)
367
 
368
 
369
  gr.Markdown("### 🧑‍⚖️ Judge B")
@@ -371,90 +484,16 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
371
  model_name_b = gr.Markdown("*Model: Hidden*")
372
  with gr.Row():
373
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
374
- score_b = gr.Textbox(label="Score", lines=6, interactive=False)
375
- vote_b = gr.Button("Vote B", variant="primary", interactive=False)
376
  with gr.Column(scale=9, min_width=400): # Wider width for critique
377
- critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
378
- # Place Vote B button directly under Judge B
379
 
380
  gr.Markdown("<br>")
381
-
382
-
383
- # Replace the "Edit Judge Prompt" Accordion section with:
384
- with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
385
- gr.Markdown("<br>")
386
- use_reference_toggle = gr.Checkbox(
387
- label="Use a reference response",
388
- value=False
389
- )
390
-
391
- # Hide the default prompt editor
392
- with gr.Column(visible=False) as default_prompt_editor:
393
- eval_prompt_editable = gr.TextArea(
394
- value=DEFAULT_EVAL_PROMPT_EDITABLE,
395
- label="Evaluation Criteria",
396
- lines=12
397
- )
398
 
399
- with gr.Row(visible=False) as edit_buttons_row:
400
- cancel_prompt_btn = gr.Button("Cancel")
401
- save_prompt_btn = gr.Button("Save", variant="primary")
402
- gr.Markdown("*The sample being evaluated is always appended as:*")
403
- gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
404
-
405
- # Show the compatible mode editor
406
- with gr.Column(visible=True) as compatible_prompt_editor:
407
- with gr.Row():
408
- # Left column - Evaluation Criteria
409
- with gr.Column(scale=1):
410
- eval_criteria_text = gr.TextArea(
411
- label="Evaluation Criteria",
412
- lines=12,
413
- value=DEFAULT_EVAL_CRITERIA,
414
- placeholder="Enter the evaluation criteria..."
415
- )
416
- prometheus_reference = gr.Markdown(
417
- "<br> *By default, we use the Prometheus absolute grading prompt template - see [here](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0).*",
418
- visible=True
419
- )
420
-
421
- # Right column - Score Descriptions
422
- with gr.Column(scale=1):
423
- score1_description = gr.TextArea(
424
- label="Score 1",
425
- value=DEFAULT_SCORE_1,
426
- placeholder="Description for score 1",
427
- lines=2
428
- )
429
- score2_description = gr.TextArea(
430
- label="Score 2",
431
- value=DEFAULT_SCORE_2,
432
- placeholder="Description for score 2",
433
- lines=2
434
- )
435
- score3_description = gr.TextArea(
436
- label="Score 3",
437
- value=DEFAULT_SCORE_3,
438
- placeholder="Description for score 3",
439
- lines=2
440
- )
441
- score4_description = gr.TextArea(
442
- label="Score 4",
443
- value=DEFAULT_SCORE_4,
444
- placeholder="Description for score 4",
445
- lines=2
446
- )
447
- score5_description = gr.TextArea(
448
- label="Score 5",
449
- value=DEFAULT_SCORE_5,
450
- placeholder="Description for score 5",
451
- lines=2
452
- )
453
-
454
- # Add save/cancel buttons for compatible mode
455
- with gr.Row(visible=False) as compatible_edit_buttons_row:
456
- compatible_cancel_btn = gr.Button("Cancel")
457
- compatible_save_btn = gr.Button("Save", variant="primary")
458
 
459
  with gr.TabItem("Leaderboard"):
460
  with gr.Row():
@@ -462,7 +501,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
462
  show_preliminary = gr.Checkbox(
463
  label="Reveal preliminary results",
464
  value=True, # Checked by default
465
- info="Show all models, including models with less human ratings (< 300 votes)",
466
  interactive=True
467
  )
468
  stats_display = gr.Markdown()
@@ -470,13 +509,24 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
470
  headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
471
  datatype=["str", "number", "str", "number", "str", "str", "str"],
472
  )
473
-
474
- gr.Markdown("""<br>
475
- <br>
476
- Judge Arena uses Together AI for inference of open-source models. FP8 models are named as -- "Turbo" where the performance of the FP16 reference models is closely matched:
477
 
478
- [*"Together Turbo achieves this performance while maintaining full accuracy compared to Meta's reference implementation across all models. Llama-3.1-405B-Instruct-Turbo matches the accuracy of Meta reference models."*](https://www.together.ai/blog/together-inference-engine-2)
479
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  # Add change handler for checkbox
482
  show_preliminary.change(
@@ -494,15 +544,67 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
494
 
495
  with gr.TabItem("Policy"):
496
  gr.Markdown(POLICY_CONTENT)
497
- gr.Markdown(ACKNOWLEDGEMENTS)
498
 
499
  # Define state variables for model tracking
500
  model_a_state = gr.State()
501
  model_b_state = gr.State()
502
  final_prompt_state = gr.State()
503
- eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
504
- is_editing = gr.State(False) # Track editing state
505
- compatible_mode_state = gr.State(False) # Track compatible mode state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
 
507
  # Update model names after responses are generated
508
  def update_model_names(model_a, model_b):
@@ -517,7 +619,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
517
  vote_a.click(
518
  fn=vote,
519
  inputs=[
520
- gr.State("A"),
521
  model_a_state,
522
  model_b_state,
523
  final_prompt_state,
@@ -529,19 +631,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
529
  outputs=[
530
  vote_a,
531
  vote_b,
532
- vote_tie,
533
  model_name_a,
534
  model_name_b,
535
  send_btn,
536
- random_btn,
537
- gr.State(), # placeholder for success message
538
  ],
539
  )
540
 
541
  vote_b.click(
542
  fn=vote,
543
  inputs=[
544
- gr.State("B"),
545
  model_a_state,
546
  model_b_state,
547
  final_prompt_state,
@@ -553,19 +654,18 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
553
  outputs=[
554
  vote_a,
555
  vote_b,
556
- vote_tie,
557
  model_name_a,
558
  model_name_b,
559
  send_btn,
560
- random_btn,
561
- gr.State(), # placeholder for success message
562
  ],
563
  )
564
 
565
  vote_tie.click(
566
  fn=vote,
567
  inputs=[
568
- gr.State("Tie"),
569
  model_a_state,
570
  model_b_state,
571
  final_prompt_state,
@@ -577,250 +677,66 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
577
  outputs=[
578
  vote_a,
579
  vote_b,
580
- vote_tie,
581
  model_name_a,
582
  model_name_b,
583
  send_btn,
584
- random_btn,
585
- gr.State(), # placeholder for success message
586
  ],
587
  )
588
 
589
- # Add handlers for save/cancel buttons
590
- def save_prompt(new_prompt, previous_prompt):
591
- return [
592
- gr.update(value=new_prompt), # Update the prompt
593
- new_prompt, # Update the previous prompt state
594
- gr.update(visible=False) # Hide the buttons
595
- ]
596
-
597
- def cancel_prompt(previous_prompt):
598
- return [
599
- gr.update(value=previous_prompt), # Revert to previous prompt
600
- previous_prompt, # Keep the previous prompt state
601
- gr.update(visible=False) # Hide the buttons
602
- ]
 
603
 
604
- def show_edit_buttons(current_value, previous_value):
605
- # Show buttons only if the current value differs from the previous value
606
- return gr.update(visible=current_value != previous_value)
607
 
608
- # Add handlers for save/cancel buttons and prompt changes
609
- save_prompt_btn.click(
610
- fn=save_prompt,
611
- inputs=[eval_prompt_editable, eval_prompt_previous],
612
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
613
- )
614
 
615
- cancel_prompt_btn.click(
616
- fn=cancel_prompt,
617
- inputs=[eval_prompt_previous],
618
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
619
- )
620
-
621
- eval_prompt_editable.change(
622
- fn=show_edit_buttons,
623
- inputs=[eval_prompt_editable, eval_prompt_previous],
624
- outputs=edit_buttons_row
625
- )
626
 
627
- # Function to toggle visibility based on compatible mode
628
- def toggle_use_reference(checked):
629
- if checked:
630
- # Get new random samples with ground truth when enabling reference mode
631
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
632
- return {
633
- ground_truth: gr.update(visible=True, value=ground_truth_msg),
634
- human_input: gr.update(value=human_msg),
635
- ai_response: gr.update(value=ai_msg),
636
- # Reset other UI elements
637
- score_a: gr.update(value=""),
638
- critique_a: gr.update(value=""),
639
- score_b: gr.update(value=""),
640
- critique_b: gr.update(value=""),
641
- vote_a: gr.update(interactive=False, variant="primary"),
642
- vote_b: gr.update(interactive=False, variant="primary"),
643
- vote_tie: gr.update(interactive=False, variant="primary"),
644
- model_name_a: gr.update(value="*Model: Hidden*"),
645
- model_name_b: gr.update(value="*Model: Hidden*"),
646
- random_btn: gr.update(value="🎲", variant="secondary"),
647
- }
648
- else:
649
- # Just hide ground truth when disabling reference mode
650
- return {
651
- ground_truth: gr.update(visible=False)
652
- }
653
-
654
- # Update the change handler to include all necessary outputs
655
- use_reference_toggle.change(
656
- fn=toggle_use_reference,
657
- inputs=[use_reference_toggle],
658
- outputs=[
659
- ground_truth,
660
- human_input,
661
- ai_response,
662
  score_a,
663
  critique_a,
664
  score_b,
665
  critique_b,
666
- vote_a,
667
- vote_b,
668
- vote_tie,
669
- model_name_a,
670
- model_name_b,
671
- random_btn,
672
- ]
673
- )
674
-
675
- # Add a new state variable to track first game
676
- first_game_state = gr.State(True) # Initialize as True
677
-
678
- # Update the submit function to use the state variable
679
- def submit_and_store(
680
- use_reference,
681
- eval_criteria_text_input,
682
- human_input,
683
- ai_response,
684
- ground_truth_input,
685
- score1_description,
686
- score2_description,
687
- score3_description,
688
- score4_description,
689
- score5_description,
690
- ):
691
- # Build prompt data dictionary
692
- prompt_data = {
693
- 'human_input': human_input,
694
- 'ai_response': ai_response,
695
- 'ground_truth_input': ground_truth_input,
696
- 'eval_criteria': eval_criteria_text_input,
697
- 'score1_desc': score1_description,
698
- 'score2_desc': score2_description,
699
- 'score3_desc': score3_description,
700
- 'score4_desc': score4_description,
701
- 'score5_desc': score5_description,
702
- }
703
-
704
- # Get list of active models only for matches
705
- active_models = [name for name, info in model_data.items()
706
- if info.get("active", True) is True] # Explicitly check for True
707
-
708
- # Define new models list
709
- new_models = ["Atla Selene 1 Mini", "SFR-LLaMA-3.1-70B-Judge"]
710
-
711
- # New models appear 40% of the time
712
- if random.random() < 0.4:
713
- # Randomly choose between new models
714
- new_model = random.choice(new_models)
715
- other_models = [m for m in active_models if m not in new_models]
716
- other_model = random.choice(other_models)
717
-
718
- if random.random() < 0.5:
719
- model_a, model_b = new_model, other_model
720
- else:
721
- model_a, model_b = other_model, new_model
722
- else:
723
- # For other cases, exclude new models
724
- non_special_models = [m for m in active_models if m not in new_models]
725
- model1, model2 = random.sample(non_special_models, 2)
726
- model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
727
-
728
- # Get responses from models
729
- response_a = get_model_response(
730
  model_a,
731
- model_data.get(model_a),
732
- prompt_data,
733
- use_reference=use_reference
734
- )
735
- response_b = get_model_response(
736
  model_b,
737
- model_data.get(model_b),
738
- prompt_data,
739
- use_reference=use_reference
740
- )
741
-
742
-
743
- is_prometheus_a = model_data.get(model_a, {}).get('organization') == 'Prometheus'
744
- is_prometheus_b = model_data.get(model_b, {}).get('organization') == 'Prometheus'
745
- is_atla_a = model_data.get(model_a, {}).get('organization') == 'Atla'
746
- is_atla_b = model_data.get(model_b, {}).get('organization') == 'Atla'
747
- is_flow_judge_a = model_data.get(model_a, {}).get('organization') == 'Flow AI'
748
- is_flow_judge_b = model_data.get(model_b, {}).get('organization') == 'Flow AI'
749
- is_salesforce_a = model_data.get(model_a, {}).get('organization') == 'Salesforce'
750
- is_salesforce_b = model_data.get(model_b, {}).get('organization') == 'Salesforce'
751
-
752
- # Parse the responses based on model, using appropriate parsing for different models
753
- if is_prometheus_a:
754
- score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
755
- score_a_val = f"{score_a_val} / 5"
756
- elif is_atla_a or is_salesforce_a: # Same parser for Atla and Salesforce
757
- score_a_val, critique_a_val = atla_parse_model_response(response_a)
758
- score_a_val = f"{score_a_val} / 5"
759
- elif is_flow_judge_a:
760
- score_a_val, critique_a_val = flow_judge_parse_model_response(response_a)
761
- score_a_val = f"{score_a_val} / 5"
762
- else:
763
- score_a_val, critique_a_val = parse_model_response(response_a)
764
- score_a_val = f"{score_a_val} / 5"
765
-
766
- if is_prometheus_b:
767
- score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
768
- score_b_val = f"{score_b_val} / 5"
769
- elif is_atla_b or is_salesforce_b: # Same parser for Atla and Salesforce
770
- score_b_val, critique_b_val = atla_parse_model_response(response_b)
771
- score_b_val = f"{score_b_val} / 5"
772
- elif is_flow_judge_b:
773
- score_b_val, critique_b_val = flow_judge_parse_model_response(response_b)
774
- score_b_val = f"{score_b_val} / 5"
775
- else:
776
- score_b_val, critique_b_val = parse_model_response(response_b)
777
- score_b_val = f"{score_b_val} / 5"
778
-
779
- return (
780
- score_a_val,
781
- critique_a_val,
782
- score_b_val,
783
- critique_b_val,
784
- gr.update(interactive=True, variant="primary"), # vote_a
785
- gr.update(interactive=True, variant="primary"), # vote_b
786
- gr.update(interactive=True, variant="primary"), # vote_tie
787
- model_a,
788
- model_b,
789
- eval_prompt,
790
  gr.update(value="*Model: Hidden*"),
791
  gr.update(value="*Model: Hidden*"),
792
- gr.update(value="Regenerate judges", variant="secondary", interactive=True),
793
- gr.update(value="🎲"), # random_btn
794
- False, # Set first_game_state to False after first submission
 
 
 
 
795
  )
796
 
797
- # Update the click handler to use False for is_first_game after first submission
798
- def create_submit_handler():
799
- first_game = True
800
-
801
- def handler(*args):
802
- nonlocal first_game
803
- result = submit_and_store(*args)
804
- first_game = False # Set to False after first submission
805
- return result
806
-
807
- return handler
808
-
809
- # Update the send_btn click handler
810
  send_btn.click(
811
  fn=submit_and_store,
812
- inputs=[
813
- use_reference_toggle,
814
- eval_criteria_text,
815
- human_input,
816
- ai_response,
817
- ground_truth,
818
- score1_description,
819
- score2_description,
820
- score3_description,
821
- score4_description,
822
- score5_description,
823
- ],
824
  outputs=[
825
  score_a,
826
  critique_a,
@@ -828,225 +744,101 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
828
  critique_b,
829
  vote_a,
830
  vote_b,
831
- vote_tie,
832
  model_a_state,
833
  model_b_state,
834
  final_prompt_state,
835
  model_name_a,
836
  model_name_b,
837
  send_btn,
838
- random_btn,
839
  ],
840
  )
841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842
  # Add random button handler
843
  random_btn.click(
844
  fn=populate_random_example,
845
- inputs=[use_reference_toggle], # Use compatible mode toggle to decide behavior
846
- outputs=[
847
- human_input,
848
- ai_response,
849
- random_btn,
850
- score_a,
851
- critique_a,
852
- score_b,
853
- critique_b,
854
- vote_a,
855
- vote_b,
856
- vote_tie,
857
- model_name_a,
858
- model_name_b,
859
- ground_truth, # Set ground truth
860
- ]
861
  )
862
 
863
  # Add new input change handlers
864
  def handle_input_change():
865
- """Reset UI state when inputs are changed"""
866
- return [
867
- gr.update(interactive=False), # vote_a
868
- gr.update(interactive=False), # vote_b
869
- gr.update(interactive=False), # vote_tie
870
- gr.update(value="Run judges", variant="primary"), # send_btn
871
- gr.update(value="🎲", variant="secondary"), # random_btn
872
- ]
873
 
874
  # Update the change handlers for inputs
875
  human_input.change(
876
  fn=handle_input_change,
877
  inputs=[],
878
- outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
879
  )
880
 
881
  ai_response.change(
882
  fn=handle_input_change,
883
  inputs=[],
884
- outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
885
- )
886
-
887
- generate_btn.click(
888
- fn=lambda msg: (
889
- generate_ai_response(msg)[0], # Only take the response text
890
- gr.update(
891
- value="Generate AI Response", # Keep the label
892
- interactive=False # Disable the button
893
- )
894
- ),
895
- inputs=[human_input],
896
- outputs=[ai_response, generate_btn]
897
- )
898
-
899
- human_input.change(
900
- fn=lambda x: gr.update(interactive=bool(x.strip())),
901
- inputs=[human_input],
902
- outputs=[generate_btn]
903
  )
904
 
905
  # Update the demo.load to include the random example population
906
  demo.load(
907
- fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
908
  inputs=[],
909
- outputs=[
910
- human_input,
911
- ai_response,
912
- random_btn,
913
- score_a,
914
- critique_a,
915
- score_b,
916
- critique_b,
917
- vote_a,
918
- vote_b,
919
- vote_tie,
920
- model_name_a,
921
- model_name_b,
922
- ground_truth,
923
- ]
924
  )
925
 
926
- # Add new state variables for compatible mode
927
- eval_criteria_previous = gr.State(value=DEFAULT_EVAL_CRITERIA)
928
- score1_previous = gr.State(value=DEFAULT_SCORE_1)
929
- score2_previous = gr.State(value=DEFAULT_SCORE_2)
930
- score3_previous = gr.State(value=DEFAULT_SCORE_3)
931
- score4_previous = gr.State(value=DEFAULT_SCORE_4)
932
- score5_previous = gr.State(value=DEFAULT_SCORE_5)
933
-
934
- # Add new functions to handle compatible mode saves/cancels
935
- def save_compatible_prompt(criteria, score1, score2, score3, score4, score5):
936
- return [
937
- gr.update(value=criteria), # Update criteria
938
- criteria, # Update previous criteria state
939
- gr.update(value=score1),
940
- score1,
941
- gr.update(value=score2),
942
- score2,
943
- gr.update(value=score3),
944
- score3,
945
- gr.update(value=score4),
946
- score4,
947
- gr.update(value=score5),
948
- score5,
949
- gr.update(visible=False) # Hide buttons
950
- ]
951
-
952
- def cancel_compatible_prompt(prev_criteria, prev_score1, prev_score2, prev_score3, prev_score4, prev_score5):
953
- return [
954
- gr.update(value=prev_criteria),
955
- prev_criteria,
956
- gr.update(value=prev_score1),
957
- prev_score1,
958
- gr.update(value=prev_score2),
959
- prev_score2,
960
- gr.update(value=prev_score3),
961
- prev_score3,
962
- gr.update(value=prev_score4),
963
- prev_score4,
964
- gr.update(value=prev_score5),
965
- prev_score5,
966
- gr.update(visible=False)
967
- ]
968
-
969
- def show_compatible_edit_buttons(*current_values):
970
- previous_values = current_values[1::2] # Get previous values
971
- current_values = current_values[::2] # Get current values
972
- return gr.update(visible=any(curr != prev for curr, prev in zip(current_values, previous_values)))
973
-
974
- # Add click handlers for compatible mode buttons
975
- compatible_save_btn.click(
976
- fn=save_compatible_prompt,
977
- inputs=[
978
- eval_criteria_text,
979
- score1_description,
980
- score2_description,
981
- score3_description,
982
- score4_description,
983
- score5_description
984
- ],
985
- outputs=[
986
- eval_criteria_text,
987
- eval_criteria_previous,
988
- score1_description,
989
- score1_previous,
990
- score2_description,
991
- score2_previous,
992
- score3_description,
993
- score3_previous,
994
- score4_description,
995
- score4_previous,
996
- score5_description,
997
- score5_previous,
998
- compatible_edit_buttons_row
999
- ]
1000
- )
1001
-
1002
- compatible_cancel_btn.click(
1003
- fn=cancel_compatible_prompt,
1004
- inputs=[
1005
- eval_criteria_previous,
1006
- score1_previous,
1007
- score2_previous,
1008
- score3_previous,
1009
- score4_previous,
1010
- score5_previous
1011
- ],
1012
- outputs=[
1013
- eval_criteria_text,
1014
- eval_criteria_previous,
1015
- score1_description,
1016
- score1_previous,
1017
- score2_description,
1018
- score2_previous,
1019
- score3_description,
1020
- score3_previous,
1021
- score4_description,
1022
- score4_previous,
1023
- score5_description,
1024
- score5_previous,
1025
- compatible_edit_buttons_row
1026
- ]
1027
- )
1028
-
1029
- # Add change handlers for all compatible mode inputs
1030
- for component in [eval_criteria_text, score1_description, score2_description,
1031
- score3_description, score4_description, score5_description]:
1032
- component.change(
1033
- fn=show_compatible_edit_buttons,
1034
- inputs=[
1035
- eval_criteria_text,
1036
- eval_criteria_previous,
1037
- score1_description,
1038
- score1_previous,
1039
- score2_description,
1040
- score2_previous,
1041
- score3_description,
1042
- score3_previous,
1043
- score4_description,
1044
- score4_previous,
1045
- score5_description,
1046
- score5_previous
1047
- ],
1048
- outputs=compatible_edit_buttons_row
1049
- )
1050
-
1051
  if __name__ == "__main__":
1052
  demo.launch()
 
2
  import re
3
  import random
4
  from collections import defaultdict
5
+ from datetime import datetime, timezone
6
  import hashlib
 
7
 
8
  from dotenv import load_dotenv
 
9
 
10
+ load_dotenv()
 
 
 
 
 
 
11
 
12
+ import gradio as gr
13
+ from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
 
 
 
14
  from db import add_vote, create_db_connection, get_votes
 
15
  from utils import Vote
 
16
  from common import (
17
  POLICY_CONTENT,
18
  ACKNOWLEDGEMENTS,
19
+ DEFAULT_EVAL_PROMPT,
20
+ DEFAULT_INPUT,
21
+ DEFAULT_RESPONSE,
22
  CSS_STYLES,
23
  MAIN_TITLE,
24
  HOW_IT_WORKS,
25
+ BATTLE_RULES,
26
+ EVAL_DESCRIPTION,
27
+ VOTING_HEADER,
28
  )
29
+ from example_metrics import EXAMPLE_METRICS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
+ # Model and ELO score data
33
+ DEFAULT_ELO = 1200 # Starting ELO for new models
34
+ K_FACTOR = 32 # Standard chess K-factor, adjust as needed
35
  elo_scores = defaultdict(lambda: DEFAULT_ELO)
36
  vote_counts = defaultdict(int)
37
 
 
52
  "organization": model["organization"],
53
  "license": model["license"],
54
  "api_model": model["api_model"],
 
55
  }
56
  except FileNotFoundError:
57
  print("Warning: models.jsonl not found")
 
62
  model_data = load_model_data()
63
 
64
  def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
 
 
65
  vote = Vote(
66
  timestamp=datetime.now().isoformat(),
67
+ prompt=prompt,
68
  response_a=response_a,
69
  response_b=response_b,
70
  model_a=model_a,
 
93
  return eval_prompt
94
 
95
 
96
+ def submit_prompt(eval_prompt, *variable_values):
97
+ try:
98
+ variables = parse_variables(eval_prompt)
99
+ variable_values_dict = {var: val for var, val in zip(variables, variable_values)}
100
+ final_prompt = get_final_prompt(eval_prompt, variable_values_dict)
101
+
102
+ models = list(model_data.keys())
103
+ model1, model2 = random.sample(models, 2)
104
+ model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1)
105
+
106
+ response_a = get_model_response(model_a, model_data.get(model_a), final_prompt)
107
+ response_b = get_model_response(model_b, model_data.get(model_b), final_prompt)
108
+
109
+ return (
110
+ response_a,
111
+ response_b,
112
+ gr.update(visible=True),
113
+ gr.update(visible=True),
114
+ model_a,
115
+ model_b,
116
+ final_prompt,
117
+ )
118
+ except Exception as e:
119
+ print(f"Error in submit_prompt: {str(e)}")
120
+ return (
121
+ "Error generating response",
122
+ "Error generating response",
123
+ gr.update(visible=False),
124
+ gr.update(visible=False),
125
+ None,
126
+ None,
127
+ None,
128
+ )
129
+
130
 
131
  def get_ip(request: gr.Request) -> str:
132
  """Get and hash the IP address from the request."""
 
143
  return hashlib.sha256(ip.encode()).hexdigest()[:16]
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def vote(
147
  choice,
148
  model_a,
 
192
  store_vote_data(
193
  final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
194
  )
195
+
196
+ # Return updates for UI components
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  return [
198
+ gr.update(visible=False), # vote_a
199
+ gr.update(visible=False), # vote_b
200
+ gr.update(visible=False), # tie_button_row
201
+ gr.update(value=f"*Model: {model_a}*"), # model_name_a
202
+ gr.update(value=f"*Model: {model_b}*"), # model_name_b
203
+ gr.update(interactive=True, value="Run the evaluators", variant="primary"), # send_btn
204
+ gr.update(visible=True), # spacing_div
 
205
  ]
206
 
207
 
 
210
  return get_votes(db)
211
 
212
 
213
+ def get_leaderboard(show_preliminary=True):
214
+ """Generate leaderboard data using fresh votes from MongoDB."""
215
+ # Get fresh voting data
216
  voting_data = get_current_votes()
217
+ print(f"Fetched {len(voting_data)} votes from database") # Debug log
218
+
219
+ # Initialize dictionaries for tracking
220
+ ratings = defaultdict(lambda: DEFAULT_ELO)
221
+ matches = defaultdict(int)
222
+
223
+ # Process each vote
224
+ for vote in voting_data:
225
+ try:
226
+ model_a = vote.get("model_a")
227
+ model_b = vote.get("model_b")
228
+ winner = vote.get("winner")
229
+
230
+ # Skip if models aren't in current model_data
231
+ if (
232
+ not all([model_a, model_b, winner])
233
+ or model_a not in model_data
234
+ or model_b not in model_data
235
+ ):
236
+ continue
237
+
238
+ # Update match counts
239
+ matches[model_a] += 1
240
+ matches[model_b] += 1
241
+
242
+ # Calculate ELO changes
243
+ elo_a = ratings[model_a]
244
+ elo_b = ratings[model_b]
245
+
246
+ # Expected scores
247
+ expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
248
+ expected_b = 1 - expected_a
249
+
250
+ # Actual scores
251
+ score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
252
+ score_b = 1 - score_a
253
+
254
+ # Update ratings
255
+ ratings[model_a] += K_FACTOR * (score_a - expected_a)
256
+ ratings[model_b] += K_FACTOR * (score_b - expected_b)
257
+
258
+ except Exception as e:
259
+ print(f"Error processing vote: {e}")
260
+ continue
261
+
262
+ # Generate leaderboard data
263
+ leaderboard = []
264
+ for model in model_data.keys():
265
+ votes = matches[model]
266
+ # Skip models with < 500 votes if show_preliminary is False
267
+ if not show_preliminary and votes < 500:
268
+ continue
269
+
270
+ elo = ratings[model]
271
+ ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
272
+ data = {
273
+ "Model": model,
274
+ "ELO Score": f"{int(elo)}",
275
+ "95% CI": f"±{int(ci)}",
276
+ "# Votes": votes,
277
+ "Organization": model_data[model]["organization"],
278
+ "License": model_data[model]["license"],
279
+ }
280
+ leaderboard.append(data)
281
+
282
+ # Sort leaderboard by ELO score in descending order
283
+ leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
284
+
285
+ return leaderboard
286
+
287
+
288
+ def calculate_elo_change(rating_a, rating_b, winner):
289
+ """Calculate ELO rating changes for both players."""
290
+ expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
291
+ expected_b = 1 - expected_a
292
+
293
+ if winner == "A":
294
+ score_a, score_b = 1, 0
295
+ elif winner == "B":
296
+ score_a, score_b = 0, 1
297
+ else: # Handle ties
298
+ score_a, score_b = 0.5, 0.5
299
+
300
+ change_a = K_FACTOR * (score_a - expected_a)
301
+ change_b = K_FACTOR * (score_b - expected_b)
302
+
303
+ return change_a, change_b
304
+
305
+
306
+ def update_leaderboard():
307
+ """Generate leaderboard DataFrame using fresh votes from MongoDB."""
308
+ # Get fresh voting data
309
+ voting_data = get_current_votes()
310
+ print(f"Found {len(voting_data)} votes in database")
311
+ matches = defaultdict(int)
312
+
313
+ # Process each vote chronologically
314
+ for vote in voting_data:
315
+ # Extract model names from the vote document
316
+ try:
317
+ model_a = vote.get("model_a")
318
+ model_b = vote.get("model_b")
319
+ winner = vote.get("winner")
320
+
321
+ print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
322
+
323
+ # Skip if any required field is missing or models aren't in current model_data
324
+ if not all([model_a, model_b, winner]):
325
+ print(f"Missing required fields in vote: {vote}")
326
+ continue
327
+
328
+ if model_a not in model_data:
329
+ print(f"Model A '{model_a}' not found in model_data")
330
+ continue
331
+
332
+ if model_b not in model_data:
333
+ print(f"Model B '{model_b}' not found in model_data")
334
+ continue
335
+
336
+ # Update match counts
337
+ matches[model_a] += 1
338
+ matches[model_b] += 1
339
+ print(
340
+ f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
341
+ )
342
+ except Exception as e:
343
+ print(f"Error processing vote: {e}")
344
+ print(f"Problematic vote data: {vote}")
345
+ continue
346
+
347
+
348
+ # Update the display_leaderboard function
349
+ def display_leaderboard():
350
+ df = update_leaderboard()
351
+ return gr.DataFrame(
352
+ value=df,
353
+ headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
354
+ datatype=["str", "number", "str", "number", "str", "str", "str"],
355
+ row_count=(len(df) + 1, "dynamic"),
356
+ )
357
 
358
 
359
  # Update the leaderboard table definition in the UI
 
363
  )
364
 
365
 
366
+ def get_leaderboard_stats():
367
+ """Get summary statistics for the leaderboard."""
368
+ now = datetime.now(timezone.utc)
369
+ total_votes = len(get_current_votes())
370
+ total_models = len(model_data)
371
+ last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
372
+ "%B %d, %Y at %H:00 UTC"
373
+ )
374
+
375
+ return f"""
376
+ ### Leaderboard Stats
377
+ - **Total Models**: {total_models}
378
+ - **Total Votes**: {total_votes}
379
+ - **Last Updated**: {last_updated}
380
+ """
381
+
382
+
383
+ #def set_example_metric(metric_name):
384
+ # if metric_name == "Custom":
385
+ # variables = parse_variables(DEFAULT_EVAL_PROMPT)
386
+ # variable_values = []
387
+ # for var in variables:
388
+ # if var == "input":
389
+ # variable_values.append(DEFAULT_INPUT)
390
+ # elif var == "response":
391
+ # variable_values.append(DEFAULT_RESPONSE)
392
+ # else:
393
+ # variable_values.append("") # Default empty value
394
+ # Pad variable_values to match the length of variable_rows
395
+ # while len(variable_values) < len(variable_rows):
396
+ # variable_values.append("")
397
+ # return [DEFAULT_EVAL_PROMPT] + variable_values
398
+
399
+ # metric_data = EXAMPLE_METRICS[metric_name]
400
+ # variables = parse_variables(metric_data["prompt"])
401
+ # variable_values = []
402
+ # for var in variables:
403
+ # value = metric_data.get(var, "") # Default to empty string if not found
404
+ # variable_values.append(value)
405
+ # Pad variable_values to match the length of variable_rows
406
+ # while len(variable_values) < len(variable_rows):
407
+ # variable_values.append("")
408
+ # return [metric_data["prompt"]] + variable_values
409
+
410
+
411
+ # Select random metric at startup
412
+ # def get_random_metric():
413
+ # metrics = list(EXAMPLE_METRICS.keys())
414
+ # return set_example_metric(random.choice(metrics))
415
+
416
+
417
+ def populate_random_example(request: gr.Request):
418
+ """Generate a random human-AI conversation example."""
419
+ human_msg, ai_msg = get_random_human_ai_pair()
420
  return [
421
  gr.update(value=human_msg),
422
+ gr.update(value=ai_msg)
 
 
 
 
 
 
 
 
 
 
 
423
  ]
424
 
425
 
 
435
 
436
  with gr.Tabs():
437
  with gr.TabItem("Judge Arena"):
438
+ random_btn = gr.Button("🎲", scale=0)
439
  with gr.Row():
440
  # Left side - Input section
441
  with gr.Column(scale=1):
442
  with gr.Group():
443
  human_input = gr.TextArea(
444
+ label="👩 Human Input",
445
+ lines=12,
446
  placeholder="Enter the human message here..."
447
  )
 
 
 
 
 
 
448
 
449
  ai_response = gr.TextArea(
450
  label="🤖 AI Response",
 
 
 
 
 
 
 
451
  lines=12,
452
+ placeholder="Enter the AI response here..."
 
453
  )
454
 
 
 
455
  send_btn = gr.Button(
456
+ value="Run the evaluators",
457
  variant="primary",
458
+ size="lg"
 
459
  )
460
 
461
  # Right side - Model outputs
 
465
  model_name_a = gr.Markdown("*Model: Hidden*")
466
  with gr.Row():
467
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
468
+ score_a = gr.Textbox(label="Score", lines=5, interactive=False)
469
+ vote_a = gr.Button("Vote A", variant="primary", visible=False)
470
  with gr.Column(scale=9, min_width=400): # Wider width for critique
471
+ critique_a = gr.TextArea(label="Critique", lines=7, interactive=False)
472
+
473
+ # Spacing div that's visible only when tie button is hidden
474
+ spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
475
 
476
  # Tie button row
477
+ with gr.Row(visible=False) as tie_button_row:
478
  with gr.Column():
479
+ vote_tie = gr.Button("Tie", variant="secondary")
480
 
481
 
482
  gr.Markdown("### 🧑‍⚖️ Judge B")
 
484
  model_name_b = gr.Markdown("*Model: Hidden*")
485
  with gr.Row():
486
  with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
487
+ score_b = gr.Textbox(label="Score", lines=5, interactive=False)
488
+ vote_b = gr.Button("Vote B", variant="primary", visible=False)
489
  with gr.Column(scale=9, min_width=400): # Wider width for critique
490
+ critique_b = gr.TextArea(label="Critique", lines=7, interactive=False)
491
+ # Place Vote B button directly under Judge B
492
 
493
  gr.Markdown("<br>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
+ # Add spacing and acknowledgements at the bottom
496
+ gr.Markdown(ACKNOWLEDGEMENTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
  with gr.TabItem("Leaderboard"):
499
  with gr.Row():
 
501
  show_preliminary = gr.Checkbox(
502
  label="Reveal preliminary results",
503
  value=True, # Checked by default
504
+ info="Show all models, including models with less few human ratings (< 500 votes)",
505
  interactive=True
506
  )
507
  stats_display = gr.Markdown()
 
509
  headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
510
  datatype=["str", "number", "str", "number", "str", "str", "str"],
511
  )
 
 
 
 
512
 
513
+ # Update refresh_leaderboard to use the checkbox value
514
+ def refresh_leaderboard(show_preliminary):
515
+ """Refresh the leaderboard data and stats."""
516
+ leaderboard = get_leaderboard(show_preliminary)
517
+ data = [
518
+ [
519
+ entry["Model"],
520
+ float(entry["ELO Score"]),
521
+ entry["95% CI"],
522
+ entry["# Votes"],
523
+ entry["Organization"],
524
+ entry["License"],
525
+ ]
526
+ for entry in leaderboard
527
+ ]
528
+ stats = get_leaderboard_stats()
529
+ return [gr.update(value=data), gr.update(value=stats)]
530
 
531
  # Add change handler for checkbox
532
  show_preliminary.change(
 
544
 
545
  with gr.TabItem("Policy"):
546
  gr.Markdown(POLICY_CONTENT)
 
547
 
548
  # Define state variables for model tracking
549
  model_a_state = gr.State()
550
  model_b_state = gr.State()
551
  final_prompt_state = gr.State()
552
+
553
+ # Update variable inputs based on the eval prompt
554
+ def update_variables(eval_prompt):
555
+ variables = parse_variables(eval_prompt)
556
+ updates = []
557
+
558
+ for i in range(len(variable_rows)):
559
+ var_row, var_input = variable_rows[i]
560
+ if i < len(variables):
561
+ var_name = variables[i]
562
+ # Set the number of lines based on the variable name
563
+ if var_name == "response":
564
+ lines = 4 # Adjust this number as needed
565
+ else:
566
+ lines = 1 # Default to single line for other variables
567
+ updates.extend(
568
+ [
569
+ gr.update(visible=True), # Show the variable row
570
+ gr.update(
571
+ label=var_name, visible=True, lines=lines
572
+ ), # Update label and lines
573
+ ]
574
+ )
575
+ else:
576
+ updates.extend(
577
+ [
578
+ gr.update(visible=False), # Hide the variable row
579
+ gr.update(value="", visible=False), # Clear value when hidden
580
+ ]
581
+ )
582
+ return updates
583
+
584
+ #eval_prompt.change(
585
+ # fn=update_variables,
586
+ # inputs=eval_prompt,
587
+ # outputs=[item for sublist in variable_rows for item in sublist],
588
+ #)
589
+
590
+ # Regenerate button functionality
591
+ #regenerate_button.click(
592
+ # fn=regenerate_prompt,
593
+ # inputs=[model_a_state, model_b_state, eval_prompt, human_input, ai_response],
594
+ # outputs=[
595
+ # score_a,
596
+ # critique_a,
597
+ # score_b,
598
+ # critique_b,
599
+ # vote_a,
600
+ # vote_b,
601
+ # tie_button_row,
602
+ # model_name_a,
603
+ # model_name_b,
604
+ # model_a_state,
605
+ # model_b_state,
606
+ # ],
607
+ #)
608
 
609
  # Update model names after responses are generated
610
  def update_model_names(model_a, model_b):
 
619
  vote_a.click(
620
  fn=vote,
621
  inputs=[
622
+ gr.State("A"), # Choice
623
  model_a_state,
624
  model_b_state,
625
  final_prompt_state,
 
631
  outputs=[
632
  vote_a,
633
  vote_b,
634
+ tie_button_row,
635
  model_name_a,
636
  model_name_b,
637
  send_btn,
638
+ spacing_div,
 
639
  ],
640
  )
641
 
642
  vote_b.click(
643
  fn=vote,
644
  inputs=[
645
+ gr.State("B"), # Choice
646
  model_a_state,
647
  model_b_state,
648
  final_prompt_state,
 
654
  outputs=[
655
  vote_a,
656
  vote_b,
657
+ tie_button_row,
658
  model_name_a,
659
  model_name_b,
660
  send_btn,
661
+ spacing_div,
 
662
  ],
663
  )
664
 
665
  vote_tie.click(
666
  fn=vote,
667
  inputs=[
668
+ gr.State("Tie"), # Choice
669
  model_a_state,
670
  model_b_state,
671
  final_prompt_state,
 
677
  outputs=[
678
  vote_a,
679
  vote_b,
680
+ tie_button_row,
681
  model_name_a,
682
  model_name_b,
683
  send_btn,
684
+ spacing_div,
 
685
  ],
686
  )
687
 
688
+ # Update the send button handler to store the submitted inputs
689
+ def submit_and_store(prompt, *variables):
690
+ # Create a copy of the current submission
691
+ current_submission = {"prompt": prompt, "variables": variables}
692
+
693
+ # Get the responses
694
+ (
695
+ response_a,
696
+ response_b,
697
+ buttons_visible,
698
+ regen_visible,
699
+ model_a,
700
+ model_b,
701
+ final_prompt,
702
+ ) = submit_prompt(prompt, *variables)
703
 
704
+ # Parse the responses
705
+ score_a, critique_a = parse_model_response(response_a)
706
+ score_b, critique_b = parse_model_response(response_b)
707
 
708
+ # Format scores with "/ 5"
709
+ score_a = f"{score_a} / 5"
710
+ score_b = f"{score_b} / 5"
 
 
 
711
 
712
+ # Update the last_submission state with the current values
713
+ last_submission.value = current_submission
 
 
 
 
 
 
 
 
 
714
 
715
+ return (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  score_a,
717
  critique_a,
718
  score_b,
719
  critique_b,
720
+ gr.update(visible=True), # vote_a
721
+ gr.update(visible=True), # vote_b
722
+ gr.update(visible=True), # tie_button_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  model_a,
 
 
 
 
 
724
  model_b,
725
+ final_prompt, # Add final_prompt to state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  gr.update(value="*Model: Hidden*"),
727
  gr.update(value="*Model: Hidden*"),
728
+ # Change the button to "Regenerate" mode after evaluation
729
+ gr.update(
730
+ value="Regenerate with different models",
731
+ variant="secondary",
732
+ interactive=True
733
+ ),
734
+ gr.update(visible=False), # spacing_div
735
  )
736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  send_btn.click(
738
  fn=submit_and_store,
739
+ inputs=[eval_prompt, human_input, ai_response],
 
 
 
 
 
 
 
 
 
 
 
740
  outputs=[
741
  score_a,
742
  critique_a,
 
744
  critique_b,
745
  vote_a,
746
  vote_b,
747
+ tie_button_row,
748
  model_a_state,
749
  model_b_state,
750
  final_prompt_state,
751
  model_name_a,
752
  model_name_b,
753
  send_btn,
754
+ spacing_div,
755
  ],
756
  )
757
 
758
+ # Update the input change handlers to also disable regenerate button
759
+ def handle_input_changes(prompt, *variables):
760
+ """Enable send button and manage regenerate button based on input changes"""
761
+ last_inputs = last_submission.value
762
+ current_inputs = {"prompt": prompt, "variables": variables}
763
+ inputs_changed = last_inputs != current_inputs
764
+ return [
765
+ gr.update(interactive=True), # send button always enabled
766
+ gr.update(
767
+ interactive=not inputs_changed
768
+ ), # regenerate button disabled if inputs changed
769
+ ]
770
+
771
+ # Update the change handlers for prompt and variables
772
+ #eval_prompt.change(
773
+ # fn=handle_input_changes,
774
+ # inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
775
+ # outputs=[send_btn, regenerate_button],
776
+ #)
777
+
778
+ # for _, var_input in variable_rows:
779
+ # var_input.change(
780
+ # fn=handle_input_changes,
781
+ # inputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
782
+ # outputs=[send_btn, regenerate_button],
783
+ # )
784
+
785
+ # Add click handlers for metric buttons
786
+ #outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
787
+
788
+ #custom_btn.click(fn=lambda: set_example_metric("Custom"), outputs=outputs_list)
789
+
790
+ #hallucination_btn.click(
791
+ # fn=lambda: set_example_metric("Hallucination"), outputs=outputs_list
792
+ #)
793
+
794
+ #precision_btn.click(fn=lambda: set_example_metric("Precision"), outputs=outputs_list)
795
+
796
+ #recall_btn.click(fn=lambda: set_example_metric("Recall"), outputs=outputs_list)
797
+
798
+ #coherence_btn.click(
799
+ # fn=lambda: set_example_metric("Logical_Coherence"), outputs=outputs_list
800
+ #)
801
+
802
+ #faithfulness_btn.click(
803
+ # fn=lambda: set_example_metric("Faithfulness"), outputs=outputs_list
804
+ #)
805
+
806
+ # Set default metric at startup
807
+ demo.load(
808
+ #fn=lambda: set_example_metric("Hallucination"),
809
+ #outputs=[eval_prompt] + [var_input for _, var_input in variable_rows],
810
+ )
811
+
812
  # Add random button handler
813
  random_btn.click(
814
  fn=populate_random_example,
815
+ inputs=[],
816
+ outputs=[human_input, ai_response]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  )
818
 
819
  # Add new input change handlers
820
  def handle_input_change():
821
+ return gr.update(value="Run the evaluators", variant="primary")
 
 
 
 
 
 
 
822
 
823
  # Update the change handlers for inputs
824
  human_input.change(
825
  fn=handle_input_change,
826
  inputs=[],
827
+ outputs=[send_btn]
828
  )
829
 
830
  ai_response.change(
831
  fn=handle_input_change,
832
  inputs=[],
833
+ outputs=[send_btn]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
  )
835
 
836
  # Update the demo.load to include the random example population
837
  demo.load(
838
+ fn=populate_random_example,
839
  inputs=[],
840
+ outputs=[human_input, ai_response]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  )
842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843
  if __name__ == "__main__":
844
  demo.launch()
common.py CHANGED
@@ -37,7 +37,7 @@ CSS_STYLES = """
37
  gap: 8px;
38
  }
39
  """
40
-
41
  # Default Eval Prompt
42
  EVAL_DESCRIPTION = """
43
  ## 📝 Tips
@@ -47,6 +47,27 @@ EVAL_DESCRIPTION = """
47
  - Examples (Optional)
48
  """
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Voting Section Header
51
  VOTING_HEADER = """
52
  # Start Voting Now
@@ -68,50 +89,55 @@ POLICY_CONTENT = """
68
 
69
  Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
70
  <br><br>
71
- # [Our Mission](https://www.atla-ai.com/company)
72
 
73
- By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
74
- Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
75
  <br><br>
76
  # Judge Arena Policy
77
 
78
  ## Overview
79
 
80
- Judge Arena is an open-source platform dedicated to determining which models make the best judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair and open environment :)
81
 
82
  ## Transparency
83
 
84
  - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
85
- - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
86
  - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
87
 
88
  ## Model Inclusion Criteria
89
 
90
  Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
91
 
92
- - **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
93
- - **Promptable:** The model must be promptable to be evaluate in different scoring formats, for different criteria.
94
  - **Accessibility**:
95
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
96
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
97
 
98
  ## Leaderboard Management
99
 
100
- - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1200, and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
101
  - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
102
  - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
103
 
104
- *This policy might be updated to reflect changes in our practices or in response to community feedback.*
105
- <br><br>
106
  # FAQ
107
 
108
  **Isn't this the same as Chatbot Arena?**
109
 
110
  We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
111
 
 
 
 
 
 
 
112
  **Why should I trust this leaderboard?**
113
 
114
- We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena). Check out our [blog](https://www.atla-ai.com/blog) to stay up to date as we analyse the results from the leaderboard.
115
 
116
  **Who funds this effort?**
117
 
@@ -122,5 +148,4 @@ Atla currently funds this out of our own pocket. We are looking for API credits
122
  We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
123
  <br><br>
124
  # Get in touch
125
- We’d love to hear your feedback! For general feature requests or to submit / suggest new models to add to the arena, please open up a discussion in the [community](https://huggingface.co/spaces/AtlaAI/judge-arena/discussions) tab. You can also contact us directly on [X](https://x.com/Atla_AI) or [Discord](https://discord.com/invite/qFCMgkGwUK).
126
- \nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
 
37
  gap: 8px;
38
  }
39
  """
40
+
41
  # Default Eval Prompt
42
  EVAL_DESCRIPTION = """
43
  ## 📝 Tips
 
47
  - Examples (Optional)
48
  """
49
 
50
+ DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on how well it follows the user's instructions. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Do not allow the length of the response to influence your evaluation. Be objective as possible and give a brief explanation for your score.
51
+
52
+ Score:
53
+ Score 1: The response ignores or misinterprets instructions, providing irrelevant or inaccurate content that fails to address the request.
54
+ Score 2: The response follows instructions partially but misses key elements, lacking depth or precision while containing minor inaccuracies.
55
+ Score 3: The response follows main instructions adequately, providing correct and relevant information with reasonable depth.
56
+ Score 4: The response follows instructions thoroughly with strong attention to detail, offering accurate, well-developed content that thoughtfully addresses needs.
57
+ Score 5: The response demonstrates exceptional instruction following with precise, comprehensive content that shows both insight and perfect alignment with the request.
58
+
59
+ [User Query]: {{input}}
60
+
61
+ [Response]: {{response}}"""
62
+
63
+ # Default Variable Values
64
+ DEFAULT_INPUT = """Which of these animals is least likely to be found in a rainforest?"
65
+ A) Jaguar
66
+ B) Toucan
67
+ C) Polar Bear
68
+ D) Sloth"""
69
+ DEFAULT_RESPONSE = "C) Polar Bear"
70
+
71
  # Voting Section Header
72
  VOTING_HEADER = """
73
  # Start Voting Now
 
89
 
90
  Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
91
  <br><br>
92
+ # Our Mission
93
 
94
+ By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
 
95
  <br><br>
96
  # Judge Arena Policy
97
 
98
  ## Overview
99
 
100
+ Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
101
 
102
  ## Transparency
103
 
104
  - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
105
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
106
  - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
107
 
108
  ## Model Inclusion Criteria
109
 
110
  Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
111
 
112
+ - **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
113
+ - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
114
  - **Accessibility**:
115
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
116
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
117
 
118
  ## Leaderboard Management
119
 
120
+ - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
121
  - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
122
  - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
123
 
124
+ This policy might be updated to reflect changes in our practices or in response to community feedback.
125
+
126
  # FAQ
127
 
128
  **Isn't this the same as Chatbot Arena?**
129
 
130
  We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
131
 
132
+ **What are the Evaluator Prompt Templates based on?**
133
+
134
+ As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
135
+
136
+ These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
137
+
138
  **Why should I trust this leaderboard?**
139
 
140
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
141
 
142
  **Who funds this effort?**
143
 
 
148
  We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
149
  <br><br>
150
  # Get in touch
151
+ Feel free to email us at [support@atla-ai.com](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
 
data/models.jsonl CHANGED
@@ -1,27 +1,16 @@
1
- {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "active": false}
2
- {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "active": true}
3
- {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it", "active": true}
4
- {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it", "active": true}
5
- {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct", "active": true}
6
- {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3", "active": true}
7
- {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "active": true}
8
- {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "active": true}
9
- {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "active": true}
10
- {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307", "active": true}
11
- {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229", "active": true}
12
- {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest", "active": true}
13
- {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "active": true}
14
- {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo", "active": true}
15
- {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "active": true}
16
- {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1", "active": true}
17
- {"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest", "active": true}
18
- {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest", "active": true}
19
- {"name": "Prometheus-7b v2", "organization": "Prometheus", "license": "Open Source", "api_model": "prometheus/prometheus-7b-v2", "active": false}
20
- {"name": "Command-R", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r", "active": true}
21
- {"name": "Command-R Plus", "organization": "Cohere", "license": "Proprietary", "api_model": "command-r-plus", "active": true}
22
- {"name": "Atla-8B-preview", "organization": "Atla", "license": "Open Source", "api_model": "Atla-8B-preview", "active": false}
23
- {"name": "Meta Llama 3.3 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "active": true}
24
- {"name": "QwQ 32B Preview", "organization": "Qwen", "license": "Open Source", "api_model": "Qwen/QwQ-32B-Preview", "active": true}
25
- {"name": "Flow-Judge-v0.1", "organization": "Flow AI", "license": "Open Source", "api_model": "Flow-Judge-v0.1-4.65bpw-exl2", "active": false}
26
- {"name": "SFR-LLaMA-3.1-70B-Judge", "organization": "Salesforce", "license": "Proprietary", "api_model": "sfr-llama-3.1-70b-judge", "active": true}
27
- {"name": "Atla Selene 1 Mini", "organization": "Atla", "license": "Open Source", "api_model": "Atla-Selene-Mini", "active": true}
 
1
+ {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
+ {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-20240229"}
13
+ {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
+ {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
+ {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
 
 
 
 
 
 
 
 
 
 
 
gen_api_answer.py CHANGED
@@ -1,238 +1,95 @@
1
  from openai import OpenAI
2
  import anthropic
3
  from together import Together
4
- import cohere
5
  import json
6
  import re
7
- import os
8
- import requests
9
- from prompts import (
10
- JUDGE_SYSTEM_PROMPT,
11
- PROMETHEUS_PROMPT,
12
- PROMETHEUS_PROMPT_WITH_REFERENCE,
13
- ATLA_PROMPT,
14
- ATLA_PROMPT_WITH_REFERENCE,
15
- FLOW_JUDGE_PROMPT
16
- )
17
- from transformers import AutoTokenizer
18
 
19
  # Initialize clients
20
  anthropic_client = anthropic.Anthropic()
21
  openai_client = OpenAI()
22
  together_client = Together()
23
- hf_api_key = os.getenv("HF_API_KEY")
24
- flow_judge_api_key = os.getenv("FLOW_JUDGE_API_KEY")
25
- cohere_client = cohere.ClientV2(os.getenv("CO_API_KEY"))
26
- salesforce_api_key = os.getenv("SALESFORCE_API_KEY")
27
- def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  """Get response from OpenAI API"""
29
  try:
30
  response = openai_client.chat.completions.create(
31
  model=model_name,
32
  messages=[
33
- {"role": "system", "content": system_prompt},
34
  {"role": "user", "content": prompt},
35
  ],
36
- max_completion_tokens=max_tokens,
37
- temperature=temperature,
38
  )
39
  return response.choices[0].message.content
40
  except Exception as e:
41
  return f"Error with OpenAI model {model_name}: {str(e)}"
42
 
43
- def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
44
  """Get response from Anthropic API"""
45
  try:
46
  response = anthropic_client.messages.create(
47
  model=model_name,
48
- max_tokens=max_tokens,
49
- temperature=temperature,
50
- system=system_prompt,
51
  messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
52
  )
53
  return response.content[0].text
54
  except Exception as e:
55
  return f"Error with Anthropic model {model_name}: {str(e)}"
56
 
57
- def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
58
  """Get response from Together API"""
59
  try:
60
  response = together_client.chat.completions.create(
61
  model=model_name,
62
  messages=[
63
- {"role": "system", "content": system_prompt},
64
  {"role": "user", "content": prompt},
65
  ],
66
- max_tokens=max_tokens,
67
- temperature=temperature,
68
  stream=False,
69
  )
70
  return response.choices[0].message.content
71
  except Exception as e:
72
  return f"Error with Together model {model_name}: {str(e)}"
73
 
74
- def get_prometheus_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
75
- """Get response from Hugging Face model"""
76
- try:
77
- headers = {
78
- "Accept": "application/json",
79
- "Authorization": f"Bearer {hf_api_key}",
80
- "Content-Type": "application/json"
81
- }
82
-
83
- # Create messages list for chat template
84
- messages = []
85
- if system_prompt:
86
- messages.append({"role": "system", "content": system_prompt})
87
- messages.append({"role": "user", "content": prompt})
88
-
89
- # Apply chat template
90
- model_id = "prometheus-eval/prometheus-7b-v2.0"
91
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
92
- formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
93
-
94
- payload = {
95
- "inputs": formatted_prompt,
96
- "parameters": {
97
- "max_new_tokens": max_tokens,
98
- "return_full_text": False,
99
- "temperature": temperature
100
- }
101
- }
102
-
103
- response = requests.post(
104
- "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
105
- headers=headers,
106
- json=payload
107
- )
108
- return response.json()[0]["generated_text"]
109
- except Exception as e:
110
- return f"Error with Hugging Face model {model_name}: {str(e)}"
111
-
112
- def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
113
- """Get response from HF endpoint for Atla model"""
114
- try:
115
- headers = {
116
- "Accept": "application/json",
117
- "Authorization": f"Bearer {hf_api_key}",
118
- "Content-Type": "application/json"
119
- }
120
-
121
- # Create messages list for chat template
122
- messages = []
123
- if system_prompt:
124
- messages.append({"role": "system", "content": system_prompt})
125
- messages.append({"role": "user", "content": prompt})
126
-
127
- # Apply chat template
128
- model_id = "AtlaAI/Selene-1-Mini-Llama-3.1-8B"
129
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
130
- formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
131
-
132
- payload = {
133
- "inputs": formatted_prompt,
134
- "parameters": {
135
- "max_new_tokens": max_tokens,
136
- "return_full_text": False,
137
- "temperature": temperature,
138
- "seed": 42,
139
- "add_generation_prompt": True
140
- }
141
- }
142
-
143
- response = requests.post(
144
- "https://bkp9p28gri93egqh.us-east-1.aws.endpoints.huggingface.cloud",
145
- headers=headers,
146
- json=payload
147
- )
148
- return response.json()[0]["generated_text"]
149
- except Exception as e:
150
- return f"Error with Atla model {model_name}: {str(e)}"
151
-
152
- def get_flow_judge_response(model_name, prompt, max_tokens=2048, temperature=0.1, top_p=0.95) -> str:
153
- """Get response from Flow Judge"""
154
- try:
155
- response = requests.post(
156
- "https://arena.flow-ai.io/v1/chat/completions",
157
- headers={
158
- "Content-Type": "application/json",
159
- "Authorization": f"Bearer {flow_judge_api_key}"
160
- },
161
- json={
162
- "model": model_name,
163
- "messages": [
164
- {"role": "user", "content": prompt}
165
- ],
166
- "max_tokens": max_tokens,
167
- "temperature": temperature,
168
- "top_p": top_p,
169
- "stop": None
170
- }
171
- )
172
- response.raise_for_status()
173
- return response.json()["choices"][0]['message']['content']
174
- except Exception as e:
175
- return f"Error with Flow Judge completions model {model_name}: {str(e)}"
176
-
177
- def get_cohere_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
178
- """Get response from Cohere API"""
179
- try:
180
- response = cohere_client.chat(
181
- model=model_name,
182
- messages=[
183
- {"role": "system", "content": system_prompt},
184
- {"role": "user", "content": prompt}
185
- ],
186
- max_tokens=max_tokens,
187
- temperature=temperature
188
- )
189
- # Extract the text from the content items
190
- content_items = response.message.content
191
- if isinstance(content_items, list):
192
- # Get the text from the first content item
193
- return content_items[0].text
194
- return str(content_items) # Fallback if it's not a list
195
- except Exception as e:
196
- return f"Error with Cohere model {model_name}: {str(e)}"
197
-
198
- def get_salesforce_response(model_name, prompt, system_prompt=None, max_tokens=2048, temperature=0):
199
- """Get response from Salesforce Research API"""
200
- try:
201
- headers = {
202
- 'accept': 'application/json',
203
- "content-type": "application/json",
204
- "X-Api-Key": salesforce_api_key,
205
- }
206
-
207
- # Create messages list
208
- messages = []
209
- messages.append({"role": "user", "content": prompt})
210
-
211
- json_data = {
212
- "prompts": messages,
213
- "temperature": temperature,
214
- "top_p": 1,
215
- "max_tokens": max_tokens,
216
- }
217
-
218
- response = requests.post(
219
- 'https://gateway.salesforceresearch.ai/sfr-judge/process',
220
- headers=headers,
221
- json=json_data
222
- )
223
- response.raise_for_status()
224
- return response.json()['result'][0]
225
- except Exception as e:
226
- return f"Error with Salesforce model {model_name}: {str(e)}"
227
 
228
- def get_model_response(
229
- model_name,
230
- model_info,
231
- prompt_data,
232
- use_reference=False,
233
- max_tokens=500,
234
- temperature=0
235
- ):
236
  """Get response from appropriate API based on model organization"""
237
  if not model_info:
238
  return "Model not found or unsupported."
@@ -240,250 +97,37 @@ def get_model_response(
240
  api_model = model_info["api_model"]
241
  organization = model_info["organization"]
242
 
243
- # Determine if model is Prometheus, Atla, Flow Judge, or Salesforce
244
- is_prometheus = (organization == "Prometheus")
245
- is_atla = (organization == "Atla")
246
- is_flow_judge = (organization == "Flow AI")
247
- is_salesforce = (organization == "Salesforce")
248
-
249
- # For non-Prometheus/Atla/Flow Judge/Salesforce models, use the Judge system prompt
250
- system_prompt = None if (is_prometheus or is_atla or is_flow_judge or is_salesforce) else JUDGE_SYSTEM_PROMPT
251
-
252
- # Select the appropriate base prompt
253
- if is_atla or is_salesforce: # Use same prompt for Atla and Salesforce
254
- base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
255
- elif is_flow_judge:
256
- base_prompt = FLOW_JUDGE_PROMPT
257
- else:
258
- base_prompt = PROMETHEUS_PROMPT_WITH_REFERENCE if use_reference else PROMETHEUS_PROMPT
259
-
260
- # For non-Prometheus/non-Atla/non-Salesforce models, use Prometheus but replace the output format with JSON
261
- if not (is_prometheus or is_atla or is_flow_judge or is_salesforce):
262
- base_prompt = base_prompt.replace(
263
- '3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"',
264
- '3. Your output format should strictly adhere to JSON as follows: {{"feedback": "<write feedback>", "result": <numerical score>}}. Ensure the output is valid JSON, without additional formatting or explanations.'
265
- )
266
-
267
- try:
268
- if not is_flow_judge:
269
- # Format the prompt with the provided data, only using available keys
270
- final_prompt = base_prompt.format(
271
- human_input=prompt_data['human_input'],
272
- ai_response=prompt_data['ai_response'],
273
- ground_truth_input=prompt_data.get('ground_truth_input', ''),
274
- eval_criteria=prompt_data['eval_criteria'],
275
- score1_desc=prompt_data['score1_desc'],
276
- score2_desc=prompt_data['score2_desc'],
277
- score3_desc=prompt_data['score3_desc'],
278
- score4_desc=prompt_data['score4_desc'],
279
- score5_desc=prompt_data['score5_desc']
280
- )
281
- else:
282
- human_input = f"<user_input>\n{prompt_data['human_input']}\n</user_input>"
283
- ai_response = f"<response>\n{prompt_data['ai_response']}\n</response>"
284
- ground_truth=prompt_data.get('ground_truth_input', '')
285
- if ground_truth:
286
- response_reference = f"<response_reference>\n{ground_truth}\n</response_reference>"
287
- else:
288
- response_reference = ""
289
- eval_criteria = prompt_data['eval_criteria']
290
- score1_desc = f"- Score 1: {prompt_data['score1_desc']}\n"
291
- score2_desc = f"- Score 2: {prompt_data['score2_desc']}\n"
292
- score3_desc = f"- Score 3: {prompt_data['score3_desc']}\n"
293
- score4_desc = f"- Score 4: {prompt_data['score4_desc']}\n"
294
- score5_desc = f"- Score 5: {prompt_data['score5_desc']}"
295
- rubric = score1_desc + score2_desc + score3_desc + score4_desc + score5_desc
296
- if response_reference:
297
- inputs = human_input + "\n"+ response_reference
298
- else:
299
- inputs = human_input
300
- final_prompt = base_prompt.format(
301
- INPUTS=inputs,
302
- OUTPUT=ai_response,
303
- EVALUATION_CRITERIA=eval_criteria,
304
- RUBRIC=rubric
305
- )
306
-
307
- except KeyError as e:
308
- return f"Error formatting prompt: Missing required field {str(e)}"
309
-
310
  try:
311
  if organization == "OpenAI":
312
- return get_openai_response(
313
- api_model, final_prompt, system_prompt, max_tokens, temperature
314
- )
315
  elif organization == "Anthropic":
316
- return get_anthropic_response(
317
- api_model, final_prompt, system_prompt, max_tokens, temperature
318
- )
319
- elif organization == "Prometheus":
320
- return get_prometheus_response(
321
- api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
322
- )
323
- elif organization == "Atla":
324
- return get_atla_response(
325
- api_model, final_prompt, system_prompt, max_tokens, temperature = 0.01
326
- )
327
- elif organization == "Cohere":
328
- return get_cohere_response(
329
- api_model, final_prompt, system_prompt, max_tokens, temperature
330
- )
331
- elif organization == "Flow AI":
332
- return get_flow_judge_response(
333
- api_model, final_prompt
334
- )
335
- elif organization == "Salesforce":
336
- response = get_salesforce_response(
337
- api_model, final_prompt, system_prompt, max_tokens, temperature
338
- )
339
- return response
340
  else:
341
  # All other organizations use Together API
342
- return get_together_response(
343
- api_model, final_prompt, system_prompt, max_tokens, temperature
344
- )
345
  except Exception as e:
346
  return f"Error with {organization} model {model_name}: {str(e)}"
347
 
 
348
  def parse_model_response(response):
349
  try:
350
  # Debug print
351
  print(f"Raw model response: {response}")
352
 
353
- # If response is already a dictionary, use it directly
354
- if isinstance(response, dict):
355
- return str(response.get("result", "N/A")), response.get("feedback", "N/A")
356
-
357
  # First try to parse the entire response as JSON
358
  try:
359
  data = json.loads(response)
360
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
361
  except json.JSONDecodeError:
362
- # If that fails, check if this is a Salesforce response (which uses ATLA format)
363
- if "**Reasoning:**" in response or "**Result:**" in response:
364
- # Use ATLA parser for Salesforce responses
365
- return atla_parse_model_response(response)
366
-
367
- # Otherwise try to find JSON within the response
368
- json_match = re.search(r"{.*}", response, re.DOTALL)
369
  if json_match:
370
  data = json.loads(json_match.group(0))
371
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
372
  else:
373
- return "Error", f"Invalid response format returned - here is the raw model response: {response}"
374
 
375
  except Exception as e:
376
  # Debug print for error case
377
  print(f"Failed to parse response: {str(e)}")
378
-
379
- # If the error message itself contains valid JSON, try to parse that
380
- try:
381
- error_json_match = re.search(r"{.*}", str(e), re.DOTALL)
382
- if error_json_match:
383
- data = json.loads(error_json_match.group(0))
384
- return str(data.get("result", "N/A")), data.get("feedback", "N/A")
385
- except:
386
- pass
387
-
388
  return "Error", f"Failed to parse response: {response}"
389
-
390
- def prometheus_parse_model_response(output):
391
- try:
392
- print(f"Raw model response: {output}")
393
- output = output.strip()
394
-
395
- # Remove "Feedback:" prefix if present (case insensitive)
396
- output = re.sub(r'^feedback:\s*', '', output, flags=re.IGNORECASE)
397
-
398
- # New pattern to match [RESULT] X at the beginning
399
- begin_result_pattern = r'^\[RESULT\]\s*(\d+)\s*\n*(.*?)$'
400
- begin_match = re.search(begin_result_pattern, output, re.DOTALL | re.IGNORECASE)
401
- if begin_match:
402
- score = int(begin_match.group(1))
403
- feedback = begin_match.group(2).strip()
404
- return str(score), feedback
405
-
406
- # Existing patterns for end-of-string results...
407
- pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
408
- match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
409
- if match:
410
- feedback = match.group(1).strip()
411
- score = int(match.group(2))
412
- return str(score), feedback
413
-
414
- # If no match, try to match "... Score: X"
415
- pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
416
- match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
417
- if match:
418
- feedback = match.group(1).strip()
419
- score = int(match.group(2))
420
- return str(score), feedback
421
-
422
- # Pattern to handle [Score X] at the end
423
- pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
424
- match = re.search(pattern, output, re.DOTALL)
425
- if match:
426
- feedback = match.group(1).strip()
427
- score = int(match.group(2))
428
- return str(score), feedback
429
-
430
- # Final fallback attempt
431
- pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
432
- match = re.search(pattern, output)
433
- if match:
434
- score = int(match.group(1))
435
- feedback = output[:match.start()].rstrip()
436
- # Remove any trailing brackets from feedback
437
- feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
438
- return str(score), feedback
439
-
440
- return "Error", f"Failed to parse response: {output}"
441
-
442
- except Exception as e:
443
- print(f"Failed to parse response: {str(e)}")
444
- return "Error", f"Exception during parsing: {str(e)}"
445
-
446
- def atla_parse_model_response(output):
447
- """Parse response from ATLA model"""
448
- try:
449
- print(f"Raw Atla model response: {output}")
450
- output = output.strip()
451
-
452
- # Look for the Reasoning and Result sections
453
- reasoning_match = re.search(r'\*\*Reasoning:\*\*(.*?)(?=\*\*Result:|$)', output, re.DOTALL)
454
- result_match = re.search(r'\*\*Result:\*\*\s*(\d+)', output)
455
-
456
- if reasoning_match and result_match:
457
- feedback = reasoning_match.group(1).strip()
458
- score = result_match.group(1)
459
- return str(score), feedback
460
-
461
- return "Error", f"Failed to parse ATLA response format: {output}"
462
-
463
- except Exception as e:
464
- print(f"Failed to parse ATLA response: {str(e)}")
465
- return "Error", f"Exception during parsing: {str(e)}"
466
-
467
- def flow_judge_parse_model_response(output):
468
- try:
469
- print(f"Raw model response: {output}")
470
- # Convert multiple line breaks to single ones and strip whitespace
471
- output = re.sub(r'\n{2,}', '\n', output.strip())
472
-
473
- # Compile regex patterns
474
- feedback_pattern = re.compile(r"<feedback>\s*(.*?)\s*</feedback>", re.DOTALL)
475
- score_pattern = re.compile(r"<score>\s*(\d+)\s*</score>", re.DOTALL)
476
-
477
- feedback_match = feedback_pattern.search(output)
478
- score_match = score_pattern.search(output)
479
-
480
- if feedback_match or not score_match:
481
- feedback = feedback_match.group(1).strip()
482
- score = int(score_match.group(1).strip())
483
- return str(score), feedback
484
-
485
- return "Error", f"Failed to parse response: {output}"
486
-
487
- except Exception as e:
488
- print(f"Failed to parse response: {str(e)}")
489
- return "Error", f"Exception during parsing: {str(e)}"
 
1
  from openai import OpenAI
2
  import anthropic
3
  from together import Together
 
4
  import json
5
  import re
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # Initialize clients
8
  anthropic_client = anthropic.Anthropic()
9
  openai_client = OpenAI()
10
  together_client = Together()
11
+
12
+ # Initialize OpenAI client
13
+
14
+ EXAMPLE_GENERATION_PROMPT_SYSTEM = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes."""
15
+ EXAMPLE_GENERATION_PROMPT_USER = """Please provide a random human message and an appropriate AI response in the format of an academic benchmark dataset e.g.,. User: "Hi, I'm trying to solve a crossword puzzle, but I've never done one of these before. Can you help me out?" / AI Response: "Absolutely! I'd be delighted to help you with your crossword puzzle. Just tell me the clues and the number of letters needed for each answer (and any letters you may have already filled in), and I'll do my best to help you find the solutions. If you have any specific questions about how to approach solving crossword puzzles in general, feel free to ask those as well!". Format the output as JSON:\n\n{\"human\": \"<human message>\", \"ai\": \"<AI assistant response>\"}"""
16
+
17
+ def get_random_human_ai_pair():
18
+ # Use GPT-3.5 to generate a random conversation
19
+ completion = openai_client.chat.completions.create(
20
+ model="gpt-3.5-turbo",
21
+ messages=[
22
+ {"role": "system", "content": EXAMPLE_GENERATION_PROMPT_SYSTEM},
23
+ {"role": "user", "content": EXAMPLE_GENERATION_PROMPT_USER},
24
+ ],
25
+ max_completion_tokens=300,
26
+ temperature=1,
27
+ )
28
+
29
+ # Parse the response to get the human input and AI response
30
+ raw_response = completion.choices[0].message.content.strip()
31
+
32
+ try:
33
+ data = json.loads(raw_response)
34
+ human_message = data.get("human", "Hello, how are you?")
35
+ ai_message = data.get("ai", "I'm doing well, thank you!")
36
+ except json.JSONDecodeError:
37
+ # If parsing fails, set default messages
38
+ human_message = "Hello, how are you?"
39
+ ai_message = "I'm doing well, thank you!"
40
+
41
+ return human_message, ai_message
42
+
43
+ SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
44
+
45
+
46
+ def get_openai_response(model_name, prompt):
47
  """Get response from OpenAI API"""
48
  try:
49
  response = openai_client.chat.completions.create(
50
  model=model_name,
51
  messages=[
52
+ {"role": "system", "content": SYSTEM_PROMPT},
53
  {"role": "user", "content": prompt},
54
  ],
 
 
55
  )
56
  return response.choices[0].message.content
57
  except Exception as e:
58
  return f"Error with OpenAI model {model_name}: {str(e)}"
59
 
60
+
61
+ def get_anthropic_response(model_name, prompt):
62
  """Get response from Anthropic API"""
63
  try:
64
  response = anthropic_client.messages.create(
65
  model=model_name,
66
+ max_tokens=1000,
67
+ temperature=0,
68
+ system=SYSTEM_PROMPT,
69
  messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
70
  )
71
  return response.content[0].text
72
  except Exception as e:
73
  return f"Error with Anthropic model {model_name}: {str(e)}"
74
 
75
+
76
+ def get_together_response(model_name, prompt):
77
  """Get response from Together API"""
78
  try:
79
  response = together_client.chat.completions.create(
80
  model=model_name,
81
  messages=[
82
+ {"role": "system", "content": SYSTEM_PROMPT},
83
  {"role": "user", "content": prompt},
84
  ],
 
 
85
  stream=False,
86
  )
87
  return response.choices[0].message.content
88
  except Exception as e:
89
  return f"Error with Together model {model_name}: {str(e)}"
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ def get_model_response(model_name, model_info, prompt):
 
 
 
 
 
 
 
93
  """Get response from appropriate API based on model organization"""
94
  if not model_info:
95
  return "Model not found or unsupported."
 
97
  api_model = model_info["api_model"]
98
  organization = model_info["organization"]
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  try:
101
  if organization == "OpenAI":
102
+ return get_openai_response(api_model, prompt)
 
 
103
  elif organization == "Anthropic":
104
+ return get_anthropic_response(api_model, prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  else:
106
  # All other organizations use Together API
107
+ return get_together_response(api_model, prompt)
 
 
108
  except Exception as e:
109
  return f"Error with {organization} model {model_name}: {str(e)}"
110
 
111
+
112
  def parse_model_response(response):
113
  try:
114
  # Debug print
115
  print(f"Raw model response: {response}")
116
 
 
 
 
 
117
  # First try to parse the entire response as JSON
118
  try:
119
  data = json.loads(response)
120
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
121
  except json.JSONDecodeError:
122
+ # If that fails (typically for smaller models), try to find JSON within the response
123
+ json_match = re.search(r"{.*}", response)
 
 
 
 
 
124
  if json_match:
125
  data = json.loads(json_match.group(0))
126
  return str(data.get("result", "N/A")), data.get("feedback", "N/A")
127
  else:
128
+ return "Error", f"Failed to parse response: {response}"
129
 
130
  except Exception as e:
131
  # Debug print for error case
132
  print(f"Failed to parse response: {str(e)}")
 
 
 
 
 
 
 
 
 
 
133
  return "Error", f"Failed to parse response: {response}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard.py DELETED
@@ -1,116 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import datetime, timezone
3
- from typing import Dict, List
4
-
5
- # Constants
6
- DEFAULT_ELO = 1200 # Starting ELO for new models
7
- K_FACTOR = 32 # Standard chess K-factor
8
-
9
- def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
10
- """Generate leaderboard data using votes from MongoDB."""
11
- # Initialize dictionaries for tracking
12
- ratings = defaultdict(lambda: DEFAULT_ELO)
13
- matches = defaultdict(int)
14
-
15
- # Process each vote
16
- for vote in voting_data:
17
- try:
18
- model_a = vote.get("model_a")
19
- model_b = vote.get("model_b")
20
- winner = vote.get("winner")
21
-
22
- # Skip if models aren't in current model_data
23
- if (
24
- not all([model_a, model_b, winner])
25
- or model_a not in model_data
26
- or model_b not in model_data
27
- ):
28
- continue
29
-
30
- # Update match counts
31
- matches[model_a] += 1
32
- matches[model_b] += 1
33
-
34
- # Calculate ELO changes
35
- elo_a = ratings[model_a]
36
- elo_b = ratings[model_b]
37
-
38
- # Expected scores
39
- expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
40
- expected_b = 1 - expected_a
41
-
42
- # Actual scores
43
- score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
44
- score_b = 1 - score_a
45
-
46
- # Update ratings
47
- ratings[model_a] += K_FACTOR * (score_a - expected_a)
48
- ratings[model_b] += K_FACTOR * (score_b - expected_b)
49
-
50
- except Exception as e:
51
- print(f"Error processing vote: {e}")
52
- continue
53
-
54
- # Generate leaderboard data
55
- leaderboard = []
56
- for model in model_data.keys():
57
- votes = matches[model]
58
- # Skip models with < 300 votes if show_preliminary is False
59
- if not show_preliminary and votes < 300:
60
- continue
61
-
62
- elo = ratings[model]
63
- ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
64
- data = {
65
- "Model": model,
66
- "ELO Score": f"{int(elo)}",
67
- "95% CI": f"±{int(ci)}",
68
- "# Votes": votes,
69
- "Organization": model_data[model]["organization"],
70
- "License": model_data[model]["license"],
71
- }
72
- leaderboard.append(data)
73
-
74
- # Sort leaderboard by ELO score in descending order
75
- leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
76
-
77
- return leaderboard
78
-
79
- def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
80
- """Get summary statistics for the leaderboard."""
81
- now = datetime.now(timezone.utc)
82
- total_votes = len(voting_data)
83
- total_models = len(model_data)
84
- # last_updated = now.strftime("%B %d, %Y at %H:%M:%S UTC")
85
-
86
- last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
87
- "%B %d, %Y at %H:00 UTC"
88
- )
89
-
90
- return f"""
91
- ### Leaderboard Stats
92
- - **Total Models**: {total_models}
93
- - **Total Votes**: {total_votes}
94
- - **Last Updated**: {last_updated}
95
- """
96
-
97
- def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
98
- """Calculate ELO rating changes for both players."""
99
- expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
100
- expected_b = 1 - expected_a
101
-
102
- if winner == "A":
103
- score_a, score_b = 1, 0
104
- elif winner == "B":
105
- score_a, score_b = 0, 1
106
- else: # Handle ties
107
- score_a, score_b = 0.5, 0.5
108
-
109
- change_a = K_FACTOR * (score_a - expected_a)
110
- change_b = K_FACTOR * (score_b - expected_b)
111
-
112
- return change_a, change_b
113
-
114
- def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
115
- """Get current rankings of all models from leaderboard data."""
116
- return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts.py DELETED
@@ -1,210 +0,0 @@
1
- # Default values for compatible mode
2
- DEFAULT_EVAL_CRITERIA = """Does the model provide relevant and useful responses to the user's needs or questions?"""
3
-
4
- DEFAULT_SCORE_1 = "The model's responses are irrelevant or unhelpful to the user's needs or queries."
5
- DEFAULT_SCORE_2 = "The model sometimes provides helpful information, but often fails to address the user's actual needs or questions."
6
- DEFAULT_SCORE_3 = "The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark."
7
- DEFAULT_SCORE_4 = "The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies."
8
- DEFAULT_SCORE_5 = "The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."
9
-
10
- # Default Eval Prompt
11
- DEFAULT_EVAL_PROMPT = """Does the model provide relevant and useful responses to the user's needs or questions?
12
-
13
- Scoring Rubric:
14
- Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
15
- Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
16
- Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
17
- Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
18
- Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries.
19
-
20
- [User Query]: {{input}}
21
-
22
- [AI Response]: {{response}}"""
23
-
24
- # Split the eval prompt into editable and fixed parts
25
- DEFAULT_EVAL_PROMPT_EDITABLE = """Does the model provide relevant and useful responses to the user's needs or questions?
26
-
27
- Scoring Rubric:
28
- Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
29
- Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
30
- Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
31
- Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
32
- Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
33
-
34
- # Fixed suffix that will always be appended
35
- FIXED_EVAL_SUFFIX = """
36
- [User Query]: {{human_input}}
37
-
38
- [AI Response]: {{ai_response}}"""
39
-
40
- # Define the Prometheus prompt used by default (without reference)
41
- PROMETHEUS_PROMPT = """###Task Description:
42
- An instruction (might include an Input inside it) and a response to evaluate are given.
43
- 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
44
- 2. After writing the feedback, write a score that is an integer between 1 and 5.
45
- 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
46
- 4. Please do not generate any other openings, closings, or explanations.
47
-
48
- ###The instruction to evaluate:
49
- {human_input}
50
-
51
- ###Response to evaluate:
52
- {ai_response}
53
-
54
- ###Score Rubrics:
55
- [{eval_criteria}]
56
- Score 1: {score1_desc}
57
- Score 2: {score2_desc}
58
- Score 3: {score3_desc}
59
- Score 4: {score4_desc}
60
- Score 5: {score5_desc}
61
-
62
- ###Feedback:
63
- """
64
-
65
- # Define the Prometheus prompt with reference response
66
- PROMETHEUS_PROMPT_WITH_REFERENCE = """###Task Description:
67
- An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing an evaluation criteria are given.
68
- 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general.
69
- 2. After writing the feedback, write a score that is an integer between 1 and 5.
70
- 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
71
- 4. Please do not generate any other openings, closings, or explanations.
72
-
73
- ###The instruction to evaluate:
74
- {human_input}
75
-
76
- ###Response to evaluate:
77
- {ai_response}
78
-
79
- ###Reference Answer (Score 5):
80
- {ground_truth_input}
81
-
82
- ###Score Rubrics:
83
- [{eval_criteria}]
84
- Score 1: {score1_desc}
85
- Score 2: {score2_desc}
86
- Score 3: {score3_desc}
87
- Score 4: {score4_desc}
88
- Score 5: {score5_desc}
89
-
90
- ###Feedback:
91
- """
92
-
93
- # Judge system prompt for non-Prometheus models
94
- JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
95
-
96
- ATLA_PROMPT = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
97
- Here are some rules of the evaluation:
98
- (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
99
-
100
- Your reply should strictly follow this format:
101
- **Reasoning:** <Your feedback>
102
-
103
- **Result:** <an integer between 1 and 5>
104
-
105
- Here is the data:
106
-
107
- Instruction:
108
- ```
109
- {human_input}
110
- ```
111
-
112
- Response:
113
- ```
114
- {ai_response}
115
- ```
116
-
117
- Score Rubrics:
118
- [{eval_criteria}]
119
- Score 1: {score1_desc}
120
- Score 2: {score2_desc}
121
- Score 3: {score3_desc}
122
- Score 4: {score4_desc}
123
- Score 5: {score5_desc}"""
124
-
125
- ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score between 1 and 5, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
126
-
127
- Here are some rules of the evaluation:
128
- (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
129
-
130
- Your reply should strictly follow this format:
131
- **Reasoning:** <Your feedback>
132
-
133
- **Result:** <an integer between 1 and 5>
134
-
135
- Here is the data:
136
-
137
- Instruction:
138
- ```
139
- {human_input}
140
- ```
141
-
142
- Response:
143
- ```
144
- {ai_response}
145
- ```
146
-
147
- Score Rubrics:
148
- [{eval_criteria}]
149
- Score 1: {score1_desc}
150
- Score 2: {score2_desc}
151
- Score 3: {score3_desc}
152
- Score 4: {score4_desc}
153
- Score 5: {score5_desc}
154
-
155
- Reference answer:
156
- {ground_truth_input}"""
157
-
158
- # Define the Flow Judge prompt
159
- FLOW_JUDGE_PROMPT = """# GOAL
160
- Your job is to evaluate a task carried out by an AI system powered by a large \
161
- language model.
162
-
163
- You will be provided with the inputs and output of the task, as well as the evaluation criteria \
164
- and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation \
165
- criteria and scoring rubric provided.
166
-
167
- # INPUT
168
- Below are the inputs required for performing the task:
169
- <inputs>
170
- {INPUTS}
171
- </inputs>
172
-
173
- # OUTPUT
174
- Below is the output of the task:
175
- <output>
176
- {OUTPUT}
177
- </output>
178
-
179
- # EVALUATION CRITERIA AND SCORING RUBRIC
180
- Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
181
- <evaluation_criteria>
182
- {EVALUATION_CRITERIA}
183
- </evaluation_criteria>
184
-
185
- <scoring_rubric>
186
- {RUBRIC}
187
- </scoring_rubric>
188
-
189
- # INSTRUCTIONS FOR THE EVALUATION
190
- 1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. \
191
- Review the evaluation criteria and scoring rubric to understand the different levels of \
192
- performance and the descriptions for each score.
193
- 2. Review the inputs and output: Look at the inputs provided for the task. Examine the output \
194
- generated from completing the task.
195
- 3. Compare output to score descriptions: Compare the output against the criteria and score \
196
- descriptions in the scoring rubric. For each criterion,decide which description best matches the \
197
- output.
198
- 4. After comparing the output to the score descriptions, pay attention to the small details that \
199
- might impact the final score that you assign. Sometimes a small difference can dictate the final \
200
- score.
201
- 5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring \
202
- to specific aspects of the output and comparing them to the rubric.
203
- 6. Assign a final score based on the scoring rubric.
204
-
205
- ## FORMAT FOR THE EVALUATION
206
- - Write the verbal feedback inside <feedback> tags without any additional surrounding text.
207
- - Write the numeric score inside <score> tags, without any additional surrounding text and always \
208
- after the feedback.
209
-
210
- Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
random_sample_generation.py DELETED
@@ -1,183 +0,0 @@
1
- from openai import OpenAI
2
- import anthropic
3
- import json
4
- import re
5
- import random
6
- import os
7
- from gen_api_answer import get_openai_response, get_anthropic_response
8
-
9
- # Initialize clients
10
- anthropic_client = anthropic.Anthropic()
11
- openai_client = OpenAI()
12
-
13
- GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
14
- BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
15
- AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
16
-
17
- GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response generated should be a few sentences long and contain accurate information. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
18
- BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
19
- AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
-
21
- GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be a few sentences long."""
22
- GENERATION_PROMPT_WITH_GROUND_TRUTH = """Please generate:
23
- 1. A random human message (not a simple one-word answer question)
24
- 2. An AI response (a few sentences long)
25
- 3. A perfect reference answer that would score 5/5 on all criteria (e.g., concise, helpful, and accurate)
26
-
27
- Format as JSON with "human", "ai", and "ground_truth" fields."""
28
-
29
- RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
30
-
31
- def get_random_human_ai_pair():
32
- # Select system prompt with specified probabilities
33
- system_prompt = random.choices(
34
- [GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
35
- weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
36
- )[0]
37
-
38
- # Log which type of response is being generated
39
- prompt_type = {
40
- GOOD_SYSTEM_PROMPT: "good",
41
- BAD_SYSTEM_PROMPT: "bad",
42
- AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
43
- }[system_prompt]
44
- print(f"Generating {prompt_type} response")
45
-
46
- # Randomly choose between GPT-3.5 and Claude with 65%/35% weights
47
- model_choice = random.choices([
48
- ("gpt-3.5-turbo", get_openai_response),
49
- ("claude-3-5-haiku-latest", get_anthropic_response)
50
- ], weights=[0.5, 0.5])[0]
51
- model_name, api_func = model_choice
52
-
53
- # Generate response using selected model
54
- response = api_func(
55
- model_name=model_name,
56
- prompt=GENERATION_PROMPT,
57
- system_prompt=system_prompt,
58
- max_tokens=500,
59
- temperature=1
60
- )
61
-
62
- # Define default messages
63
- default_human = "How do muscles grow?"
64
- default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
65
-
66
- try:
67
- # Clean the response by replacing newlines with spaces
68
- cleaned_response = response.replace('\n', ' ').replace('\r', '')
69
- data = json.loads(cleaned_response)
70
-
71
- # Extract messages with fallbacks
72
- human_message = data.get("human", default_human)
73
- ai_message = data.get("ai", default_ai)
74
-
75
- # Debug logging
76
- print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
77
-
78
- except Exception as e:
79
- print(f"Failed to parse response: {str(e)}\n {response}")
80
- human_message = default_human
81
- ai_message = default_ai
82
-
83
- return human_message, ai_message
84
-
85
- def get_random_human_ai_ground_truth_pair():
86
- # Select system prompt with specified probabilities
87
- system_prompts = {
88
- "good": GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
89
- "bad": BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
90
- "ambiguous": AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH
91
- }
92
-
93
- prompt_type = random.choices(
94
- ["good", "bad", "ambiguous"],
95
- weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
96
- )[0]
97
-
98
- system_prompt = system_prompts[prompt_type]
99
- print(f"Generating {prompt_type} response with ground truth")
100
-
101
- # Randomly choose between GPT-3.5 and Claude with 50/50 weights
102
- model_choice = random.choices([
103
- ("gpt-3.5-turbo", get_openai_response),
104
- ("claude-3-5-haiku-latest", get_anthropic_response)
105
- ], weights=[0.5, 0.5])[0]
106
- model_name, api_func = model_choice
107
-
108
- # Define default messages
109
- defaults = {
110
- "human": "How do muscles grow?",
111
- "ai": """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis.""",
112
- "ground_truth": """Muscle growth (hypertrophy) occurs through a complex biological process involving several key mechanisms:
113
-
114
- 1. Mechanical Tension: Resistance training creates mechanical tension in muscle fibers, triggering molecular and cellular responses that promote growth.
115
-
116
- 2. Metabolic Stress: The depletion of energy resources and accumulation of metabolic byproducts during exercise contributes to muscle growth signaling.
117
-
118
- 3. Muscle Damage: Exercise-induced micro-damage to muscle fibers activates satellite cells, which help repair and build new muscle tissue.
119
-
120
- 4. Protein Synthesis: After exercise, increased protein synthesis rates exceed protein breakdown, leading to net muscle protein accretion.
121
-
122
- 5. Hormonal Response: Exercise triggers the release of growth-promoting hormones like testosterone, growth hormone, and IGF-1.
123
-
124
- 6. Recovery: Adequate rest between training sessions allows for repair and growth, supported by proper nutrition, particularly protein intake (1.6-2.2g/kg/day).
125
-
126
- This process is influenced by factors including genetics, age, sex, nutrition, sleep quality, and training variables. Optimal muscle growth requires a structured resistance training program, adequate protein intake, sufficient calories, and proper recovery."""
127
- }
128
-
129
- # Generate response using selected model
130
- response = api_func(
131
- model_name=model_name,
132
- prompt=GENERATION_PROMPT_WITH_GROUND_TRUTH,
133
- system_prompt=system_prompt,
134
- max_tokens=1000, # Increased token limit to accommodate ground truth
135
- temperature=1
136
- )
137
-
138
- # Parse the response to get all three components
139
- try:
140
- # Clean the response by replacing newlines with spaces
141
- cleaned_response = response.replace('\n', ' ').replace('\r', '')
142
- data = json.loads(cleaned_response)
143
-
144
- # Extract messages with fallbacks
145
- human_message = data.get("human", defaults["human"])
146
- ai_message = data.get("ai", defaults["ai"])
147
- ground_truth = data.get("ground_truth", defaults["ground_truth"])
148
-
149
- # Debug logging
150
- print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...', ground_truth='{ground_truth[:50]}...'")
151
-
152
- except Exception as e:
153
- print(f"Failed to parse response: {str(e)}\n {response}")
154
- human_message = defaults["human"]
155
- ai_message = defaults["ai"]
156
- ground_truth = defaults["ground_truth"]
157
-
158
- return human_message, ai_message, ground_truth
159
-
160
- def generate_ai_response(human_msg):
161
- """Generate AI response using GPT-3.5-turbo"""
162
- if not human_msg.strip():
163
- return "", False
164
-
165
- try:
166
- response = get_openai_response(
167
- "gpt-3.5-turbo",
168
- human_msg,
169
- system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
170
- max_tokens=1000,
171
- temperature=1
172
- )
173
- # Extract just the response content since we don't need JSON format here
174
- if isinstance(response, str):
175
- # Clean up any JSON formatting if present
176
- try:
177
- data = json.loads(response)
178
- response = data.get("content", response)
179
- except json.JSONDecodeError:
180
- pass
181
- return response, False # Return response and button interactive state
182
- except Exception as e:
183
- return f"Error generating response: {str(e)}", False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -4,5 +4,3 @@ python-dotenv
4
  openai
5
  anthropic
6
  together
7
- cohere
8
- transformers
 
4
  openai
5
  anthropic
6
  together