shgao commited on
Commit
dd01482
·
1 Parent(s): 03e54bd
Files changed (1) hide show
  1. app.py +122 -171
app.py CHANGED
@@ -319,16 +319,17 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
319
  for (q_id, our_model, other_model) in full_question_ids_list
320
  if (q_id, our_model, other_model) not in matched_pairs
321
  ]
322
- print(f"Filtered question IDs: {full_question_ids_list}")
323
  print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
 
324
 
325
  return full_question_ids_list, data_by_filename
326
 
327
- def go_to_page0_from_minus1():
328
- return gr.update(visible=False), gr.update(visible=True)
329
-
330
- def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id):
331
-
 
332
  # ADDED: Validate that name and email are non-empty before proceeding
333
  if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
334
  return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
@@ -357,76 +358,98 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
357
  token = os.getenv("HF_TOKEN")
358
  )
359
 
360
- full_question_ids_list, data_by_filename = get_evaluator_questions(evaluator_id, all_files, evaluator_directory)
 
 
361
 
362
  if len(full_question_ids_list) == 0:
363
- return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False),""
364
 
365
  full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
366
  #selected question is the first element
367
- q_id, other_model_name = full_question_ids_list[0]
 
368
 
369
- #Constructing question_for_eval, the question to evaluate this round
 
 
 
370
  txagent_matched_entry = next(
371
- (entry for entry in data_by_filename['TxAgent-T1-Llama-3.1-8B'] if preprocess_question_id(entry.get("id")) == q_id),
372
  None
373
  )
 
 
 
 
374
  other_model_matched_entry = next(
375
  (entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
376
  None
377
  )
378
-
379
- models_list = [
380
- {
381
- "model": 'TxAgent-T1-Llama-3.1-8B',
382
- "reasoning_trace": txagent_matched_entry.get("solution")
383
- },
384
- {
385
  "model": other_model_name,
386
  "reasoning_trace": other_model_matched_entry.get("solution")
387
  }
388
- ]
 
389
  random.shuffle(models_list)
390
 
391
  question_for_eval = {
392
  "question": txagent_matched_entry.get("question"),
393
- "correct_answer": txagent_matched_entry.get("correct_answer"),
394
  "id": q_id,
395
  "models": models_list,
396
  }
 
 
397
 
398
- #update user_info
399
- user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id, evaluator_id)
400
  chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
401
  chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
402
  prompt_text = question_for_eval['question']
403
 
404
- # Construct the question-specific elements of the pairwise rating page (page 1)
405
  page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
406
- page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer"))
407
  chat_a = gr.Chatbot(
408
- value=chat_A_value,
409
- type="messages",
410
- height=400,
411
- label="Model A Response",
412
- show_copy_button=False,
413
- show_label=True,
414
- render_markdown=True, # Required for markdown/HTML support in messages
415
- avatar_images=None, # Optional: omit user/assistant icons
416
- rtl=False
417
- )
418
  chat_b = gr.Chatbot(
419
- value=chat_B_value,
420
- type="messages",
421
- height=400,
422
- label="Model B Response",
423
- show_copy_button=False,
424
- show_label=True,
425
- render_markdown=True, # Required for markdown/HTML support in messages
426
- avatar_images=None, # Optional: omit user/assistant icons
427
- rtl=False
428
- )
429
- return gr.update(visible=True), gr.update(visible=False), user_info,"", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, gr.update(visible=True), f"You are about to evaluate the next question. You have {len(full_question_ids_list)} question(s) remaining to evaluate."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
  #goes to page 1 from confirmation modal that tells users how many questions they have left to evaluate
432
  def go_to_page1():
@@ -454,7 +477,7 @@ def go_to_page2(data_subset_state,*pairwise_values):
454
  pairwise_results_for_display = [gr.Markdown(f"***As a reminder, your pairwise comparison answer for this criterion was: {pairwise_list[i]}. Your answer choices will be restricted based on your comparison answer, but you may go back and change the comparison answer if you wish.***") for i in range(len(criteria))]
455
 
456
  if any(answer is None for answer in pairwise_list):
457
- return gr.update(visible=True), gr.update(visible=False), None, None, "Error: Please select an option for every pairwise comparison.", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(),*pairwise_results_for_display
458
 
459
  chat_A_value = format_chat(data_subset_state['models'][0]['reasoning_trace'], tool_database_labels)
460
  chat_B_value = format_chat(data_subset_state['models'][1]['reasoning_trace'], tool_database_labels)
@@ -482,7 +505,7 @@ def go_to_page2(data_subset_state,*pairwise_values):
482
  page2_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
483
  page2_reference_answer = gr.Markdown(data_subset_state['correct_answer'])
484
 
485
- return gr.update(visible=False), gr.update(visible=True), pairwise_list, comparison_reasons_list, "", chat_A_rating, chat_B_rating, page2_prompt, page2_reference_answer, *pairwise_results_for_display
486
 
487
 
488
  # Callback to store scores for Response A.
@@ -543,9 +566,9 @@ def toggle_reference(selection):
543
  def mark_invalid_question(btn_clicked_status):
544
  new_status = not btn_clicked_status
545
  if new_status == True:
546
- return new_status, gr.update(value="Undo; this question makes biomedical sense", variant="primary")
547
  else:
548
- return new_status, gr.update(value="This question does not make sense or is not biomedically-relevant",variant="stop")
549
 
550
  centered_col_css = """
551
  #centered-column {
@@ -559,18 +582,24 @@ centered_col_css = """
559
  color: white !important;
560
  border-color: purple !important;
561
  }
 
 
 
 
 
562
  #clear_btn {
563
  background-color: #F08080 !important;
564
  color: white !important;
565
  border-color: #F08080 !important;
566
  }
567
- #reference-box {
568
- background-color: #9191FF !important;
569
- color: white !important;
570
  border: 1px solid #ccc;
571
  padding: 10px;
572
  border-radius: 5px;
573
  }
 
 
 
574
  """
575
  with gr.Blocks(css=centered_col_css) as demo:
576
  # States to save information between pages.
@@ -674,7 +703,7 @@ with gr.Blocks(css=centered_col_css) as demo:
674
  home_btn_0 = gr.Button("Home (your registration info will be saved)")
675
 
676
 
677
- with Modal(visible=False) as eval_progress_modal:
678
  eval_progress_text = gr.Markdown("You have X questions remaining.")
679
  eval_progress_proceed_btn = gr.Button("OK, proceed to question evaluation")
680
 
@@ -683,13 +712,30 @@ with gr.Blocks(css=centered_col_css) as demo:
683
  gr.Markdown("## Part 1/2: Pairwise Comparison") #Make the number controlled by question indexing!
684
  page1_prompt = gr.HTML()
685
 
 
 
 
 
 
 
 
 
 
 
 
686
  # Add small red button under the prompt
687
- nonsense_btn = gr.Button(
688
- "This question does not make sense or is not biomedically-relevant",
689
- size="sm",
690
- variant="stop", # red variant
691
- elem_id="invalid-question-btn"
692
- )
 
 
 
 
 
 
693
 
694
  nonsense_btn.click(
695
  fn=mark_invalid_question,
@@ -698,14 +744,6 @@ with gr.Blocks(css=centered_col_css) as demo:
698
  queue=False,
699
  )
700
 
701
- with gr.Accordion("Show reference answer (this is ONE correct answer, but there may be multiple)", open=False, elem_id="reference-box"):
702
- page1_reference_answer = gr.Markdown(
703
- """
704
- **Reference Answer:**
705
-
706
- This is the reference answer content.
707
- """
708
- )
709
  with gr.Row():
710
  # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
711
  with gr.Column():
@@ -769,13 +807,14 @@ with gr.Blocks(css=centered_col_css) as demo:
769
  gr.Markdown("## Part 2/2: Rate Model Responses")
770
  # ### EDIT: Show a highlighted prompt as on previous pages.
771
  page2_prompt = gr.HTML()
772
- with gr.Accordion("Show reference answer (this is ONE correct answer, but there may be multiple)", open=False, elem_id="reference-box"):
773
  page2_reference_answer = gr.Markdown(
774
  """
775
  **Reference Answer:**
776
 
777
  This is the reference answer content.
778
- """
 
779
  )
780
  # ### EDIT: Display both responses side-by-side using Chatbot windows.
781
  with gr.Row():
@@ -991,39 +1030,13 @@ with gr.Blocks(css=centered_col_css) as demo:
991
  def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
992
  # --- Part 1: Submit the current results (Existing Logic) ---
993
  row_dict = build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args)
994
- _, _, _, _, _, _, _, _, evaluator_id = user_info
995
  append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{evaluator_id}"), add_header_when_create_sheet=True)
996
-
997
- # --- Part 2: Recalculate remaining questions (Existing Logic + Modified Error Handling) ---
998
- # try:
999
-
1000
- # --- Re-fetch data and filter questions (Same logic as before) ---
1001
- question_map_path = hf_hub_download(
1002
- repo_id=REPO_ID,
1003
- filename=EVALUATOR_MAP_DICT,
1004
- repo_type="dataset", # or omit if it's a Model/Space
1005
- # force_download=True, # ← always fetch new copy
1006
- revision="main", # branch/tag/commit, fetches the most recent version of the dataset each time this command is called
1007
- token = os.getenv("HF_TOKEN")
1008
- )
1009
-
1010
- with open(question_map_path, 'r') as f:
1011
- question_map = json.load(f)
1012
-
1013
- evaluator_directory = question_map.get(evaluator_id, None)
1014
- all_files = list_repo_files(
1015
- repo_id=REPO_ID,
1016
- repo_type="dataset",
1017
- revision="main",
1018
- token = os.getenv("HF_TOKEN")
1019
  )
1020
 
1021
- full_question_ids_list, data_by_filename = get_evaluator_questions(evaluator_id, all_files, evaluator_directory)
1022
- remaining_count = len(full_question_ids_list)
1023
-
1024
- # --- Part 3: Determine UI updates based on remaining count ---
1025
  if remaining_count == 0:
1026
- # Success with NO remaining questions
1027
  return (
1028
  gr.update(visible=False), # page0 (Hide)
1029
  gr.update(visible=False), # page2 (Hide)
@@ -1036,72 +1049,9 @@ with gr.Blocks(css=centered_col_css) as demo:
1036
  None,
1037
  None,
1038
  None,
1039
- None
1040
- )
1041
-
1042
- full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
1043
- #selected question is the first element
1044
- q_id, other_model_name = full_question_ids_list[0]
1045
-
1046
- #Constructing question_for_eval, the question to evaluate this round
1047
- txagent_matched_entry = next(
1048
- (entry for entry in data_by_filename['TxAgent-T1-Llama-3.1-8B'] if preprocess_question_id(entry.get("id")) == q_id),
1049
- None
1050
- )
1051
- other_model_matched_entry = next(
1052
- (entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
1053
- None
1054
- )
1055
-
1056
- models_list = [
1057
- {
1058
- "model": 'TxAgent-T1-Llama-3.1-8B',
1059
- "reasoning_trace": txagent_matched_entry.get("solution")
1060
- },
1061
- {
1062
- "model": other_model_name,
1063
- "reasoning_trace": other_model_matched_entry.get("solution")
1064
- }
1065
- ]
1066
- random.shuffle(models_list)
1067
-
1068
- question_for_eval = {
1069
- "question": txagent_matched_entry.get("question"),
1070
- "id": q_id,
1071
- "models": models_list,
1072
- }
1073
-
1074
- chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
1075
- chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
1076
- prompt_text = question_for_eval['question']
1077
-
1078
- # Construct the question-specific elements of the pairwise rating page (page 1)
1079
- page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
1080
- page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer"))
1081
- chat_a = gr.Chatbot(
1082
- value=chat_A_value,
1083
- type="messages",
1084
- height=400,
1085
- label="Model A Response",
1086
- show_copy_button=False,
1087
- show_label=True,
1088
- render_markdown=True, # Required for markdown/HTML support in messages
1089
- avatar_images=None, # Optional: omit user/assistant icons
1090
- rtl=False
1091
- )
1092
- chat_b = gr.Chatbot(
1093
- value=chat_B_value,
1094
- type="messages",
1095
- height=400,
1096
- label="Model B Response",
1097
- show_copy_button=False,
1098
- show_label=True,
1099
- render_markdown=True, # Required for markdown/HTML support in messages
1100
- avatar_images=None, # Optional: omit user/assistant icons
1101
- rtl=False
1102
  )
1103
-
1104
- # Success with remaining questions
1105
  return (
1106
  gr.update(visible=False), # page0 (Hide)
1107
  gr.update(visible=False), # page2 (Hide)
@@ -1114,8 +1064,10 @@ with gr.Blocks(css=centered_col_css) as demo:
1114
  chat_b,
1115
  page1_prompt,
1116
  page1_reference_answer,
1117
- question_for_eval)
1118
-
 
 
1119
  def cancel_submission():
1120
  # Cancel final submission: just hide the confirmation modal.
1121
  return gr.update(visible=False)
@@ -1131,6 +1083,9 @@ with gr.Blocks(css=centered_col_css) as demo:
1131
  reset_ratings_B = [gr.update(value=None) for i in range(len(criteria))]
1132
 
1133
  return (
 
 
 
1134
 
1135
  # states
1136
  # gr.update(value=None), # user_info_state
@@ -1251,11 +1206,6 @@ with gr.Blocks(css=centered_col_css) as demo:
1251
  )
1252
 
1253
  # Finalize submission if user confirms.
1254
- # yes_btn.click(
1255
- # fn=final_submit,
1256
- # inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, *ratings_A, *ratings_B],
1257
- # outputs=[page2, final_page, confirm_modal]
1258
- # )
1259
  question_submission_event = yes_btn.click(
1260
  fn=final_submit,
1261
  inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, nonsense_btn_clicked, *ratings_A, *ratings_B],
@@ -1271,7 +1221,8 @@ with gr.Blocks(css=centered_col_css) as demo:
1271
  chat_b,
1272
  page1_prompt,
1273
  page1_reference_answer,
1274
- data_subset_state
 
1275
  ],
1276
  scroll_to_output=True
1277
  )
 
319
  for (q_id, our_model, other_model) in full_question_ids_list
320
  if (q_id, our_model, other_model) not in matched_pairs
321
  ]
 
322
  print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
323
+
324
 
325
  return full_question_ids_list, data_by_filename
326
 
327
+ def get_next_eval_question(
328
+ name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id,
329
+ our_methods,
330
+ return_user_info=True, # Whether to return user_info tuple
331
+ include_correct_answer=True # Whether to return correct_answer
332
+ ):
333
  # ADDED: Validate that name and email are non-empty before proceeding
334
  if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
335
  return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
 
358
  token = os.getenv("HF_TOKEN")
359
  )
360
 
361
+ # Get available questions for the evaluator
362
+ full_question_ids_list, data_by_filename = get_evaluator_questions(
363
+ evaluator_id, all_files, evaluator_directory, our_methods)
364
 
365
  if len(full_question_ids_list) == 0:
366
+ return None, None, None, None, None, None, 0
367
 
368
  full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
369
  #selected question is the first element
370
+ q_id, our_model_name, other_model_name = full_question_ids_list[0]
371
+ print("Selected question ID:", q_id)
372
 
373
+ # Build model answer lists
374
+ models_list = []
375
+
376
+
377
  txagent_matched_entry = next(
378
+ (entry for entry in data_by_filename[our_model_name] if preprocess_question_id(entry.get("id")) == q_id),
379
  None
380
  )
381
+ our_model = {
382
+ "model": our_model_name,
383
+ "reasoning_trace": txagent_matched_entry.get("solution")
384
+ }
385
  other_model_matched_entry = next(
386
  (entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
387
  None
388
  )
389
+ compared_model = {
 
 
 
 
 
 
390
  "model": other_model_name,
391
  "reasoning_trace": other_model_matched_entry.get("solution")
392
  }
393
+
394
+ models_list = [our_model, compared_model]
395
  random.shuffle(models_list)
396
 
397
  question_for_eval = {
398
  "question": txagent_matched_entry.get("question"),
 
399
  "id": q_id,
400
  "models": models_list,
401
  }
402
+ if include_correct_answer:
403
+ question_for_eval["correct_answer"] = txagent_matched_entry.get("correct_answer")
404
 
405
+ # Prepare Gradio components
 
406
  chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
407
  chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
408
  prompt_text = question_for_eval['question']
409
 
 
410
  page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
411
+ page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer")) if include_correct_answer else None
412
  chat_a = gr.Chatbot(
413
+ value=chat_A_value,
414
+ type="messages",
415
+ height=400,
416
+ label="Model A Response",
417
+ show_copy_button=False,
418
+ show_label=True,
419
+ render_markdown=True,
420
+ avatar_images=None,
421
+ rtl=False
422
+ )
423
  chat_b = gr.Chatbot(
424
+ value=chat_B_value,
425
+ type="messages",
426
+ height=400,
427
+ label="Model B Response",
428
+ show_copy_button=False,
429
+ show_label=True,
430
+ render_markdown=True,
431
+ avatar_images=None,
432
+ rtl=False
433
+ )
434
+
435
+ user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id, evaluator_id) if return_user_info else None
436
+ return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
437
+
438
+ def go_to_page0_from_minus1():
439
+ return gr.update(visible=False), gr.update(visible=True)
440
+
441
+ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id):
442
+
443
+ # ADDED: Validate that name and email are non-empty before proceeding
444
+ if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
445
+ return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
446
+
447
+ user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
448
+ name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id, our_methods
449
+ )
450
+ if remaining_count == 0:
451
+ return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False),""
452
+ return gr.update(visible=True), gr.update(visible=False), user_info,"", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, gr.update(visible=True), f"You are about to evaluate the next question. You have {remaining_count} question(s) remaining to evaluate."
453
 
454
  #goes to page 1 from confirmation modal that tells users how many questions they have left to evaluate
455
  def go_to_page1():
 
477
  pairwise_results_for_display = [gr.Markdown(f"***As a reminder, your pairwise comparison answer for this criterion was: {pairwise_list[i]}. Your answer choices will be restricted based on your comparison answer, but you may go back and change the comparison answer if you wish.***") for i in range(len(criteria))]
478
 
479
  if any(answer is None for answer in pairwise_list):
480
+ return (gr.update(visible=True), gr.update(visible=False), None, None, "Error: Please select an option for every pairwise comparison.", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown()) + tuple(pairwise_results_for_display)
481
 
482
  chat_A_value = format_chat(data_subset_state['models'][0]['reasoning_trace'], tool_database_labels)
483
  chat_B_value = format_chat(data_subset_state['models'][1]['reasoning_trace'], tool_database_labels)
 
505
  page2_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
506
  page2_reference_answer = gr.Markdown(data_subset_state['correct_answer'])
507
 
508
+ return (gr.update(visible=False), gr.update(visible=True), pairwise_list, comparison_reasons_list, "", chat_A_rating, chat_B_rating, page2_prompt, page2_reference_answer) + tuple(pairwise_results_for_display)
509
 
510
 
511
  # Callback to store scores for Response A.
 
566
  def mark_invalid_question(btn_clicked_status):
567
  new_status = not btn_clicked_status
568
  if new_status == True:
569
+ return new_status, gr.update(value="Undo: Correct Question", variant="primary")
570
  else:
571
+ return new_status, gr.update(value="Wrong Question",variant="stop")
572
 
573
  centered_col_css = """
574
  #centered-column {
 
582
  color: white !important;
583
  border-color: purple !important;
584
  }
585
+ #answer-reference-btn {
586
+ background-color: #E6E6FA !important;
587
+ color: white !important;
588
+ border-color: #E6E6FA !important;
589
+ }
590
  #clear_btn {
591
  background-color: #F08080 !important;
592
  color: white !important;
593
  border-color: #F08080 !important;
594
  }
595
+ .reference-box {
 
 
596
  border: 1px solid #ccc;
597
  padding: 10px;
598
  border-radius: 5px;
599
  }
600
+ .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
601
+ .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
602
+
603
  """
604
  with gr.Blocks(css=centered_col_css) as demo:
605
  # States to save information between pages.
 
703
  home_btn_0 = gr.Button("Home (your registration info will be saved)")
704
 
705
 
706
+ with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
707
  eval_progress_text = gr.Markdown("You have X questions remaining.")
708
  eval_progress_proceed_btn = gr.Button("OK, proceed to question evaluation")
709
 
 
712
  gr.Markdown("## Part 1/2: Pairwise Comparison") #Make the number controlled by question indexing!
713
  page1_prompt = gr.HTML()
714
 
715
+ with gr.Accordion("Click to reveal a reference answer—this is just one correct solution; others are possible.", open=False, elem_id="answer-reference-btn"):
716
+ page1_reference_answer = gr.Markdown(
717
+ """
718
+ **Reference Answer:**
719
+
720
+ This is the reference answer content.
721
+ """,
722
+ elem_classes="reference-box"
723
+ )
724
+
725
+
726
  # Add small red button under the prompt
727
+ with gr.Row():
728
+ nonsense_btn = gr.Button(
729
+ "Wrong Question?",
730
+ size="sm",
731
+ variant="stop", # red variant
732
+ elem_id="invalid-question-btn",
733
+ elem_classes=["short-btn"]
734
+ )
735
+ gr.Markdown(
736
+ "<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
737
+ render=True
738
+ )
739
 
740
  nonsense_btn.click(
741
  fn=mark_invalid_question,
 
744
  queue=False,
745
  )
746
 
 
 
 
 
 
 
 
 
747
  with gr.Row():
748
  # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
749
  with gr.Column():
 
807
  gr.Markdown("## Part 2/2: Rate Model Responses")
808
  # ### EDIT: Show a highlighted prompt as on previous pages.
809
  page2_prompt = gr.HTML()
810
+ with gr.Accordion("Click to reveal a reference answerthis is just one correct solution; others are possible.", open=False, elem_id="answer-reference-btn"):
811
  page2_reference_answer = gr.Markdown(
812
  """
813
  **Reference Answer:**
814
 
815
  This is the reference answer content.
816
+ """,
817
+ elem_classes="reference-box"
818
  )
819
  # ### EDIT: Display both responses side-by-side using Chatbot windows.
820
  with gr.Row():
 
1030
  def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
1031
  # --- Part 1: Submit the current results (Existing Logic) ---
1032
  row_dict = build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args)
 
1033
  append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{evaluator_id}"), add_header_when_create_sheet=True)
1034
+ name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, _, evaluator_id = user_info
1035
+ user_info_new, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
1036
+ name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, our_methods
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
  )
1038
 
 
 
 
 
1039
  if remaining_count == 0:
 
1040
  return (
1041
  gr.update(visible=False), # page0 (Hide)
1042
  gr.update(visible=False), # page2 (Hide)
 
1049
  None,
1050
  None,
1051
  None,
1052
+ None,
1053
+ user_info_new,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
  )
 
 
1055
  return (
1056
  gr.update(visible=False), # page0 (Hide)
1057
  gr.update(visible=False), # page2 (Hide)
 
1064
  chat_b,
1065
  page1_prompt,
1066
  page1_reference_answer,
1067
+ question_for_eval,
1068
+ user_info_new
1069
+ )
1070
+
1071
  def cancel_submission():
1072
  # Cancel final submission: just hide the confirmation modal.
1073
  return gr.update(visible=False)
 
1083
  reset_ratings_B = [gr.update(value=None) for i in range(len(criteria))]
1084
 
1085
  return (
1086
+ # pages
1087
+ gr.update(visible=True), # page0
1088
+ gr.update(visible=False), # final_page
1089
 
1090
  # states
1091
  # gr.update(value=None), # user_info_state
 
1206
  )
1207
 
1208
  # Finalize submission if user confirms.
 
 
 
 
 
1209
  question_submission_event = yes_btn.click(
1210
  fn=final_submit,
1211
  inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, nonsense_btn_clicked, *ratings_A, *ratings_B],
 
1221
  chat_b,
1222
  page1_prompt,
1223
  page1_reference_answer,
1224
+ data_subset_state,
1225
+ user_info_state,
1226
  ],
1227
  scroll_to_output=True
1228
  )