Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -319,16 +319,17 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 319 |
for (q_id, our_model, other_model) in full_question_ids_list
|
| 320 |
if (q_id, our_model, other_model) not in matched_pairs
|
| 321 |
]
|
| 322 |
-
print(f"Filtered question IDs: {full_question_ids_list}")
|
| 323 |
print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
|
|
|
|
| 324 |
|
| 325 |
return full_question_ids_list, data_by_filename
|
| 326 |
|
| 327 |
-
def
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
| 332 |
# ADDED: Validate that name and email are non-empty before proceeding
|
| 333 |
if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
|
| 334 |
return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
|
|
@@ -357,76 +358,98 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
|
|
| 357 |
token = os.getenv("HF_TOKEN")
|
| 358 |
)
|
| 359 |
|
| 360 |
-
|
|
|
|
|
|
|
| 361 |
|
| 362 |
if len(full_question_ids_list) == 0:
|
| 363 |
-
return
|
| 364 |
|
| 365 |
full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
|
| 366 |
#selected question is the first element
|
| 367 |
-
q_id, other_model_name = full_question_ids_list[0]
|
|
|
|
| 368 |
|
| 369 |
-
#
|
|
|
|
|
|
|
|
|
|
| 370 |
txagent_matched_entry = next(
|
| 371 |
-
(entry for entry in data_by_filename[
|
| 372 |
None
|
| 373 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
other_model_matched_entry = next(
|
| 375 |
(entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
|
| 376 |
None
|
| 377 |
)
|
| 378 |
-
|
| 379 |
-
models_list = [
|
| 380 |
-
{
|
| 381 |
-
"model": 'TxAgent-T1-Llama-3.1-8B',
|
| 382 |
-
"reasoning_trace": txagent_matched_entry.get("solution")
|
| 383 |
-
},
|
| 384 |
-
{
|
| 385 |
"model": other_model_name,
|
| 386 |
"reasoning_trace": other_model_matched_entry.get("solution")
|
| 387 |
}
|
| 388 |
-
|
|
|
|
| 389 |
random.shuffle(models_list)
|
| 390 |
|
| 391 |
question_for_eval = {
|
| 392 |
"question": txagent_matched_entry.get("question"),
|
| 393 |
-
"correct_answer": txagent_matched_entry.get("correct_answer"),
|
| 394 |
"id": q_id,
|
| 395 |
"models": models_list,
|
| 396 |
}
|
|
|
|
|
|
|
| 397 |
|
| 398 |
-
#
|
| 399 |
-
user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id, evaluator_id)
|
| 400 |
chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
|
| 401 |
chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
|
| 402 |
prompt_text = question_for_eval['question']
|
| 403 |
|
| 404 |
-
# Construct the question-specific elements of the pairwise rating page (page 1)
|
| 405 |
page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
|
| 406 |
-
page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer"))
|
| 407 |
chat_a = gr.Chatbot(
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
chat_b = gr.Chatbot(
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
#goes to page 1 from confirmation modal that tells users how many questions they have left to evaluate
|
| 432 |
def go_to_page1():
|
|
@@ -454,7 +477,7 @@ def go_to_page2(data_subset_state,*pairwise_values):
|
|
| 454 |
pairwise_results_for_display = [gr.Markdown(f"***As a reminder, your pairwise comparison answer for this criterion was: {pairwise_list[i]}. Your answer choices will be restricted based on your comparison answer, but you may go back and change the comparison answer if you wish.***") for i in range(len(criteria))]
|
| 455 |
|
| 456 |
if any(answer is None for answer in pairwise_list):
|
| 457 |
-
return gr.update(visible=True), gr.update(visible=False), None, None, "Error: Please select an option for every pairwise comparison.", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown()
|
| 458 |
|
| 459 |
chat_A_value = format_chat(data_subset_state['models'][0]['reasoning_trace'], tool_database_labels)
|
| 460 |
chat_B_value = format_chat(data_subset_state['models'][1]['reasoning_trace'], tool_database_labels)
|
|
@@ -482,7 +505,7 @@ def go_to_page2(data_subset_state,*pairwise_values):
|
|
| 482 |
page2_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
|
| 483 |
page2_reference_answer = gr.Markdown(data_subset_state['correct_answer'])
|
| 484 |
|
| 485 |
-
return gr.update(visible=False), gr.update(visible=True), pairwise_list, comparison_reasons_list, "", chat_A_rating, chat_B_rating, page2_prompt, page2_reference_answer
|
| 486 |
|
| 487 |
|
| 488 |
# Callback to store scores for Response A.
|
|
@@ -543,9 +566,9 @@ def toggle_reference(selection):
|
|
| 543 |
def mark_invalid_question(btn_clicked_status):
|
| 544 |
new_status = not btn_clicked_status
|
| 545 |
if new_status == True:
|
| 546 |
-
return new_status, gr.update(value="Undo
|
| 547 |
else:
|
| 548 |
-
return new_status, gr.update(value="
|
| 549 |
|
| 550 |
centered_col_css = """
|
| 551 |
#centered-column {
|
|
@@ -559,18 +582,24 @@ centered_col_css = """
|
|
| 559 |
color: white !important;
|
| 560 |
border-color: purple !important;
|
| 561 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
#clear_btn {
|
| 563 |
background-color: #F08080 !important;
|
| 564 |
color: white !important;
|
| 565 |
border-color: #F08080 !important;
|
| 566 |
}
|
| 567 |
-
|
| 568 |
-
background-color: #9191FF !important;
|
| 569 |
-
color: white !important;
|
| 570 |
border: 1px solid #ccc;
|
| 571 |
padding: 10px;
|
| 572 |
border-radius: 5px;
|
| 573 |
}
|
|
|
|
|
|
|
|
|
|
| 574 |
"""
|
| 575 |
with gr.Blocks(css=centered_col_css) as demo:
|
| 576 |
# States to save information between pages.
|
|
@@ -674,7 +703,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 674 |
home_btn_0 = gr.Button("Home (your registration info will be saved)")
|
| 675 |
|
| 676 |
|
| 677 |
-
with Modal(visible=False) as eval_progress_modal:
|
| 678 |
eval_progress_text = gr.Markdown("You have X questions remaining.")
|
| 679 |
eval_progress_proceed_btn = gr.Button("OK, proceed to question evaluation")
|
| 680 |
|
|
@@ -683,13 +712,30 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 683 |
gr.Markdown("## Part 1/2: Pairwise Comparison") #Make the number controlled by question indexing!
|
| 684 |
page1_prompt = gr.HTML()
|
| 685 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
# Add small red button under the prompt
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
|
| 694 |
nonsense_btn.click(
|
| 695 |
fn=mark_invalid_question,
|
|
@@ -698,14 +744,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 698 |
queue=False,
|
| 699 |
)
|
| 700 |
|
| 701 |
-
with gr.Accordion("Show reference answer (this is ONE correct answer, but there may be multiple)", open=False, elem_id="reference-box"):
|
| 702 |
-
page1_reference_answer = gr.Markdown(
|
| 703 |
-
"""
|
| 704 |
-
**Reference Answer:**
|
| 705 |
-
|
| 706 |
-
This is the reference answer content.
|
| 707 |
-
"""
|
| 708 |
-
)
|
| 709 |
with gr.Row():
|
| 710 |
# ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
|
| 711 |
with gr.Column():
|
|
@@ -769,13 +807,14 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 769 |
gr.Markdown("## Part 2/2: Rate Model Responses")
|
| 770 |
# ### EDIT: Show a highlighted prompt as on previous pages.
|
| 771 |
page2_prompt = gr.HTML()
|
| 772 |
-
with gr.Accordion("
|
| 773 |
page2_reference_answer = gr.Markdown(
|
| 774 |
"""
|
| 775 |
**Reference Answer:**
|
| 776 |
|
| 777 |
This is the reference answer content.
|
| 778 |
-
"""
|
|
|
|
| 779 |
)
|
| 780 |
# ### EDIT: Display both responses side-by-side using Chatbot windows.
|
| 781 |
with gr.Row():
|
|
@@ -991,39 +1030,13 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 991 |
def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
|
| 992 |
# --- Part 1: Submit the current results (Existing Logic) ---
|
| 993 |
row_dict = build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args)
|
| 994 |
-
_, _, _, _, _, _, _, _, evaluator_id = user_info
|
| 995 |
append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{evaluator_id}"), add_header_when_create_sheet=True)
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
# --- Re-fetch data and filter questions (Same logic as before) ---
|
| 1001 |
-
question_map_path = hf_hub_download(
|
| 1002 |
-
repo_id=REPO_ID,
|
| 1003 |
-
filename=EVALUATOR_MAP_DICT,
|
| 1004 |
-
repo_type="dataset", # or omit if it's a Model/Space
|
| 1005 |
-
# force_download=True, # ← always fetch new copy
|
| 1006 |
-
revision="main", # branch/tag/commit, fetches the most recent version of the dataset each time this command is called
|
| 1007 |
-
token = os.getenv("HF_TOKEN")
|
| 1008 |
-
)
|
| 1009 |
-
|
| 1010 |
-
with open(question_map_path, 'r') as f:
|
| 1011 |
-
question_map = json.load(f)
|
| 1012 |
-
|
| 1013 |
-
evaluator_directory = question_map.get(evaluator_id, None)
|
| 1014 |
-
all_files = list_repo_files(
|
| 1015 |
-
repo_id=REPO_ID,
|
| 1016 |
-
repo_type="dataset",
|
| 1017 |
-
revision="main",
|
| 1018 |
-
token = os.getenv("HF_TOKEN")
|
| 1019 |
)
|
| 1020 |
|
| 1021 |
-
full_question_ids_list, data_by_filename = get_evaluator_questions(evaluator_id, all_files, evaluator_directory)
|
| 1022 |
-
remaining_count = len(full_question_ids_list)
|
| 1023 |
-
|
| 1024 |
-
# --- Part 3: Determine UI updates based on remaining count ---
|
| 1025 |
if remaining_count == 0:
|
| 1026 |
-
# Success with NO remaining questions
|
| 1027 |
return (
|
| 1028 |
gr.update(visible=False), # page0 (Hide)
|
| 1029 |
gr.update(visible=False), # page2 (Hide)
|
|
@@ -1036,72 +1049,9 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1036 |
None,
|
| 1037 |
None,
|
| 1038 |
None,
|
| 1039 |
-
None
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
|
| 1043 |
-
#selected question is the first element
|
| 1044 |
-
q_id, other_model_name = full_question_ids_list[0]
|
| 1045 |
-
|
| 1046 |
-
#Constructing question_for_eval, the question to evaluate this round
|
| 1047 |
-
txagent_matched_entry = next(
|
| 1048 |
-
(entry for entry in data_by_filename['TxAgent-T1-Llama-3.1-8B'] if preprocess_question_id(entry.get("id")) == q_id),
|
| 1049 |
-
None
|
| 1050 |
-
)
|
| 1051 |
-
other_model_matched_entry = next(
|
| 1052 |
-
(entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
|
| 1053 |
-
None
|
| 1054 |
-
)
|
| 1055 |
-
|
| 1056 |
-
models_list = [
|
| 1057 |
-
{
|
| 1058 |
-
"model": 'TxAgent-T1-Llama-3.1-8B',
|
| 1059 |
-
"reasoning_trace": txagent_matched_entry.get("solution")
|
| 1060 |
-
},
|
| 1061 |
-
{
|
| 1062 |
-
"model": other_model_name,
|
| 1063 |
-
"reasoning_trace": other_model_matched_entry.get("solution")
|
| 1064 |
-
}
|
| 1065 |
-
]
|
| 1066 |
-
random.shuffle(models_list)
|
| 1067 |
-
|
| 1068 |
-
question_for_eval = {
|
| 1069 |
-
"question": txagent_matched_entry.get("question"),
|
| 1070 |
-
"id": q_id,
|
| 1071 |
-
"models": models_list,
|
| 1072 |
-
}
|
| 1073 |
-
|
| 1074 |
-
chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
|
| 1075 |
-
chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
|
| 1076 |
-
prompt_text = question_for_eval['question']
|
| 1077 |
-
|
| 1078 |
-
# Construct the question-specific elements of the pairwise rating page (page 1)
|
| 1079 |
-
page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
|
| 1080 |
-
page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer"))
|
| 1081 |
-
chat_a = gr.Chatbot(
|
| 1082 |
-
value=chat_A_value,
|
| 1083 |
-
type="messages",
|
| 1084 |
-
height=400,
|
| 1085 |
-
label="Model A Response",
|
| 1086 |
-
show_copy_button=False,
|
| 1087 |
-
show_label=True,
|
| 1088 |
-
render_markdown=True, # Required for markdown/HTML support in messages
|
| 1089 |
-
avatar_images=None, # Optional: omit user/assistant icons
|
| 1090 |
-
rtl=False
|
| 1091 |
-
)
|
| 1092 |
-
chat_b = gr.Chatbot(
|
| 1093 |
-
value=chat_B_value,
|
| 1094 |
-
type="messages",
|
| 1095 |
-
height=400,
|
| 1096 |
-
label="Model B Response",
|
| 1097 |
-
show_copy_button=False,
|
| 1098 |
-
show_label=True,
|
| 1099 |
-
render_markdown=True, # Required for markdown/HTML support in messages
|
| 1100 |
-
avatar_images=None, # Optional: omit user/assistant icons
|
| 1101 |
-
rtl=False
|
| 1102 |
)
|
| 1103 |
-
|
| 1104 |
-
# Success with remaining questions
|
| 1105 |
return (
|
| 1106 |
gr.update(visible=False), # page0 (Hide)
|
| 1107 |
gr.update(visible=False), # page2 (Hide)
|
|
@@ -1114,8 +1064,10 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1114 |
chat_b,
|
| 1115 |
page1_prompt,
|
| 1116 |
page1_reference_answer,
|
| 1117 |
-
question_for_eval
|
| 1118 |
-
|
|
|
|
|
|
|
| 1119 |
def cancel_submission():
|
| 1120 |
# Cancel final submission: just hide the confirmation modal.
|
| 1121 |
return gr.update(visible=False)
|
|
@@ -1131,6 +1083,9 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1131 |
reset_ratings_B = [gr.update(value=None) for i in range(len(criteria))]
|
| 1132 |
|
| 1133 |
return (
|
|
|
|
|
|
|
|
|
|
| 1134 |
|
| 1135 |
# states
|
| 1136 |
# gr.update(value=None), # user_info_state
|
|
@@ -1251,11 +1206,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1251 |
)
|
| 1252 |
|
| 1253 |
# Finalize submission if user confirms.
|
| 1254 |
-
# yes_btn.click(
|
| 1255 |
-
# fn=final_submit,
|
| 1256 |
-
# inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, *ratings_A, *ratings_B],
|
| 1257 |
-
# outputs=[page2, final_page, confirm_modal]
|
| 1258 |
-
# )
|
| 1259 |
question_submission_event = yes_btn.click(
|
| 1260 |
fn=final_submit,
|
| 1261 |
inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, nonsense_btn_clicked, *ratings_A, *ratings_B],
|
|
@@ -1271,7 +1221,8 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1271 |
chat_b,
|
| 1272 |
page1_prompt,
|
| 1273 |
page1_reference_answer,
|
| 1274 |
-
data_subset_state
|
|
|
|
| 1275 |
],
|
| 1276 |
scroll_to_output=True
|
| 1277 |
)
|
|
|
|
| 319 |
for (q_id, our_model, other_model) in full_question_ids_list
|
| 320 |
if (q_id, our_model, other_model) not in matched_pairs
|
| 321 |
]
|
|
|
|
| 322 |
print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
|
| 323 |
+
|
| 324 |
|
| 325 |
return full_question_ids_list, data_by_filename
|
| 326 |
|
| 327 |
+
def get_next_eval_question(
|
| 328 |
+
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id,
|
| 329 |
+
our_methods,
|
| 330 |
+
return_user_info=True, # Whether to return user_info tuple
|
| 331 |
+
include_correct_answer=True # Whether to return correct_answer
|
| 332 |
+
):
|
| 333 |
# ADDED: Validate that name and email are non-empty before proceeding
|
| 334 |
if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
|
| 335 |
return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
|
|
|
|
| 358 |
token = os.getenv("HF_TOKEN")
|
| 359 |
)
|
| 360 |
|
| 361 |
+
# Get available questions for the evaluator
|
| 362 |
+
full_question_ids_list, data_by_filename = get_evaluator_questions(
|
| 363 |
+
evaluator_id, all_files, evaluator_directory, our_methods)
|
| 364 |
|
| 365 |
if len(full_question_ids_list) == 0:
|
| 366 |
+
return None, None, None, None, None, None, 0
|
| 367 |
|
| 368 |
full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
|
| 369 |
#selected question is the first element
|
| 370 |
+
q_id, our_model_name, other_model_name = full_question_ids_list[0]
|
| 371 |
+
print("Selected question ID:", q_id)
|
| 372 |
|
| 373 |
+
# Build model answer lists
|
| 374 |
+
models_list = []
|
| 375 |
+
|
| 376 |
+
|
| 377 |
txagent_matched_entry = next(
|
| 378 |
+
(entry for entry in data_by_filename[our_model_name] if preprocess_question_id(entry.get("id")) == q_id),
|
| 379 |
None
|
| 380 |
)
|
| 381 |
+
our_model = {
|
| 382 |
+
"model": our_model_name,
|
| 383 |
+
"reasoning_trace": txagent_matched_entry.get("solution")
|
| 384 |
+
}
|
| 385 |
other_model_matched_entry = next(
|
| 386 |
(entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
|
| 387 |
None
|
| 388 |
)
|
| 389 |
+
compared_model = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
"model": other_model_name,
|
| 391 |
"reasoning_trace": other_model_matched_entry.get("solution")
|
| 392 |
}
|
| 393 |
+
|
| 394 |
+
models_list = [our_model, compared_model]
|
| 395 |
random.shuffle(models_list)
|
| 396 |
|
| 397 |
question_for_eval = {
|
| 398 |
"question": txagent_matched_entry.get("question"),
|
|
|
|
| 399 |
"id": q_id,
|
| 400 |
"models": models_list,
|
| 401 |
}
|
| 402 |
+
if include_correct_answer:
|
| 403 |
+
question_for_eval["correct_answer"] = txagent_matched_entry.get("correct_answer")
|
| 404 |
|
| 405 |
+
# Prepare Gradio components
|
|
|
|
| 406 |
chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
|
| 407 |
chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
|
| 408 |
prompt_text = question_for_eval['question']
|
| 409 |
|
|
|
|
| 410 |
page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
|
| 411 |
+
page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer")) if include_correct_answer else None
|
| 412 |
chat_a = gr.Chatbot(
|
| 413 |
+
value=chat_A_value,
|
| 414 |
+
type="messages",
|
| 415 |
+
height=400,
|
| 416 |
+
label="Model A Response",
|
| 417 |
+
show_copy_button=False,
|
| 418 |
+
show_label=True,
|
| 419 |
+
render_markdown=True,
|
| 420 |
+
avatar_images=None,
|
| 421 |
+
rtl=False
|
| 422 |
+
)
|
| 423 |
chat_b = gr.Chatbot(
|
| 424 |
+
value=chat_B_value,
|
| 425 |
+
type="messages",
|
| 426 |
+
height=400,
|
| 427 |
+
label="Model B Response",
|
| 428 |
+
show_copy_button=False,
|
| 429 |
+
show_label=True,
|
| 430 |
+
render_markdown=True,
|
| 431 |
+
avatar_images=None,
|
| 432 |
+
rtl=False
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id, evaluator_id) if return_user_info else None
|
| 436 |
+
return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
|
| 437 |
+
|
| 438 |
+
def go_to_page0_from_minus1():
|
| 439 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 440 |
+
|
| 441 |
+
def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id):
|
| 442 |
+
|
| 443 |
+
# ADDED: Validate that name and email are non-empty before proceeding
|
| 444 |
+
if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
|
| 445 |
+
return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
|
| 446 |
+
|
| 447 |
+
user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
|
| 448 |
+
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id, our_methods
|
| 449 |
+
)
|
| 450 |
+
if remaining_count == 0:
|
| 451 |
+
return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False),""
|
| 452 |
+
return gr.update(visible=True), gr.update(visible=False), user_info,"", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, gr.update(visible=True), f"You are about to evaluate the next question. You have {remaining_count} question(s) remaining to evaluate."
|
| 453 |
|
| 454 |
#goes to page 1 from confirmation modal that tells users how many questions they have left to evaluate
|
| 455 |
def go_to_page1():
|
|
|
|
| 477 |
pairwise_results_for_display = [gr.Markdown(f"***As a reminder, your pairwise comparison answer for this criterion was: {pairwise_list[i]}. Your answer choices will be restricted based on your comparison answer, but you may go back and change the comparison answer if you wish.***") for i in range(len(criteria))]
|
| 478 |
|
| 479 |
if any(answer is None for answer in pairwise_list):
|
| 480 |
+
return (gr.update(visible=True), gr.update(visible=False), None, None, "Error: Please select an option for every pairwise comparison.", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown()) + tuple(pairwise_results_for_display)
|
| 481 |
|
| 482 |
chat_A_value = format_chat(data_subset_state['models'][0]['reasoning_trace'], tool_database_labels)
|
| 483 |
chat_B_value = format_chat(data_subset_state['models'][1]['reasoning_trace'], tool_database_labels)
|
|
|
|
| 505 |
page2_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
|
| 506 |
page2_reference_answer = gr.Markdown(data_subset_state['correct_answer'])
|
| 507 |
|
| 508 |
+
return (gr.update(visible=False), gr.update(visible=True), pairwise_list, comparison_reasons_list, "", chat_A_rating, chat_B_rating, page2_prompt, page2_reference_answer) + tuple(pairwise_results_for_display)
|
| 509 |
|
| 510 |
|
| 511 |
# Callback to store scores for Response A.
|
|
|
|
| 566 |
def mark_invalid_question(btn_clicked_status):
|
| 567 |
new_status = not btn_clicked_status
|
| 568 |
if new_status == True:
|
| 569 |
+
return new_status, gr.update(value="Undo: Correct Question", variant="primary")
|
| 570 |
else:
|
| 571 |
+
return new_status, gr.update(value="Wrong Question",variant="stop")
|
| 572 |
|
| 573 |
centered_col_css = """
|
| 574 |
#centered-column {
|
|
|
|
| 582 |
color: white !important;
|
| 583 |
border-color: purple !important;
|
| 584 |
}
|
| 585 |
+
#answer-reference-btn {
|
| 586 |
+
background-color: #E6E6FA !important;
|
| 587 |
+
color: white !important;
|
| 588 |
+
border-color: #E6E6FA !important;
|
| 589 |
+
}
|
| 590 |
#clear_btn {
|
| 591 |
background-color: #F08080 !important;
|
| 592 |
color: white !important;
|
| 593 |
border-color: #F08080 !important;
|
| 594 |
}
|
| 595 |
+
.reference-box {
|
|
|
|
|
|
|
| 596 |
border: 1px solid #ccc;
|
| 597 |
padding: 10px;
|
| 598 |
border-radius: 5px;
|
| 599 |
}
|
| 600 |
+
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
|
| 601 |
+
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
|
| 602 |
+
|
| 603 |
"""
|
| 604 |
with gr.Blocks(css=centered_col_css) as demo:
|
| 605 |
# States to save information between pages.
|
|
|
|
| 703 |
home_btn_0 = gr.Button("Home (your registration info will be saved)")
|
| 704 |
|
| 705 |
|
| 706 |
+
with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
|
| 707 |
eval_progress_text = gr.Markdown("You have X questions remaining.")
|
| 708 |
eval_progress_proceed_btn = gr.Button("OK, proceed to question evaluation")
|
| 709 |
|
|
|
|
| 712 |
gr.Markdown("## Part 1/2: Pairwise Comparison") #Make the number controlled by question indexing!
|
| 713 |
page1_prompt = gr.HTML()
|
| 714 |
|
| 715 |
+
with gr.Accordion("Click to reveal a reference answer—this is just one correct solution; others are possible.", open=False, elem_id="answer-reference-btn"):
|
| 716 |
+
page1_reference_answer = gr.Markdown(
|
| 717 |
+
"""
|
| 718 |
+
**Reference Answer:**
|
| 719 |
+
|
| 720 |
+
This is the reference answer content.
|
| 721 |
+
""",
|
| 722 |
+
elem_classes="reference-box"
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
|
| 726 |
# Add small red button under the prompt
|
| 727 |
+
with gr.Row():
|
| 728 |
+
nonsense_btn = gr.Button(
|
| 729 |
+
"Wrong Question?",
|
| 730 |
+
size="sm",
|
| 731 |
+
variant="stop", # red variant
|
| 732 |
+
elem_id="invalid-question-btn",
|
| 733 |
+
elem_classes=["short-btn"]
|
| 734 |
+
)
|
| 735 |
+
gr.Markdown(
|
| 736 |
+
"<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
|
| 737 |
+
render=True
|
| 738 |
+
)
|
| 739 |
|
| 740 |
nonsense_btn.click(
|
| 741 |
fn=mark_invalid_question,
|
|
|
|
| 744 |
queue=False,
|
| 745 |
)
|
| 746 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
with gr.Row():
|
| 748 |
# ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
|
| 749 |
with gr.Column():
|
|
|
|
| 807 |
gr.Markdown("## Part 2/2: Rate Model Responses")
|
| 808 |
# ### EDIT: Show a highlighted prompt as on previous pages.
|
| 809 |
page2_prompt = gr.HTML()
|
| 810 |
+
with gr.Accordion("Click to reveal a reference answer—this is just one correct solution; others are possible.", open=False, elem_id="answer-reference-btn"):
|
| 811 |
page2_reference_answer = gr.Markdown(
|
| 812 |
"""
|
| 813 |
**Reference Answer:**
|
| 814 |
|
| 815 |
This is the reference answer content.
|
| 816 |
+
""",
|
| 817 |
+
elem_classes="reference-box"
|
| 818 |
)
|
| 819 |
# ### EDIT: Display both responses side-by-side using Chatbot windows.
|
| 820 |
with gr.Row():
|
|
|
|
| 1030 |
def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
|
| 1031 |
# --- Part 1: Submit the current results (Existing Logic) ---
|
| 1032 |
row_dict = build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args)
|
|
|
|
| 1033 |
append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{evaluator_id}"), add_header_when_create_sheet=True)
|
| 1034 |
+
name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, _, evaluator_id = user_info
|
| 1035 |
+
user_info_new, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
|
| 1036 |
+
name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, our_methods
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1037 |
)
|
| 1038 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1039 |
if remaining_count == 0:
|
|
|
|
| 1040 |
return (
|
| 1041 |
gr.update(visible=False), # page0 (Hide)
|
| 1042 |
gr.update(visible=False), # page2 (Hide)
|
|
|
|
| 1049 |
None,
|
| 1050 |
None,
|
| 1051 |
None,
|
| 1052 |
+
None,
|
| 1053 |
+
user_info_new,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1054 |
)
|
|
|
|
|
|
|
| 1055 |
return (
|
| 1056 |
gr.update(visible=False), # page0 (Hide)
|
| 1057 |
gr.update(visible=False), # page2 (Hide)
|
|
|
|
| 1064 |
chat_b,
|
| 1065 |
page1_prompt,
|
| 1066 |
page1_reference_answer,
|
| 1067 |
+
question_for_eval,
|
| 1068 |
+
user_info_new
|
| 1069 |
+
)
|
| 1070 |
+
|
| 1071 |
def cancel_submission():
|
| 1072 |
# Cancel final submission: just hide the confirmation modal.
|
| 1073 |
return gr.update(visible=False)
|
|
|
|
| 1083 |
reset_ratings_B = [gr.update(value=None) for i in range(len(criteria))]
|
| 1084 |
|
| 1085 |
return (
|
| 1086 |
+
# pages
|
| 1087 |
+
gr.update(visible=True), # page0
|
| 1088 |
+
gr.update(visible=False), # final_page
|
| 1089 |
|
| 1090 |
# states
|
| 1091 |
# gr.update(value=None), # user_info_state
|
|
|
|
| 1206 |
)
|
| 1207 |
|
| 1208 |
# Finalize submission if user confirms.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1209 |
question_submission_event = yes_btn.click(
|
| 1210 |
fn=final_submit,
|
| 1211 |
inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, nonsense_btn_clicked, *ratings_A, *ratings_B],
|
|
|
|
| 1221 |
chat_b,
|
| 1222 |
page1_prompt,
|
| 1223 |
page1_reference_answer,
|
| 1224 |
+
data_subset_state,
|
| 1225 |
+
user_info_state,
|
| 1226 |
],
|
| 1227 |
scroll_to_output=True
|
| 1228 |
)
|