update a lot
Browse files- app.py +296 -98
- index.html +1 -1
- utils.py +9 -2
app.py
CHANGED
@@ -339,7 +339,7 @@ def get_evaluator_questions(email, disease_map_data, drug_map_data, user_all_spe
|
|
339 |
for other_model_name in model_names:
|
340 |
for (q_id, dataset) in evaluator_question_ids:
|
341 |
full_question_ids_list.append((q_id, our_model_name, other_model_name, dataset))
|
342 |
-
|
343 |
results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME))
|
344 |
if (results_df is not None) and (not results_df.empty):
|
345 |
# collect all (question_ID, other_model) pairs already seen
|
@@ -360,7 +360,6 @@ def get_evaluator_questions(email, disease_map_data, drug_map_data, user_all_spe
|
|
360 |
for (q_id, our_model, other_model, dataset) in full_question_ids_list
|
361 |
if (q_id, our_model, other_model) not in matched_pairs
|
362 |
]
|
363 |
-
# print(f"Filtered question IDs: {full_question_ids_list}")
|
364 |
print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
|
365 |
|
366 |
|
@@ -479,7 +478,6 @@ def get_next_eval_question(
|
|
479 |
)
|
480 |
|
481 |
user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id) if return_user_info else None
|
482 |
-
|
483 |
return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
|
484 |
|
485 |
def go_to_page0_from_minus1():
|
@@ -611,12 +609,129 @@ def toggle_reference(selection):
|
|
611 |
return gr.update(visible=False)
|
612 |
|
613 |
#nonsense button helper
|
614 |
-
def mark_invalid_question(btn_clicked_status):
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
620 |
|
621 |
centered_col_css = """
|
622 |
#centered-column {
|
@@ -648,6 +763,21 @@ centered_col_css = """
|
|
648 |
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
|
649 |
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
|
650 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
"""
|
652 |
with gr.Blocks(css=centered_col_css) as demo:
|
653 |
# States to save information between pages.
|
@@ -686,27 +816,59 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
686 |
<p>Welcome to the TxAgent Evaluation Portal.</p>
|
687 |
</div>
|
688 |
""")
|
689 |
-
with gr.Row():
|
690 |
-
#
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
gr.HTML(TxAgent_Project_Page_HTML)
|
705 |
|
706 |
# Define actions for the new buttons
|
707 |
# For the Google Form button, we'll use JavaScript to open a new tab.
|
708 |
# The URL for the Google Form should be replaced with the actual link.
|
709 |
-
google_form_url = "https://forms.gle/pYvyvEQQwS5gdupQA"
|
710 |
submit_questions_btn.click(
|
711 |
fn=None,
|
712 |
inputs=None,
|
@@ -716,57 +878,59 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
716 |
|
717 |
# Page 0: Welcome / Informational page.
|
718 |
with gr.Column(visible=False, elem_id="page0") as page0:
|
719 |
-
gr.Markdown("## Welcome to the TxAgent Evalution Study!")
|
720 |
-
gr.Markdown("Please read the following instructions and then enter your information to begin:")
|
721 |
-
# Existing informational markdown...
|
722 |
-
gr.Markdown("""
|
723 |
-
- Each session requires a minimum commitment of 5-10 minutes to complete one question.
|
724 |
-
- If you wish to evaluate multiple questions, you may do so; you will never be asked to re-evaluate questions you have already seen.
|
725 |
-
- When evaluating a question, you will be asked to compare the responses of two different models to the question and then rate each model's response on a scale of 1-5.
|
726 |
-
- If you feel that a question does not make sense or is not biomedically relevant, there is a RED BUTTON at the top of the first model comparison page to indicate this
|
727 |
-
- You may use the Back and Next buttons at the bottom of each page to edit any of your responses before submitting.
|
728 |
-
- You may use the Home Page button at the bottom of each page to the home page. Your progress will be saved but not submitted.
|
729 |
-
- You must submit your answers to the current question before moving on to evaluate the next question.
|
730 |
-
- You may stop in between questions and return at a later time; however, you must submit your answers to the current question if you would like them saved.
|
731 |
-
- Please review the example question and LLM model response below:
|
732 |
-
|
733 |
-
""")
|
734 |
-
# Assume 'your_image.png' is in the same directory
|
735 |
-
with open("anatomyofAgentResponse.jpg", "rb") as image_file:
|
736 |
-
img = Image.open(image_file)
|
737 |
-
new_size = (int(img.width * 0.5), int(img.height * 0.5))
|
738 |
-
img = img.resize(new_size, Image.LANCZOS)
|
739 |
-
buffer = io.BytesIO()
|
740 |
-
img.save(buffer, format="PNG")
|
741 |
-
encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
742 |
-
#encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
743 |
-
|
744 |
-
image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
745 |
-
ReasoningTraceExampleHTML = f"""
|
746 |
-
<div>
|
747 |
-
{image_html}
|
748 |
-
</div>
|
749 |
-
"""
|
750 |
-
gr.HTML(ReasoningTraceExampleHTML)
|
751 |
-
gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
752 |
-
""")
|
753 |
-
gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
|
754 |
-
name = gr.Textbox(label="Name (required)")
|
755 |
-
email = gr.Textbox(label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
|
756 |
-
specialty_dd = gr.Dropdown(choices=specialties_list, label="Primary Medical Specialty (required). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
|
757 |
-
subspecialty_dd = gr.Dropdown(choices=subspecialties_list, label="Subspecialty (if applicable). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
|
758 |
-
npi_id = gr.Textbox(label="National Provider Identifier ID (optional). Got to https://npiregistry.cms.hhs.gov/search to search for your NPI ID. If you do not have an NPI ID, please leave this blank.")
|
759 |
-
years_exp_radio = gr.Radio(
|
760 |
-
choices=["0-2 years", "3-5 years", "6-10 years", "11-20 years", "20+ years", "Not Applicable"],
|
761 |
-
label="How many years have you been involved in clinical and/or research activities related to your biomedical area of expertise? (required)"
|
762 |
-
)
|
763 |
-
exp_explanation_tb = gr.Textbox(label="Please briefly explain your expertise/experience relevant to evaluating AI for clinical decision support (optional)")
|
764 |
-
|
765 |
-
page0_error_box = gr.Markdown("")
|
766 |
-
with gr.Row():
|
767 |
-
next_btn_0 = gr.Button("Next")
|
768 |
with gr.Row():
|
769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
|
771 |
|
772 |
with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
|
@@ -802,14 +966,22 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
802 |
"<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
|
803 |
render=True
|
804 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
805 |
|
806 |
-
|
807 |
-
fn=mark_invalid_question,
|
808 |
-
inputs=[nonsense_btn_clicked],
|
809 |
-
outputs=[nonsense_btn_clicked, nonsense_btn],
|
810 |
-
queue=False,
|
811 |
-
)
|
812 |
|
|
|
813 |
with gr.Row():
|
814 |
# ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
|
815 |
with gr.Column():
|
@@ -845,22 +1017,25 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
845 |
pairwise_inputs = []
|
846 |
for crit in criteria_for_comparison:
|
847 |
with gr.Row():
|
848 |
-
gr.
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
|
|
|
|
|
|
858 |
pairwise_inputs.append(radio)
|
859 |
# ADDED: free text under each comparison
|
860 |
text_input = gr.Textbox(label=f"Reasons for your selection (optional)")
|
861 |
comparison_reasons_inputs.append(text_input)
|
862 |
|
863 |
-
|
864 |
with gr.Row():
|
865 |
back_btn_0 = gr.Button("Back")
|
866 |
next_btn_1 = gr.Button("Next: Rate Responses")
|
@@ -999,18 +1174,20 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
999 |
indices_for_change.append(index_component)
|
1000 |
|
1001 |
with gr.Column(elem_id="centered-column"):
|
1002 |
-
gr.Markdown(f'<div style="text-align: left;">{crit["text"][0]}</div>')
|
1003 |
-
gr.Markdown(f'<div style="text-align: left;">{crit["text"][1]}</div>')
|
1004 |
pairwise_results_for_display[i].render()
|
1005 |
with gr.Row():
|
1006 |
with gr.Column(scale=1):
|
1007 |
rating_a = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
|
1008 |
label=f"Score for Response A - {crit['label']}",
|
1009 |
-
interactive=True
|
|
|
1010 |
with gr.Column(scale=1):
|
1011 |
rating_b = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
|
1012 |
label=f"Score for Response B - {crit['label']}",
|
1013 |
-
interactive=True
|
|
|
1014 |
with gr.Row():
|
1015 |
clear_btn = gr.Button("Clear Selection", size="sm", elem_id="clear_btn")
|
1016 |
clear_btn.click(fn=clear_selection, outputs=[rating_a,rating_b])
|
@@ -1054,7 +1231,8 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1054 |
with gr.Row():
|
1055 |
yes_btn = gr.Button("Yes, please submit")
|
1056 |
cancel_btn = gr.Button("Cancel")
|
1057 |
-
|
|
|
1058 |
# --- Define Callback Functions for Confirmation Flow ---
|
1059 |
def build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
|
1060 |
num_criteria = len(criteria)
|
@@ -1222,6 +1400,24 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1222 |
inputs=None,
|
1223 |
outputs=[page0, page1]
|
1224 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1225 |
|
1226 |
# Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
|
1227 |
next_btn_1.click(
|
@@ -1247,7 +1443,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1247 |
gr.update(), # Show page 3
|
1248 |
gr.update(), # Hide final page
|
1249 |
gr.update(visible=True), # Show confirmation modal
|
1250 |
-
gr.update(visible=False),
|
1251 |
gr.update(value="") # EDIT: Clear the error_message_box
|
1252 |
)
|
1253 |
else:
|
@@ -1342,4 +1538,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1342 |
]
|
1343 |
)
|
1344 |
|
|
|
|
|
1345 |
demo.launch(share=True, allowed_paths = ["."])
|
|
|
339 |
for other_model_name in model_names:
|
340 |
for (q_id, dataset) in evaluator_question_ids:
|
341 |
full_question_ids_list.append((q_id, our_model_name, other_model_name, dataset))
|
342 |
+
|
343 |
results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME))
|
344 |
if (results_df is not None) and (not results_df.empty):
|
345 |
# collect all (question_ID, other_model) pairs already seen
|
|
|
360 |
for (q_id, our_model, other_model, dataset) in full_question_ids_list
|
361 |
if (q_id, our_model, other_model) not in matched_pairs
|
362 |
]
|
|
|
363 |
print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
|
364 |
|
365 |
|
|
|
478 |
)
|
479 |
|
480 |
user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id) if return_user_info else None
|
|
|
481 |
return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
|
482 |
|
483 |
def go_to_page0_from_minus1():
|
|
|
609 |
return gr.update(visible=False)
|
610 |
|
611 |
#nonsense button helper
|
612 |
+
# def mark_invalid_question(btn_clicked_status):
|
613 |
+
# new_status = not btn_clicked_status
|
614 |
+
# if new_status == True:
|
615 |
+
# # Show skip modal when marking as Wrong Question
|
616 |
+
# return new_status, gr.update(value="Undo: Correct Question", variant="primary"), gr.update(visible=True)
|
617 |
+
# else:
|
618 |
+
# return new_status, gr.update(value="Wrong Question", variant="stop"), gr.update(visible=False)
|
619 |
+
|
620 |
+
# --- Skip Question Modal Callbacks ---
|
621 |
+
def skip_question_and_load_new(user_info_state, our_methods):
|
622 |
+
# user_info_state is a tuple: (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id)
|
623 |
+
if user_info_state is None:
|
624 |
+
# Defensive: just close modal if no user info
|
625 |
+
return gr.update(visible=False), gr.update(visible=False), None, "", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State()
|
626 |
+
# Unpack user_info_state
|
627 |
+
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, _ = user_info_state
|
628 |
+
user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
|
629 |
+
name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods
|
630 |
+
)
|
631 |
+
if remaining_count == 0:
|
632 |
+
# No more questions, go to final page
|
633 |
+
return gr.update(visible=False), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State()
|
634 |
+
return gr.update(visible=False), gr.update(visible=True), user_info, "", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval
|
635 |
+
|
636 |
+
# --- Skip‑question handler for the "Wrong Question?" button -------------------
|
637 |
+
def skip_current_question(user_info_state, our_methods: list = our_methods):
|
638 |
+
# Guard: user clicked before session started
|
639 |
+
gr.Info("Skipping this question and loading the next one…", duration=5)
|
640 |
+
if user_info_state is None:
|
641 |
+
return (
|
642 |
+
None,
|
643 |
+
gr.update(value="Please start the evaluation before skipping questions."),
|
644 |
+
gr.update(value=[]), # Chatbot A history
|
645 |
+
gr.update(value=[]), # Chatbot B history
|
646 |
+
gr.update(value=""), # Prompt HTML
|
647 |
+
gr.update(value=""), # Reference answer
|
648 |
+
gr.State() # data_subset_state
|
649 |
+
)
|
650 |
+
|
651 |
+
# Unpack evaluator identity
|
652 |
+
name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, _ = user_info_state
|
653 |
+
|
654 |
+
# Pull the next unused question
|
655 |
+
(
|
656 |
+
user_info_new,
|
657 |
+
_chat_a_comp, # we will rebuild the values directly
|
658 |
+
_chat_b_comp,
|
659 |
+
_prompt_comp,
|
660 |
+
_ref_comp,
|
661 |
+
question_for_eval,
|
662 |
+
remaining,
|
663 |
+
) = get_next_eval_question(
|
664 |
+
name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, our_methods
|
665 |
+
)
|
666 |
+
|
667 |
+
# If the pool is exhausted, just notify the evaluator
|
668 |
+
if remaining == 0 or question_for_eval is None:
|
669 |
+
final_msg = (
|
670 |
+
"Based on your submitted data, you have no more questions to evaluate. "
|
671 |
+
"You may exit the page; we will follow‑up if we require anything else from you. "
|
672 |
+
"Thank you!"
|
673 |
+
)
|
674 |
+
return (
|
675 |
+
user_info_state,
|
676 |
+
gr.update(value=final_msg),
|
677 |
+
gr.update(value=[]),
|
678 |
+
gr.update(value=[]),
|
679 |
+
gr.update(value=""),
|
680 |
+
gr.update(value=""),
|
681 |
+
gr.State()
|
682 |
+
)
|
683 |
+
|
684 |
+
# --- Build fresh values for the existing UI components ---
|
685 |
+
chat_a_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
|
686 |
+
chat_b_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
|
687 |
+
|
688 |
+
prompt_html = (
|
689 |
+
f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
|
690 |
+
f"border-radius: 5px; color: black;'><strong style='color: black;'>Prompt:</strong> "
|
691 |
+
f"{question_for_eval['question']}</div>"
|
692 |
+
)
|
693 |
+
reference_md = question_for_eval.get("correct_answer", "")
|
694 |
+
gr.Info("New question loaded…", duration=3)
|
695 |
+
|
696 |
+
# Return updates to refresh Page 1 in‑place
|
697 |
+
return (
|
698 |
+
user_info_new,
|
699 |
+
gr.update(value=""), # clear any previous error text
|
700 |
+
gr.update(value=chat_a_value), # Chatbot A history
|
701 |
+
gr.update(value=chat_b_value), # Chatbot B history
|
702 |
+
gr.update(value=prompt_html), # Prompt
|
703 |
+
gr.update(value=reference_md), # Reference answer
|
704 |
+
question_for_eval # store for later pages
|
705 |
+
)
|
706 |
+
|
707 |
+
# --- Handler for "Wrong Question?": flags nonsense and skips
|
708 |
+
def flag_nonsense_and_skip(user_info_state):
|
709 |
+
"""
|
710 |
+
When the evaluator clicks the “Wrong Question?” button, immediately
|
711 |
+
record that this question was flagged as nonsensical/irrelevant and
|
712 |
+
then load the next question (re‑using the existing skip logic).
|
713 |
+
"""
|
714 |
+
# 1) Record the flag to the Google Sheet so we keep the feedback even
|
715 |
+
# if the evaluator stops here.
|
716 |
+
if user_info_state is not None:
|
717 |
+
name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, q_id = user_info_state
|
718 |
+
timestamp = datetime.datetime.now().isoformat()
|
719 |
+
row = {
|
720 |
+
"Timestamp": timestamp,
|
721 |
+
"Name": name,
|
722 |
+
"Email": email,
|
723 |
+
"Question ID": q_id,
|
724 |
+
"Question Makes No Sense or Biomedically Irrelevant": True,
|
725 |
+
}
|
726 |
+
append_to_sheet(
|
727 |
+
user_data=None,
|
728 |
+
custom_row_dict=row,
|
729 |
+
custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME),
|
730 |
+
add_header_when_create_sheet=True,
|
731 |
+
)
|
732 |
+
|
733 |
+
# 2) Fall back to the existing skip logic to advance the UI.
|
734 |
+
return skip_current_question(user_info_state)
|
735 |
|
736 |
centered_col_css = """
|
737 |
#centered-column {
|
|
|
763 |
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
|
764 |
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
|
765 |
|
766 |
+
/* --- Added for larger criteria font --- */
|
767 |
+
.criteria-font-large {
|
768 |
+
font-size: 3em !important;
|
769 |
+
}
|
770 |
+
.gr-radio label, .criteria-radio-label {
|
771 |
+
font-size: 3em !important;
|
772 |
+
}
|
773 |
+
#participate-btn button {
|
774 |
+
font-size: 24px !important; /* Large readable text */
|
775 |
+
font-weight: 700 !important; /* Bold for emphasis */
|
776 |
+
padding: 28px 40px !important; /* Extra padding for height */
|
777 |
+
min-height: 120px !important; /* Make button visibly taller (multi‑line) */
|
778 |
+
width: 100% !important; /* Occupy full width of its column */
|
779 |
+
white-space: normal !important; /* Allow text to wrap onto multiple lines */
|
780 |
+
}
|
781 |
"""
|
782 |
with gr.Blocks(css=centered_col_css) as demo:
|
783 |
# States to save information between pages.
|
|
|
816 |
<p>Welcome to the TxAgent Evaluation Portal.</p>
|
817 |
</div>
|
818 |
""")
|
819 |
+
with gr.Row(elem_classes=["center-row"]):
|
820 |
+
# 第一行:并排放两个按钮
|
821 |
+
with gr.Column(scale=1):
|
822 |
+
participate_eval_btn = gr.Button(
|
823 |
+
value="Click to 🌟Participate in TxAgent Evaluation 🌟",
|
824 |
+
variant="primary",
|
825 |
+
size="lg",
|
826 |
+
elem_id="participate-btn"
|
827 |
+
)
|
828 |
+
with gr.Column(scale=1):
|
829 |
+
submit_questions_btn = gr.Button(
|
830 |
+
value="Click to 🚀 Submit Questions for TxAgent Evaluation 🚀",
|
831 |
+
variant="primary",
|
832 |
+
size="lg",
|
833 |
+
elem_id="submit-btn"
|
834 |
+
)
|
835 |
+
|
836 |
+
with gr.Row(elem_classes=["center-row"]):
|
837 |
+
# 第二行:分别放两段说明文字
|
838 |
+
with gr.Column(scale=1):
|
839 |
+
gr.Markdown(
|
840 |
+
"""
|
841 |
+
### Why participate in the evaluation?
|
842 |
+
Joining the **TxAgent human evaluation** lets you experience the model in real-time and give immediate feedback.
|
843 |
+
|
844 |
+
**By clicking the button above, you will:**
|
845 |
+
- See how the model responds to a variety of prompts.
|
846 |
+
- Provide instant thumbs-up / thumbs-down ratings.
|
847 |
+
- Shape the roadmap for upcoming releases.
|
848 |
+
|
849 |
+
_Thank you for helping us improve!_
|
850 |
+
"""
|
851 |
+
)
|
852 |
+
with gr.Column(scale=1):
|
853 |
+
gr.Markdown(
|
854 |
+
"""
|
855 |
+
### Why submit questions?
|
856 |
+
Help us build a richer evaluation set by sending unique, challenging prompts.
|
857 |
+
|
858 |
+
**By clicking the button above, you will:**
|
859 |
+
- Highlight edge-cases and identify blind spots.
|
860 |
+
- Push TxAgent to reason in new domains.
|
861 |
+
- Directly influence future model improvements.
|
862 |
+
|
863 |
+
_We're excited to see what you come up with!_
|
864 |
+
"""
|
865 |
+
)
|
866 |
gr.HTML(TxAgent_Project_Page_HTML)
|
867 |
|
868 |
# Define actions for the new buttons
|
869 |
# For the Google Form button, we'll use JavaScript to open a new tab.
|
870 |
# The URL for the Google Form should be replaced with the actual link.
|
871 |
+
google_form_url = "https://forms.gle/pYvyvEQQwS5gdupQA"
|
872 |
submit_questions_btn.click(
|
873 |
fn=None,
|
874 |
inputs=None,
|
|
|
878 |
|
879 |
# Page 0: Welcome / Informational page.
|
880 |
with gr.Column(visible=False, elem_id="page0") as page0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
881 |
with gr.Row():
|
882 |
+
with gr.Column():
|
883 |
+
gr.Markdown("## Welcome to the TxAgent Evalution Study!")
|
884 |
+
gr.Markdown("Please read the following instructions and then enter your information to begin:")
|
885 |
+
# Existing informational markdown...
|
886 |
+
gr.Markdown("""
|
887 |
+
- Each session requires a minimum commitment of 5-10 minutes to complete one question.
|
888 |
+
- If you wish to evaluate multiple questions, you may do so; you will never be asked to re-evaluate questions you have already seen.
|
889 |
+
- When evaluating a question, you will be asked to compare the responses of two different models to the question and then rate each model's response on a scale of 1-5.
|
890 |
+
- If you feel that a question does not make sense or is not biomedically relevant, there is a RED BUTTON at the top of the first model comparison page to indicate this
|
891 |
+
- You may use the Back and Next buttons at the bottom of each page to edit any of your responses before submitting.
|
892 |
+
- You may use the Home Page button at the bottom of each page to the home page. Your progress will be saved but not submitted.
|
893 |
+
- You must submit your answers to the current question before moving on to evaluate the next question.
|
894 |
+
- You may stop in between questions and return at a later time; however, you must submit your answers to the current question if you would like them saved.
|
895 |
+
- Please review the example question and LLM model response below:
|
896 |
+
|
897 |
+
""")
|
898 |
+
with open("anatomyofAgentResponse.jpg", "rb") as image_file:
|
899 |
+
img = Image.open(image_file)
|
900 |
+
new_size = (int(img.width * 0.5), int(img.height * 0.5))
|
901 |
+
img = img.resize(new_size, Image.LANCZOS)
|
902 |
+
buffer = io.BytesIO()
|
903 |
+
img.save(buffer, format="PNG")
|
904 |
+
encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
905 |
+
#encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
906 |
+
|
907 |
+
image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
908 |
+
ReasoningTraceExampleHTML = f"""
|
909 |
+
<div>
|
910 |
+
{image_html}
|
911 |
+
</div>
|
912 |
+
"""
|
913 |
+
gr.HTML(ReasoningTraceExampleHTML)
|
914 |
+
with gr.Column():
|
915 |
+
gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
|
916 |
+
name = gr.Textbox(label="Name (required)")
|
917 |
+
email = gr.Textbox(label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
|
918 |
+
specialty_dd = gr.Dropdown(choices=specialties_list, label="Primary Medical Specialty (required). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
|
919 |
+
subspecialty_dd = gr.Dropdown(choices=subspecialties_list, label="Subspecialty (if applicable). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
|
920 |
+
npi_id = gr.Textbox(label="National Provider Identifier ID (optional). Got to https://npiregistry.cms.hhs.gov/search to search for your NPI ID. If you do not have an NPI ID, please leave this blank.")
|
921 |
+
years_exp_radio = gr.Radio(
|
922 |
+
choices=["0-2 years", "3-5 years", "6-10 years", "11-20 years", "20+ years", "Not Applicable"],
|
923 |
+
label="How many years have you been involved in clinical and/or research activities related to your biomedical area of expertise? (required)"
|
924 |
+
)
|
925 |
+
exp_explanation_tb = gr.Textbox(label="Please briefly explain your expertise/experience relevant to evaluating AI for clinical decision support (optional)")
|
926 |
+
|
927 |
+
page0_error_box = gr.Markdown("")
|
928 |
+
with gr.Row():
|
929 |
+
next_btn_0 = gr.Button("Next")
|
930 |
+
with gr.Row():
|
931 |
+
home_btn_0 = gr.Button("Home") # (your registration info will be saved)
|
932 |
+
gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
933 |
+
""")
|
934 |
|
935 |
|
936 |
with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
|
|
|
966 |
"<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
|
967 |
render=True
|
968 |
)
|
969 |
+
with gr.Row():
|
970 |
+
unfamiliar_btn = gr.Button(
|
971 |
+
"Unfamiliar Question?",
|
972 |
+
size="sm",
|
973 |
+
variant="stop", # red variant
|
974 |
+
elem_id="invalid-question-btn",
|
975 |
+
elem_classes=["short-btn"]
|
976 |
+
)
|
977 |
+
gr.Markdown(
|
978 |
+
"<span style='color: #b30000; font-weight: bold;'>Click the button if you are not familiar with this area</span>",
|
979 |
+
render=True
|
980 |
+
)
|
981 |
|
982 |
+
page1_error_box = gr.Markdown("") # ADDED: display validation errors
|
|
|
|
|
|
|
|
|
|
|
983 |
|
984 |
+
# --- Define chat_a and chat_b before using them in outputs ---
|
985 |
with gr.Row():
|
986 |
# ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
|
987 |
with gr.Column():
|
|
|
1017 |
pairwise_inputs = []
|
1018 |
for crit in criteria_for_comparison:
|
1019 |
with gr.Row():
|
1020 |
+
with gr.Column(scale=1):
|
1021 |
+
gr.Markdown(crit['text'], elem_classes="criteria-font-large") # <--- add class here
|
1022 |
+
with gr.Column(scale=1):
|
1023 |
+
radio = gr.Radio(
|
1024 |
+
choices=[
|
1025 |
+
"👈 Model A", # A
|
1026 |
+
"👉 Model B", # B
|
1027 |
+
"🤝 Tie", # tie
|
1028 |
+
"👎 Neither model did well" # neither
|
1029 |
+
],
|
1030 |
+
label="Which is better?",
|
1031 |
+
elem_classes="criteria-radio-label" # <--- add class here
|
1032 |
+
)
|
1033 |
pairwise_inputs.append(radio)
|
1034 |
# ADDED: free text under each comparison
|
1035 |
text_input = gr.Textbox(label=f"Reasons for your selection (optional)")
|
1036 |
comparison_reasons_inputs.append(text_input)
|
1037 |
|
1038 |
+
|
1039 |
with gr.Row():
|
1040 |
back_btn_0 = gr.Button("Back")
|
1041 |
next_btn_1 = gr.Button("Next: Rate Responses")
|
|
|
1174 |
indices_for_change.append(index_component)
|
1175 |
|
1176 |
with gr.Column(elem_id="centered-column"):
|
1177 |
+
gr.Markdown(f'<div style="text-align: left;">{crit["text"][0]}</div>', elem_classes="criteria-font-large")
|
1178 |
+
gr.Markdown(f'<div style="text-align: left;">{crit["text"][1]}</div>', elem_classes="criteria-font-large")
|
1179 |
pairwise_results_for_display[i].render()
|
1180 |
with gr.Row():
|
1181 |
with gr.Column(scale=1):
|
1182 |
rating_a = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
|
1183 |
label=f"Score for Response A - {crit['label']}",
|
1184 |
+
interactive=True,
|
1185 |
+
elem_classes="criteria-radio-label")
|
1186 |
with gr.Column(scale=1):
|
1187 |
rating_b = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
|
1188 |
label=f"Score for Response B - {crit['label']}",
|
1189 |
+
interactive=True,
|
1190 |
+
elem_classes="criteria-radio-label")
|
1191 |
with gr.Row():
|
1192 |
clear_btn = gr.Button("Clear Selection", size="sm", elem_id="clear_btn")
|
1193 |
clear_btn.click(fn=clear_selection, outputs=[rating_a,rating_b])
|
|
|
1231 |
with gr.Row():
|
1232 |
yes_btn = gr.Button("Yes, please submit")
|
1233 |
cancel_btn = gr.Button("Cancel")
|
1234 |
+
|
1235 |
+
|
1236 |
# --- Define Callback Functions for Confirmation Flow ---
|
1237 |
def build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
|
1238 |
num_criteria = len(criteria)
|
|
|
1400 |
inputs=None,
|
1401 |
outputs=[page0, page1]
|
1402 |
)
|
1403 |
+
|
1404 |
+
# Skip the current question and load a new one when the evaluator flags it
|
1405 |
+
nonsense_btn.click(
|
1406 |
+
fn=flag_nonsense_and_skip,
|
1407 |
+
inputs=[user_info_state],
|
1408 |
+
outputs=[user_info_state, page1_error_box, chat_a, chat_b,
|
1409 |
+
page1_prompt, page1_reference_answer, data_subset_state],
|
1410 |
+
scroll_to_output=True
|
1411 |
+
)
|
1412 |
+
|
1413 |
+
unfamiliar_btn.click(
|
1414 |
+
fn=skip_current_question,
|
1415 |
+
inputs=[user_info_state],
|
1416 |
+
outputs=[user_info_state, page1_error_box, chat_a, chat_b,
|
1417 |
+
page1_prompt, page1_reference_answer, data_subset_state],
|
1418 |
+
scroll_to_output=True
|
1419 |
+
)
|
1420 |
+
|
1421 |
|
1422 |
# Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
|
1423 |
next_btn_1.click(
|
|
|
1443 |
gr.update(), # Show page 3
|
1444 |
gr.update(), # Hide final page
|
1445 |
gr.update(visible=True), # Show confirmation modal
|
1446 |
+
gr.update(visible=False), # Hide error modal
|
1447 |
gr.update(value="") # EDIT: Clear the error_message_box
|
1448 |
)
|
1449 |
else:
|
|
|
1538 |
]
|
1539 |
)
|
1540 |
|
1541 |
+
|
1542 |
+
|
1543 |
demo.launch(share=True, allowed_paths = ["."])
|
index.html
CHANGED
@@ -150,7 +150,7 @@
|
|
150 |
<div class="container is-max-desktop">
|
151 |
<div class="columns is-centered has-text-centered">
|
152 |
<div class="column is-four-fifths">
|
153 |
-
<h2 class="title is-3">
|
154 |
<div class="content has-text-justified">
|
155 |
<p>
|
156 |
Precision therapeutics require multimodal adaptive models that
|
|
|
150 |
<div class="container is-max-desktop">
|
151 |
<div class="columns is-centered has-text-centered">
|
152 |
<div class="column is-four-fifths">
|
153 |
+
<h2 class="title is-3">What is TxAgent?</h2>
|
154 |
<div class="content has-text-justified">
|
155 |
<p>
|
156 |
Precision therapeutics require multimodal adaptive models that
|
utils.py
CHANGED
@@ -141,6 +141,7 @@ def append_to_sheet(user_data=None, custom_row_dict=None, custom_sheet_name=None
|
|
141 |
"""
|
142 |
Append a new row to a Google Sheet. If 'custom_row' is provided, append that row.
|
143 |
Otherwise, append a default row constructed from the provided user_data.
|
|
|
144 |
"""
|
145 |
if custom_sheet_name is None:
|
146 |
custom_sheet_name = GSHEET_NAME
|
@@ -165,6 +166,7 @@ def append_to_sheet(user_data=None, custom_row_dict=None, custom_sheet_name=None
|
|
165 |
existing_values = sheet.get_all_values()
|
166 |
is_empty = (existing_values == [[]]) #indicates empty spreadsheet that was cleared in the past
|
167 |
|
|
|
168 |
if (is_new or is_empty) and add_header_when_create_sheet:
|
169 |
# headers come from the keys of our row dict
|
170 |
if custom_row_dict is not None:
|
@@ -172,9 +174,14 @@ def append_to_sheet(user_data=None, custom_row_dict=None, custom_sheet_name=None
|
|
172 |
else:
|
173 |
headers = list(user_data.keys())
|
174 |
sheet.append_row(headers)
|
175 |
-
|
|
|
|
|
|
|
|
|
176 |
if custom_row_dict is not None:
|
177 |
-
|
|
|
178 |
else:
|
179 |
# Construct the default row with a timestamp and user_data fields
|
180 |
custom_row = [str(datetime.datetime.now()), user_data["question"], user_data["final_answer"], user_data["trace"]]
|
|
|
141 |
"""
|
142 |
Append a new row to a Google Sheet. If 'custom_row' is provided, append that row.
|
143 |
Otherwise, append a default row constructed from the provided user_data.
|
144 |
+
Ensures that each value is aligned with the correct column header.
|
145 |
"""
|
146 |
if custom_sheet_name is None:
|
147 |
custom_sheet_name = GSHEET_NAME
|
|
|
166 |
existing_values = sheet.get_all_values()
|
167 |
is_empty = (existing_values == [[]]) #indicates empty spreadsheet that was cleared in the past
|
168 |
|
169 |
+
# --- Always ensure header row is present and get headers ---
|
170 |
if (is_new or is_empty) and add_header_when_create_sheet:
|
171 |
# headers come from the keys of our row dict
|
172 |
if custom_row_dict is not None:
|
|
|
174 |
else:
|
175 |
headers = list(user_data.keys())
|
176 |
sheet.append_row(headers)
|
177 |
+
else:
|
178 |
+
# Read headers from the first row of the sheet
|
179 |
+
headers = sheet.row_values(1) if sheet.row_count > 0 else []
|
180 |
+
|
181 |
+
# --- Build row aligned to headers ---
|
182 |
if custom_row_dict is not None:
|
183 |
+
# Ensure all values are aligned to headers, fill missing with ""
|
184 |
+
custom_row = [custom_row_dict.get(header, "") for header in headers]
|
185 |
else:
|
186 |
# Construct the default row with a timestamp and user_data fields
|
187 |
custom_row = [str(datetime.datetime.now()), user_data["question"], user_data["final_answer"], user_data["trace"]]
|