Spaces:

agenticx
/

TxAgentRAOEval

Sleeping

App Files Files Community

shgao commited on May 28

Commit

dd01482

1 Parent(s): 03e54bd

update

Browse files

Files changed (1) hide show

app.py +122 -171

app.py CHANGED Viewed

@@ -319,16 +319,17 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
             for (q_id, our_model, other_model) in full_question_ids_list
             if (q_id, our_model, other_model) not in matched_pairs
         ]
-        print(f"Filtered question IDs: {full_question_ids_list}")
         print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
     return full_question_ids_list, data_by_filename
-def go_to_page0_from_minus1():
-    return gr.update(visible=False), gr.update(visible=True)
-def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id):
     # ADDED: Validate that name and email are non-empty before proceeding
     if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
         return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
@@ -357,76 +358,98 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
         token = os.getenv("HF_TOKEN")
     )
-    full_question_ids_list, data_by_filename = get_evaluator_questions(evaluator_id, all_files, evaluator_directory)
     if len(full_question_ids_list) == 0:
-        return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False),""
     full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
     #selected question is the first element
-    q_id, other_model_name = full_question_ids_list[0]
-    #Constructing question_for_eval, the question to evaluate this round
     txagent_matched_entry = next(
-        (entry for entry in data_by_filename['TxAgent-T1-Llama-3.1-8B'] if preprocess_question_id(entry.get("id")) == q_id),
         None
     )
     other_model_matched_entry = next(
         (entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
         None
     )
-    models_list = [
-        {
-            "model": 'TxAgent-T1-Llama-3.1-8B',
-            "reasoning_trace": txagent_matched_entry.get("solution")
-        },
-        {
             "model": other_model_name,
             "reasoning_trace": other_model_matched_entry.get("solution")
         }
-    ]
     random.shuffle(models_list)
     question_for_eval = {
         "question": txagent_matched_entry.get("question"),
-        "correct_answer": txagent_matched_entry.get("correct_answer"),
         "id": q_id,
         "models": models_list,
     }
-    #update user_info
-    user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id, evaluator_id)
     chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
     chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
     prompt_text = question_for_eval['question']
-    # Construct the question-specific elements of the pairwise rating page (page 1)
     page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
-    page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer"))
     chat_a = gr.Chatbot(
-                    value=chat_A_value,
-                    type="messages",
-                    height=400,
-                    label="Model A Response",
-                    show_copy_button=False,
-                    show_label=True,
-                    render_markdown=True,  # Required for markdown/HTML support in messages
-                    avatar_images=None,    # Optional: omit user/assistant icons
-                    rtl=False
-                )
     chat_b = gr.Chatbot(
-                    value=chat_B_value,
-                    type="messages",
-                    height=400,
-                    label="Model B Response",
-                    show_copy_button=False,
-                    show_label=True,
-                    render_markdown=True,  # Required for markdown/HTML support in messages
-                    avatar_images=None,    # Optional: omit user/assistant icons
-                    rtl=False
-                )
-    return gr.update(visible=True), gr.update(visible=False), user_info,"", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, gr.update(visible=True), f"You are about to evaluate the next question. You have {len(full_question_ids_list)} question(s) remaining to evaluate."
 #goes to page 1 from confirmation modal that tells users how many questions they have left to evaluate
 def go_to_page1():
@@ -454,7 +477,7 @@ def go_to_page2(data_subset_state,*pairwise_values):
     pairwise_results_for_display = [gr.Markdown(f"***As a reminder, your pairwise comparison answer for this criterion was: {pairwise_list[i]}. Your answer choices will be restricted based on your comparison answer, but you may go back and change the comparison answer if you wish.***") for i in range(len(criteria))]
     if any(answer is None for answer in pairwise_list):
-        return gr.update(visible=True), gr.update(visible=False), None, None, "Error: Please select an option for every pairwise comparison.", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(),*pairwise_results_for_display
     chat_A_value = format_chat(data_subset_state['models'][0]['reasoning_trace'], tool_database_labels)
     chat_B_value = format_chat(data_subset_state['models'][1]['reasoning_trace'], tool_database_labels)
@@ -482,7 +505,7 @@ def go_to_page2(data_subset_state,*pairwise_values):
     page2_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
     page2_reference_answer = gr.Markdown(data_subset_state['correct_answer'])
-    return gr.update(visible=False), gr.update(visible=True), pairwise_list, comparison_reasons_list, "", chat_A_rating, chat_B_rating, page2_prompt, page2_reference_answer, *pairwise_results_for_display
 # Callback to store scores for Response A.
@@ -543,9 +566,9 @@ def toggle_reference(selection):
 def mark_invalid_question(btn_clicked_status):
     new_status = not btn_clicked_status
     if new_status == True:
-        return new_status, gr.update(value="Undo; this question makes biomedical sense", variant="primary")
     else:
-        return new_status, gr.update(value="This question does not make sense or is not biomedically-relevant",variant="stop")
 centered_col_css = """
 #centered-column {
@@ -559,18 +582,24 @@ centered_col_css = """
     color: white !important;
     border-color: purple !important;
 }
 #clear_btn {
     background-color: #F08080 !important;
     color: white !important;
     border-color: #F08080 !important;
 }
-#reference-box {
-    background-color: #9191FF !important;
-    color: white !important;
     border: 1px solid #ccc;
     padding: 10px;
     border-radius: 5px;
 }
 """
 with gr.Blocks(css=centered_col_css) as demo:
     # States to save information between pages.
@@ -674,7 +703,7 @@ with gr.Blocks(css=centered_col_css) as demo:
             home_btn_0 = gr.Button("Home (your registration info will be saved)")
-    with Modal(visible=False) as eval_progress_modal:
         eval_progress_text = gr.Markdown("You have X questions remaining.")
         eval_progress_proceed_btn = gr.Button("OK, proceed to question evaluation")
@@ -683,13 +712,30 @@ with gr.Blocks(css=centered_col_css) as demo:
         gr.Markdown("## Part 1/2: Pairwise Comparison") #Make the number controlled by question indexing!
         page1_prompt = gr.HTML()
         # Add small red button under the prompt
-        nonsense_btn = gr.Button(
-            "This question does not make sense or is not biomedically-relevant",
-            size="sm",
-            variant="stop",  # red variant
-            elem_id="invalid-question-btn"
-        )
         nonsense_btn.click(
             fn=mark_invalid_question,
@@ -698,14 +744,6 @@ with gr.Blocks(css=centered_col_css) as demo:
             queue=False,
         )
-        with gr.Accordion("Show reference answer (this is ONE correct answer, but there may be multiple)", open=False, elem_id="reference-box"):
-            page1_reference_answer = gr.Markdown(
-                """
-                **Reference Answer:**
-                This is the reference answer content.
-                """
-            )
         with gr.Row():
             # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
             with gr.Column():
@@ -769,13 +807,14 @@ with gr.Blocks(css=centered_col_css) as demo:
         gr.Markdown("## Part 2/2: Rate Model Responses")
         # ### EDIT: Show a highlighted prompt as on previous pages.
         page2_prompt = gr.HTML()
-        with gr.Accordion("Show reference answer (this is ONE correct answer, but there may be multiple)", open=False, elem_id="reference-box"):
             page2_reference_answer = gr.Markdown(
                 """
                 **Reference Answer:**
                 This is the reference answer content.
-                """
             )
         # ### EDIT: Display both responses side-by-side using Chatbot windows.
         with gr.Row():
@@ -991,39 +1030,13 @@ with gr.Blocks(css=centered_col_css) as demo:
     def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
         # --- Part 1: Submit the current results (Existing Logic) ---
         row_dict = build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args)
-        _, _, _, _, _, _, _, _, evaluator_id = user_info
         append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{evaluator_id}"), add_header_when_create_sheet=True)
-        # --- Part 2: Recalculate remaining questions (Existing Logic + Modified Error Handling) ---
-        # try:
-        # --- Re-fetch data and filter questions (Same logic as before) ---
-        question_map_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=EVALUATOR_MAP_DICT,
-            repo_type="dataset",       # or omit if it's a Model/Space
-            # force_download=True,       # ← always fetch new copy
-            revision="main",            # branch/tag/commit, fetches the most recent version of the dataset each time this command is called
-            token = os.getenv("HF_TOKEN")
-        )
-        with open(question_map_path, 'r') as f:
-            question_map = json.load(f)
-        evaluator_directory = question_map.get(evaluator_id, None)
-        all_files = list_repo_files(
-            repo_id=REPO_ID,
-            repo_type="dataset",
-            revision="main",
-            token = os.getenv("HF_TOKEN")
         )
-        full_question_ids_list, data_by_filename = get_evaluator_questions(evaluator_id, all_files, evaluator_directory)
-        remaining_count = len(full_question_ids_list)
-        # --- Part 3: Determine UI updates based on remaining count ---
         if remaining_count == 0:
-            # Success with NO remaining questions
             return (
                 gr.update(visible=False),  # page0 (Hide)
                 gr.update(visible=False),  # page2 (Hide)
@@ -1036,72 +1049,9 @@ with gr.Blocks(css=centered_col_css) as demo:
                 None,
                 None,
                 None,
-                None
-            )
-        full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
-        #selected question is the first element
-        q_id, other_model_name = full_question_ids_list[0]
-        #Constructing question_for_eval, the question to evaluate this round
-        txagent_matched_entry = next(
-            (entry for entry in data_by_filename['TxAgent-T1-Llama-3.1-8B'] if preprocess_question_id(entry.get("id")) == q_id),
-            None
-        )
-        other_model_matched_entry = next(
-            (entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
-            None
-        )
-        models_list = [
-            {
-                "model": 'TxAgent-T1-Llama-3.1-8B',
-                "reasoning_trace": txagent_matched_entry.get("solution")
-            },
-            {
-                "model": other_model_name,
-                "reasoning_trace": other_model_matched_entry.get("solution")
-            }
-        ]
-        random.shuffle(models_list)
-        question_for_eval = {
-            "question": txagent_matched_entry.get("question"),
-            "id": q_id,
-            "models": models_list,
-        }
-        chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
-        chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
-        prompt_text = question_for_eval['question']
-        # Construct the question-specific elements of the pairwise rating page (page 1)
-        page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
-        page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer"))
-        chat_a = gr.Chatbot(
-                value=chat_A_value,
-                type="messages",
-                height=400,
-                label="Model A Response",
-                show_copy_button=False,
-                show_label=True,
-                render_markdown=True,  # Required for markdown/HTML support in messages
-                avatar_images=None,    # Optional: omit user/assistant icons
-                rtl=False
-            )
-        chat_b = gr.Chatbot(
-                value=chat_B_value,
-                type="messages",
-                height=400,
-                label="Model B Response",
-                show_copy_button=False,
-                show_label=True,
-                render_markdown=True,  # Required for markdown/HTML support in messages
-                avatar_images=None,    # Optional: omit user/assistant icons
-                rtl=False
             )
-        # Success with remaining questions
         return (
             gr.update(visible=False),  # page0 (Hide)
             gr.update(visible=False),  # page2 (Hide)
@@ -1114,8 +1064,10 @@ with gr.Blocks(css=centered_col_css) as demo:
             chat_b,
             page1_prompt,
             page1_reference_answer,
-            question_for_eval)
     def cancel_submission():
         # Cancel final submission: just hide the confirmation modal.
         return gr.update(visible=False)
@@ -1131,6 +1083,9 @@ with gr.Blocks(css=centered_col_css) as demo:
         reset_ratings_B = [gr.update(value=None) for i in range(len(criteria))]
         return (
             # states
             # gr.update(value=None),  # user_info_state
@@ -1251,11 +1206,6 @@ with gr.Blocks(css=centered_col_css) as demo:
     )
     # Finalize submission if user confirms.
-    # yes_btn.click(
-    #     fn=final_submit,
-    #     inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, *ratings_A, *ratings_B],
-    #     outputs=[page2, final_page, confirm_modal]
-    # )
     question_submission_event = yes_btn.click(
         fn=final_submit,
         inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, nonsense_btn_clicked, *ratings_A, *ratings_B],
@@ -1271,7 +1221,8 @@ with gr.Blocks(css=centered_col_css) as demo:
             chat_b,
             page1_prompt,
             page1_reference_answer,
-            data_subset_state
         ],
         scroll_to_output=True
     )

             for (q_id, our_model, other_model) in full_question_ids_list
             if (q_id, our_model, other_model) not in matched_pairs
         ]
         print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
     return full_question_ids_list, data_by_filename
+def get_next_eval_question(
+    name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id,
+    our_methods,
+    return_user_info=True,  # Whether to return user_info tuple
+    include_correct_answer=True  # Whether to return correct_answer
+):
     # ADDED: Validate that name and email are non-empty before proceeding
     if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
         return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
         token = os.getenv("HF_TOKEN")
     )
+    # Get available questions for the evaluator
+    full_question_ids_list, data_by_filename = get_evaluator_questions(
+        evaluator_id, all_files, evaluator_directory, our_methods)
     if len(full_question_ids_list) == 0:
+        return None, None, None, None, None, None, 0
     full_question_ids_list = sorted(full_question_ids_list, key=lambda x: str(x[0])+str(x[1]))
     #selected question is the first element
+    q_id, our_model_name, other_model_name = full_question_ids_list[0]
+    print("Selected question ID:", q_id)
+    # Build model answer lists
+    models_list = []
     txagent_matched_entry = next(
+        (entry for entry in data_by_filename[our_model_name] if preprocess_question_id(entry.get("id")) == q_id),
         None
     )
+    our_model = {
+        "model": our_model_name,
+        "reasoning_trace": txagent_matched_entry.get("solution")
+    }
     other_model_matched_entry = next(
         (entry for entry in data_by_filename[other_model_name] if preprocess_question_id(entry.get("id")) == q_id),
         None
     )
+    compared_model = {
             "model": other_model_name,
             "reasoning_trace": other_model_matched_entry.get("solution")
         }
+    models_list = [our_model, compared_model]
     random.shuffle(models_list)
     question_for_eval = {
         "question": txagent_matched_entry.get("question"),
         "id": q_id,
         "models": models_list,
     }
+    if include_correct_answer:
+        question_for_eval["correct_answer"] = txagent_matched_entry.get("correct_answer")
+    # Prepare Gradio components
     chat_A_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
     chat_B_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
     prompt_text = question_for_eval['question']
     page1_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
+    page1_reference_answer = gr.Markdown(txagent_matched_entry.get("correct_answer")) if include_correct_answer else None
     chat_a = gr.Chatbot(
+        value=chat_A_value,
+        type="messages",
+        height=400,
+        label="Model A Response",
+        show_copy_button=False,
+        show_label=True,
+        render_markdown=True,
+        avatar_images=None,
+        rtl=False
+    )
     chat_b = gr.Chatbot(
+        value=chat_B_value,
+        type="messages",
+        height=400,
+        label="Model B Response",
+        show_copy_button=False,
+        show_label=True,
+        render_markdown=True,
+        avatar_images=None,
+        rtl=False
+    )
+    user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id, evaluator_id)  if return_user_info else None
+    return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
+def go_to_page0_from_minus1():
+    return gr.update(visible=False), gr.update(visible=True)
+def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id):
+    # ADDED: Validate that name and email are non-empty before proceeding
+    if not name or not email or not evaluator_id or not specialty_dd or not years_exp_radio:
+        return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, evaluator ID, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False), ""
+    user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
+        name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id, our_methods
+    )
+    if remaining_count == 0:
+        return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(),gr.Markdown(),gr.State(),gr.update(visible=False),""
+    return gr.update(visible=True), gr.update(visible=False), user_info,"", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, gr.update(visible=True), f"You are about to evaluate the next question. You have {remaining_count} question(s) remaining to evaluate."
 #goes to page 1 from confirmation modal that tells users how many questions they have left to evaluate
 def go_to_page1():
     pairwise_results_for_display = [gr.Markdown(f"***As a reminder, your pairwise comparison answer for this criterion was: {pairwise_list[i]}. Your answer choices will be restricted based on your comparison answer, but you may go back and change the comparison answer if you wish.***") for i in range(len(criteria))]
     if any(answer is None for answer in pairwise_list):
+        return (gr.update(visible=True), gr.update(visible=False), None, None, "Error: Please select an option for every pairwise comparison.", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown()) + tuple(pairwise_results_for_display)
     chat_A_value = format_chat(data_subset_state['models'][0]['reasoning_trace'], tool_database_labels)
     chat_B_value = format_chat(data_subset_state['models'][1]['reasoning_trace'], tool_database_labels)
     page2_prompt = gr.HTML(f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
     page2_reference_answer = gr.Markdown(data_subset_state['correct_answer'])
+    return (gr.update(visible=False), gr.update(visible=True), pairwise_list, comparison_reasons_list, "", chat_A_rating, chat_B_rating, page2_prompt, page2_reference_answer) + tuple(pairwise_results_for_display)
 # Callback to store scores for Response A.
 def mark_invalid_question(btn_clicked_status):
     new_status = not btn_clicked_status
     if new_status == True:
+        return new_status, gr.update(value="Undo: Correct Question", variant="primary")
     else:
+        return new_status, gr.update(value="Wrong Question",variant="stop")
 centered_col_css = """
 #centered-column {
     color: white !important;
     border-color: purple !important;
 }
+#answer-reference-btn {
+    background-color: #E6E6FA !important;
+    color: white !important;
+    border-color: #E6E6FA !important;
+}
 #clear_btn {
     background-color: #F08080 !important;
     color: white !important;
     border-color: #F08080 !important;
 }
+.reference-box {
     border: 1px solid #ccc;
     padding: 10px;
     border-radius: 5px;
 }
+.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
+.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
 """
 with gr.Blocks(css=centered_col_css) as demo:
     # States to save information between pages.
             home_btn_0 = gr.Button("Home (your registration info will be saved)")
+    with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
         eval_progress_text = gr.Markdown("You have X questions remaining.")
         eval_progress_proceed_btn = gr.Button("OK, proceed to question evaluation")
         gr.Markdown("## Part 1/2: Pairwise Comparison") #Make the number controlled by question indexing!
         page1_prompt = gr.HTML()
+        with gr.Accordion("Click to reveal a reference answer—this is just one correct solution; others are possible.", open=False, elem_id="answer-reference-btn"):
+            page1_reference_answer = gr.Markdown(
+                """
+                **Reference Answer:**
+                This is the reference answer content.
+                """,
+                elem_classes="reference-box"
+            )
         # Add small red button under the prompt
+        with gr.Row():
+            nonsense_btn = gr.Button(
+                "Wrong Question?",
+                size="sm",
+                variant="stop",  # red variant
+                elem_id="invalid-question-btn",
+                elem_classes=["short-btn"]
+            )
+            gr.Markdown(
+                "<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
+                render=True
+            )
         nonsense_btn.click(
             fn=mark_invalid_question,
             queue=False,
         )
         with gr.Row():
             # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
             with gr.Column():
         gr.Markdown("## Part 2/2: Rate Model Responses")
         # ### EDIT: Show a highlighted prompt as on previous pages.
         page2_prompt = gr.HTML()
+        with gr.Accordion("Click to reveal a reference answer—this is just one correct solution; others are possible.", open=False, elem_id="answer-reference-btn"):
             page2_reference_answer = gr.Markdown(
                 """
                 **Reference Answer:**
                 This is the reference answer content.
+                """,
+                elem_classes="reference-box"
             )
         # ### EDIT: Display both responses side-by-side using Chatbot windows.
         with gr.Row():
     def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
         # --- Part 1: Submit the current results (Existing Logic) ---
         row_dict = build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args)
         append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{evaluator_id}"), add_header_when_create_sheet=True)
+        name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, _, evaluator_id = user_info
+        user_info_new, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
+            name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, our_methods
         )
         if remaining_count == 0:
             return (
                 gr.update(visible=False),  # page0 (Hide)
                 gr.update(visible=False),  # page2 (Hide)
                 None,
                 None,
                 None,
+                None,
+                user_info_new,
             )
         return (
             gr.update(visible=False),  # page0 (Hide)
             gr.update(visible=False),  # page2 (Hide)
             chat_b,
             page1_prompt,
             page1_reference_answer,
+            question_for_eval,
+            user_info_new
+        )
     def cancel_submission():
         # Cancel final submission: just hide the confirmation modal.
         return gr.update(visible=False)
         reset_ratings_B = [gr.update(value=None) for i in range(len(criteria))]
         return (
+            # pages
+            gr.update(visible=True),  # page0
+            gr.update(visible=False), # final_page
             # states
             # gr.update(value=None),  # user_info_state
     )
     # Finalize submission if user confirms.
     question_submission_event = yes_btn.click(
         fn=final_submit,
         inputs=[data_subset_state, user_info_state, pairwise_state, comparison_reasons, nonsense_btn_clicked, *ratings_A, *ratings_B],
             chat_b,
             page1_prompt,
             page1_reference_answer,
+            data_subset_state,
+            user_info_state,
         ],
         scroll_to_output=True
     )