shgao commited on
Commit
6df2faf
·
1 Parent(s): c4045a3

update a lot

Browse files
Files changed (3) hide show
  1. app.py +296 -98
  2. index.html +1 -1
  3. utils.py +9 -2
app.py CHANGED
@@ -339,7 +339,7 @@ def get_evaluator_questions(email, disease_map_data, drug_map_data, user_all_spe
339
  for other_model_name in model_names:
340
  for (q_id, dataset) in evaluator_question_ids:
341
  full_question_ids_list.append((q_id, our_model_name, other_model_name, dataset))
342
- print("Length of full question IDs list before filtering:", len(full_question_ids_list))
343
  results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME))
344
  if (results_df is not None) and (not results_df.empty):
345
  # collect all (question_ID, other_model) pairs already seen
@@ -360,7 +360,6 @@ def get_evaluator_questions(email, disease_map_data, drug_map_data, user_all_spe
360
  for (q_id, our_model, other_model, dataset) in full_question_ids_list
361
  if (q_id, our_model, other_model) not in matched_pairs
362
  ]
363
- # print(f"Filtered question IDs: {full_question_ids_list}")
364
  print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
365
 
366
 
@@ -479,7 +478,6 @@ def get_next_eval_question(
479
  )
480
 
481
  user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id) if return_user_info else None
482
-
483
  return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
484
 
485
  def go_to_page0_from_minus1():
@@ -611,12 +609,129 @@ def toggle_reference(selection):
611
  return gr.update(visible=False)
612
 
613
  #nonsense button helper
614
- def mark_invalid_question(btn_clicked_status):
615
- new_status = not btn_clicked_status
616
- if new_status == True:
617
- return new_status, gr.update(value="Undo: Correct Question", variant="primary")
618
- else:
619
- return new_status, gr.update(value="Wrong Question",variant="stop")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
  centered_col_css = """
622
  #centered-column {
@@ -648,6 +763,21 @@ centered_col_css = """
648
  .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
649
  .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  """
652
  with gr.Blocks(css=centered_col_css) as demo:
653
  # States to save information between pages.
@@ -686,27 +816,59 @@ with gr.Blocks(css=centered_col_css) as demo:
686
  <p>Welcome to the TxAgent Evaluation Portal.</p>
687
  </div>
688
  """)
689
- with gr.Row():
690
- # submit_questions_btn = gr.Button("Submit Questions for TxAgent Evaluation")
691
- # participate_eval_btn = gr.Button("Participate in TxAgent Evaluation")
692
- submit_questions_btn = gr.Button(
693
- value="🚀 Submit Questions for TxAgent Evaluation 🚀",
694
- variant="primary",
695
- size="lg",
696
- elem_id="participate-btn"
697
- )
698
- participate_eval_btn = gr.Button(
699
- value="🌟 Participate in TxAgent Evaluation 🌟",
700
- variant="primary",
701
- size="lg",
702
- elem_id="participate-btn"
703
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  gr.HTML(TxAgent_Project_Page_HTML)
705
 
706
  # Define actions for the new buttons
707
  # For the Google Form button, we'll use JavaScript to open a new tab.
708
  # The URL for the Google Form should be replaced with the actual link.
709
- google_form_url = "https://forms.gle/pYvyvEQQwS5gdupQA" # Replace with your actual Google Form link
710
  submit_questions_btn.click(
711
  fn=None,
712
  inputs=None,
@@ -716,57 +878,59 @@ with gr.Blocks(css=centered_col_css) as demo:
716
 
717
  # Page 0: Welcome / Informational page.
718
  with gr.Column(visible=False, elem_id="page0") as page0:
719
- gr.Markdown("## Welcome to the TxAgent Evalution Study!")
720
- gr.Markdown("Please read the following instructions and then enter your information to begin:")
721
- # Existing informational markdown...
722
- gr.Markdown("""
723
- - Each session requires a minimum commitment of 5-10 minutes to complete one question.
724
- - If you wish to evaluate multiple questions, you may do so; you will never be asked to re-evaluate questions you have already seen.
725
- - When evaluating a question, you will be asked to compare the responses of two different models to the question and then rate each model's response on a scale of 1-5.
726
- - If you feel that a question does not make sense or is not biomedically relevant, there is a RED BUTTON at the top of the first model comparison page to indicate this
727
- - You may use the Back and Next buttons at the bottom of each page to edit any of your responses before submitting.
728
- - You may use the Home Page button at the bottom of each page to the home page. Your progress will be saved but not submitted.
729
- - You must submit your answers to the current question before moving on to evaluate the next question.
730
- - You may stop in between questions and return at a later time; however, you must submit your answers to the current question if you would like them saved.
731
- - Please review the example question and LLM model response below:
732
-
733
- """)
734
- # Assume 'your_image.png' is in the same directory
735
- with open("anatomyofAgentResponse.jpg", "rb") as image_file:
736
- img = Image.open(image_file)
737
- new_size = (int(img.width * 0.5), int(img.height * 0.5))
738
- img = img.resize(new_size, Image.LANCZOS)
739
- buffer = io.BytesIO()
740
- img.save(buffer, format="PNG")
741
- encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")
742
- #encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
743
-
744
- image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
745
- ReasoningTraceExampleHTML = f"""
746
- <div>
747
- {image_html}
748
- </div>
749
- """
750
- gr.HTML(ReasoningTraceExampleHTML)
751
- gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
752
- """)
753
- gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
754
- name = gr.Textbox(label="Name (required)")
755
- email = gr.Textbox(label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
756
- specialty_dd = gr.Dropdown(choices=specialties_list, label="Primary Medical Specialty (required). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
757
- subspecialty_dd = gr.Dropdown(choices=subspecialties_list, label="Subspecialty (if applicable). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
758
- npi_id = gr.Textbox(label="National Provider Identifier ID (optional). Got to https://npiregistry.cms.hhs.gov/search to search for your NPI ID. If you do not have an NPI ID, please leave this blank.")
759
- years_exp_radio = gr.Radio(
760
- choices=["0-2 years", "3-5 years", "6-10 years", "11-20 years", "20+ years", "Not Applicable"],
761
- label="How many years have you been involved in clinical and/or research activities related to your biomedical area of expertise? (required)"
762
- )
763
- exp_explanation_tb = gr.Textbox(label="Please briefly explain your expertise/experience relevant to evaluating AI for clinical decision support (optional)")
764
-
765
- page0_error_box = gr.Markdown("")
766
- with gr.Row():
767
- next_btn_0 = gr.Button("Next")
768
  with gr.Row():
769
- home_btn_0 = gr.Button("Home (your registration info will be saved)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
 
771
 
772
  with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
@@ -802,14 +966,22 @@ with gr.Blocks(css=centered_col_css) as demo:
802
  "<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
803
  render=True
804
  )
 
 
 
 
 
 
 
 
 
 
 
 
805
 
806
- nonsense_btn.click(
807
- fn=mark_invalid_question,
808
- inputs=[nonsense_btn_clicked],
809
- outputs=[nonsense_btn_clicked, nonsense_btn],
810
- queue=False,
811
- )
812
 
 
813
  with gr.Row():
814
  # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
815
  with gr.Column():
@@ -845,22 +1017,25 @@ with gr.Blocks(css=centered_col_css) as demo:
845
  pairwise_inputs = []
846
  for crit in criteria_for_comparison:
847
  with gr.Row():
848
- gr.Markdown(crit['text'])
849
- radio = gr.Radio(
850
- choices=[
851
- "👈 Model A", # A
852
- "👉 Model B", # B
853
- "🤝 Tie", # tie
854
- "👎 Neither model did well" # neither
855
- ],
856
- label="Which is better?"
857
- )
 
 
 
858
  pairwise_inputs.append(radio)
859
  # ADDED: free text under each comparison
860
  text_input = gr.Textbox(label=f"Reasons for your selection (optional)")
861
  comparison_reasons_inputs.append(text_input)
862
 
863
- page1_error_box = gr.Markdown("") # ADDED: display validation errors
864
  with gr.Row():
865
  back_btn_0 = gr.Button("Back")
866
  next_btn_1 = gr.Button("Next: Rate Responses")
@@ -999,18 +1174,20 @@ with gr.Blocks(css=centered_col_css) as demo:
999
  indices_for_change.append(index_component)
1000
 
1001
  with gr.Column(elem_id="centered-column"):
1002
- gr.Markdown(f'<div style="text-align: left;">{crit["text"][0]}</div>')
1003
- gr.Markdown(f'<div style="text-align: left;">{crit["text"][1]}</div>')
1004
  pairwise_results_for_display[i].render()
1005
  with gr.Row():
1006
  with gr.Column(scale=1):
1007
  rating_a = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
1008
  label=f"Score for Response A - {crit['label']}",
1009
- interactive=True)
 
1010
  with gr.Column(scale=1):
1011
  rating_b = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
1012
  label=f"Score for Response B - {crit['label']}",
1013
- interactive=True)
 
1014
  with gr.Row():
1015
  clear_btn = gr.Button("Clear Selection", size="sm", elem_id="clear_btn")
1016
  clear_btn.click(fn=clear_selection, outputs=[rating_a,rating_b])
@@ -1054,7 +1231,8 @@ with gr.Blocks(css=centered_col_css) as demo:
1054
  with gr.Row():
1055
  yes_btn = gr.Button("Yes, please submit")
1056
  cancel_btn = gr.Button("Cancel")
1057
-
 
1058
  # --- Define Callback Functions for Confirmation Flow ---
1059
  def build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
1060
  num_criteria = len(criteria)
@@ -1222,6 +1400,24 @@ with gr.Blocks(css=centered_col_css) as demo:
1222
  inputs=None,
1223
  outputs=[page0, page1]
1224
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1225
 
1226
  # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
1227
  next_btn_1.click(
@@ -1247,7 +1443,7 @@ with gr.Blocks(css=centered_col_css) as demo:
1247
  gr.update(), # Show page 3
1248
  gr.update(), # Hide final page
1249
  gr.update(visible=True), # Show confirmation modal
1250
- gr.update(visible=False), # Hide error modal
1251
  gr.update(value="") # EDIT: Clear the error_message_box
1252
  )
1253
  else:
@@ -1342,4 +1538,6 @@ with gr.Blocks(css=centered_col_css) as demo:
1342
  ]
1343
  )
1344
 
 
 
1345
  demo.launch(share=True, allowed_paths = ["."])
 
339
  for other_model_name in model_names:
340
  for (q_id, dataset) in evaluator_question_ids:
341
  full_question_ids_list.append((q_id, our_model_name, other_model_name, dataset))
342
+
343
  results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME))
344
  if (results_df is not None) and (not results_df.empty):
345
  # collect all (question_ID, other_model) pairs already seen
 
360
  for (q_id, our_model, other_model, dataset) in full_question_ids_list
361
  if (q_id, our_model, other_model) not in matched_pairs
362
  ]
 
363
  print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
364
 
365
 
 
478
  )
479
 
480
  user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id) if return_user_info else None
 
481
  return user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list)
482
 
483
  def go_to_page0_from_minus1():
 
609
  return gr.update(visible=False)
610
 
611
  #nonsense button helper
612
+ # def mark_invalid_question(btn_clicked_status):
613
+ # new_status = not btn_clicked_status
614
+ # if new_status == True:
615
+ # # Show skip modal when marking as Wrong Question
616
+ # return new_status, gr.update(value="Undo: Correct Question", variant="primary"), gr.update(visible=True)
617
+ # else:
618
+ # return new_status, gr.update(value="Wrong Question", variant="stop"), gr.update(visible=False)
619
+
620
+ # --- Skip Question Modal Callbacks ---
621
+ def skip_question_and_load_new(user_info_state, our_methods):
622
+ # user_info_state is a tuple: (name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, q_id)
623
+ if user_info_state is None:
624
+ # Defensive: just close modal if no user info
625
+ return gr.update(visible=False), gr.update(visible=False), None, "", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State()
626
+ # Unpack user_info_state
627
+ name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, _ = user_info_state
628
+ user_info, chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question(
629
+ name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods
630
+ )
631
+ if remaining_count == 0:
632
+ # No more questions, go to final page
633
+ return gr.update(visible=False), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State()
634
+ return gr.update(visible=False), gr.update(visible=True), user_info, "", chat_a, chat_b, page1_prompt, page1_reference_answer, question_for_eval
635
+
636
+ # --- Skip‑question handler for the "Wrong Question?" button -------------------
637
+ def skip_current_question(user_info_state, our_methods: list = our_methods):
638
+ # Guard: user clicked before session started
639
+ gr.Info("Skipping this question and loading the next one…", duration=5)
640
+ if user_info_state is None:
641
+ return (
642
+ None,
643
+ gr.update(value="Please start the evaluation before skipping questions."),
644
+ gr.update(value=[]), # Chatbot A history
645
+ gr.update(value=[]), # Chatbot B history
646
+ gr.update(value=""), # Prompt HTML
647
+ gr.update(value=""), # Reference answer
648
+ gr.State() # data_subset_state
649
+ )
650
+
651
+ # Unpack evaluator identity
652
+ name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, _ = user_info_state
653
+
654
+ # Pull the next unused question
655
+ (
656
+ user_info_new,
657
+ _chat_a_comp, # we will rebuild the values directly
658
+ _chat_b_comp,
659
+ _prompt_comp,
660
+ _ref_comp,
661
+ question_for_eval,
662
+ remaining,
663
+ ) = get_next_eval_question(
664
+ name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, our_methods
665
+ )
666
+
667
+ # If the pool is exhausted, just notify the evaluator
668
+ if remaining == 0 or question_for_eval is None:
669
+ final_msg = (
670
+ "Based on your submitted data, you have no more questions to evaluate. "
671
+ "You may exit the page; we will follow‑up if we require anything else from you. "
672
+ "Thank you!"
673
+ )
674
+ return (
675
+ user_info_state,
676
+ gr.update(value=final_msg),
677
+ gr.update(value=[]),
678
+ gr.update(value=[]),
679
+ gr.update(value=""),
680
+ gr.update(value=""),
681
+ gr.State()
682
+ )
683
+
684
+ # --- Build fresh values for the existing UI components ---
685
+ chat_a_value = format_chat(question_for_eval['models'][0]['reasoning_trace'], tool_database_labels)
686
+ chat_b_value = format_chat(question_for_eval['models'][1]['reasoning_trace'], tool_database_labels)
687
+
688
+ prompt_html = (
689
+ f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
690
+ f"border-radius: 5px; color: black;'><strong style='color: black;'>Prompt:</strong> "
691
+ f"{question_for_eval['question']}</div>"
692
+ )
693
+ reference_md = question_for_eval.get("correct_answer", "")
694
+ gr.Info("New question loaded…", duration=3)
695
+
696
+ # Return updates to refresh Page 1 in‑place
697
+ return (
698
+ user_info_new,
699
+ gr.update(value=""), # clear any previous error text
700
+ gr.update(value=chat_a_value), # Chatbot A history
701
+ gr.update(value=chat_b_value), # Chatbot B history
702
+ gr.update(value=prompt_html), # Prompt
703
+ gr.update(value=reference_md), # Reference answer
704
+ question_for_eval # store for later pages
705
+ )
706
+
707
+ # --- Handler for "Wrong Question?": flags nonsense and skips
708
+ def flag_nonsense_and_skip(user_info_state):
709
+ """
710
+ When the evaluator clicks the “Wrong Question?” button, immediately
711
+ record that this question was flagged as nonsensical/irrelevant and
712
+ then load the next question (re‑using the existing skip logic).
713
+ """
714
+ # 1) Record the flag to the Google Sheet so we keep the feedback even
715
+ # if the evaluator stops here.
716
+ if user_info_state is not None:
717
+ name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, q_id = user_info_state
718
+ timestamp = datetime.datetime.now().isoformat()
719
+ row = {
720
+ "Timestamp": timestamp,
721
+ "Name": name,
722
+ "Email": email,
723
+ "Question ID": q_id,
724
+ "Question Makes No Sense or Biomedically Irrelevant": True,
725
+ }
726
+ append_to_sheet(
727
+ user_data=None,
728
+ custom_row_dict=row,
729
+ custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME),
730
+ add_header_when_create_sheet=True,
731
+ )
732
+
733
+ # 2) Fall back to the existing skip logic to advance the UI.
734
+ return skip_current_question(user_info_state)
735
 
736
  centered_col_css = """
737
  #centered-column {
 
763
  .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
764
  .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
765
 
766
+ /* --- Added for larger criteria font --- */
767
+ .criteria-font-large {
768
+ font-size: 3em !important;
769
+ }
770
+ .gr-radio label, .criteria-radio-label {
771
+ font-size: 3em !important;
772
+ }
773
+ #participate-btn button {
774
+ font-size: 24px !important; /* Large readable text */
775
+ font-weight: 700 !important; /* Bold for emphasis */
776
+ padding: 28px 40px !important; /* Extra padding for height */
777
+ min-height: 120px !important; /* Make button visibly taller (multi‑line) */
778
+ width: 100% !important; /* Occupy full width of its column */
779
+ white-space: normal !important; /* Allow text to wrap onto multiple lines */
780
+ }
781
  """
782
  with gr.Blocks(css=centered_col_css) as demo:
783
  # States to save information between pages.
 
816
  <p>Welcome to the TxAgent Evaluation Portal.</p>
817
  </div>
818
  """)
819
+ with gr.Row(elem_classes=["center-row"]):
820
+ # 第一行:并排放两个按钮
821
+ with gr.Column(scale=1):
822
+ participate_eval_btn = gr.Button(
823
+ value="Click to 🌟Participate in TxAgent Evaluation 🌟",
824
+ variant="primary",
825
+ size="lg",
826
+ elem_id="participate-btn"
827
+ )
828
+ with gr.Column(scale=1):
829
+ submit_questions_btn = gr.Button(
830
+ value="Click to 🚀 Submit Questions for TxAgent Evaluation 🚀",
831
+ variant="primary",
832
+ size="lg",
833
+ elem_id="submit-btn"
834
+ )
835
+
836
+ with gr.Row(elem_classes=["center-row"]):
837
+ # 第二行:分别放两段说明文字
838
+ with gr.Column(scale=1):
839
+ gr.Markdown(
840
+ """
841
+ ### Why participate in the evaluation?
842
+ Joining the **TxAgent human evaluation** lets you experience the model in real-time and give immediate feedback.
843
+
844
+ **By clicking the button above, you will:**
845
+ - See how the model responds to a variety of prompts.
846
+ - Provide instant thumbs-up / thumbs-down ratings.
847
+ - Shape the roadmap for upcoming releases.
848
+
849
+ _Thank you for helping us improve!_
850
+ """
851
+ )
852
+ with gr.Column(scale=1):
853
+ gr.Markdown(
854
+ """
855
+ ### Why submit questions?
856
+ Help us build a richer evaluation set by sending unique, challenging prompts.
857
+
858
+ **By clicking the button above, you will:**
859
+ - Highlight edge-cases and identify blind spots.
860
+ - Push TxAgent to reason in new domains.
861
+ - Directly influence future model improvements.
862
+
863
+ _We're excited to see what you come up with!_
864
+ """
865
+ )
866
  gr.HTML(TxAgent_Project_Page_HTML)
867
 
868
  # Define actions for the new buttons
869
  # For the Google Form button, we'll use JavaScript to open a new tab.
870
  # The URL for the Google Form should be replaced with the actual link.
871
+ google_form_url = "https://forms.gle/pYvyvEQQwS5gdupQA"
872
  submit_questions_btn.click(
873
  fn=None,
874
  inputs=None,
 
878
 
879
  # Page 0: Welcome / Informational page.
880
  with gr.Column(visible=False, elem_id="page0") as page0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
881
  with gr.Row():
882
+ with gr.Column():
883
+ gr.Markdown("## Welcome to the TxAgent Evalution Study!")
884
+ gr.Markdown("Please read the following instructions and then enter your information to begin:")
885
+ # Existing informational markdown...
886
+ gr.Markdown("""
887
+ - Each session requires a minimum commitment of 5-10 minutes to complete one question.
888
+ - If you wish to evaluate multiple questions, you may do so; you will never be asked to re-evaluate questions you have already seen.
889
+ - When evaluating a question, you will be asked to compare the responses of two different models to the question and then rate each model's response on a scale of 1-5.
890
+ - If you feel that a question does not make sense or is not biomedically relevant, there is a RED BUTTON at the top of the first model comparison page to indicate this
891
+ - You may use the Back and Next buttons at the bottom of each page to edit any of your responses before submitting.
892
+ - You may use the Home Page button at the bottom of each page to the home page. Your progress will be saved but not submitted.
893
+ - You must submit your answers to the current question before moving on to evaluate the next question.
894
+ - You may stop in between questions and return at a later time; however, you must submit your answers to the current question if you would like them saved.
895
+ - Please review the example question and LLM model response below:
896
+
897
+ """)
898
+ with open("anatomyofAgentResponse.jpg", "rb") as image_file:
899
+ img = Image.open(image_file)
900
+ new_size = (int(img.width * 0.5), int(img.height * 0.5))
901
+ img = img.resize(new_size, Image.LANCZOS)
902
+ buffer = io.BytesIO()
903
+ img.save(buffer, format="PNG")
904
+ encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")
905
+ #encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
906
+
907
+ image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
908
+ ReasoningTraceExampleHTML = f"""
909
+ <div>
910
+ {image_html}
911
+ </div>
912
+ """
913
+ gr.HTML(ReasoningTraceExampleHTML)
914
+ with gr.Column():
915
+ gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
916
+ name = gr.Textbox(label="Name (required)")
917
+ email = gr.Textbox(label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
918
+ specialty_dd = gr.Dropdown(choices=specialties_list, label="Primary Medical Specialty (required). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
919
+ subspecialty_dd = gr.Dropdown(choices=subspecialties_list, label="Subspecialty (if applicable). Go to https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categorization)", multiselect=True)
920
+ npi_id = gr.Textbox(label="National Provider Identifier ID (optional). Got to https://npiregistry.cms.hhs.gov/search to search for your NPI ID. If you do not have an NPI ID, please leave this blank.")
921
+ years_exp_radio = gr.Radio(
922
+ choices=["0-2 years", "3-5 years", "6-10 years", "11-20 years", "20+ years", "Not Applicable"],
923
+ label="How many years have you been involved in clinical and/or research activities related to your biomedical area of expertise? (required)"
924
+ )
925
+ exp_explanation_tb = gr.Textbox(label="Please briefly explain your expertise/experience relevant to evaluating AI for clinical decision support (optional)")
926
+
927
+ page0_error_box = gr.Markdown("")
928
+ with gr.Row():
929
+ next_btn_0 = gr.Button("Next")
930
+ with gr.Row():
931
+ home_btn_0 = gr.Button("Home") # (your registration info will be saved)
932
+ gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
933
+ """)
934
 
935
 
936
  with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
 
966
  "<span style='color: #b30000; font-weight: bold;'>Click the button if you think this question does not make sense or is not biomedically-relevant</span>",
967
  render=True
968
  )
969
+ with gr.Row():
970
+ unfamiliar_btn = gr.Button(
971
+ "Unfamiliar Question?",
972
+ size="sm",
973
+ variant="stop", # red variant
974
+ elem_id="invalid-question-btn",
975
+ elem_classes=["short-btn"]
976
+ )
977
+ gr.Markdown(
978
+ "<span style='color: #b30000; font-weight: bold;'>Click the button if you are not familiar with this area</span>",
979
+ render=True
980
+ )
981
 
982
+ page1_error_box = gr.Markdown("") # ADDED: display validation errors
 
 
 
 
 
983
 
984
+ # --- Define chat_a and chat_b before using them in outputs ---
985
  with gr.Row():
986
  # ADDED: Use gr.Chatbot to display the scrollable chat window for Response A.
987
  with gr.Column():
 
1017
  pairwise_inputs = []
1018
  for crit in criteria_for_comparison:
1019
  with gr.Row():
1020
+ with gr.Column(scale=1):
1021
+ gr.Markdown(crit['text'], elem_classes="criteria-font-large") # <--- add class here
1022
+ with gr.Column(scale=1):
1023
+ radio = gr.Radio(
1024
+ choices=[
1025
+ "👈 Model A", # A
1026
+ "👉 Model B", # B
1027
+ "🤝 Tie", # tie
1028
+ "👎 Neither model did well" # neither
1029
+ ],
1030
+ label="Which is better?",
1031
+ elem_classes="criteria-radio-label" # <--- add class here
1032
+ )
1033
  pairwise_inputs.append(radio)
1034
  # ADDED: free text under each comparison
1035
  text_input = gr.Textbox(label=f"Reasons for your selection (optional)")
1036
  comparison_reasons_inputs.append(text_input)
1037
 
1038
+
1039
  with gr.Row():
1040
  back_btn_0 = gr.Button("Back")
1041
  next_btn_1 = gr.Button("Next: Rate Responses")
 
1174
  indices_for_change.append(index_component)
1175
 
1176
  with gr.Column(elem_id="centered-column"):
1177
+ gr.Markdown(f'<div style="text-align: left;">{crit["text"][0]}</div>', elem_classes="criteria-font-large")
1178
+ gr.Markdown(f'<div style="text-align: left;">{crit["text"][1]}</div>', elem_classes="criteria-font-large")
1179
  pairwise_results_for_display[i].render()
1180
  with gr.Row():
1181
  with gr.Column(scale=1):
1182
  rating_a = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
1183
  label=f"Score for Response A - {crit['label']}",
1184
+ interactive=True,
1185
+ elem_classes="criteria-radio-label")
1186
  with gr.Column(scale=1):
1187
  rating_b = gr.Radio(choices=["1", "2", "3", "4", "5", "Unable to Judge"],
1188
  label=f"Score for Response B - {crit['label']}",
1189
+ interactive=True,
1190
+ elem_classes="criteria-radio-label")
1191
  with gr.Row():
1192
  clear_btn = gr.Button("Clear Selection", size="sm", elem_id="clear_btn")
1193
  clear_btn.click(fn=clear_selection, outputs=[rating_a,rating_b])
 
1231
  with gr.Row():
1232
  yes_btn = gr.Button("Yes, please submit")
1233
  cancel_btn = gr.Button("Cancel")
1234
+
1235
+
1236
  # --- Define Callback Functions for Confirmation Flow ---
1237
  def build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args):
1238
  num_criteria = len(criteria)
 
1400
  inputs=None,
1401
  outputs=[page0, page1]
1402
  )
1403
+
1404
+ # Skip the current question and load a new one when the evaluator flags it
1405
+ nonsense_btn.click(
1406
+ fn=flag_nonsense_and_skip,
1407
+ inputs=[user_info_state],
1408
+ outputs=[user_info_state, page1_error_box, chat_a, chat_b,
1409
+ page1_prompt, page1_reference_answer, data_subset_state],
1410
+ scroll_to_output=True
1411
+ )
1412
+
1413
+ unfamiliar_btn.click(
1414
+ fn=skip_current_question,
1415
+ inputs=[user_info_state],
1416
+ outputs=[user_info_state, page1_error_box, chat_a, chat_b,
1417
+ page1_prompt, page1_reference_answer, data_subset_state],
1418
+ scroll_to_output=True
1419
+ )
1420
+
1421
 
1422
  # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
1423
  next_btn_1.click(
 
1443
  gr.update(), # Show page 3
1444
  gr.update(), # Hide final page
1445
  gr.update(visible=True), # Show confirmation modal
1446
+ gr.update(visible=False), # Hide error modal
1447
  gr.update(value="") # EDIT: Clear the error_message_box
1448
  )
1449
  else:
 
1538
  ]
1539
  )
1540
 
1541
+
1542
+
1543
  demo.launch(share=True, allowed_paths = ["."])
index.html CHANGED
@@ -150,7 +150,7 @@
150
  <div class="container is-max-desktop">
151
  <div class="columns is-centered has-text-centered">
152
  <div class="column is-four-fifths">
153
- <h2 class="title is-3">Abstract</h2>
154
  <div class="content has-text-justified">
155
  <p>
156
  Precision therapeutics require multimodal adaptive models that
 
150
  <div class="container is-max-desktop">
151
  <div class="columns is-centered has-text-centered">
152
  <div class="column is-four-fifths">
153
+ <h2 class="title is-3">What is TxAgent?</h2>
154
  <div class="content has-text-justified">
155
  <p>
156
  Precision therapeutics require multimodal adaptive models that
utils.py CHANGED
@@ -141,6 +141,7 @@ def append_to_sheet(user_data=None, custom_row_dict=None, custom_sheet_name=None
141
  """
142
  Append a new row to a Google Sheet. If 'custom_row' is provided, append that row.
143
  Otherwise, append a default row constructed from the provided user_data.
 
144
  """
145
  if custom_sheet_name is None:
146
  custom_sheet_name = GSHEET_NAME
@@ -165,6 +166,7 @@ def append_to_sheet(user_data=None, custom_row_dict=None, custom_sheet_name=None
165
  existing_values = sheet.get_all_values()
166
  is_empty = (existing_values == [[]]) #indicates empty spreadsheet that was cleared in the past
167
 
 
168
  if (is_new or is_empty) and add_header_when_create_sheet:
169
  # headers come from the keys of our row dict
170
  if custom_row_dict is not None:
@@ -172,9 +174,14 @@ def append_to_sheet(user_data=None, custom_row_dict=None, custom_sheet_name=None
172
  else:
173
  headers = list(user_data.keys())
174
  sheet.append_row(headers)
175
-
 
 
 
 
176
  if custom_row_dict is not None:
177
- custom_row = [custom_row_dict.get(header) for header in list(custom_row_dict.keys())]
 
178
  else:
179
  # Construct the default row with a timestamp and user_data fields
180
  custom_row = [str(datetime.datetime.now()), user_data["question"], user_data["final_answer"], user_data["trace"]]
 
141
  """
142
  Append a new row to a Google Sheet. If 'custom_row' is provided, append that row.
143
  Otherwise, append a default row constructed from the provided user_data.
144
+ Ensures that each value is aligned with the correct column header.
145
  """
146
  if custom_sheet_name is None:
147
  custom_sheet_name = GSHEET_NAME
 
166
  existing_values = sheet.get_all_values()
167
  is_empty = (existing_values == [[]]) #indicates empty spreadsheet that was cleared in the past
168
 
169
+ # --- Always ensure header row is present and get headers ---
170
  if (is_new or is_empty) and add_header_when_create_sheet:
171
  # headers come from the keys of our row dict
172
  if custom_row_dict is not None:
 
174
  else:
175
  headers = list(user_data.keys())
176
  sheet.append_row(headers)
177
+ else:
178
+ # Read headers from the first row of the sheet
179
+ headers = sheet.row_values(1) if sheet.row_count > 0 else []
180
+
181
+ # --- Build row aligned to headers ---
182
  if custom_row_dict is not None:
183
+ # Ensure all values are aligned to headers, fill missing with ""
184
+ custom_row = [custom_row_dict.get(header, "") for header in headers]
185
  else:
186
  # Construct the default row with a timestamp and user_data fields
187
  custom_row = [str(datetime.datetime.now()), user_data["question"], user_data["final_answer"], user_data["trace"]]