Spaces:

agenticx
/

TxAgentCrowdsourcingEval

Running

App Files Files Community

shgao commited on Jun 18

Commit

9b0e7e4

1 Parent(s): f84f9ce

update

Browse files

Files changed (2) hide show

app.py +68 -42
utils.py +8 -8

app.py CHANGED Viewed

@@ -124,7 +124,7 @@ criteria = [
         ]
     },
     {
-        "label": "Justification helpfulness",
         "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
         "scores": [
             "1 No usable rationale. ",
@@ -218,7 +218,7 @@ criteria_for_comparison = [
         )
     },
     {
-        "label": "Justification helpfulness",
         "text": (
             "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
         )
@@ -482,7 +482,7 @@ def get_next_eval_question(
     prompt_text = question_for_eval['question']
     page1_prompt = gr.HTML(
-        f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
     page1_reference_answer = gr.Markdown(txagent_matched_entry.get(
         "correct_answer")) if include_correct_answer else None
     chat_a_answer = gr.Chatbot(
@@ -513,7 +513,7 @@ def get_next_eval_question(
         value=chat_A_reasoning,
         type="messages",
         height=300,
-        label="Model A Reasoning",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
@@ -525,7 +525,7 @@ def get_next_eval_question(
         value=chat_B_reasoning,
         type="messages",
         height=300,
-        label="Model B Reasoning",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
@@ -668,7 +668,7 @@ def skip_current_question(user_info_state, our_methods: list = our_methods):
     prompt_html = (
         f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
-        f"border-radius: 5px; color: black;'><strong style='color: black;'>Prompt:</strong> "
         f"{question_for_eval['question']}</div>"
     )
     reference_md = question_for_eval.get("correct_answer", "")
@@ -1088,6 +1088,32 @@ centered_col_css = """
     width: 100% !important;          /* Occupy full width of its column */
     white-space: normal !important;  /* Allow text to wrap onto multiple lines */
 }
 """
 with gr.Blocks(css=centered_col_css) as demo:
     # States to save information between pages.
@@ -1217,41 +1243,41 @@ with gr.Blocks(css=centered_col_css) as demo:
             next_btn_0 = gr.Button("Next")
         gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
             """)
-        gr.Markdown("""
-                ## Instructions:
-                Please review these instructions and enter your information to begin:
-                - Each session requires at least 5-10 minutes per question.
-                - You can evaluate multiple questions; you will not repeat evaluations.
-                - For each question, compare responses from two models and rate them (scale: 1-5).
-                - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
-                - Use the Back and Next buttons to edit responses before submission.
-                - Use the Home Page button to return to the homepage; progress will save but not submit.
-                - Submit answers to the current question before moving to the next.
-                - You can pause between questions and return later; ensure current answers are submitted to save them.
-            """)
-        with open("anatomyofAgentResponse.jpg", "rb") as image_file:
-            img = Image.open(image_file)
-            new_size = (int(img.width * 0.5), int(img.height * 0.5))
-            img = img.resize(new_size, Image.LANCZOS)
-            buffer = io.BytesIO()
-            img.save(buffer, format="PNG")
-            encoded_string = base64.b64encode(
-                buffer.getvalue()).decode("utf-8")
-        image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
-        ReasoningTraceExampleHTML = f"""
-            <div>
-                {image_html}
-            </div>
-            """
-        gr.HTML(ReasoningTraceExampleHTML)
     # Page 1: Pairwise Comparison.
     with gr.Column(visible=False) as page1:
         # Make the number controlled by question indexing!
-        gr.Markdown("Comparison")
         # Add small red button and comments text box in the same row
         page1_prompt = gr.HTML()
         with gr.Row():
@@ -1264,7 +1290,7 @@ with gr.Blocks(css=centered_col_css) as demo:
                 scale=1
             )
             skip_comments = gr.Textbox(
-                placeholder="(Optional) Comments about why you're skipping this question...",
                 show_label=False,
                 scale=3,
                 container=False,
@@ -1293,7 +1319,7 @@ with gr.Blocks(css=centered_col_css) as demo:
                     value=[],
                     type="messages",
                     height=300,
-                    label="Model A Reasoning",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
@@ -1319,7 +1345,7 @@ with gr.Blocks(css=centered_col_css) as demo:
                     value=[],
                     type="messages",
                     height=300,
-                    label="Model B Reasoning",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
@@ -1366,12 +1392,12 @@ with gr.Blocks(css=centered_col_css) as demo:
                     rating_a = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response A - {crit_score['text']}",
                                         interactive=True,
-                                        elem_classes="criteria-radio-label")
                 with gr.Column(scale=1):
                     rating_b = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response B - {crit_score['text']}",
                                         interactive=True,
-                                        elem_classes="criteria-radio-label")
             # Add clear button and wire up the restrictions
             with gr.Row():

         ]
     },
     {
+        "label": "Helpfulness of rationale",
         "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
         "scores": [
             "1 No usable rationale. ",
         )
     },
     {
+        "label": "Helpfulness of rationale",
         "text": (
             "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
         )
     prompt_text = question_for_eval['question']
     page1_prompt = gr.HTML(
+        f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Question:</strong> {prompt_text}</div>')
     page1_reference_answer = gr.Markdown(txagent_matched_entry.get(
         "correct_answer")) if include_correct_answer else None
     chat_a_answer = gr.Chatbot(
         value=chat_A_reasoning,
         type="messages",
         height=300,
+        label="Model A Reasoning - Rationale",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
         value=chat_B_reasoning,
         type="messages",
         height=300,
+        label="Model B Reasoning - Rationale",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
     prompt_html = (
         f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
+        f"border-radius: 5px; color: black;'><strong style='color: black;'>Question:</strong> "
         f"{question_for_eval['question']}</div>"
     )
     reference_md = question_for_eval.get("correct_answer", "")
     width: 100% !important;          /* Occupy full width of its column */
     white-space: normal !important;  /* Allow text to wrap onto multiple lines */
 }
+.criteria-radio-score-label [role="radiogroup"],
+.criteria-radio-score-label .gr-radio-group,
+.criteria-radio-score-label .flex {
+    display: flex !important;
+    flex-direction: column !important;
+    gap: 4px !important;                 /* 行间距，可按需调整 */
+}
+/* 更具体的选择器来确保垂直布局 */
+.criteria-radio-score-label fieldset {
+    display: flex !important;
+    flex-direction: column !important;
+    gap: 4px !important;
+}
+.criteria-radio-score-label .wrap {
+    display: flex !important;
+    flex-direction: column !important;
+    gap: 4px !important;
+}
+/* 确保每个单选按钮选项垂直排列 */
+.criteria-radio-score-label label {
+    display: block !important;
+    margin-bottom: 4px !important;
+}
 """
 with gr.Blocks(css=centered_col_css) as demo:
     # States to save information between pages.
             next_btn_0 = gr.Button("Next")
         gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
             """)
+        # with open("anatomyofAgentResponse.jpg", "rb") as image_file:
+        #     img = Image.open(image_file)
+        #     new_size = (int(img.width * 0.5), int(img.height * 0.5))
+        #     img = img.resize(new_size, Image.LANCZOS)
+        #     buffer = io.BytesIO()
+        #     img.save(buffer, format="PNG")
+        #     encoded_string = base64.b64encode(
+        #         buffer.getvalue()).decode("utf-8")
+        # image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
+        # ReasoningTraceExampleHTML = f"""
+        #     <div>
+        #         {image_html}
+        #     </div>
+        #     """
+        # gr.HTML(ReasoningTraceExampleHTML)
     # Page 1: Pairwise Comparison.
     with gr.Column(visible=False) as page1:
+        with gr.Accordion("Instructions", open=False):
+            gr.Markdown("""
+                    ## Instructions:
+                    Please review these instructions and enter your information to begin:
+                    - Each session requires at least 5-10 minutes per question.
+                    - You can evaluate multiple questions; you will not repeat evaluations.
+                    - For each question, compare responses from two models and rate them (scale: 1-5).
+                    - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
+                    - Use the Back and Next buttons to edit responses before submission.
+                    - Use the Home Page button to return to the homepage; progress will save but not submit.
+                    - Submit answers to the current question before moving to the next.
+                    - You can pause between questions and return later; ensure current answers are submitted to save them.
+                """)
         # Make the number controlled by question indexing!
+        # gr.Markdown("Comparison")
         # Add small red button and comments text box in the same row
         page1_prompt = gr.HTML()
         with gr.Row():
                 scale=1
             )
             skip_comments = gr.Textbox(
+                placeholder="(Optional) Why do you want to skip this question...",
                 show_label=False,
                 scale=3,
                 container=False,
                     value=[],
                     type="messages",
                     height=300,
+                    label="Model A Reasoning - Rationale",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
                     value=[],
                     type="messages",
                     height=300,
+                    label="Model B Reasoning - Rationale",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
                     rating_a = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response A - {crit_score['text']}",
                                         interactive=True,
+                                        elem_classes="criteria-radio-score-label")
                 with gr.Column(scale=1):
                     rating_b = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response B - {crit_score['text']}",
                                         interactive=True,
+                                        elem_classes="criteria-radio-score-label")
             # Add clear button and wire up the restrictions
             with gr.Row():

utils.py CHANGED Viewed

@@ -248,14 +248,14 @@ def format_chat(response, tool_database_labels):
             # Clear after rendering
             last_tool_calls = []
-    if chat_history:
-        last_msg = chat_history[-1]
-        if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
-            # Find the first assistant message
-            for msg in chat_history:
-                if msg.role == "assistant" and isinstance(msg.content, str):
-                    msg.content = "**Reasoning:**\n" + msg.content
-                    break
     if chat_history:
         last_msg = chat_history[-1]
         if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:

             # Clear after rendering
             last_tool_calls = []
+    # if chat_history:
+    #     last_msg = chat_history[-1]
+    #     if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
+    #         # Find the first assistant message
+    #         for msg in chat_history:
+    #             if msg.role == "assistant" and isinstance(msg.content, str):
+    #                 msg.content = "**Reasoning:**\n" + msg.content
+    #                 break
     if chat_history:
         last_msg = chat_history[-1]
         if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content: