shgao commited on
Commit
9b0e7e4
·
1 Parent(s): f84f9ce
Files changed (2) hide show
  1. app.py +68 -42
  2. utils.py +8 -8
app.py CHANGED
@@ -124,7 +124,7 @@ criteria = [
124
  ]
125
  },
126
  {
127
- "label": "Justification helpfulness",
128
  "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
129
  "scores": [
130
  "1 No usable rationale. ",
@@ -218,7 +218,7 @@ criteria_for_comparison = [
218
  )
219
  },
220
  {
221
- "label": "Justification helpfulness",
222
  "text": (
223
  "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
224
  )
@@ -482,7 +482,7 @@ def get_next_eval_question(
482
  prompt_text = question_for_eval['question']
483
 
484
  page1_prompt = gr.HTML(
485
- f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Prompt:</strong> {prompt_text}</div>')
486
  page1_reference_answer = gr.Markdown(txagent_matched_entry.get(
487
  "correct_answer")) if include_correct_answer else None
488
  chat_a_answer = gr.Chatbot(
@@ -513,7 +513,7 @@ def get_next_eval_question(
513
  value=chat_A_reasoning,
514
  type="messages",
515
  height=300,
516
- label="Model A Reasoning",
517
  show_copy_button=False,
518
  show_label=True,
519
  render_markdown=True,
@@ -525,7 +525,7 @@ def get_next_eval_question(
525
  value=chat_B_reasoning,
526
  type="messages",
527
  height=300,
528
- label="Model B Reasoning",
529
  show_copy_button=False,
530
  show_label=True,
531
  render_markdown=True,
@@ -668,7 +668,7 @@ def skip_current_question(user_info_state, our_methods: list = our_methods):
668
 
669
  prompt_html = (
670
  f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
671
- f"border-radius: 5px; color: black;'><strong style='color: black;'>Prompt:</strong> "
672
  f"{question_for_eval['question']}</div>"
673
  )
674
  reference_md = question_for_eval.get("correct_answer", "")
@@ -1088,6 +1088,32 @@ centered_col_css = """
1088
  width: 100% !important; /* Occupy full width of its column */
1089
  white-space: normal !important; /* Allow text to wrap onto multiple lines */
1090
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1091
  """
1092
  with gr.Blocks(css=centered_col_css) as demo:
1093
  # States to save information between pages.
@@ -1217,41 +1243,41 @@ with gr.Blocks(css=centered_col_css) as demo:
1217
  next_btn_0 = gr.Button("Next")
1218
  gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
1219
  """)
1220
-
1221
- gr.Markdown("""
1222
- ## Instructions:
1223
- Please review these instructions and enter your information to begin:
1224
-
1225
- - Each session requires at least 5-10 minutes per question.
1226
- - You can evaluate multiple questions; you will not repeat evaluations.
1227
- - For each question, compare responses from two models and rate them (scale: 1-5).
1228
- - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
1229
- - Use the Back and Next buttons to edit responses before submission.
1230
- - Use the Home Page button to return to the homepage; progress will save but not submit.
1231
- - Submit answers to the current question before moving to the next.
1232
- - You can pause between questions and return later; ensure current answers are submitted to save them.
1233
- """)
1234
- with open("anatomyofAgentResponse.jpg", "rb") as image_file:
1235
- img = Image.open(image_file)
1236
- new_size = (int(img.width * 0.5), int(img.height * 0.5))
1237
- img = img.resize(new_size, Image.LANCZOS)
1238
- buffer = io.BytesIO()
1239
- img.save(buffer, format="PNG")
1240
- encoded_string = base64.b64encode(
1241
- buffer.getvalue()).decode("utf-8")
1242
-
1243
- image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
1244
- ReasoningTraceExampleHTML = f"""
1245
- <div>
1246
- {image_html}
1247
- </div>
1248
- """
1249
- gr.HTML(ReasoningTraceExampleHTML)
1250
 
1251
  # Page 1: Pairwise Comparison.
1252
  with gr.Column(visible=False) as page1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1253
  # Make the number controlled by question indexing!
1254
- gr.Markdown("Comparison")
1255
  # Add small red button and comments text box in the same row
1256
  page1_prompt = gr.HTML()
1257
  with gr.Row():
@@ -1264,7 +1290,7 @@ with gr.Blocks(css=centered_col_css) as demo:
1264
  scale=1
1265
  )
1266
  skip_comments = gr.Textbox(
1267
- placeholder="(Optional) Comments about why you're skipping this question...",
1268
  show_label=False,
1269
  scale=3,
1270
  container=False,
@@ -1293,7 +1319,7 @@ with gr.Blocks(css=centered_col_css) as demo:
1293
  value=[],
1294
  type="messages",
1295
  height=300,
1296
- label="Model A Reasoning",
1297
  show_copy_button=False,
1298
  show_label=True,
1299
  render_markdown=True,
@@ -1319,7 +1345,7 @@ with gr.Blocks(css=centered_col_css) as demo:
1319
  value=[],
1320
  type="messages",
1321
  height=300,
1322
- label="Model B Reasoning",
1323
  show_copy_button=False,
1324
  show_label=True,
1325
  render_markdown=True,
@@ -1366,12 +1392,12 @@ with gr.Blocks(css=centered_col_css) as demo:
1366
  rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1367
  label=f"Response A - {crit_score['text']}",
1368
  interactive=True,
1369
- elem_classes="criteria-radio-label")
1370
  with gr.Column(scale=1):
1371
  rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1372
  label=f"Response B - {crit_score['text']}",
1373
  interactive=True,
1374
- elem_classes="criteria-radio-label")
1375
 
1376
  # Add clear button and wire up the restrictions
1377
  with gr.Row():
 
124
  ]
125
  },
126
  {
127
+ "label": "Helpfulness of rationale",
128
  "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
129
  "scores": [
130
  "1 No usable rationale. ",
 
218
  )
219
  },
220
  {
221
+ "label": "Helpfulness of rationale",
222
  "text": (
223
  "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
224
  )
 
482
  prompt_text = question_for_eval['question']
483
 
484
  page1_prompt = gr.HTML(
485
+ f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Question:</strong> {prompt_text}</div>')
486
  page1_reference_answer = gr.Markdown(txagent_matched_entry.get(
487
  "correct_answer")) if include_correct_answer else None
488
  chat_a_answer = gr.Chatbot(
 
513
  value=chat_A_reasoning,
514
  type="messages",
515
  height=300,
516
+ label="Model A Reasoning - Rationale",
517
  show_copy_button=False,
518
  show_label=True,
519
  render_markdown=True,
 
525
  value=chat_B_reasoning,
526
  type="messages",
527
  height=300,
528
+ label="Model B Reasoning - Rationale",
529
  show_copy_button=False,
530
  show_label=True,
531
  render_markdown=True,
 
668
 
669
  prompt_html = (
670
  f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
671
+ f"border-radius: 5px; color: black;'><strong style='color: black;'>Question:</strong> "
672
  f"{question_for_eval['question']}</div>"
673
  )
674
  reference_md = question_for_eval.get("correct_answer", "")
 
1088
  width: 100% !important; /* Occupy full width of its column */
1089
  white-space: normal !important; /* Allow text to wrap onto multiple lines */
1090
  }
1091
+ .criteria-radio-score-label [role="radiogroup"],
1092
+ .criteria-radio-score-label .gr-radio-group,
1093
+ .criteria-radio-score-label .flex {
1094
+ display: flex !important;
1095
+ flex-direction: column !important;
1096
+ gap: 4px !important; /* 行间距,可按需调整 */
1097
+ }
1098
+
1099
+ /* 更具体的选择器来确保垂直布局 */
1100
+ .criteria-radio-score-label fieldset {
1101
+ display: flex !important;
1102
+ flex-direction: column !important;
1103
+ gap: 4px !important;
1104
+ }
1105
+
1106
+ .criteria-radio-score-label .wrap {
1107
+ display: flex !important;
1108
+ flex-direction: column !important;
1109
+ gap: 4px !important;
1110
+ }
1111
+
1112
+ /* 确保每个单选按钮选项垂直排列 */
1113
+ .criteria-radio-score-label label {
1114
+ display: block !important;
1115
+ margin-bottom: 4px !important;
1116
+ }
1117
  """
1118
  with gr.Blocks(css=centered_col_css) as demo:
1119
  # States to save information between pages.
 
1243
  next_btn_0 = gr.Button("Next")
1244
  gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
1245
  """)
1246
+ # with open("anatomyofAgentResponse.jpg", "rb") as image_file:
1247
+ # img = Image.open(image_file)
1248
+ # new_size = (int(img.width * 0.5), int(img.height * 0.5))
1249
+ # img = img.resize(new_size, Image.LANCZOS)
1250
+ # buffer = io.BytesIO()
1251
+ # img.save(buffer, format="PNG")
1252
+ # encoded_string = base64.b64encode(
1253
+ # buffer.getvalue()).decode("utf-8")
1254
+
1255
+ # image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
1256
+ # ReasoningTraceExampleHTML = f"""
1257
+ # <div>
1258
+ # {image_html}
1259
+ # </div>
1260
+ # """
1261
+ # gr.HTML(ReasoningTraceExampleHTML)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1262
 
1263
  # Page 1: Pairwise Comparison.
1264
  with gr.Column(visible=False) as page1:
1265
+ with gr.Accordion("Instructions", open=False):
1266
+ gr.Markdown("""
1267
+ ## Instructions:
1268
+ Please review these instructions and enter your information to begin:
1269
+
1270
+ - Each session requires at least 5-10 minutes per question.
1271
+ - You can evaluate multiple questions; you will not repeat evaluations.
1272
+ - For each question, compare responses from two models and rate them (scale: 1-5).
1273
+ - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
1274
+ - Use the Back and Next buttons to edit responses before submission.
1275
+ - Use the Home Page button to return to the homepage; progress will save but not submit.
1276
+ - Submit answers to the current question before moving to the next.
1277
+ - You can pause between questions and return later; ensure current answers are submitted to save them.
1278
+ """)
1279
  # Make the number controlled by question indexing!
1280
+ # gr.Markdown("Comparison")
1281
  # Add small red button and comments text box in the same row
1282
  page1_prompt = gr.HTML()
1283
  with gr.Row():
 
1290
  scale=1
1291
  )
1292
  skip_comments = gr.Textbox(
1293
+ placeholder="(Optional) Why do you want to skip this question...",
1294
  show_label=False,
1295
  scale=3,
1296
  container=False,
 
1319
  value=[],
1320
  type="messages",
1321
  height=300,
1322
+ label="Model A Reasoning - Rationale",
1323
  show_copy_button=False,
1324
  show_label=True,
1325
  render_markdown=True,
 
1345
  value=[],
1346
  type="messages",
1347
  height=300,
1348
+ label="Model B Reasoning - Rationale",
1349
  show_copy_button=False,
1350
  show_label=True,
1351
  render_markdown=True,
 
1392
  rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1393
  label=f"Response A - {crit_score['text']}",
1394
  interactive=True,
1395
+ elem_classes="criteria-radio-score-label")
1396
  with gr.Column(scale=1):
1397
  rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1398
  label=f"Response B - {crit_score['text']}",
1399
  interactive=True,
1400
+ elem_classes="criteria-radio-score-label")
1401
 
1402
  # Add clear button and wire up the restrictions
1403
  with gr.Row():
utils.py CHANGED
@@ -248,14 +248,14 @@ def format_chat(response, tool_database_labels):
248
  # Clear after rendering
249
  last_tool_calls = []
250
 
251
- if chat_history:
252
- last_msg = chat_history[-1]
253
- if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
254
- # Find the first assistant message
255
- for msg in chat_history:
256
- if msg.role == "assistant" and isinstance(msg.content, str):
257
- msg.content = "**Reasoning:**\n" + msg.content
258
- break
259
  if chat_history:
260
  last_msg = chat_history[-1]
261
  if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
 
248
  # Clear after rendering
249
  last_tool_calls = []
250
 
251
+ # if chat_history:
252
+ # last_msg = chat_history[-1]
253
+ # if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
254
+ # # Find the first assistant message
255
+ # for msg in chat_history:
256
+ # if msg.role == "assistant" and isinstance(msg.content, str):
257
+ # msg.content = "**Reasoning:**\n" + msg.content
258
+ # break
259
  if chat_history:
260
  last_msg = chat_history[-1]
261
  if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content: