update
Browse files
app.py
CHANGED
@@ -124,7 +124,7 @@ criteria = [
|
|
124 |
]
|
125 |
},
|
126 |
{
|
127 |
-
"label": "
|
128 |
"text": "Is the model’s rationale helpful in determining whether the answer is correct?",
|
129 |
"scores": [
|
130 |
"1 No usable rationale. ",
|
@@ -218,7 +218,7 @@ criteria_for_comparison = [
|
|
218 |
)
|
219 |
},
|
220 |
{
|
221 |
-
"label": "
|
222 |
"text": (
|
223 |
"Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
|
224 |
)
|
@@ -482,7 +482,7 @@ def get_next_eval_question(
|
|
482 |
prompt_text = question_for_eval['question']
|
483 |
|
484 |
page1_prompt = gr.HTML(
|
485 |
-
f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">
|
486 |
page1_reference_answer = gr.Markdown(txagent_matched_entry.get(
|
487 |
"correct_answer")) if include_correct_answer else None
|
488 |
chat_a_answer = gr.Chatbot(
|
@@ -513,7 +513,7 @@ def get_next_eval_question(
|
|
513 |
value=chat_A_reasoning,
|
514 |
type="messages",
|
515 |
height=300,
|
516 |
-
label="Model A Reasoning",
|
517 |
show_copy_button=False,
|
518 |
show_label=True,
|
519 |
render_markdown=True,
|
@@ -525,7 +525,7 @@ def get_next_eval_question(
|
|
525 |
value=chat_B_reasoning,
|
526 |
type="messages",
|
527 |
height=300,
|
528 |
-
label="Model B Reasoning",
|
529 |
show_copy_button=False,
|
530 |
show_label=True,
|
531 |
render_markdown=True,
|
@@ -668,7 +668,7 @@ def skip_current_question(user_info_state, our_methods: list = our_methods):
|
|
668 |
|
669 |
prompt_html = (
|
670 |
f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
|
671 |
-
f"border-radius: 5px; color: black;'><strong style='color: black;'>
|
672 |
f"{question_for_eval['question']}</div>"
|
673 |
)
|
674 |
reference_md = question_for_eval.get("correct_answer", "")
|
@@ -1088,6 +1088,32 @@ centered_col_css = """
|
|
1088 |
width: 100% !important; /* Occupy full width of its column */
|
1089 |
white-space: normal !important; /* Allow text to wrap onto multiple lines */
|
1090 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1091 |
"""
|
1092 |
with gr.Blocks(css=centered_col_css) as demo:
|
1093 |
# States to save information between pages.
|
@@ -1217,41 +1243,41 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1217 |
next_btn_0 = gr.Button("Next")
|
1218 |
gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
1219 |
""")
|
1220 |
-
|
1221 |
-
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
-
|
1227 |
-
|
1228 |
-
|
1229 |
-
|
1230 |
-
|
1231 |
-
|
1232 |
-
|
1233 |
-
|
1234 |
-
|
1235 |
-
|
1236 |
-
new_size = (int(img.width * 0.5), int(img.height * 0.5))
|
1237 |
-
img = img.resize(new_size, Image.LANCZOS)
|
1238 |
-
buffer = io.BytesIO()
|
1239 |
-
img.save(buffer, format="PNG")
|
1240 |
-
encoded_string = base64.b64encode(
|
1241 |
-
buffer.getvalue()).decode("utf-8")
|
1242 |
-
|
1243 |
-
image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
1244 |
-
ReasoningTraceExampleHTML = f"""
|
1245 |
-
<div>
|
1246 |
-
{image_html}
|
1247 |
-
</div>
|
1248 |
-
"""
|
1249 |
-
gr.HTML(ReasoningTraceExampleHTML)
|
1250 |
|
1251 |
# Page 1: Pairwise Comparison.
|
1252 |
with gr.Column(visible=False) as page1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1253 |
# Make the number controlled by question indexing!
|
1254 |
-
gr.Markdown("Comparison")
|
1255 |
# Add small red button and comments text box in the same row
|
1256 |
page1_prompt = gr.HTML()
|
1257 |
with gr.Row():
|
@@ -1264,7 +1290,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1264 |
scale=1
|
1265 |
)
|
1266 |
skip_comments = gr.Textbox(
|
1267 |
-
placeholder="(Optional)
|
1268 |
show_label=False,
|
1269 |
scale=3,
|
1270 |
container=False,
|
@@ -1293,7 +1319,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1293 |
value=[],
|
1294 |
type="messages",
|
1295 |
height=300,
|
1296 |
-
label="Model A Reasoning",
|
1297 |
show_copy_button=False,
|
1298 |
show_label=True,
|
1299 |
render_markdown=True,
|
@@ -1319,7 +1345,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1319 |
value=[],
|
1320 |
type="messages",
|
1321 |
height=300,
|
1322 |
-
label="Model B Reasoning",
|
1323 |
show_copy_button=False,
|
1324 |
show_label=True,
|
1325 |
render_markdown=True,
|
@@ -1366,12 +1392,12 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1366 |
rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1367 |
label=f"Response A - {crit_score['text']}",
|
1368 |
interactive=True,
|
1369 |
-
elem_classes="criteria-radio-label")
|
1370 |
with gr.Column(scale=1):
|
1371 |
rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1372 |
label=f"Response B - {crit_score['text']}",
|
1373 |
interactive=True,
|
1374 |
-
elem_classes="criteria-radio-label")
|
1375 |
|
1376 |
# Add clear button and wire up the restrictions
|
1377 |
with gr.Row():
|
|
|
124 |
]
|
125 |
},
|
126 |
{
|
127 |
+
"label": "Helpfulness of rationale",
|
128 |
"text": "Is the model’s rationale helpful in determining whether the answer is correct?",
|
129 |
"scores": [
|
130 |
"1 No usable rationale. ",
|
|
|
218 |
)
|
219 |
},
|
220 |
{
|
221 |
+
"label": "Helpfulness of rationale",
|
222 |
"text": (
|
223 |
"Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
|
224 |
)
|
|
|
482 |
prompt_text = question_for_eval['question']
|
483 |
|
484 |
page1_prompt = gr.HTML(
|
485 |
+
f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Question:</strong> {prompt_text}</div>')
|
486 |
page1_reference_answer = gr.Markdown(txagent_matched_entry.get(
|
487 |
"correct_answer")) if include_correct_answer else None
|
488 |
chat_a_answer = gr.Chatbot(
|
|
|
513 |
value=chat_A_reasoning,
|
514 |
type="messages",
|
515 |
height=300,
|
516 |
+
label="Model A Reasoning - Rationale",
|
517 |
show_copy_button=False,
|
518 |
show_label=True,
|
519 |
render_markdown=True,
|
|
|
525 |
value=chat_B_reasoning,
|
526 |
type="messages",
|
527 |
height=300,
|
528 |
+
label="Model B Reasoning - Rationale",
|
529 |
show_copy_button=False,
|
530 |
show_label=True,
|
531 |
render_markdown=True,
|
|
|
668 |
|
669 |
prompt_html = (
|
670 |
f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; "
|
671 |
+
f"border-radius: 5px; color: black;'><strong style='color: black;'>Question:</strong> "
|
672 |
f"{question_for_eval['question']}</div>"
|
673 |
)
|
674 |
reference_md = question_for_eval.get("correct_answer", "")
|
|
|
1088 |
width: 100% !important; /* Occupy full width of its column */
|
1089 |
white-space: normal !important; /* Allow text to wrap onto multiple lines */
|
1090 |
}
|
1091 |
+
.criteria-radio-score-label [role="radiogroup"],
|
1092 |
+
.criteria-radio-score-label .gr-radio-group,
|
1093 |
+
.criteria-radio-score-label .flex {
|
1094 |
+
display: flex !important;
|
1095 |
+
flex-direction: column !important;
|
1096 |
+
gap: 4px !important; /* 行间距,可按需调整 */
|
1097 |
+
}
|
1098 |
+
|
1099 |
+
/* 更具体的选择器来确保垂直布局 */
|
1100 |
+
.criteria-radio-score-label fieldset {
|
1101 |
+
display: flex !important;
|
1102 |
+
flex-direction: column !important;
|
1103 |
+
gap: 4px !important;
|
1104 |
+
}
|
1105 |
+
|
1106 |
+
.criteria-radio-score-label .wrap {
|
1107 |
+
display: flex !important;
|
1108 |
+
flex-direction: column !important;
|
1109 |
+
gap: 4px !important;
|
1110 |
+
}
|
1111 |
+
|
1112 |
+
/* 确保每个单选按钮选项垂直排列 */
|
1113 |
+
.criteria-radio-score-label label {
|
1114 |
+
display: block !important;
|
1115 |
+
margin-bottom: 4px !important;
|
1116 |
+
}
|
1117 |
"""
|
1118 |
with gr.Blocks(css=centered_col_css) as demo:
|
1119 |
# States to save information between pages.
|
|
|
1243 |
next_btn_0 = gr.Button("Next")
|
1244 |
gr.Markdown("""By clicking 'Next', you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
1245 |
""")
|
1246 |
+
# with open("anatomyofAgentResponse.jpg", "rb") as image_file:
|
1247 |
+
# img = Image.open(image_file)
|
1248 |
+
# new_size = (int(img.width * 0.5), int(img.height * 0.5))
|
1249 |
+
# img = img.resize(new_size, Image.LANCZOS)
|
1250 |
+
# buffer = io.BytesIO()
|
1251 |
+
# img.save(buffer, format="PNG")
|
1252 |
+
# encoded_string = base64.b64encode(
|
1253 |
+
# buffer.getvalue()).decode("utf-8")
|
1254 |
+
|
1255 |
+
# image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
1256 |
+
# ReasoningTraceExampleHTML = f"""
|
1257 |
+
# <div>
|
1258 |
+
# {image_html}
|
1259 |
+
# </div>
|
1260 |
+
# """
|
1261 |
+
# gr.HTML(ReasoningTraceExampleHTML)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1262 |
|
1263 |
# Page 1: Pairwise Comparison.
|
1264 |
with gr.Column(visible=False) as page1:
|
1265 |
+
with gr.Accordion("Instructions", open=False):
|
1266 |
+
gr.Markdown("""
|
1267 |
+
## Instructions:
|
1268 |
+
Please review these instructions and enter your information to begin:
|
1269 |
+
|
1270 |
+
- Each session requires at least 5-10 minutes per question.
|
1271 |
+
- You can evaluate multiple questions; you will not repeat evaluations.
|
1272 |
+
- For each question, compare responses from two models and rate them (scale: 1-5).
|
1273 |
+
- If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
|
1274 |
+
- Use the Back and Next buttons to edit responses before submission.
|
1275 |
+
- Use the Home Page button to return to the homepage; progress will save but not submit.
|
1276 |
+
- Submit answers to the current question before moving to the next.
|
1277 |
+
- You can pause between questions and return later; ensure current answers are submitted to save them.
|
1278 |
+
""")
|
1279 |
# Make the number controlled by question indexing!
|
1280 |
+
# gr.Markdown("Comparison")
|
1281 |
# Add small red button and comments text box in the same row
|
1282 |
page1_prompt = gr.HTML()
|
1283 |
with gr.Row():
|
|
|
1290 |
scale=1
|
1291 |
)
|
1292 |
skip_comments = gr.Textbox(
|
1293 |
+
placeholder="(Optional) Why do you want to skip this question...",
|
1294 |
show_label=False,
|
1295 |
scale=3,
|
1296 |
container=False,
|
|
|
1319 |
value=[],
|
1320 |
type="messages",
|
1321 |
height=300,
|
1322 |
+
label="Model A Reasoning - Rationale",
|
1323 |
show_copy_button=False,
|
1324 |
show_label=True,
|
1325 |
render_markdown=True,
|
|
|
1345 |
value=[],
|
1346 |
type="messages",
|
1347 |
height=300,
|
1348 |
+
label="Model B Reasoning - Rationale",
|
1349 |
show_copy_button=False,
|
1350 |
show_label=True,
|
1351 |
render_markdown=True,
|
|
|
1392 |
rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1393 |
label=f"Response A - {crit_score['text']}",
|
1394 |
interactive=True,
|
1395 |
+
elem_classes="criteria-radio-score-label")
|
1396 |
with gr.Column(scale=1):
|
1397 |
rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1398 |
label=f"Response B - {crit_score['text']}",
|
1399 |
interactive=True,
|
1400 |
+
elem_classes="criteria-radio-score-label")
|
1401 |
|
1402 |
# Add clear button and wire up the restrictions
|
1403 |
with gr.Row():
|
utils.py
CHANGED
@@ -248,14 +248,14 @@ def format_chat(response, tool_database_labels):
|
|
248 |
# Clear after rendering
|
249 |
last_tool_calls = []
|
250 |
|
251 |
-
if chat_history:
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
if chat_history:
|
260 |
last_msg = chat_history[-1]
|
261 |
if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
|
|
|
248 |
# Clear after rendering
|
249 |
last_tool_calls = []
|
250 |
|
251 |
+
# if chat_history:
|
252 |
+
# last_msg = chat_history[-1]
|
253 |
+
# if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
|
254 |
+
# # Find the first assistant message
|
255 |
+
# for msg in chat_history:
|
256 |
+
# if msg.role == "assistant" and isinstance(msg.content, str):
|
257 |
+
# msg.content = "**Reasoning:**\n" + msg.content
|
258 |
+
# break
|
259 |
if chat_history:
|
260 |
last_msg = chat_history[-1]
|
261 |
if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
|