Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -15,8 +15,7 @@ REPO_ID = "agenticx/TxAgentEvalData"
|
|
| 15 |
EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
|
| 16 |
TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
|
| 17 |
our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
|
| 18 |
-
# Load tool lists from 'tool_lists' subdirectory
|
| 19 |
-
# with the latest from ToolUniverse if necessary!
|
| 20 |
tools_dir = os.path.join(os.getcwd(), 'tool_lists')
|
| 21 |
|
| 22 |
# Initialize an empty dictionary to store the results
|
|
@@ -39,7 +38,7 @@ for filename in os.listdir(tools_dir):
|
|
| 39 |
print(f"Error processing {filename}: {e}")
|
| 40 |
results[key] = [f"Error loading {filename}"]
|
| 41 |
|
| 42 |
-
#
|
| 43 |
tool_database_labels_raw = {
|
| 44 |
"chembl_tools": "**from the ChEMBL database**",
|
| 45 |
"efo_tools": "**from the Experimental Factor Ontology**",
|
|
@@ -295,10 +294,9 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 295 |
local_path = hf_hub_download(
|
| 296 |
repo_id=REPO_ID,
|
| 297 |
repo_type="dataset",
|
| 298 |
-
#
|
| 299 |
revision="main",
|
| 300 |
filename=remote_path,
|
| 301 |
-
# force_download=True,
|
| 302 |
token=os.getenv("HF_TOKEN")
|
| 303 |
)
|
| 304 |
with open(local_path, "r") as f:
|
|
@@ -316,7 +314,8 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 316 |
if not evaluator_question_ids:
|
| 317 |
return [], data_by_filename
|
| 318 |
|
| 319 |
-
#
|
|
|
|
| 320 |
model_names = [key for key in data_by_filename.keys()
|
| 321 |
if key not in our_methods]
|
| 322 |
full_question_ids_list = []
|
|
@@ -330,7 +329,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 330 |
results_df = read_sheet_to_df(custom_sheet_name=str(
|
| 331 |
TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
|
| 332 |
if results_df is not None and not results_df.empty:
|
| 333 |
-
#
|
| 334 |
comparison_cols = [
|
| 335 |
f"Criterion_{c['label']} Comparison: Which is Better?"
|
| 336 |
for c in criteria_for_comparison
|
|
@@ -341,7 +340,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 341 |
for _, row in results_df.iterrows():
|
| 342 |
q = row.get("Question ID")
|
| 343 |
a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model")
|
| 344 |
-
#
|
| 345 |
if a in our_methods and b not in our_methods:
|
| 346 |
pair = (q, a, b)
|
| 347 |
elif b in our_methods and a not in our_methods:
|
|
@@ -349,12 +348,12 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 349 |
else:
|
| 350 |
continue
|
| 351 |
complete = True
|
| 352 |
-
#
|
| 353 |
for col in comparison_cols:
|
| 354 |
if not row.get(col):
|
| 355 |
complete = False
|
| 356 |
break
|
| 357 |
-
#
|
| 358 |
if complete:
|
| 359 |
for col in scoreA_cols + scoreB_cols:
|
| 360 |
if not row.get(col):
|
|
@@ -362,7 +361,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
|
|
| 362 |
break
|
| 363 |
if complete:
|
| 364 |
matched_pairs.add(pair)
|
| 365 |
-
#
|
| 366 |
full_question_ids_list = [
|
| 367 |
t for t in full_question_ids_list if t not in matched_pairs
|
| 368 |
]
|
|
@@ -442,7 +441,7 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
|
|
| 442 |
# Validate required fields
|
| 443 |
validation_error = validate_required_fields(
|
| 444 |
name, email, evaluator_id, specialty_dd, years_exp_radio)
|
| 445 |
-
print(f"
|
| 446 |
if validation_error:
|
| 447 |
return (
|
| 448 |
gr.update(visible=True), # page0
|
|
@@ -511,7 +510,7 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
|
|
| 511 |
# Use advance_workflow to get all UI updates
|
| 512 |
ui_updates = advance_workflow(progress_state, data_subset_state)
|
| 513 |
|
| 514 |
-
print(f"
|
| 515 |
|
| 516 |
# Map advance_workflow outputs to the required return format
|
| 517 |
num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
|
|
@@ -573,7 +572,7 @@ def get_next_unscored_pair(progress_state):
|
|
| 573 |
completed the pairwise comparison phase.
|
| 574 |
"""
|
| 575 |
for idx, pair in enumerate(progress_state['all_pairs']):
|
| 576 |
-
#
|
| 577 |
if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']:
|
| 578 |
progress_state['current_score_pair_index'] = idx
|
| 579 |
return pair
|
|
@@ -591,7 +590,7 @@ def load_progress_state(evaluator_id, question_id):
|
|
| 591 |
if df is None or df.empty:
|
| 592 |
return None
|
| 593 |
|
| 594 |
-
#
|
| 595 |
df_q = df[df["Question ID"] == question_id]
|
| 596 |
if df_q.empty:
|
| 597 |
return None
|
|
@@ -601,7 +600,7 @@ def load_progress_state(evaluator_id, question_id):
|
|
| 601 |
scoring_done_pairs = set()
|
| 602 |
pairwise_scores = {}
|
| 603 |
|
| 604 |
-
#
|
| 605 |
for _, row in df_q.iterrows():
|
| 606 |
a, b = row["ResponseA_Model"], row["ResponseB_Model"]
|
| 607 |
pair = (a, b)
|
|
@@ -616,10 +615,10 @@ def load_progress_state(evaluator_id, question_id):
|
|
| 616 |
comps.append(mapped_value)
|
| 617 |
pairwise_results[pair] = comps
|
| 618 |
|
| 619 |
-
#
|
| 620 |
first_score = f"ScoreA_{criteria[0]['label']}"
|
| 621 |
if first_score in row and row[first_score] not in (None, ""):
|
| 622 |
-
#
|
| 623 |
scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria]
|
| 624 |
scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria]
|
| 625 |
scoring_done_pairs.add(pair)
|
|
@@ -628,31 +627,31 @@ def load_progress_state(evaluator_id, question_id):
|
|
| 628 |
pairwise_scores[a] = scores_A
|
| 629 |
pairwise_scores[b] = scores_B
|
| 630 |
|
| 631 |
-
#
|
| 632 |
-
# 1.
|
| 633 |
-
# 2.
|
| 634 |
-
# 3.
|
| 635 |
|
| 636 |
-
determined_mode = "pairwise" #
|
| 637 |
|
| 638 |
if pairwise_done:
|
| 639 |
-
#
|
| 640 |
-
#
|
| 641 |
unscored_pairs = pairwise_done - scoring_done_pairs
|
| 642 |
if unscored_pairs:
|
| 643 |
-
#
|
| 644 |
determined_mode = "scoring"
|
| 645 |
-
print(f"
|
| 646 |
else:
|
| 647 |
-
#
|
| 648 |
-
determined_mode = "pairwise" #
|
| 649 |
-
print(f"
|
| 650 |
else:
|
| 651 |
-
#
|
| 652 |
determined_mode = "pairwise"
|
| 653 |
-
print(f"
|
| 654 |
|
| 655 |
-
#
|
| 656 |
progress_state = {
|
| 657 |
"current_question_index": 0,
|
| 658 |
"current_pair_index": 0,
|
|
@@ -661,12 +660,12 @@ def load_progress_state(evaluator_id, question_id):
|
|
| 661 |
"pairwise_results": pairwise_results,
|
| 662 |
"scoring_done_pairs": scoring_done_pairs,
|
| 663 |
"pairwise_scores": pairwise_scores,
|
| 664 |
-
"all_pairs": [], #
|
| 665 |
-
"all_models": [], #
|
| 666 |
"evaluator_id": evaluator_id,
|
| 667 |
-
"mode": determined_mode, #
|
| 668 |
}
|
| 669 |
-
print(
|
| 670 |
return progress_state
|
| 671 |
|
| 672 |
|
|
@@ -926,11 +925,11 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
|
|
| 926 |
|
| 927 |
# Create enhanced pairwise results display with conflict warnings
|
| 928 |
pairwise_results_for_display = []
|
| 929 |
-
for i in range(len_criteria): #
|
| 930 |
-
#
|
| 931 |
choice = pairwise_results[i] if i < len(pairwise_results) else None
|
| 932 |
|
| 933 |
-
#
|
| 934 |
normalized_choice = choice
|
| 935 |
if choice in mapping:
|
| 936 |
normalized_choice = mapping[choice]
|
|
@@ -987,7 +986,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
|
|
| 987 |
pairwise_results_for_display.append(gr.Markdown(display_html, visible=True))
|
| 988 |
|
| 989 |
|
| 990 |
-
#
|
| 991 |
assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}"
|
| 992 |
|
| 993 |
# Create enhanced chatbot components for scoring page
|
|
@@ -1030,7 +1029,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
|
|
| 1030 |
'chat_a_page2': None, # Scoring content (unused in pairwise)
|
| 1031 |
'chat_b_page2': None, # Scoring content (unused in pairwise)
|
| 1032 |
'page2_prompt': None, # Scoring prompt (unused in pairwise)
|
| 1033 |
-
'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)] #
|
| 1034 |
}
|
| 1035 |
|
| 1036 |
|
|
@@ -1205,30 +1204,30 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
|
|
| 1205 |
2. If one score exists, lock it and apply restrictions to the other based on the locked score
|
| 1206 |
3. If no scores exist, apply normal pairwise restrictions
|
| 1207 |
"""
|
| 1208 |
-
print(f"
|
| 1209 |
|
| 1210 |
# Extract the pairwise choice for the current criterion (needed for restrictions)
|
| 1211 |
pairwise_choice = _extract_pairwise_choice(progress_state, index)
|
| 1212 |
|
| 1213 |
if pairwise_choice is not None:
|
| 1214 |
-
print(f"
|
| 1215 |
else:
|
| 1216 |
-
print(f"
|
| 1217 |
|
| 1218 |
# Case 1: Both scores exist - COMPARISON-FIRST logic
|
| 1219 |
if score_a is not None and score_b is not None:
|
| 1220 |
-
print(f"
|
| 1221 |
|
| 1222 |
# Use unified conflict detection
|
| 1223 |
conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b)
|
| 1224 |
|
| 1225 |
if conflict_detected:
|
| 1226 |
-
print(f"
|
| 1227 |
-
print(f"
|
| 1228 |
# Apply normal pairwise restrictions with cleared values to prioritize pairwise choice
|
| 1229 |
result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
|
| 1230 |
else:
|
| 1231 |
-
print(f"
|
| 1232 |
# Ensure scores are strings for Gradio compatibility
|
| 1233 |
score_a_str = str(score_a)
|
| 1234 |
score_b_str = str(score_b)
|
|
@@ -1237,7 +1236,7 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
|
|
| 1237 |
|
| 1238 |
# Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None)
|
| 1239 |
elif score_a is not None and score_b is None:
|
| 1240 |
-
print(f"
|
| 1241 |
score_a_str = str(score_a)
|
| 1242 |
result_A = gr.update(choices=[score_a_str], value=score_a_str)
|
| 1243 |
# Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None
|
|
@@ -1247,7 +1246,7 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
|
|
| 1247 |
|
| 1248 |
# Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None)
|
| 1249 |
elif score_a is None and score_b is not None:
|
| 1250 |
-
print(f"
|
| 1251 |
score_b_str = str(score_b)
|
| 1252 |
result_B = gr.update(choices=[score_b_str], value=score_b_str)
|
| 1253 |
# Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None
|
|
@@ -1257,11 +1256,11 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
|
|
| 1257 |
|
| 1258 |
# Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing
|
| 1259 |
else:
|
| 1260 |
-
print(f"
|
| 1261 |
# Apply restrictions using the shared utility function with explicit None values
|
| 1262 |
result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
|
| 1263 |
|
| 1264 |
-
print(f"
|
| 1265 |
|
| 1266 |
return result_A, result_B
|
| 1267 |
|
|
@@ -1270,7 +1269,7 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1270 |
"""
|
| 1271 |
Unified workflow manager that handles all state transitions and UI updates.
|
| 1272 |
"""
|
| 1273 |
-
print(f"
|
| 1274 |
print(progress_state)
|
| 1275 |
default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)]
|
| 1276 |
|
|
@@ -1320,15 +1319,15 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1320 |
next_pairwise_pair = get_next_uncompleted_pair(progress_state)
|
| 1321 |
next_scoring_pair = get_next_unscored_pair(progress_state)
|
| 1322 |
|
| 1323 |
-
# 2. Determine workflow phase and set mode
|
| 1324 |
if next_pairwise_pair is not None:
|
| 1325 |
progress_state['mode'] = 'pairwise'
|
| 1326 |
next_pair = next_pairwise_pair
|
| 1327 |
-
print(f"
|
| 1328 |
elif next_scoring_pair is not None:
|
| 1329 |
progress_state['mode'] = 'scoring'
|
| 1330 |
next_pair = next_scoring_pair
|
| 1331 |
-
print(f"
|
| 1332 |
else:
|
| 1333 |
progress_state['mode'] = 'completed'
|
| 1334 |
next_pair = None
|
|
@@ -1353,41 +1352,41 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1353 |
'pairwise_results_for_display': default_pairwise_results
|
| 1354 |
}
|
| 1355 |
|
| 1356 |
-
# 4.
|
| 1357 |
if current_mode == 'scoring' and next_pair is not None:
|
| 1358 |
-
#
|
| 1359 |
current_model_A, current_model_B = next_pair
|
| 1360 |
|
| 1361 |
-
#
|
| 1362 |
relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A)
|
| 1363 |
relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B)
|
| 1364 |
|
| 1365 |
-
print(f"
|
| 1366 |
|
| 1367 |
# Debug check for None values
|
| 1368 |
if relevant_scores_A is None:
|
| 1369 |
-
print(f"
|
| 1370 |
if relevant_scores_B is None:
|
| 1371 |
-
print(f"
|
| 1372 |
|
| 1373 |
-
#
|
| 1374 |
ratings_A_updates = []
|
| 1375 |
ratings_B_updates = []
|
| 1376 |
|
| 1377 |
-
#
|
| 1378 |
for i in range(len_criteria):
|
| 1379 |
-
#
|
| 1380 |
score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None
|
| 1381 |
-
#
|
| 1382 |
score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None
|
| 1383 |
|
| 1384 |
-
print(f"
|
| 1385 |
|
| 1386 |
-
#
|
| 1387 |
restricted_update_A, restricted_update_B = apply_scoring_restrictions(
|
| 1388 |
progress_state, i, score_A, score_B)
|
| 1389 |
|
| 1390 |
-
print(f"
|
| 1391 |
|
| 1392 |
ratings_A_updates.append(restricted_update_A)
|
| 1393 |
ratings_B_updates.append(restricted_update_B)
|
|
@@ -1395,10 +1394,10 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1395 |
ui_updates['ratings_A'] = ratings_A_updates
|
| 1396 |
ui_updates['ratings_B'] = ratings_B_updates
|
| 1397 |
|
| 1398 |
-
print(f"
|
| 1399 |
-
print(f"
|
| 1400 |
else:
|
| 1401 |
-
#
|
| 1402 |
ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)]
|
| 1403 |
ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)]
|
| 1404 |
|
|
@@ -1406,17 +1405,16 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1406 |
if next_pair is not None:
|
| 1407 |
print("debug: Extracting UI content for next pair")
|
| 1408 |
print("progress_state:", progress_state)
|
| 1409 |
-
# print("data_subset_state:", data_subset_state)
|
| 1410 |
print("next_pair:", next_pair)
|
| 1411 |
content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair)
|
| 1412 |
ui_updates.update(content_updates)
|
| 1413 |
|
| 1414 |
-
#
|
| 1415 |
-
#
|
| 1416 |
if 'pairwise_results_for_display' in content_updates:
|
| 1417 |
pairwise_results = content_updates['pairwise_results_for_display']
|
| 1418 |
if pairwise_results is None or len(pairwise_results) != len_criteria:
|
| 1419 |
-
print(f"
|
| 1420 |
ui_updates['pairwise_results_for_display'] = default_pairwise_results
|
| 1421 |
|
| 1422 |
# 6. Ensure all required arrays have correct lengths before returning
|
|
@@ -1428,8 +1426,8 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1428 |
|
| 1429 |
# Validate pairwise_results_for_display always has the right length
|
| 1430 |
if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria:
|
| 1431 |
-
print(f"
|
| 1432 |
-
print(f"
|
| 1433 |
ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)]
|
| 1434 |
if 'pairwise_results_for_display' in ui_updates:
|
| 1435 |
if 'ui_buffer' not in progress_state:
|
|
@@ -1442,9 +1440,9 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
|
|
| 1442 |
|
| 1443 |
def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons):
|
| 1444 |
"""Process pairwise comparison submission and transition to scoring mode."""
|
| 1445 |
-
print(f"
|
| 1446 |
-
print(f"
|
| 1447 |
-
print(f"
|
| 1448 |
|
| 1449 |
# Process input parameters
|
| 1450 |
criteria_count = len_criteria
|
|
@@ -1497,7 +1495,7 @@ def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pa
|
|
| 1497 |
# Check individual elements if it's a list
|
| 1498 |
if isinstance(pairwise_results_for_display, list):
|
| 1499 |
for i, item in enumerate(pairwise_results_for_display):
|
| 1500 |
-
print(f"
|
| 1501 |
|
| 1502 |
return [
|
| 1503 |
ui_updates.get('page1_visible'),
|
|
@@ -1520,9 +1518,9 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1520 |
Submit scoring results and proceed to the next step.
|
| 1521 |
Simplified to use unified workflow management.
|
| 1522 |
"""
|
| 1523 |
-
print(f"
|
| 1524 |
-
print(f"
|
| 1525 |
-
print(f"
|
| 1526 |
|
| 1527 |
# Save current ratings - now store by method instead of by pair
|
| 1528 |
pair = progress_state['all_pairs'][progress_state['current_score_pair_index']]
|
|
@@ -1541,7 +1539,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1541 |
# Determine modal visibility based on completion status
|
| 1542 |
all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
|
| 1543 |
len(progress_state['all_pairs']))
|
| 1544 |
-
# next_question_modal_visibility = gr.update(visible=all_scoring_done)
|
| 1545 |
|
| 1546 |
return [
|
| 1547 |
ui_updates.get('page1_visible'), # 5
|
|
@@ -1575,10 +1572,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1575 |
|
| 1576 |
progress_state['scoring_done_pairs'].add(pair)
|
| 1577 |
|
| 1578 |
-
print(f"
|
| 1579 |
-
print(f"
|
| 1580 |
|
| 1581 |
-
#
|
| 1582 |
# Extract pairwise comparison results for this pair
|
| 1583 |
pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria)
|
| 1584 |
comparison_reasons = [None] * len_criteria # We don't have reasons for scoring page
|
|
@@ -1602,7 +1599,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1602 |
# Determine modal visibility based on completion status
|
| 1603 |
all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
|
| 1604 |
len(progress_state['all_pairs']))
|
| 1605 |
-
# next_question_modal_visibility = gr.update(visible=all_scoring_done)
|
| 1606 |
|
| 1607 |
|
| 1608 |
if not all_scoring_done:
|
|
@@ -1634,7 +1630,7 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1634 |
user_info, our_methods
|
| 1635 |
)
|
| 1636 |
|
| 1637 |
-
if remaining_count == 0: #
|
| 1638 |
gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!")
|
| 1639 |
return [
|
| 1640 |
ui_updates.get('page1_visible'), # 5
|
|
@@ -1656,11 +1652,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1656 |
*ui_updates.get('ratings_A'),
|
| 1657 |
*ui_updates.get('ratings_B'),
|
| 1658 |
*ui_updates.get('pairwise_results_for_display'), # 26-33
|
| 1659 |
-
# next_question_modal_visibility # 34
|
| 1660 |
]
|
| 1661 |
# Use advance_workflow to get all UI updates
|
| 1662 |
ui_updates = advance_workflow(progress_state, data_subset_state)
|
| 1663 |
-
print(f"
|
| 1664 |
num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
|
| 1665 |
gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") # eval_progress_text
|
| 1666 |
return (
|
|
@@ -1686,37 +1681,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
|
|
| 1686 |
# next_question_modal_visibility
|
| 1687 |
)
|
| 1688 |
|
| 1689 |
-
|
| 1690 |
-
# def proceed_to_next_question(user_info):
|
| 1691 |
-
# """
|
| 1692 |
-
# Refactored to reuse code from go_to_eval_progress_modal by implementing it using advance_workflow.
|
| 1693 |
-
# This eliminates code duplication and ensures consistent UI behavior.
|
| 1694 |
-
# """
|
| 1695 |
-
# # Fetch next question state
|
| 1696 |
-
# user_info_new, chat_a, chat_b, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question(
|
| 1697 |
-
# user_info['name'], user_info['email'], user_info['specialty'], user_info['subspecialty'],
|
| 1698 |
-
# user_info['years_exp'], user_info['exp_explanation'], user_info['npi_id'], user_info['evaluator_id'], our_methods
|
| 1699 |
-
# )
|
| 1700 |
-
|
| 1701 |
-
# # Use advance_workflow to get all UI updates (same pattern as go_to_eval_progress_modal)
|
| 1702 |
-
# ui_updates = advance_workflow(progress_state, data_subset_state)
|
| 1703 |
-
|
| 1704 |
-
# print(f"\033[93mIn proceed_to_next_question, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
|
| 1705 |
-
|
| 1706 |
-
# # Return exactly the elements bound in next_question_btn.click:
|
| 1707 |
-
# return [
|
| 1708 |
-
# user_info_new,
|
| 1709 |
-
# ui_updates.get('chat_a_page1', chat_a), # 使用适合当前模式的内容
|
| 1710 |
-
# ui_updates.get('chat_b_page1', chat_b), # 使用适合当前模式的内容
|
| 1711 |
-
# ui_updates.get('page1_prompt', page1_prompt), # 使用适合当前模式的提示
|
| 1712 |
-
# page1_reference_answer,
|
| 1713 |
-
# data_subset_state, # data_subset_state slot
|
| 1714 |
-
# ui_updates.get('progress_state', progress_state), # progress_state slot
|
| 1715 |
-
# ui_updates.get('page1_visible', gr.update(visible=True)), # page1 visibility based on mode
|
| 1716 |
-
# ui_updates.get('page2_visible', gr.update(visible=False)), # page2 visibility based on mode
|
| 1717 |
-
# gr.update(visible=False) # next_question_modal hidden
|
| 1718 |
-
# ]
|
| 1719 |
-
|
| 1720 |
# --- Define Callback Functions for Confirmation Flow ---
|
| 1721 |
def build_row_dict(
|
| 1722 |
data_subset_state,
|
|
@@ -1812,18 +1776,18 @@ def restrict_choices(progress_state, index, score_a, score_b):
|
|
| 1812 |
Enforces rating constraints based on the pairwise choice for the given criterion index.
|
| 1813 |
"""
|
| 1814 |
print(
|
| 1815 |
-
f"
|
| 1816 |
print(
|
| 1817 |
-
f"
|
| 1818 |
|
| 1819 |
# Extract the pairwise choice for the current criterion
|
| 1820 |
pairwise_choice = _extract_pairwise_choice(progress_state, index)
|
| 1821 |
|
| 1822 |
if pairwise_choice is not None:
|
| 1823 |
print(
|
| 1824 |
-
f"
|
| 1825 |
else:
|
| 1826 |
-
print(f"
|
| 1827 |
|
| 1828 |
# Skip if both scores are None
|
| 1829 |
if score_a is None and score_b is None:
|
|
@@ -1939,7 +1903,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1939 |
img.save(buffer, format="PNG")
|
| 1940 |
encoded_string = base64.b64encode(
|
| 1941 |
buffer.getvalue()).decode("utf-8")
|
| 1942 |
-
# encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
|
| 1943 |
|
| 1944 |
image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
| 1945 |
ReasoningTraceExampleHTML = f"""
|
|
@@ -1951,9 +1914,9 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1951 |
gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
| 1952 |
""")
|
| 1953 |
gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
|
| 1954 |
-
name = gr.Textbox(label="Name (required)", value="
|
| 1955 |
email = gr.Textbox(
|
| 1956 |
-
label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="
|
| 1957 |
evaluator_id = gr.Textbox(
|
| 1958 |
label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False)
|
| 1959 |
|
|
@@ -1992,8 +1955,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 1992 |
|
| 1993 |
with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
|
| 1994 |
eval_progress_text = gr.Markdown("You have X questions remaining.")
|
| 1995 |
-
# eval_progress_proceed_btn = gr.Button(
|
| 1996 |
-
# "OK, proceed to question evaluation")
|
| 1997 |
|
| 1998 |
# Page 1: Pairwise Comparison.
|
| 1999 |
with gr.Column(visible=False) as page1:
|
|
@@ -2218,22 +2179,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 2218 |
# Clicking OK hides the modal.
|
| 2219 |
ok_btn.click(lambda: gr.update(visible=False), None, error_modal)
|
| 2220 |
|
| 2221 |
-
# Confirmation Modal: Ask for final submission confirmation.
|
| 2222 |
-
# with Modal("Confirm Submission", visible=False, elem_id="confirm_modal") as confirm_modal:
|
| 2223 |
-
# gr.Markdown(
|
| 2224 |
-
# "Are you sure you want to submit? Once submitted, you cannot edit your responses.")
|
| 2225 |
-
# with gr.Row():
|
| 2226 |
-
# yes_btn = gr.Button("Yes, please submit")
|
| 2227 |
-
# cancel_btn = gr.Button("Cancel")
|
| 2228 |
-
|
| 2229 |
-
# Add modal for proceeding to next question
|
| 2230 |
-
# with Modal("Next Question", visible=False, elem_id="next_question_modal") as next_question_modal:
|
| 2231 |
-
# gr.Markdown(
|
| 2232 |
-
# "You have completed this question. Click below to proceed to the next question.")
|
| 2233 |
-
# next_question_btn = gr.Button("Next Question")
|
| 2234 |
-
|
| 2235 |
-
|
| 2236 |
-
|
| 2237 |
# --- Define Transitions Between Pages ---
|
| 2238 |
|
| 2239 |
# For the "Participate in Evaluation" button, transition to page0
|
|
@@ -2267,13 +2212,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 2267 |
home_btn_2.click(lambda: (gr.update(visible=True), gr.update(
|
| 2268 |
visible=False)), None, [page_minus1, page2], scroll_to_output=True)
|
| 2269 |
|
| 2270 |
-
# Transition from Page 1 to Page 0 (Back button).
|
| 2271 |
-
# back_btn_0.click(
|
| 2272 |
-
# fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
|
| 2273 |
-
# inputs=None,
|
| 2274 |
-
# outputs=[page0, page1]
|
| 2275 |
-
# )
|
| 2276 |
-
|
| 2277 |
# Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
|
| 2278 |
next_btn_1.click(
|
| 2279 |
fn=submit_pairwise_comparison,
|
|
@@ -2291,27 +2229,6 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
| 2291 |
scroll_to_output=True,
|
| 2292 |
)
|
| 2293 |
|
| 2294 |
-
# Transition from Rating Page (Page 2) back to Pairwise page.
|
| 2295 |
-
# back_btn_2.click(
|
| 2296 |
-
# fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
|
| 2297 |
-
# inputs=None,
|
| 2298 |
-
# outputs=[page1, page2],
|
| 2299 |
-
# scroll_to_output=True
|
| 2300 |
-
# )
|
| 2301 |
-
|
| 2302 |
-
# Wire up the modal button to proceed_to_next_question and reset all UI for the new question
|
| 2303 |
-
# next_question_btn.click(
|
| 2304 |
-
# fn=proceed_to_next_question,
|
| 2305 |
-
# inputs=[user_info_state],
|
| 2306 |
-
# outputs=[
|
| 2307 |
-
# user_info_state,
|
| 2308 |
-
# chat_a_page1, chat_b_page1, page1_prompt, page1_reference_answer,
|
| 2309 |
-
# data_subset_state, progress_state,
|
| 2310 |
-
# page1, page2, next_question_modal
|
| 2311 |
-
# ],
|
| 2312 |
-
# scroll_to_output=True
|
| 2313 |
-
# )
|
| 2314 |
-
|
| 2315 |
submit_btn.click(
|
| 2316 |
fn=submit_pairwise_scoring,
|
| 2317 |
inputs=[progress_state,
|
|
|
|
| 15 |
EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
|
| 16 |
TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
|
| 17 |
our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
|
| 18 |
+
# Load tool lists from 'tool_lists' subdirectory
|
|
|
|
| 19 |
tools_dir = os.path.join(os.getcwd(), 'tool_lists')
|
| 20 |
|
| 21 |
# Initialize an empty dictionary to store the results
|
|
|
|
| 38 |
print(f"Error processing {filename}: {e}")
|
| 39 |
results[key] = [f"Error loading {filename}"]
|
| 40 |
|
| 41 |
+
# Tool database labels for different tool calls in format_chat
|
| 42 |
tool_database_labels_raw = {
|
| 43 |
"chembl_tools": "**from the ChEMBL database**",
|
| 44 |
"efo_tools": "**from the Experimental Factor Ontology**",
|
|
|
|
| 294 |
local_path = hf_hub_download(
|
| 295 |
repo_id=REPO_ID,
|
| 296 |
repo_type="dataset",
|
| 297 |
+
# Fetches the most recent version of the dataset each time this command is called
|
| 298 |
revision="main",
|
| 299 |
filename=remote_path,
|
|
|
|
| 300 |
token=os.getenv("HF_TOKEN")
|
| 301 |
)
|
| 302 |
with open(local_path, "r") as f:
|
|
|
|
| 314 |
if not evaluator_question_ids:
|
| 315 |
return [], data_by_filename
|
| 316 |
|
| 317 |
+
# Check if evaluator has already completed any questions
|
| 318 |
+
# Must go through every tuple of (question_ID, TxAgent, other model)
|
| 319 |
model_names = [key for key in data_by_filename.keys()
|
| 320 |
if key not in our_methods]
|
| 321 |
full_question_ids_list = []
|
|
|
|
| 329 |
results_df = read_sheet_to_df(custom_sheet_name=str(
|
| 330 |
TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
|
| 331 |
if results_df is not None and not results_df.empty:
|
| 332 |
+
# Only consider records where both "Pairwise comparison" and "scoring" fields are filled
|
| 333 |
comparison_cols = [
|
| 334 |
f"Criterion_{c['label']} Comparison: Which is Better?"
|
| 335 |
for c in criteria_for_comparison
|
|
|
|
| 340 |
for _, row in results_df.iterrows():
|
| 341 |
q = row.get("Question ID")
|
| 342 |
a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model")
|
| 343 |
+
# Ensure our_methods comes first
|
| 344 |
if a in our_methods and b not in our_methods:
|
| 345 |
pair = (q, a, b)
|
| 346 |
elif b in our_methods and a not in our_methods:
|
|
|
|
| 348 |
else:
|
| 349 |
continue
|
| 350 |
complete = True
|
| 351 |
+
# Check all pairwise comparison columns
|
| 352 |
for col in comparison_cols:
|
| 353 |
if not row.get(col):
|
| 354 |
complete = False
|
| 355 |
break
|
| 356 |
+
# If pairwise is complete, check all scoring columns
|
| 357 |
if complete:
|
| 358 |
for col in scoreA_cols + scoreB_cols:
|
| 359 |
if not row.get(col):
|
|
|
|
| 361 |
break
|
| 362 |
if complete:
|
| 363 |
matched_pairs.add(pair)
|
| 364 |
+
# Only filter out truly completed pairs, incomplete ones (with missing values) will be retained
|
| 365 |
full_question_ids_list = [
|
| 366 |
t for t in full_question_ids_list if t not in matched_pairs
|
| 367 |
]
|
|
|
|
| 441 |
# Validate required fields
|
| 442 |
validation_error = validate_required_fields(
|
| 443 |
name, email, evaluator_id, specialty_dd, years_exp_radio)
|
| 444 |
+
print(f"In go_to_eval_progress_modal, validation_error={validation_error}")
|
| 445 |
if validation_error:
|
| 446 |
return (
|
| 447 |
gr.update(visible=True), # page0
|
|
|
|
| 510 |
# Use advance_workflow to get all UI updates
|
| 511 |
ui_updates = advance_workflow(progress_state, data_subset_state)
|
| 512 |
|
| 513 |
+
print(f"In go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}")
|
| 514 |
|
| 515 |
# Map advance_workflow outputs to the required return format
|
| 516 |
num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
|
|
|
|
| 572 |
completed the pairwise comparison phase.
|
| 573 |
"""
|
| 574 |
for idx, pair in enumerate(progress_state['all_pairs']):
|
| 575 |
+
# Ensure the pair has completed the pairwise comparison phase and hasn't been scored yet
|
| 576 |
if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']:
|
| 577 |
progress_state['current_score_pair_index'] = idx
|
| 578 |
return pair
|
|
|
|
| 590 |
if df is None or df.empty:
|
| 591 |
return None
|
| 592 |
|
| 593 |
+
# Only keep rows for current question_id
|
| 594 |
df_q = df[df["Question ID"] == question_id]
|
| 595 |
if df_q.empty:
|
| 596 |
return None
|
|
|
|
| 600 |
scoring_done_pairs = set()
|
| 601 |
pairwise_scores = {}
|
| 602 |
|
| 603 |
+
# Iterate through each record to extract model pairs, comparison results and scores
|
| 604 |
for _, row in df_q.iterrows():
|
| 605 |
a, b = row["ResponseA_Model"], row["ResponseB_Model"]
|
| 606 |
pair = (a, b)
|
|
|
|
| 615 |
comps.append(mapped_value)
|
| 616 |
pairwise_results[pair] = comps
|
| 617 |
|
| 618 |
+
# Collect scores if scoring columns exist
|
| 619 |
first_score = f"ScoreA_{criteria[0]['label']}"
|
| 620 |
if first_score in row and row[first_score] not in (None, ""):
|
| 621 |
+
# Store scores by method instead of by pair
|
| 622 |
scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria]
|
| 623 |
scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria]
|
| 624 |
scoring_done_pairs.add(pair)
|
|
|
|
| 627 |
pairwise_scores[a] = scores_A
|
| 628 |
pairwise_scores[b] = scores_B
|
| 629 |
|
| 630 |
+
# Intelligently set mode based on existing data
|
| 631 |
+
# 1. If there are completed pairwise comparisons but no corresponding scores, should enter scoring mode
|
| 632 |
+
# 2. If both pairwise comparisons and scores are completed, need to determine if there are incomplete pairs through advance_workflow
|
| 633 |
+
# 3. If no completed pairwise comparisons, should be in pairwise comparison mode
|
| 634 |
|
| 635 |
+
determined_mode = "pairwise" # Default mode
|
| 636 |
|
| 637 |
if pairwise_done:
|
| 638 |
+
# Has completed pairwise comparisons
|
| 639 |
+
# Check if there are completed pairs but unscored pairs
|
| 640 |
unscored_pairs = pairwise_done - scoring_done_pairs
|
| 641 |
if unscored_pairs:
|
| 642 |
+
# Has completed pairs but unscored pairs, should enter scoring mode
|
| 643 |
determined_mode = "scoring"
|
| 644 |
+
print(f"load_progress_state: Found {len(unscored_pairs)} unscored pairs, setting mode to 'scoring'")
|
| 645 |
else:
|
| 646 |
+
# All paired comparisons are scored, let advance_workflow decide next step
|
| 647 |
+
determined_mode = "pairwise" # May still have unpaired ones
|
| 648 |
+
print(f"load_progress_state: All pairwise comparisons are scored, setting mode to 'pairwise' (will be corrected by advance_workflow)")
|
| 649 |
else:
|
| 650 |
+
# No completed pairwise comparisons, definitely pairwise comparison mode
|
| 651 |
determined_mode = "pairwise"
|
| 652 |
+
print(f"load_progress_state: No completed pairwise comparisons, setting mode to 'pairwise'")
|
| 653 |
|
| 654 |
+
# Construct complete progress_state (all_pairs, all_models will be overwritten later)
|
| 655 |
progress_state = {
|
| 656 |
"current_question_index": 0,
|
| 657 |
"current_pair_index": 0,
|
|
|
|
| 660 |
"pairwise_results": pairwise_results,
|
| 661 |
"scoring_done_pairs": scoring_done_pairs,
|
| 662 |
"pairwise_scores": pairwise_scores,
|
| 663 |
+
"all_pairs": [], # Reset later based on models_full
|
| 664 |
+
"all_models": [], # Reset later based on models_full
|
| 665 |
"evaluator_id": evaluator_id,
|
| 666 |
+
"mode": determined_mode, # Intelligently set mode
|
| 667 |
}
|
| 668 |
+
print(progress_state)
|
| 669 |
return progress_state
|
| 670 |
|
| 671 |
|
|
|
|
| 925 |
|
| 926 |
# Create enhanced pairwise results display with conflict warnings
|
| 927 |
pairwise_results_for_display = []
|
| 928 |
+
for i in range(len_criteria): # Use len_criteria to generate correct number of display elements
|
| 929 |
+
# Get corresponding comparison result
|
| 930 |
choice = pairwise_results[i] if i < len(pairwise_results) else None
|
| 931 |
|
| 932 |
+
# Normalize choice, support both mapping key and value forms
|
| 933 |
normalized_choice = choice
|
| 934 |
if choice in mapping:
|
| 935 |
normalized_choice = mapping[choice]
|
|
|
|
| 986 |
pairwise_results_for_display.append(gr.Markdown(display_html, visible=True))
|
| 987 |
|
| 988 |
|
| 989 |
+
# Ensure correct length
|
| 990 |
assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}"
|
| 991 |
|
| 992 |
# Create enhanced chatbot components for scoring page
|
|
|
|
| 1029 |
'chat_a_page2': None, # Scoring content (unused in pairwise)
|
| 1030 |
'chat_b_page2': None, # Scoring content (unused in pairwise)
|
| 1031 |
'page2_prompt': None, # Scoring prompt (unused in pairwise)
|
| 1032 |
+
'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)] # Ensure correct length
|
| 1033 |
}
|
| 1034 |
|
| 1035 |
|
|
|
|
| 1204 |
2. If one score exists, lock it and apply restrictions to the other based on the locked score
|
| 1205 |
3. If no scores exist, apply normal pairwise restrictions
|
| 1206 |
"""
|
| 1207 |
+
print(f"[DEBUG] apply_scoring_restrictions called - criterion {index}: input score_a={score_a}, score_b={score_b}")
|
| 1208 |
|
| 1209 |
# Extract the pairwise choice for the current criterion (needed for restrictions)
|
| 1210 |
pairwise_choice = _extract_pairwise_choice(progress_state, index)
|
| 1211 |
|
| 1212 |
if pairwise_choice is not None:
|
| 1213 |
+
print(f"[Auto-restrict] Found pairwise choice for criterion {index}: {pairwise_choice}")
|
| 1214 |
else:
|
| 1215 |
+
print(f"[Auto-restrict] No pairwise choice found for criterion {index}")
|
| 1216 |
|
| 1217 |
# Case 1: Both scores exist - COMPARISON-FIRST logic
|
| 1218 |
if score_a is not None and score_b is not None:
|
| 1219 |
+
print(f"[COMPARISON-FIRST] Both scores exist: A={score_a}, B={score_b} - checking for conflicts with pairwise choice")
|
| 1220 |
|
| 1221 |
# Use unified conflict detection
|
| 1222 |
conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b)
|
| 1223 |
|
| 1224 |
if conflict_detected:
|
| 1225 |
+
print(f"[CONFLICT] Pairwise choice conflicts with scores: A={score_a}, B={score_b}")
|
| 1226 |
+
print(f"[COMPARISON-FIRST] Conflict detected - prioritizing pairwise comparison, clearing existing scores")
|
| 1227 |
# Apply normal pairwise restrictions with cleared values to prioritize pairwise choice
|
| 1228 |
result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
|
| 1229 |
else:
|
| 1230 |
+
print(f"[NO-CONFLICT] Existing scores are compatible with pairwise choice - locking both")
|
| 1231 |
# Ensure scores are strings for Gradio compatibility
|
| 1232 |
score_a_str = str(score_a)
|
| 1233 |
score_b_str = str(score_b)
|
|
|
|
| 1236 |
|
| 1237 |
# Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None)
|
| 1238 |
elif score_a is not None and score_b is None:
|
| 1239 |
+
print(f"[LOCK] Found existing score_a={score_a}, locking A and restricting B based on A (B value=None)")
|
| 1240 |
score_a_str = str(score_a)
|
| 1241 |
result_A = gr.update(choices=[score_a_str], value=score_a_str)
|
| 1242 |
# Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None
|
|
|
|
| 1246 |
|
| 1247 |
# Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None)
|
| 1248 |
elif score_a is None and score_b is not None:
|
| 1249 |
+
print(f"[LOCK] Found existing score_b={score_b}, locking B and restricting A based on B (A value=None)")
|
| 1250 |
score_b_str = str(score_b)
|
| 1251 |
result_B = gr.update(choices=[score_b_str], value=score_b_str)
|
| 1252 |
# Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None
|
|
|
|
| 1256 |
|
| 1257 |
# Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing
|
| 1258 |
else:
|
| 1259 |
+
print(f"[CLEAR] No existing scores - applying normal restrictions with None values to ensure clearing")
|
| 1260 |
# Apply restrictions using the shared utility function with explicit None values
|
| 1261 |
result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
|
| 1262 |
|
| 1263 |
+
print(f"[DEBUG] apply_scoring_restrictions result - criterion {index}: returning gradio updates")
|
| 1264 |
|
| 1265 |
return result_A, result_B
|
| 1266 |
|
|
|
|
| 1269 |
"""
|
| 1270 |
Unified workflow manager that handles all state transitions and UI updates.
|
| 1271 |
"""
|
| 1272 |
+
print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
|
| 1273 |
print(progress_state)
|
| 1274 |
default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)]
|
| 1275 |
|
|
|
|
| 1319 |
next_pairwise_pair = get_next_uncompleted_pair(progress_state)
|
| 1320 |
next_scoring_pair = get_next_unscored_pair(progress_state)
|
| 1321 |
|
| 1322 |
+
# 2. Determine workflow phase and set mode
|
| 1323 |
if next_pairwise_pair is not None:
|
| 1324 |
progress_state['mode'] = 'pairwise'
|
| 1325 |
next_pair = next_pairwise_pair
|
| 1326 |
+
print(f"Pairwise mode: next pair {next_pair}")
|
| 1327 |
elif next_scoring_pair is not None:
|
| 1328 |
progress_state['mode'] = 'scoring'
|
| 1329 |
next_pair = next_scoring_pair
|
| 1330 |
+
print(f"Scoring mode: next pair {next_pair}")
|
| 1331 |
else:
|
| 1332 |
progress_state['mode'] = 'completed'
|
| 1333 |
next_pair = None
|
|
|
|
| 1352 |
'pairwise_results_for_display': default_pairwise_results
|
| 1353 |
}
|
| 1354 |
|
| 1355 |
+
# 4. If in scoring mode, check historical scoring records
|
| 1356 |
if current_mode == 'scoring' and next_pair is not None:
|
| 1357 |
+
# Get historical scores for current model pair (if exists)
|
| 1358 |
current_model_A, current_model_B = next_pair
|
| 1359 |
|
| 1360 |
+
# Search for historical scores directly from method-keyed storage - simpler and more efficient!
|
| 1361 |
relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A)
|
| 1362 |
relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B)
|
| 1363 |
|
| 1364 |
+
print(f"Relevant scores for {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}")
|
| 1365 |
|
| 1366 |
# Debug check for None values
|
| 1367 |
if relevant_scores_A is None:
|
| 1368 |
+
print(f"[DEBUG] relevant_scores_A is None for pair {next_pair}")
|
| 1369 |
if relevant_scores_B is None:
|
| 1370 |
+
print(f"[DEBUG] relevant_scores_B is None for pair {next_pair}")
|
| 1371 |
|
| 1372 |
+
# Set initial values for UI scoring controls
|
| 1373 |
ratings_A_updates = []
|
| 1374 |
ratings_B_updates = []
|
| 1375 |
|
| 1376 |
+
# Set initial values for A and B model scoring controls and apply scoring restrictions
|
| 1377 |
for i in range(len_criteria):
|
| 1378 |
+
# For A model: use historical score if available, otherwise clear
|
| 1379 |
score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None
|
| 1380 |
+
# For B model: use historical score if available, otherwise clear
|
| 1381 |
score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None
|
| 1382 |
|
| 1383 |
+
print(f"[DEBUG] Criterion {i}: Before apply_scoring_restrictions - score_A={score_A}, score_B={score_B}")
|
| 1384 |
|
| 1385 |
+
# Apply scoring restrictions based on pairwise comparison results
|
| 1386 |
restricted_update_A, restricted_update_B = apply_scoring_restrictions(
|
| 1387 |
progress_state, i, score_A, score_B)
|
| 1388 |
|
| 1389 |
+
print(f"[DEBUG] Criterion {i}: After apply_scoring_restrictions - restricted_update_A={restricted_update_A}, restricted_update_B={restricted_update_B}")
|
| 1390 |
|
| 1391 |
ratings_A_updates.append(restricted_update_A)
|
| 1392 |
ratings_B_updates.append(restricted_update_B)
|
|
|
|
| 1394 |
ui_updates['ratings_A'] = ratings_A_updates
|
| 1395 |
ui_updates['ratings_B'] = ratings_B_updates
|
| 1396 |
|
| 1397 |
+
print(f"Loaded relevant scores for pair {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}")
|
| 1398 |
+
print(f"Applied automatic scoring restrictions based on pairwise choices")
|
| 1399 |
else:
|
| 1400 |
+
# Non-scoring mode, keep blank
|
| 1401 |
ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)]
|
| 1402 |
ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)]
|
| 1403 |
|
|
|
|
| 1405 |
if next_pair is not None:
|
| 1406 |
print("debug: Extracting UI content for next pair")
|
| 1407 |
print("progress_state:", progress_state)
|
|
|
|
| 1408 |
print("next_pair:", next_pair)
|
| 1409 |
content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair)
|
| 1410 |
ui_updates.update(content_updates)
|
| 1411 |
|
| 1412 |
+
# If extract_ui_content_by_mode returned new pairwise_results_for_display,
|
| 1413 |
+
# ensure it has correct length
|
| 1414 |
if 'pairwise_results_for_display' in content_updates:
|
| 1415 |
pairwise_results = content_updates['pairwise_results_for_display']
|
| 1416 |
if pairwise_results is None or len(pairwise_results) != len_criteria:
|
| 1417 |
+
print(f"Warning: pairwise_results_for_display has incorrect length {len(pairwise_results) if pairwise_results else 'None'}, expected {len_criteria}")
|
| 1418 |
ui_updates['pairwise_results_for_display'] = default_pairwise_results
|
| 1419 |
|
| 1420 |
# 6. Ensure all required arrays have correct lengths before returning
|
|
|
|
| 1426 |
|
| 1427 |
# Validate pairwise_results_for_display always has the right length
|
| 1428 |
if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria:
|
| 1429 |
+
print(f"[ADVANCE_WORKFLOW] WARNING: Overriding pairwise_results_for_display due to length mismatch!")
|
| 1430 |
+
print(f" Original length: {len(ui_updates.get('pairwise_results_for_display', []))}, Expected: {len_criteria}")
|
| 1431 |
ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)]
|
| 1432 |
if 'pairwise_results_for_display' in ui_updates:
|
| 1433 |
if 'ui_buffer' not in progress_state:
|
|
|
|
| 1440 |
|
| 1441 |
def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons):
|
| 1442 |
"""Process pairwise comparison submission and transition to scoring mode."""
|
| 1443 |
+
print(f"=== SUBMIT PAIRWISE COMPARISON DEBUG ===")
|
| 1444 |
+
print(f"Input progress_state: {progress_state}")
|
| 1445 |
+
print(f"Pairwise comparisons: {pairwise_comparisons}")
|
| 1446 |
|
| 1447 |
# Process input parameters
|
| 1448 |
criteria_count = len_criteria
|
|
|
|
| 1495 |
# Check individual elements if it's a list
|
| 1496 |
if isinstance(pairwise_results_for_display, list):
|
| 1497 |
for i, item in enumerate(pairwise_results_for_display):
|
| 1498 |
+
print(f" Element {i}: {type(item)} - {item}")
|
| 1499 |
|
| 1500 |
return [
|
| 1501 |
ui_updates.get('page1_visible'),
|
|
|
|
| 1518 |
Submit scoring results and proceed to the next step.
|
| 1519 |
Simplified to use unified workflow management.
|
| 1520 |
"""
|
| 1521 |
+
print(f"=== SUBMIT PAIRWISE SCORING DEBUG ===")
|
| 1522 |
+
print(f"Input progress_state: {progress_state}")
|
| 1523 |
+
print(f"Ratings: {ratings}")
|
| 1524 |
|
| 1525 |
# Save current ratings - now store by method instead of by pair
|
| 1526 |
pair = progress_state['all_pairs'][progress_state['current_score_pair_index']]
|
|
|
|
| 1539 |
# Determine modal visibility based on completion status
|
| 1540 |
all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
|
| 1541 |
len(progress_state['all_pairs']))
|
|
|
|
| 1542 |
|
| 1543 |
return [
|
| 1544 |
ui_updates.get('page1_visible'), # 5
|
|
|
|
| 1572 |
|
| 1573 |
progress_state['scoring_done_pairs'].add(pair)
|
| 1574 |
|
| 1575 |
+
print(f"[SCORE_SAVE] Saved scores for {model_A}: {ratings_A}")
|
| 1576 |
+
print(f"[SCORE_SAVE] Saved scores for {model_B}: {ratings_B}")
|
| 1577 |
|
| 1578 |
+
# Save results to database like submit_pairwise_comparison does
|
| 1579 |
# Extract pairwise comparison results for this pair
|
| 1580 |
pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria)
|
| 1581 |
comparison_reasons = [None] * len_criteria # We don't have reasons for scoring page
|
|
|
|
| 1599 |
# Determine modal visibility based on completion status
|
| 1600 |
all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
|
| 1601 |
len(progress_state['all_pairs']))
|
|
|
|
| 1602 |
|
| 1603 |
|
| 1604 |
if not all_scoring_done:
|
|
|
|
| 1630 |
user_info, our_methods
|
| 1631 |
)
|
| 1632 |
|
| 1633 |
+
if remaining_count == 0: # TODO: handle completion
|
| 1634 |
gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!")
|
| 1635 |
return [
|
| 1636 |
ui_updates.get('page1_visible'), # 5
|
|
|
|
| 1652 |
*ui_updates.get('ratings_A'),
|
| 1653 |
*ui_updates.get('ratings_B'),
|
| 1654 |
*ui_updates.get('pairwise_results_for_display'), # 26-33
|
|
|
|
| 1655 |
]
|
| 1656 |
# Use advance_workflow to get all UI updates
|
| 1657 |
ui_updates = advance_workflow(progress_state, data_subset_state)
|
| 1658 |
+
print(f"In submit_pairwise_scoring, using advance_workflow results: mode={progress_state.get('mode')}")
|
| 1659 |
num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
|
| 1660 |
gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") # eval_progress_text
|
| 1661 |
return (
|
|
|
|
| 1681 |
# next_question_modal_visibility
|
| 1682 |
)
|
| 1683 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1684 |
# --- Define Callback Functions for Confirmation Flow ---
|
| 1685 |
def build_row_dict(
|
| 1686 |
data_subset_state,
|
|
|
|
| 1776 |
Enforces rating constraints based on the pairwise choice for the given criterion index.
|
| 1777 |
"""
|
| 1778 |
print(
|
| 1779 |
+
f"Restricting choices for index {index} with scores A: {score_a}, B: {score_b}")
|
| 1780 |
print(
|
| 1781 |
+
f"Progress state keys: {list(progress_state.keys()) if progress_state else 'None'}")
|
| 1782 |
|
| 1783 |
# Extract the pairwise choice for the current criterion
|
| 1784 |
pairwise_choice = _extract_pairwise_choice(progress_state, index)
|
| 1785 |
|
| 1786 |
if pairwise_choice is not None:
|
| 1787 |
print(
|
| 1788 |
+
f"Found pairwise choice for criterion {index}: {pairwise_choice}")
|
| 1789 |
else:
|
| 1790 |
+
print(f"No pairwise results found for criterion {index}")
|
| 1791 |
|
| 1792 |
# Skip if both scores are None
|
| 1793 |
if score_a is None and score_b is None:
|
|
|
|
| 1903 |
img.save(buffer, format="PNG")
|
| 1904 |
encoded_string = base64.b64encode(
|
| 1905 |
buffer.getvalue()).decode("utf-8")
|
|
|
|
| 1906 |
|
| 1907 |
image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
| 1908 |
ReasoningTraceExampleHTML = f"""
|
|
|
|
| 1914 |
gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
| 1915 |
""")
|
| 1916 |
gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
|
| 1917 |
+
name = gr.Textbox(label="Name (required)", value="")
|
| 1918 |
email = gr.Textbox(
|
| 1919 |
+
label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="")
|
| 1920 |
evaluator_id = gr.Textbox(
|
| 1921 |
label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False)
|
| 1922 |
|
|
|
|
| 1955 |
|
| 1956 |
with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
|
| 1957 |
eval_progress_text = gr.Markdown("You have X questions remaining.")
|
|
|
|
|
|
|
| 1958 |
|
| 1959 |
# Page 1: Pairwise Comparison.
|
| 1960 |
with gr.Column(visible=False) as page1:
|
|
|
|
| 2179 |
# Clicking OK hides the modal.
|
| 2180 |
ok_btn.click(lambda: gr.update(visible=False), None, error_modal)
|
| 2181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2182 |
# --- Define Transitions Between Pages ---
|
| 2183 |
|
| 2184 |
# For the "Participate in Evaluation" button, transition to page0
|
|
|
|
| 2212 |
home_btn_2.click(lambda: (gr.update(visible=True), gr.update(
|
| 2213 |
visible=False)), None, [page_minus1, page2], scroll_to_output=True)
|
| 2214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2215 |
# Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
|
| 2216 |
next_btn_1.click(
|
| 2217 |
fn=submit_pairwise_comparison,
|
|
|
|
| 2229 |
scroll_to_output=True,
|
| 2230 |
)
|
| 2231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2232 |
submit_btn.click(
|
| 2233 |
fn=submit_pairwise_scoring,
|
| 2234 |
inputs=[progress_state,
|