shgao commited on
Commit
a23fc7c
·
1 Parent(s): 6f0352a
Files changed (1) hide show
  1. app.py +95 -178
app.py CHANGED
@@ -15,8 +15,7 @@ REPO_ID = "agenticx/TxAgentEvalData"
15
  EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
16
  TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
17
  our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
18
- # Load tool lists from 'tool_lists' subdirectory---make sure to update this
19
- # with the latest from ToolUniverse if necessary!
20
  tools_dir = os.path.join(os.getcwd(), 'tool_lists')
21
 
22
  # Initialize an empty dictionary to store the results
@@ -39,7 +38,7 @@ for filename in os.listdir(tools_dir):
39
  print(f"Error processing {filename}: {e}")
40
  results[key] = [f"Error loading {filename}"]
41
 
42
- # for labeling the different tool calls in format_chat
43
  tool_database_labels_raw = {
44
  "chembl_tools": "**from the ChEMBL database**",
45
  "efo_tools": "**from the Experimental Factor Ontology**",
@@ -295,10 +294,9 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
295
  local_path = hf_hub_download(
296
  repo_id=REPO_ID,
297
  repo_type="dataset",
298
- # fetches the most recent version of the dataset each time this command is called
299
  revision="main",
300
  filename=remote_path,
301
- # force_download=True,
302
  token=os.getenv("HF_TOKEN")
303
  )
304
  with open(local_path, "r") as f:
@@ -316,7 +314,8 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
316
  if not evaluator_question_ids:
317
  return [], data_by_filename
318
 
319
- # FINALLY, MAKE SURE THEY DIDNT ALREADY FILL IT OUT. Must go through every tuple of (question_ID, TxAgent, other model) where other model could be any of the other files in data_by_filename
 
320
  model_names = [key for key in data_by_filename.keys()
321
  if key not in our_methods]
322
  full_question_ids_list = []
@@ -330,7 +329,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
330
  results_df = read_sheet_to_df(custom_sheet_name=str(
331
  TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
332
  if results_df is not None and not results_df.empty:
333
- # 仅把“Pairwise 比较”和“评分”字段都已填的记录视为完成
334
  comparison_cols = [
335
  f"Criterion_{c['label']} Comparison: Which is Better?"
336
  for c in criteria_for_comparison
@@ -341,7 +340,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
341
  for _, row in results_df.iterrows():
342
  q = row.get("Question ID")
343
  a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model")
344
- # 确保 our_methods 在前
345
  if a in our_methods and b not in our_methods:
346
  pair = (q, a, b)
347
  elif b in our_methods and a not in our_methods:
@@ -349,12 +348,12 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
349
  else:
350
  continue
351
  complete = True
352
- # 检查所有的 Pairwise 比较列
353
  for col in comparison_cols:
354
  if not row.get(col):
355
  complete = False
356
  break
357
- # 如果 Pairwise 完成,再检查所有评分列
358
  if complete:
359
  for col in scoreA_cols + scoreB_cols:
360
  if not row.get(col):
@@ -362,7 +361,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
362
  break
363
  if complete:
364
  matched_pairs.add(pair)
365
- # 只过滤掉真正做完的 pair,未完成(缺值)的会被保留
366
  full_question_ids_list = [
367
  t for t in full_question_ids_list if t not in matched_pairs
368
  ]
@@ -442,7 +441,7 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
442
  # Validate required fields
443
  validation_error = validate_required_fields(
444
  name, email, evaluator_id, specialty_dd, years_exp_radio)
445
- print(f"\033[93mIn go_to_eval_progress_modal, validation_error={validation_error}\033[0m")
446
  if validation_error:
447
  return (
448
  gr.update(visible=True), # page0
@@ -511,7 +510,7 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
511
  # Use advance_workflow to get all UI updates
512
  ui_updates = advance_workflow(progress_state, data_subset_state)
513
 
514
- print(f"\033[93mIn go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
515
 
516
  # Map advance_workflow outputs to the required return format
517
  num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
@@ -573,7 +572,7 @@ def get_next_unscored_pair(progress_state):
573
  completed the pairwise comparison phase.
574
  """
575
  for idx, pair in enumerate(progress_state['all_pairs']):
576
- # 确保该对已完成配对比较阶段且尚未进行评分
577
  if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']:
578
  progress_state['current_score_pair_index'] = idx
579
  return pair
@@ -591,7 +590,7 @@ def load_progress_state(evaluator_id, question_id):
591
  if df is None or df.empty:
592
  return None
593
 
594
- # 只保留当前 question_id 的行
595
  df_q = df[df["Question ID"] == question_id]
596
  if df_q.empty:
597
  return None
@@ -601,7 +600,7 @@ def load_progress_state(evaluator_id, question_id):
601
  scoring_done_pairs = set()
602
  pairwise_scores = {}
603
 
604
- # 遍历每条记录,抽取模型对、比较结果和打分
605
  for _, row in df_q.iterrows():
606
  a, b = row["ResponseA_Model"], row["ResponseB_Model"]
607
  pair = (a, b)
@@ -616,10 +615,10 @@ def load_progress_state(evaluator_id, question_id):
616
  comps.append(mapped_value)
617
  pairwise_results[pair] = comps
618
 
619
- # 如果有评分列则收集评分
620
  first_score = f"ScoreA_{criteria[0]['label']}"
621
  if first_score in row and row[first_score] not in (None, ""):
622
- # NEW: Store scores by method instead of by pair
623
  scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria]
624
  scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria]
625
  scoring_done_pairs.add(pair)
@@ -628,31 +627,31 @@ def load_progress_state(evaluator_id, question_id):
628
  pairwise_scores[a] = scores_A
629
  pairwise_scores[b] = scores_B
630
 
631
- # 根据已有数据智能设置模式
632
- # 1. 如果有已完成的配对比较但没有对应的评分,应该进入评分模式
633
- # 2. 如果配对比较和评分都完成了,需要在后续通过advance_workflow判断是否还有未完成的pair
634
- # 3. 如果没有任何已完成的配对比较,应该是配对比较模式
635
 
636
- determined_mode = "pairwise" # 默认模式
637
 
638
  if pairwise_done:
639
- # 有已完成的配对比较
640
- # 检查是否有已完成配对但未评分的pair
641
  unscored_pairs = pairwise_done - scoring_done_pairs
642
  if unscored_pairs:
643
- # 有配对完成但未评分的pair,应该进入评分模式
644
  determined_mode = "scoring"
645
- print(f"\033[93mload_progress_state: Found {len(unscored_pairs)} unscored pairs, setting mode to 'scoring'\033[0m")
646
  else:
647
- # 所有已配对的都已评分,让advance_workflow决定下一步
648
- determined_mode = "pairwise" # 可能还有未配对的
649
- print(f"\033[93mload_progress_state: All pairwise comparisons are scored, setting mode to 'pairwise' (will be corrected by advance_workflow)\033[0m")
650
  else:
651
- # 没有任何已完成的配对比较,肯定是配对比较模式
652
  determined_mode = "pairwise"
653
- print(f"\033[93mload_progress_state: No completed pairwise comparisons, setting mode to 'pairwise'\033[0m")
654
 
655
- # 构造完整的 progress_stateall_pairsall_models 会在后续被覆盖)
656
  progress_state = {
657
  "current_question_index": 0,
658
  "current_pair_index": 0,
@@ -661,12 +660,12 @@ def load_progress_state(evaluator_id, question_id):
661
  "pairwise_results": pairwise_results,
662
  "scoring_done_pairs": scoring_done_pairs,
663
  "pairwise_scores": pairwise_scores,
664
- "all_pairs": [], # 后面根据 models_full 重置
665
- "all_models": [], # 后面根据 models_full 重置
666
  "evaluator_id": evaluator_id,
667
- "mode": determined_mode, # 智能设置的模式
668
  }
669
- print("\033[92m", progress_state, "\033[0m")
670
  return progress_state
671
 
672
 
@@ -926,11 +925,11 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
926
 
927
  # Create enhanced pairwise results display with conflict warnings
928
  pairwise_results_for_display = []
929
- for i in range(len_criteria): # 修复:使用 len_criteria 来生成正确数量的显示元素
930
- # 获取对应的比较结果
931
  choice = pairwise_results[i] if i < len(pairwise_results) else None
932
 
933
- # 标准化choice,支持mappingkeyvalue形式
934
  normalized_choice = choice
935
  if choice in mapping:
936
  normalized_choice = mapping[choice]
@@ -987,7 +986,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
987
  pairwise_results_for_display.append(gr.Markdown(display_html, visible=True))
988
 
989
 
990
- # 确保长度正确 - 现在应该已经是正确的了
991
  assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}"
992
 
993
  # Create enhanced chatbot components for scoring page
@@ -1030,7 +1029,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
1030
  'chat_a_page2': None, # Scoring content (unused in pairwise)
1031
  'chat_b_page2': None, # Scoring content (unused in pairwise)
1032
  'page2_prompt': None, # Scoring prompt (unused in pairwise)
1033
- 'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)] # 确保正确长度
1034
  }
1035
 
1036
 
@@ -1205,30 +1204,30 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
1205
  2. If one score exists, lock it and apply restrictions to the other based on the locked score
1206
  3. If no scores exist, apply normal pairwise restrictions
1207
  """
1208
- print(f"\033[96m[DEBUG] apply_scoring_restrictions called - criterion {index}: input score_a={score_a}, score_b={score_b}\033[0m")
1209
 
1210
  # Extract the pairwise choice for the current criterion (needed for restrictions)
1211
  pairwise_choice = _extract_pairwise_choice(progress_state, index)
1212
 
1213
  if pairwise_choice is not None:
1214
- print(f"\033[92m[Auto-restrict] Found pairwise choice for criterion {index}: {pairwise_choice}\033[0m")
1215
  else:
1216
- print(f"\033[93m[Auto-restrict] No pairwise choice found for criterion {index}\033[0m")
1217
 
1218
  # Case 1: Both scores exist - COMPARISON-FIRST logic
1219
  if score_a is not None and score_b is not None:
1220
- print(f"\033[95m[COMPARISON-FIRST] Both scores exist: A={score_a}, B={score_b} - checking for conflicts with pairwise choice\033[0m")
1221
 
1222
  # Use unified conflict detection
1223
  conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b)
1224
 
1225
  if conflict_detected:
1226
- print(f"\033[91m[CONFLICT] Pairwise choice conflicts with scores: A={score_a}, B={score_b}\033[0m")
1227
- print(f"\033[91m[COMPARISON-FIRST] Conflict detected - prioritizing pairwise comparison, clearing existing scores\033[0m")
1228
  # Apply normal pairwise restrictions with cleared values to prioritize pairwise choice
1229
  result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
1230
  else:
1231
- print(f"\033[95m[NO-CONFLICT] Existing scores are compatible with pairwise choice - locking both\033[0m")
1232
  # Ensure scores are strings for Gradio compatibility
1233
  score_a_str = str(score_a)
1234
  score_b_str = str(score_b)
@@ -1237,7 +1236,7 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
1237
 
1238
  # Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None)
1239
  elif score_a is not None and score_b is None:
1240
- print(f"\033[95m[LOCK] Found existing score_a={score_a}, locking A and restricting B based on A (B value=None)\033[0m")
1241
  score_a_str = str(score_a)
1242
  result_A = gr.update(choices=[score_a_str], value=score_a_str)
1243
  # Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None
@@ -1247,7 +1246,7 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
1247
 
1248
  # Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None)
1249
  elif score_a is None and score_b is not None:
1250
- print(f"\033[95m[LOCK] Found existing score_b={score_b}, locking B and restricting A based on B (A value=None)\033[0m")
1251
  score_b_str = str(score_b)
1252
  result_B = gr.update(choices=[score_b_str], value=score_b_str)
1253
  # Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None
@@ -1257,11 +1256,11 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
1257
 
1258
  # Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing
1259
  else:
1260
- print(f"\033[91m[CLEAR] No existing scores - applying normal restrictions with None values to ensure clearing\033[0m")
1261
  # Apply restrictions using the shared utility function with explicit None values
1262
  result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
1263
 
1264
- print(f"\033[96m[DEBUG] apply_scoring_restrictions result - criterion {index}: returning gradio updates\033[0m")
1265
 
1266
  return result_A, result_B
1267
 
@@ -1270,7 +1269,7 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1270
  """
1271
  Unified workflow manager that handles all state transitions and UI updates.
1272
  """
1273
- print(f"\033[94mAdvance workflow called, previous mode: {progress_state.get('mode')}\033[0m")
1274
  print(progress_state)
1275
  default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)]
1276
 
@@ -1320,15 +1319,15 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1320
  next_pairwise_pair = get_next_uncompleted_pair(progress_state)
1321
  next_scoring_pair = get_next_unscored_pair(progress_state)
1322
 
1323
- # 2. Determine workflow phase and set mode (只保留一次)
1324
  if next_pairwise_pair is not None:
1325
  progress_state['mode'] = 'pairwise'
1326
  next_pair = next_pairwise_pair
1327
- print(f"\033[92mPairwise mode: next pair {next_pair}\033[0m")
1328
  elif next_scoring_pair is not None:
1329
  progress_state['mode'] = 'scoring'
1330
  next_pair = next_scoring_pair
1331
- print(f"\033[91mScoring mode: next pair {next_pair}\033[0m")
1332
  else:
1333
  progress_state['mode'] = 'completed'
1334
  next_pair = None
@@ -1353,41 +1352,41 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1353
  'pairwise_results_for_display': default_pairwise_results
1354
  }
1355
 
1356
- # 4. 如果是评分模式,检查历史评分记录
1357
  if current_mode == 'scoring' and next_pair is not None:
1358
- # 获取当前模型对的历史评分(如果存在)
1359
  current_model_A, current_model_B = next_pair
1360
 
1361
- # 直接从method-keyed存储中查找历史评分 - 更简单高效!
1362
  relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A)
1363
  relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B)
1364
 
1365
- print(f"\033[93mRelevant scores for {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}\033[0m")
1366
 
1367
  # Debug check for None values
1368
  if relevant_scores_A is None:
1369
- print(f"\033[91m[DEBUG] relevant_scores_A is None for pair {next_pair}\033[0m")
1370
  if relevant_scores_B is None:
1371
- print(f"\033[91m[DEBUG] relevant_scores_B is None for pair {next_pair}\033[0m")
1372
 
1373
- # 设置UI评分控件的初始值
1374
  ratings_A_updates = []
1375
  ratings_B_updates = []
1376
 
1377
- # 设置A模型和B模型评分控件的初始值,并应用评分限制
1378
  for i in range(len_criteria):
1379
- # 对于A模型:如果有历史评分则使用,否则清空
1380
  score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None
1381
- # 对于B模型:如果有历史评分则使用,否则清空
1382
  score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None
1383
 
1384
- print(f"\033[94m[DEBUG] Criterion {i}: Before apply_scoring_restrictions - score_A={score_A}, score_B={score_B}\033[0m")
1385
 
1386
- # 应用基于配对比较结果的评分限制
1387
  restricted_update_A, restricted_update_B = apply_scoring_restrictions(
1388
  progress_state, i, score_A, score_B)
1389
 
1390
- print(f"\033[94m[DEBUG] Criterion {i}: After apply_scoring_restrictions - restricted_update_A={restricted_update_A}, restricted_update_B={restricted_update_B}\033[0m")
1391
 
1392
  ratings_A_updates.append(restricted_update_A)
1393
  ratings_B_updates.append(restricted_update_B)
@@ -1395,10 +1394,10 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1395
  ui_updates['ratings_A'] = ratings_A_updates
1396
  ui_updates['ratings_B'] = ratings_B_updates
1397
 
1398
- print(f"\033[93mLoaded relevant scores for pair {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}\033[0m")
1399
- print(f"\033[93mApplied automatic scoring restrictions based on pairwise choices\033[0m")
1400
  else:
1401
- # 非评分模式,保持空白
1402
  ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)]
1403
  ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)]
1404
 
@@ -1406,17 +1405,16 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1406
  if next_pair is not None:
1407
  print("debug: Extracting UI content for next pair")
1408
  print("progress_state:", progress_state)
1409
- # print("data_subset_state:", data_subset_state)
1410
  print("next_pair:", next_pair)
1411
  content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair)
1412
  ui_updates.update(content_updates)
1413
 
1414
- # 如果 extract_ui_content_by_mode 返回了新的 pairwise_results_for_display,
1415
- # 确保它有正确的长度
1416
  if 'pairwise_results_for_display' in content_updates:
1417
  pairwise_results = content_updates['pairwise_results_for_display']
1418
  if pairwise_results is None or len(pairwise_results) != len_criteria:
1419
- print(f"\033[93mWarning: pairwise_results_for_display has incorrect length {len(pairwise_results) if pairwise_results else 'None'}, expected {len_criteria}\033[0m")
1420
  ui_updates['pairwise_results_for_display'] = default_pairwise_results
1421
 
1422
  # 6. Ensure all required arrays have correct lengths before returning
@@ -1428,8 +1426,8 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1428
 
1429
  # Validate pairwise_results_for_display always has the right length
1430
  if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria:
1431
- print(f"\033[91m[ADVANCE_WORKFLOW] WARNING: Overriding pairwise_results_for_display due to length mismatch!\033[0m")
1432
- print(f"\033[91m Original length: {len(ui_updates.get('pairwise_results_for_display', []))}, Expected: {len_criteria}\033[0m")
1433
  ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)]
1434
  if 'pairwise_results_for_display' in ui_updates:
1435
  if 'ui_buffer' not in progress_state:
@@ -1442,9 +1440,9 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
1442
 
1443
  def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons):
1444
  """Process pairwise comparison submission and transition to scoring mode."""
1445
- print(f"\033[95m=== SUBMIT PAIRWISE COMPARISON DEBUG ===\033[0m")
1446
- print(f"\033[95mInput progress_state: {progress_state}\033[0m")
1447
- print(f"\033[95mPairwise comparisons: {pairwise_comparisons}\033[0m")
1448
 
1449
  # Process input parameters
1450
  criteria_count = len_criteria
@@ -1497,7 +1495,7 @@ def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pa
1497
  # Check individual elements if it's a list
1498
  if isinstance(pairwise_results_for_display, list):
1499
  for i, item in enumerate(pairwise_results_for_display):
1500
- print(f"\033[96m Element {i}: {type(item)} - {item}\033[0m")
1501
 
1502
  return [
1503
  ui_updates.get('page1_visible'),
@@ -1520,9 +1518,9 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1520
  Submit scoring results and proceed to the next step.
1521
  Simplified to use unified workflow management.
1522
  """
1523
- print(f"\033[95m=== SUBMIT PAIRWISE SCORING DEBUG ===\033[0m")
1524
- print(f"\033[95mInput progress_state: {progress_state}\033[0m")
1525
- print(f"\033[95mRatings: {ratings}\033[0m")
1526
 
1527
  # Save current ratings - now store by method instead of by pair
1528
  pair = progress_state['all_pairs'][progress_state['current_score_pair_index']]
@@ -1541,7 +1539,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1541
  # Determine modal visibility based on completion status
1542
  all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
1543
  len(progress_state['all_pairs']))
1544
- # next_question_modal_visibility = gr.update(visible=all_scoring_done)
1545
 
1546
  return [
1547
  ui_updates.get('page1_visible'), # 5
@@ -1575,10 +1572,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1575
 
1576
  progress_state['scoring_done_pairs'].add(pair)
1577
 
1578
- print(f"\033[92m[SCORE_SAVE] Saved scores for {model_A}: {ratings_A}\033[0m")
1579
- print(f"\033[92m[SCORE_SAVE] Saved scores for {model_B}: {ratings_B}\033[0m")
1580
 
1581
- # **ADDED: Save results to database like submit_pairwise_comparison does**
1582
  # Extract pairwise comparison results for this pair
1583
  pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria)
1584
  comparison_reasons = [None] * len_criteria # We don't have reasons for scoring page
@@ -1602,7 +1599,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1602
  # Determine modal visibility based on completion status
1603
  all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
1604
  len(progress_state['all_pairs']))
1605
- # next_question_modal_visibility = gr.update(visible=all_scoring_done)
1606
 
1607
 
1608
  if not all_scoring_done:
@@ -1634,7 +1630,7 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1634
  user_info, our_methods
1635
  )
1636
 
1637
- if remaining_count == 0: # code TODO
1638
  gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!")
1639
  return [
1640
  ui_updates.get('page1_visible'), # 5
@@ -1656,11 +1652,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1656
  *ui_updates.get('ratings_A'),
1657
  *ui_updates.get('ratings_B'),
1658
  *ui_updates.get('pairwise_results_for_display'), # 26-33
1659
- # next_question_modal_visibility # 34
1660
  ]
1661
  # Use advance_workflow to get all UI updates
1662
  ui_updates = advance_workflow(progress_state, data_subset_state)
1663
- print(f"\033[93mIn submit_pairwise_scoring, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
1664
  num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
1665
  gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") # eval_progress_text
1666
  return (
@@ -1686,37 +1681,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
1686
  # next_question_modal_visibility
1687
  )
1688
 
1689
-
1690
- # def proceed_to_next_question(user_info):
1691
- # """
1692
- # Refactored to reuse code from go_to_eval_progress_modal by implementing it using advance_workflow.
1693
- # This eliminates code duplication and ensures consistent UI behavior.
1694
- # """
1695
- # # Fetch next question state
1696
- # user_info_new, chat_a, chat_b, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question(
1697
- # user_info['name'], user_info['email'], user_info['specialty'], user_info['subspecialty'],
1698
- # user_info['years_exp'], user_info['exp_explanation'], user_info['npi_id'], user_info['evaluator_id'], our_methods
1699
- # )
1700
-
1701
- # # Use advance_workflow to get all UI updates (same pattern as go_to_eval_progress_modal)
1702
- # ui_updates = advance_workflow(progress_state, data_subset_state)
1703
-
1704
- # print(f"\033[93mIn proceed_to_next_question, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
1705
-
1706
- # # Return exactly the elements bound in next_question_btn.click:
1707
- # return [
1708
- # user_info_new,
1709
- # ui_updates.get('chat_a_page1', chat_a), # 使用适合当前模式的内容
1710
- # ui_updates.get('chat_b_page1', chat_b), # 使用适合当前模式的内容
1711
- # ui_updates.get('page1_prompt', page1_prompt), # 使用适合当前模式的提示
1712
- # page1_reference_answer,
1713
- # data_subset_state, # data_subset_state slot
1714
- # ui_updates.get('progress_state', progress_state), # progress_state slot
1715
- # ui_updates.get('page1_visible', gr.update(visible=True)), # page1 visibility based on mode
1716
- # ui_updates.get('page2_visible', gr.update(visible=False)), # page2 visibility based on mode
1717
- # gr.update(visible=False) # next_question_modal hidden
1718
- # ]
1719
-
1720
  # --- Define Callback Functions for Confirmation Flow ---
1721
  def build_row_dict(
1722
  data_subset_state,
@@ -1812,18 +1776,18 @@ def restrict_choices(progress_state, index, score_a, score_b):
1812
  Enforces rating constraints based on the pairwise choice for the given criterion index.
1813
  """
1814
  print(
1815
- f"\033[91mRestricting choices for index {index} with scores A: {score_a}, B: {score_b}\033[0m")
1816
  print(
1817
- f"\033[91mProgress state keys: {list(progress_state.keys()) if progress_state else 'None'}\033[0m")
1818
 
1819
  # Extract the pairwise choice for the current criterion
1820
  pairwise_choice = _extract_pairwise_choice(progress_state, index)
1821
 
1822
  if pairwise_choice is not None:
1823
  print(
1824
- f"\033[92mFound pairwise choice for criterion {index}: {pairwise_choice}\033[0m")
1825
  else:
1826
- print(f"\033[93mNo pairwise results found for criterion {index}\033[0m")
1827
 
1828
  # Skip if both scores are None
1829
  if score_a is None and score_b is None:
@@ -1939,7 +1903,6 @@ with gr.Blocks(css=centered_col_css) as demo:
1939
  img.save(buffer, format="PNG")
1940
  encoded_string = base64.b64encode(
1941
  buffer.getvalue()).decode("utf-8")
1942
- # encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
1943
 
1944
  image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
1945
  ReasoningTraceExampleHTML = f"""
@@ -1951,9 +1914,9 @@ with gr.Blocks(css=centered_col_css) as demo:
1951
  gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
1952
  """)
1953
  gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
1954
- name = gr.Textbox(label="Name (required)", value="Curt Ginder")
1955
  email = gr.Textbox(
1956
- label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="[email protected]")
1957
  evaluator_id = gr.Textbox(
1958
  label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False)
1959
 
@@ -1992,8 +1955,6 @@ with gr.Blocks(css=centered_col_css) as demo:
1992
 
1993
  with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
1994
  eval_progress_text = gr.Markdown("You have X questions remaining.")
1995
- # eval_progress_proceed_btn = gr.Button(
1996
- # "OK, proceed to question evaluation")
1997
 
1998
  # Page 1: Pairwise Comparison.
1999
  with gr.Column(visible=False) as page1:
@@ -2218,22 +2179,6 @@ with gr.Blocks(css=centered_col_css) as demo:
2218
  # Clicking OK hides the modal.
2219
  ok_btn.click(lambda: gr.update(visible=False), None, error_modal)
2220
 
2221
- # Confirmation Modal: Ask for final submission confirmation.
2222
- # with Modal("Confirm Submission", visible=False, elem_id="confirm_modal") as confirm_modal:
2223
- # gr.Markdown(
2224
- # "Are you sure you want to submit? Once submitted, you cannot edit your responses.")
2225
- # with gr.Row():
2226
- # yes_btn = gr.Button("Yes, please submit")
2227
- # cancel_btn = gr.Button("Cancel")
2228
-
2229
- # Add modal for proceeding to next question
2230
- # with Modal("Next Question", visible=False, elem_id="next_question_modal") as next_question_modal:
2231
- # gr.Markdown(
2232
- # "You have completed this question. Click below to proceed to the next question.")
2233
- # next_question_btn = gr.Button("Next Question")
2234
-
2235
-
2236
-
2237
  # --- Define Transitions Between Pages ---
2238
 
2239
  # For the "Participate in Evaluation" button, transition to page0
@@ -2267,13 +2212,6 @@ with gr.Blocks(css=centered_col_css) as demo:
2267
  home_btn_2.click(lambda: (gr.update(visible=True), gr.update(
2268
  visible=False)), None, [page_minus1, page2], scroll_to_output=True)
2269
 
2270
- # Transition from Page 1 to Page 0 (Back button).
2271
- # back_btn_0.click(
2272
- # fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
2273
- # inputs=None,
2274
- # outputs=[page0, page1]
2275
- # )
2276
-
2277
  # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
2278
  next_btn_1.click(
2279
  fn=submit_pairwise_comparison,
@@ -2291,27 +2229,6 @@ with gr.Blocks(css=centered_col_css) as demo:
2291
  scroll_to_output=True,
2292
  )
2293
 
2294
- # Transition from Rating Page (Page 2) back to Pairwise page.
2295
- # back_btn_2.click(
2296
- # fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
2297
- # inputs=None,
2298
- # outputs=[page1, page2],
2299
- # scroll_to_output=True
2300
- # )
2301
-
2302
- # Wire up the modal button to proceed_to_next_question and reset all UI for the new question
2303
- # next_question_btn.click(
2304
- # fn=proceed_to_next_question,
2305
- # inputs=[user_info_state],
2306
- # outputs=[
2307
- # user_info_state,
2308
- # chat_a_page1, chat_b_page1, page1_prompt, page1_reference_answer,
2309
- # data_subset_state, progress_state,
2310
- # page1, page2, next_question_modal
2311
- # ],
2312
- # scroll_to_output=True
2313
- # )
2314
-
2315
  submit_btn.click(
2316
  fn=submit_pairwise_scoring,
2317
  inputs=[progress_state,
 
15
  EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
16
  TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
17
  our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
18
+ # Load tool lists from 'tool_lists' subdirectory
 
19
  tools_dir = os.path.join(os.getcwd(), 'tool_lists')
20
 
21
  # Initialize an empty dictionary to store the results
 
38
  print(f"Error processing {filename}: {e}")
39
  results[key] = [f"Error loading {filename}"]
40
 
41
+ # Tool database labels for different tool calls in format_chat
42
  tool_database_labels_raw = {
43
  "chembl_tools": "**from the ChEMBL database**",
44
  "efo_tools": "**from the Experimental Factor Ontology**",
 
294
  local_path = hf_hub_download(
295
  repo_id=REPO_ID,
296
  repo_type="dataset",
297
+ # Fetches the most recent version of the dataset each time this command is called
298
  revision="main",
299
  filename=remote_path,
 
300
  token=os.getenv("HF_TOKEN")
301
  )
302
  with open(local_path, "r") as f:
 
314
  if not evaluator_question_ids:
315
  return [], data_by_filename
316
 
317
+ # Check if evaluator has already completed any questions
318
+ # Must go through every tuple of (question_ID, TxAgent, other model)
319
  model_names = [key for key in data_by_filename.keys()
320
  if key not in our_methods]
321
  full_question_ids_list = []
 
329
  results_df = read_sheet_to_df(custom_sheet_name=str(
330
  TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
331
  if results_df is not None and not results_df.empty:
332
+ # Only consider records where both "Pairwise comparison" and "scoring" fields are filled
333
  comparison_cols = [
334
  f"Criterion_{c['label']} Comparison: Which is Better?"
335
  for c in criteria_for_comparison
 
340
  for _, row in results_df.iterrows():
341
  q = row.get("Question ID")
342
  a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model")
343
+ # Ensure our_methods comes first
344
  if a in our_methods and b not in our_methods:
345
  pair = (q, a, b)
346
  elif b in our_methods and a not in our_methods:
 
348
  else:
349
  continue
350
  complete = True
351
+ # Check all pairwise comparison columns
352
  for col in comparison_cols:
353
  if not row.get(col):
354
  complete = False
355
  break
356
+ # If pairwise is complete, check all scoring columns
357
  if complete:
358
  for col in scoreA_cols + scoreB_cols:
359
  if not row.get(col):
 
361
  break
362
  if complete:
363
  matched_pairs.add(pair)
364
+ # Only filter out truly completed pairs, incomplete ones (with missing values) will be retained
365
  full_question_ids_list = [
366
  t for t in full_question_ids_list if t not in matched_pairs
367
  ]
 
441
  # Validate required fields
442
  validation_error = validate_required_fields(
443
  name, email, evaluator_id, specialty_dd, years_exp_radio)
444
+ print(f"In go_to_eval_progress_modal, validation_error={validation_error}")
445
  if validation_error:
446
  return (
447
  gr.update(visible=True), # page0
 
510
  # Use advance_workflow to get all UI updates
511
  ui_updates = advance_workflow(progress_state, data_subset_state)
512
 
513
+ print(f"In go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}")
514
 
515
  # Map advance_workflow outputs to the required return format
516
  num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
 
572
  completed the pairwise comparison phase.
573
  """
574
  for idx, pair in enumerate(progress_state['all_pairs']):
575
+ # Ensure the pair has completed the pairwise comparison phase and hasn't been scored yet
576
  if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']:
577
  progress_state['current_score_pair_index'] = idx
578
  return pair
 
590
  if df is None or df.empty:
591
  return None
592
 
593
+ # Only keep rows for current question_id
594
  df_q = df[df["Question ID"] == question_id]
595
  if df_q.empty:
596
  return None
 
600
  scoring_done_pairs = set()
601
  pairwise_scores = {}
602
 
603
+ # Iterate through each record to extract model pairs, comparison results and scores
604
  for _, row in df_q.iterrows():
605
  a, b = row["ResponseA_Model"], row["ResponseB_Model"]
606
  pair = (a, b)
 
615
  comps.append(mapped_value)
616
  pairwise_results[pair] = comps
617
 
618
+ # Collect scores if scoring columns exist
619
  first_score = f"ScoreA_{criteria[0]['label']}"
620
  if first_score in row and row[first_score] not in (None, ""):
621
+ # Store scores by method instead of by pair
622
  scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria]
623
  scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria]
624
  scoring_done_pairs.add(pair)
 
627
  pairwise_scores[a] = scores_A
628
  pairwise_scores[b] = scores_B
629
 
630
+ # Intelligently set mode based on existing data
631
+ # 1. If there are completed pairwise comparisons but no corresponding scores, should enter scoring mode
632
+ # 2. If both pairwise comparisons and scores are completed, need to determine if there are incomplete pairs through advance_workflow
633
+ # 3. If no completed pairwise comparisons, should be in pairwise comparison mode
634
 
635
+ determined_mode = "pairwise" # Default mode
636
 
637
  if pairwise_done:
638
+ # Has completed pairwise comparisons
639
+ # Check if there are completed pairs but unscored pairs
640
  unscored_pairs = pairwise_done - scoring_done_pairs
641
  if unscored_pairs:
642
+ # Has completed pairs but unscored pairs, should enter scoring mode
643
  determined_mode = "scoring"
644
+ print(f"load_progress_state: Found {len(unscored_pairs)} unscored pairs, setting mode to 'scoring'")
645
  else:
646
+ # All paired comparisons are scored, let advance_workflow decide next step
647
+ determined_mode = "pairwise" # May still have unpaired ones
648
+ print(f"load_progress_state: All pairwise comparisons are scored, setting mode to 'pairwise' (will be corrected by advance_workflow)")
649
  else:
650
+ # No completed pairwise comparisons, definitely pairwise comparison mode
651
  determined_mode = "pairwise"
652
+ print(f"load_progress_state: No completed pairwise comparisons, setting mode to 'pairwise'")
653
 
654
+ # Construct complete progress_state (all_pairs, all_models will be overwritten later)
655
  progress_state = {
656
  "current_question_index": 0,
657
  "current_pair_index": 0,
 
660
  "pairwise_results": pairwise_results,
661
  "scoring_done_pairs": scoring_done_pairs,
662
  "pairwise_scores": pairwise_scores,
663
+ "all_pairs": [], # Reset later based on models_full
664
+ "all_models": [], # Reset later based on models_full
665
  "evaluator_id": evaluator_id,
666
+ "mode": determined_mode, # Intelligently set mode
667
  }
668
+ print(progress_state)
669
  return progress_state
670
 
671
 
 
925
 
926
  # Create enhanced pairwise results display with conflict warnings
927
  pairwise_results_for_display = []
928
+ for i in range(len_criteria): # Use len_criteria to generate correct number of display elements
929
+ # Get corresponding comparison result
930
  choice = pairwise_results[i] if i < len(pairwise_results) else None
931
 
932
+ # Normalize choice, support both mapping key and value forms
933
  normalized_choice = choice
934
  if choice in mapping:
935
  normalized_choice = mapping[choice]
 
986
  pairwise_results_for_display.append(gr.Markdown(display_html, visible=True))
987
 
988
 
989
+ # Ensure correct length
990
  assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}"
991
 
992
  # Create enhanced chatbot components for scoring page
 
1029
  'chat_a_page2': None, # Scoring content (unused in pairwise)
1030
  'chat_b_page2': None, # Scoring content (unused in pairwise)
1031
  'page2_prompt': None, # Scoring prompt (unused in pairwise)
1032
+ 'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)] # Ensure correct length
1033
  }
1034
 
1035
 
 
1204
  2. If one score exists, lock it and apply restrictions to the other based on the locked score
1205
  3. If no scores exist, apply normal pairwise restrictions
1206
  """
1207
+ print(f"[DEBUG] apply_scoring_restrictions called - criterion {index}: input score_a={score_a}, score_b={score_b}")
1208
 
1209
  # Extract the pairwise choice for the current criterion (needed for restrictions)
1210
  pairwise_choice = _extract_pairwise_choice(progress_state, index)
1211
 
1212
  if pairwise_choice is not None:
1213
+ print(f"[Auto-restrict] Found pairwise choice for criterion {index}: {pairwise_choice}")
1214
  else:
1215
+ print(f"[Auto-restrict] No pairwise choice found for criterion {index}")
1216
 
1217
  # Case 1: Both scores exist - COMPARISON-FIRST logic
1218
  if score_a is not None and score_b is not None:
1219
+ print(f"[COMPARISON-FIRST] Both scores exist: A={score_a}, B={score_b} - checking for conflicts with pairwise choice")
1220
 
1221
  # Use unified conflict detection
1222
  conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b)
1223
 
1224
  if conflict_detected:
1225
+ print(f"[CONFLICT] Pairwise choice conflicts with scores: A={score_a}, B={score_b}")
1226
+ print(f"[COMPARISON-FIRST] Conflict detected - prioritizing pairwise comparison, clearing existing scores")
1227
  # Apply normal pairwise restrictions with cleared values to prioritize pairwise choice
1228
  result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
1229
  else:
1230
+ print(f"[NO-CONFLICT] Existing scores are compatible with pairwise choice - locking both")
1231
  # Ensure scores are strings for Gradio compatibility
1232
  score_a_str = str(score_a)
1233
  score_b_str = str(score_b)
 
1236
 
1237
  # Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None)
1238
  elif score_a is not None and score_b is None:
1239
+ print(f"[LOCK] Found existing score_a={score_a}, locking A and restricting B based on A (B value=None)")
1240
  score_a_str = str(score_a)
1241
  result_A = gr.update(choices=[score_a_str], value=score_a_str)
1242
  # Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None
 
1246
 
1247
  # Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None)
1248
  elif score_a is None and score_b is not None:
1249
+ print(f"[LOCK] Found existing score_b={score_b}, locking B and restricting A based on B (A value=None)")
1250
  score_b_str = str(score_b)
1251
  result_B = gr.update(choices=[score_b_str], value=score_b_str)
1252
  # Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None
 
1256
 
1257
  # Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing
1258
  else:
1259
+ print(f"[CLEAR] No existing scores - applying normal restrictions with None values to ensure clearing")
1260
  # Apply restrictions using the shared utility function with explicit None values
1261
  result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
1262
 
1263
+ print(f"[DEBUG] apply_scoring_restrictions result - criterion {index}: returning gradio updates")
1264
 
1265
  return result_A, result_B
1266
 
 
1269
  """
1270
  Unified workflow manager that handles all state transitions and UI updates.
1271
  """
1272
+ print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
1273
  print(progress_state)
1274
  default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)]
1275
 
 
1319
  next_pairwise_pair = get_next_uncompleted_pair(progress_state)
1320
  next_scoring_pair = get_next_unscored_pair(progress_state)
1321
 
1322
+ # 2. Determine workflow phase and set mode
1323
  if next_pairwise_pair is not None:
1324
  progress_state['mode'] = 'pairwise'
1325
  next_pair = next_pairwise_pair
1326
+ print(f"Pairwise mode: next pair {next_pair}")
1327
  elif next_scoring_pair is not None:
1328
  progress_state['mode'] = 'scoring'
1329
  next_pair = next_scoring_pair
1330
+ print(f"Scoring mode: next pair {next_pair}")
1331
  else:
1332
  progress_state['mode'] = 'completed'
1333
  next_pair = None
 
1352
  'pairwise_results_for_display': default_pairwise_results
1353
  }
1354
 
1355
+ # 4. If in scoring mode, check historical scoring records
1356
  if current_mode == 'scoring' and next_pair is not None:
1357
+ # Get historical scores for current model pair (if exists)
1358
  current_model_A, current_model_B = next_pair
1359
 
1360
+ # Search for historical scores directly from method-keyed storage - simpler and more efficient!
1361
  relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A)
1362
  relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B)
1363
 
1364
+ print(f"Relevant scores for {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}")
1365
 
1366
  # Debug check for None values
1367
  if relevant_scores_A is None:
1368
+ print(f"[DEBUG] relevant_scores_A is None for pair {next_pair}")
1369
  if relevant_scores_B is None:
1370
+ print(f"[DEBUG] relevant_scores_B is None for pair {next_pair}")
1371
 
1372
+ # Set initial values for UI scoring controls
1373
  ratings_A_updates = []
1374
  ratings_B_updates = []
1375
 
1376
+ # Set initial values for A and B model scoring controls and apply scoring restrictions
1377
  for i in range(len_criteria):
1378
+ # For A model: use historical score if available, otherwise clear
1379
  score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None
1380
+ # For B model: use historical score if available, otherwise clear
1381
  score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None
1382
 
1383
+ print(f"[DEBUG] Criterion {i}: Before apply_scoring_restrictions - score_A={score_A}, score_B={score_B}")
1384
 
1385
+ # Apply scoring restrictions based on pairwise comparison results
1386
  restricted_update_A, restricted_update_B = apply_scoring_restrictions(
1387
  progress_state, i, score_A, score_B)
1388
 
1389
+ print(f"[DEBUG] Criterion {i}: After apply_scoring_restrictions - restricted_update_A={restricted_update_A}, restricted_update_B={restricted_update_B}")
1390
 
1391
  ratings_A_updates.append(restricted_update_A)
1392
  ratings_B_updates.append(restricted_update_B)
 
1394
  ui_updates['ratings_A'] = ratings_A_updates
1395
  ui_updates['ratings_B'] = ratings_B_updates
1396
 
1397
+ print(f"Loaded relevant scores for pair {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}")
1398
+ print(f"Applied automatic scoring restrictions based on pairwise choices")
1399
  else:
1400
+ # Non-scoring mode, keep blank
1401
  ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)]
1402
  ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)]
1403
 
 
1405
  if next_pair is not None:
1406
  print("debug: Extracting UI content for next pair")
1407
  print("progress_state:", progress_state)
 
1408
  print("next_pair:", next_pair)
1409
  content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair)
1410
  ui_updates.update(content_updates)
1411
 
1412
+ # If extract_ui_content_by_mode returned new pairwise_results_for_display,
1413
+ # ensure it has correct length
1414
  if 'pairwise_results_for_display' in content_updates:
1415
  pairwise_results = content_updates['pairwise_results_for_display']
1416
  if pairwise_results is None or len(pairwise_results) != len_criteria:
1417
+ print(f"Warning: pairwise_results_for_display has incorrect length {len(pairwise_results) if pairwise_results else 'None'}, expected {len_criteria}")
1418
  ui_updates['pairwise_results_for_display'] = default_pairwise_results
1419
 
1420
  # 6. Ensure all required arrays have correct lengths before returning
 
1426
 
1427
  # Validate pairwise_results_for_display always has the right length
1428
  if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria:
1429
+ print(f"[ADVANCE_WORKFLOW] WARNING: Overriding pairwise_results_for_display due to length mismatch!")
1430
+ print(f" Original length: {len(ui_updates.get('pairwise_results_for_display', []))}, Expected: {len_criteria}")
1431
  ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)]
1432
  if 'pairwise_results_for_display' in ui_updates:
1433
  if 'ui_buffer' not in progress_state:
 
1440
 
1441
  def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons):
1442
  """Process pairwise comparison submission and transition to scoring mode."""
1443
+ print(f"=== SUBMIT PAIRWISE COMPARISON DEBUG ===")
1444
+ print(f"Input progress_state: {progress_state}")
1445
+ print(f"Pairwise comparisons: {pairwise_comparisons}")
1446
 
1447
  # Process input parameters
1448
  criteria_count = len_criteria
 
1495
  # Check individual elements if it's a list
1496
  if isinstance(pairwise_results_for_display, list):
1497
  for i, item in enumerate(pairwise_results_for_display):
1498
+ print(f" Element {i}: {type(item)} - {item}")
1499
 
1500
  return [
1501
  ui_updates.get('page1_visible'),
 
1518
  Submit scoring results and proceed to the next step.
1519
  Simplified to use unified workflow management.
1520
  """
1521
+ print(f"=== SUBMIT PAIRWISE SCORING DEBUG ===")
1522
+ print(f"Input progress_state: {progress_state}")
1523
+ print(f"Ratings: {ratings}")
1524
 
1525
  # Save current ratings - now store by method instead of by pair
1526
  pair = progress_state['all_pairs'][progress_state['current_score_pair_index']]
 
1539
  # Determine modal visibility based on completion status
1540
  all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
1541
  len(progress_state['all_pairs']))
 
1542
 
1543
  return [
1544
  ui_updates.get('page1_visible'), # 5
 
1572
 
1573
  progress_state['scoring_done_pairs'].add(pair)
1574
 
1575
+ print(f"[SCORE_SAVE] Saved scores for {model_A}: {ratings_A}")
1576
+ print(f"[SCORE_SAVE] Saved scores for {model_B}: {ratings_B}")
1577
 
1578
+ # Save results to database like submit_pairwise_comparison does
1579
  # Extract pairwise comparison results for this pair
1580
  pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria)
1581
  comparison_reasons = [None] * len_criteria # We don't have reasons for scoring page
 
1599
  # Determine modal visibility based on completion status
1600
  all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
1601
  len(progress_state['all_pairs']))
 
1602
 
1603
 
1604
  if not all_scoring_done:
 
1630
  user_info, our_methods
1631
  )
1632
 
1633
+ if remaining_count == 0: # TODO: handle completion
1634
  gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!")
1635
  return [
1636
  ui_updates.get('page1_visible'), # 5
 
1652
  *ui_updates.get('ratings_A'),
1653
  *ui_updates.get('ratings_B'),
1654
  *ui_updates.get('pairwise_results_for_display'), # 26-33
 
1655
  ]
1656
  # Use advance_workflow to get all UI updates
1657
  ui_updates = advance_workflow(progress_state, data_subset_state)
1658
+ print(f"In submit_pairwise_scoring, using advance_workflow results: mode={progress_state.get('mode')}")
1659
  num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
1660
  gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") # eval_progress_text
1661
  return (
 
1681
  # next_question_modal_visibility
1682
  )
1683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1684
  # --- Define Callback Functions for Confirmation Flow ---
1685
  def build_row_dict(
1686
  data_subset_state,
 
1776
  Enforces rating constraints based on the pairwise choice for the given criterion index.
1777
  """
1778
  print(
1779
+ f"Restricting choices for index {index} with scores A: {score_a}, B: {score_b}")
1780
  print(
1781
+ f"Progress state keys: {list(progress_state.keys()) if progress_state else 'None'}")
1782
 
1783
  # Extract the pairwise choice for the current criterion
1784
  pairwise_choice = _extract_pairwise_choice(progress_state, index)
1785
 
1786
  if pairwise_choice is not None:
1787
  print(
1788
+ f"Found pairwise choice for criterion {index}: {pairwise_choice}")
1789
  else:
1790
+ print(f"No pairwise results found for criterion {index}")
1791
 
1792
  # Skip if both scores are None
1793
  if score_a is None and score_b is None:
 
1903
  img.save(buffer, format="PNG")
1904
  encoded_string = base64.b64encode(
1905
  buffer.getvalue()).decode("utf-8")
 
1906
 
1907
  image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
1908
  ReasoningTraceExampleHTML = f"""
 
1914
  gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
1915
  """)
1916
  gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
1917
+ name = gr.Textbox(label="Name (required)", value="")
1918
  email = gr.Textbox(
1919
+ label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="")
1920
  evaluator_id = gr.Textbox(
1921
  label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False)
1922
 
 
1955
 
1956
  with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
1957
  eval_progress_text = gr.Markdown("You have X questions remaining.")
 
 
1958
 
1959
  # Page 1: Pairwise Comparison.
1960
  with gr.Column(visible=False) as page1:
 
2179
  # Clicking OK hides the modal.
2180
  ok_btn.click(lambda: gr.update(visible=False), None, error_modal)
2181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2182
  # --- Define Transitions Between Pages ---
2183
 
2184
  # For the "Participate in Evaluation" button, transition to page0
 
2212
  home_btn_2.click(lambda: (gr.update(visible=True), gr.update(
2213
  visible=False)), None, [page_minus1, page2], scroll_to_output=True)
2214
 
 
 
 
 
 
 
 
2215
  # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
2216
  next_btn_1.click(
2217
  fn=submit_pairwise_comparison,
 
2229
  scroll_to_output=True,
2230
  )
2231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2232
  submit_btn.click(
2233
  fn=submit_pairwise_scoring,
2234
  inputs=[progress_state,