Spaces:

agenticx
/

TxAgentRAOEval

Sleeping

App Files Files Community

shgao commited on Jun 6

Commit

a23fc7c

1 Parent(s): 6f0352a

update

Browse files

Files changed (1) hide show

app.py +95 -178

app.py CHANGED Viewed

@@ -15,8 +15,7 @@ REPO_ID = "agenticx/TxAgentEvalData"
 EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
 TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
 our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
-# Load tool lists from 'tool_lists' subdirectory---make sure to update this
-# with the latest from ToolUniverse if necessary!
 tools_dir = os.path.join(os.getcwd(), 'tool_lists')
 # Initialize an empty dictionary to store the results
@@ -39,7 +38,7 @@ for filename in os.listdir(tools_dir):
             print(f"Error processing {filename}: {e}")
             results[key] = [f"Error loading {filename}"]
-# for labeling the different tool calls in format_chat
 tool_database_labels_raw = {
     "chembl_tools": "**from the ChEMBL database**",
     "efo_tools": "**from the Experimental Factor Ontology**",
@@ -295,10 +294,9 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
         local_path = hf_hub_download(
             repo_id=REPO_ID,
             repo_type="dataset",
-            # fetches the most recent version of the dataset each time this command is called
             revision="main",
             filename=remote_path,
-            # force_download=True,
             token=os.getenv("HF_TOKEN")
         )
         with open(local_path, "r") as f:
@@ -316,7 +314,8 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
     if not evaluator_question_ids:
         return [], data_by_filename
-    # FINALLY, MAKE SURE THEY DIDNT ALREADY FILL IT OUT. Must go through every tuple of (question_ID, TxAgent, other model) where other model could be any of the other files in data_by_filename
     model_names = [key for key in data_by_filename.keys()
                    if key not in our_methods]
     full_question_ids_list = []
@@ -330,7 +329,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
     results_df = read_sheet_to_df(custom_sheet_name=str(
         TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
     if results_df is not None and not results_df.empty:
-        # 仅把“Pairwise 比较”和“评分”字段都已填的记录视为完成
         comparison_cols = [
             f"Criterion_{c['label']} Comparison: Which is Better?"
             for c in criteria_for_comparison
@@ -341,7 +340,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
         for _, row in results_df.iterrows():
             q = row.get("Question ID")
             a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model")
-            # 确保 our_methods 在前
             if a in our_methods and b not in our_methods:
                 pair = (q, a, b)
             elif b in our_methods and a not in our_methods:
@@ -349,12 +348,12 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
             else:
                 continue
             complete = True
-            # 检查所有的 Pairwise 比较列
             for col in comparison_cols:
                 if not row.get(col):
                     complete = False
                     break
-            # 如果 Pairwise 完成，再检查所有评分列
             if complete:
                 for col in scoreA_cols + scoreB_cols:
                     if not row.get(col):
@@ -362,7 +361,7 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_me
                         break
             if complete:
                 matched_pairs.add(pair)
-        # 只过滤掉真正做完的 pair，未完成（缺值）的会被保留
         full_question_ids_list = [
             t for t in full_question_ids_list if t not in matched_pairs
         ]
@@ -442,7 +441,7 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
     # Validate required fields
     validation_error = validate_required_fields(
         name, email, evaluator_id, specialty_dd, years_exp_radio)
-    print(f"\033[93mIn go_to_eval_progress_modal, validation_error={validation_error}\033[0m")
     if validation_error:
         return (
                 gr.update(visible=True),            # page0
@@ -511,7 +510,7 @@ def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecia
     # Use advance_workflow to get all UI updates
     ui_updates = advance_workflow(progress_state, data_subset_state)
-    print(f"\033[93mIn go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
     # Map advance_workflow outputs to the required return format
     num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
@@ -573,7 +572,7 @@ def get_next_unscored_pair(progress_state):
     completed the pairwise comparison phase.
     """
     for idx, pair in enumerate(progress_state['all_pairs']):
-        # 确保该对已完成配对比较阶段且尚未进行评分
         if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']:
             progress_state['current_score_pair_index'] = idx
             return pair
@@ -591,7 +590,7 @@ def load_progress_state(evaluator_id, question_id):
     if df is None or df.empty:
         return None
-    # 只保留当前 question_id 的行
     df_q = df[df["Question ID"] == question_id]
     if df_q.empty:
         return None
@@ -601,7 +600,7 @@ def load_progress_state(evaluator_id, question_id):
     scoring_done_pairs = set()
     pairwise_scores = {}
-    # 遍历每条记录，抽取模型对、比较结果和打分
     for _, row in df_q.iterrows():
         a, b = row["ResponseA_Model"], row["ResponseB_Model"]
         pair = (a, b)
@@ -616,10 +615,10 @@ def load_progress_state(evaluator_id, question_id):
             comps.append(mapped_value)
         pairwise_results[pair] = comps
-        # 如果有评分列则收集评分
         first_score = f"ScoreA_{criteria[0]['label']}"
         if first_score in row and row[first_score] not in (None, ""):
-            # NEW: Store scores by method instead of by pair
             scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria]
             scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria]
             scoring_done_pairs.add(pair)
@@ -628,31 +627,31 @@ def load_progress_state(evaluator_id, question_id):
             pairwise_scores[a] = scores_A
             pairwise_scores[b] = scores_B
-    # 根据已有数据智能设置模式
-    # 1. 如果有已完成的配对比较但没有对应的评分，应该进入评分模式
-    # 2. 如果配对比较和评分都完成了，需要在后续通过advance_workflow判断是否还有未完成的pair
-    # 3. 如果没有任何已完成的配对比较，应该是配对比较模式
-    determined_mode = "pairwise"  # 默认模式
     if pairwise_done:
-        # 有已完成的配对比较
-        # 检查是否有已完成配对但未评分的pair
         unscored_pairs = pairwise_done - scoring_done_pairs
         if unscored_pairs:
-            # 有配对完成但未评分的pair，应该进入评分模式
             determined_mode = "scoring"
-            print(f"\033[93mload_progress_state: Found {len(unscored_pairs)} unscored pairs, setting mode to 'scoring'\033[0m")
         else:
-            # 所有已配对的都已评分，让advance_workflow决定下一步
-            determined_mode = "pairwise"  # 可能还有未配对的
-            print(f"\033[93mload_progress_state: All pairwise comparisons are scored, setting mode to 'pairwise' (will be corrected by advance_workflow)\033[0m")
     else:
-        # 没有任何已完成的配对比较，肯定是配对比较模式
         determined_mode = "pairwise"
-        print(f"\033[93mload_progress_state: No completed pairwise comparisons, setting mode to 'pairwise'\033[0m")
-    # 构造完整的 progress_state（all_pairs、all_models 会在后续被覆盖）
     progress_state = {
         "current_question_index":     0,
         "current_pair_index":         0,
@@ -661,12 +660,12 @@ def load_progress_state(evaluator_id, question_id):
         "pairwise_results":           pairwise_results,
         "scoring_done_pairs":         scoring_done_pairs,
         "pairwise_scores":            pairwise_scores,
-        "all_pairs":                  [],   # 后面根据 models_full 重置
-        "all_models":                 [],   # 后面根据 models_full 重置
         "evaluator_id":               evaluator_id,
-        "mode":                       determined_mode,  # 智能设置的模式
     }
-    print("\033[92m", progress_state, "\033[0m")
     return progress_state
@@ -926,11 +925,11 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
         # Create enhanced pairwise results display with conflict warnings
         pairwise_results_for_display = []
-        for i in range(len_criteria):  # 修复：使用 len_criteria 来生成正确数量的显示元素
-            # 获取对应的比较结果
             choice = pairwise_results[i] if i < len(pairwise_results) else None
-            # 标准化choice，支持mapping的key和value形式
             normalized_choice = choice
             if choice in mapping:
                 normalized_choice = mapping[choice]
@@ -987,7 +986,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
             pairwise_results_for_display.append(gr.Markdown(display_html, visible=True))
-        # 确保长度正确 - 现在应该已经是正确的了
         assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}"
         # Create enhanced chatbot components for scoring page
@@ -1030,7 +1029,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
             'chat_a_page2': None,  # Scoring content (unused in pairwise)
             'chat_b_page2': None,  # Scoring content (unused in pairwise)
             'page2_prompt': None,  # Scoring prompt (unused in pairwise)
-            'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)]  # 确保正确长度
         }
@@ -1205,30 +1204,30 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
     2. If one score exists, lock it and apply restrictions to the other based on the locked score
     3. If no scores exist, apply normal pairwise restrictions
     """
-    print(f"\033[96m[DEBUG] apply_scoring_restrictions called - criterion {index}: input score_a={score_a}, score_b={score_b}\033[0m")
     # Extract the pairwise choice for the current criterion (needed for restrictions)
     pairwise_choice = _extract_pairwise_choice(progress_state, index)
     if pairwise_choice is not None:
-        print(f"\033[92m[Auto-restrict] Found pairwise choice for criterion {index}: {pairwise_choice}\033[0m")
     else:
-        print(f"\033[93m[Auto-restrict] No pairwise choice found for criterion {index}\033[0m")
     # Case 1: Both scores exist - COMPARISON-FIRST logic
     if score_a is not None and score_b is not None:
-        print(f"\033[95m[COMPARISON-FIRST] Both scores exist: A={score_a}, B={score_b} - checking for conflicts with pairwise choice\033[0m")
         # Use unified conflict detection
         conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b)
         if conflict_detected:
-            print(f"\033[91m[CONFLICT] Pairwise choice conflicts with scores: A={score_a}, B={score_b}\033[0m")
-            print(f"\033[91m[COMPARISON-FIRST] Conflict detected - prioritizing pairwise comparison, clearing existing scores\033[0m")
             # Apply normal pairwise restrictions with cleared values to prioritize pairwise choice
             result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
         else:
-            print(f"\033[95m[NO-CONFLICT] Existing scores are compatible with pairwise choice - locking both\033[0m")
             # Ensure scores are strings for Gradio compatibility
             score_a_str = str(score_a)
             score_b_str = str(score_b)
@@ -1237,7 +1236,7 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
     # Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None)
     elif score_a is not None and score_b is None:
-        print(f"\033[95m[LOCK] Found existing score_a={score_a}, locking A and restricting B based on A (B value=None)\033[0m")
         score_a_str = str(score_a)
         result_A = gr.update(choices=[score_a_str], value=score_a_str)
         # Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None
@@ -1247,7 +1246,7 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
     # Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None)
     elif score_a is None and score_b is not None:
-        print(f"\033[95m[LOCK] Found existing score_b={score_b}, locking B and restricting A based on B (A value=None)\033[0m")
         score_b_str = str(score_b)
         result_B = gr.update(choices=[score_b_str], value=score_b_str)
         # Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None
@@ -1257,11 +1256,11 @@ def apply_scoring_restrictions(progress_state, index, score_a, score_b):
     # Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing
     else:
-        print(f"\033[91m[CLEAR] No existing scores - applying normal restrictions with None values to ensure clearing\033[0m")
         # Apply restrictions using the shared utility function with explicit None values
         result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
-    print(f"\033[96m[DEBUG] apply_scoring_restrictions result - criterion {index}: returning gradio updates\033[0m")
     return result_A, result_B
@@ -1270,7 +1269,7 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
     """
     Unified workflow manager that handles all state transitions and UI updates.
     """
-    print(f"\033[94mAdvance workflow called, previous mode: {progress_state.get('mode')}\033[0m")
     print(progress_state)
     default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)]
@@ -1320,15 +1319,15 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
     next_pairwise_pair = get_next_uncompleted_pair(progress_state)
     next_scoring_pair = get_next_unscored_pair(progress_state)
-    # 2. Determine workflow phase and set mode (只保留一次)
     if next_pairwise_pair is not None:
         progress_state['mode'] = 'pairwise'
         next_pair = next_pairwise_pair
-        print(f"\033[92mPairwise mode: next pair {next_pair}\033[0m")
     elif next_scoring_pair is not None:
         progress_state['mode'] = 'scoring'
         next_pair = next_scoring_pair
-        print(f"\033[91mScoring mode: next pair {next_pair}\033[0m")
     else:
         progress_state['mode'] = 'completed'
         next_pair = None
@@ -1353,41 +1352,41 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
         'pairwise_results_for_display': default_pairwise_results
     }
-    # 4. 如果是评分模式，检查历史评分记录
     if current_mode == 'scoring' and next_pair is not None:
-        # 获取当前模型对的历史评分（如果存在）
         current_model_A, current_model_B = next_pair
-        # 直接从method-keyed存储中查找历史评分 - 更简单高效！
         relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A)
         relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B)
-        print(f"\033[93mRelevant scores for {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}\033[0m")
         # Debug check for None values
         if relevant_scores_A is None:
-            print(f"\033[91m[DEBUG] relevant_scores_A is None for pair {next_pair}\033[0m")
         if relevant_scores_B is None:
-            print(f"\033[91m[DEBUG] relevant_scores_B is None for pair {next_pair}\033[0m")
-        # 设置UI评分控件的初始值
         ratings_A_updates = []
         ratings_B_updates = []
-        # 设置A模型和B模型评分控件的初始值，并应用评分限制
         for i in range(len_criteria):
-            # 对于A模型：如果有历史评分则使用，否则清空
             score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None
-            # 对于B模型：如果有历史评分则使用，否则清空
             score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None
-            print(f"\033[94m[DEBUG] Criterion {i}: Before apply_scoring_restrictions - score_A={score_A}, score_B={score_B}\033[0m")
-            # 应用基于配对比较结果的评分限制
             restricted_update_A, restricted_update_B = apply_scoring_restrictions(
                 progress_state, i, score_A, score_B)
-            print(f"\033[94m[DEBUG] Criterion {i}: After apply_scoring_restrictions - restricted_update_A={restricted_update_A}, restricted_update_B={restricted_update_B}\033[0m")
             ratings_A_updates.append(restricted_update_A)
             ratings_B_updates.append(restricted_update_B)
@@ -1395,10 +1394,10 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
         ui_updates['ratings_A'] = ratings_A_updates
         ui_updates['ratings_B'] = ratings_B_updates
-        print(f"\033[93mLoaded relevant scores for pair {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}\033[0m")
-        print(f"\033[93mApplied automatic scoring restrictions based on pairwise choices\033[0m")
     else:
-        # 非评分模式，保持空白
         ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)]
         ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)]
@@ -1406,17 +1405,16 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
     if next_pair is not None:
         print("debug: Extracting UI content for next pair")
         print("progress_state:", progress_state)
-        # print("data_subset_state:", data_subset_state)
         print("next_pair:", next_pair)
         content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair)
         ui_updates.update(content_updates)
-        # 如果 extract_ui_content_by_mode 返回了新的 pairwise_results_for_display，
-        # 确保它有正确的长度
         if 'pairwise_results_for_display' in content_updates:
             pairwise_results = content_updates['pairwise_results_for_display']
             if pairwise_results is None or len(pairwise_results) != len_criteria:
-                print(f"\033[93mWarning: pairwise_results_for_display has incorrect length {len(pairwise_results) if pairwise_results else 'None'}, expected {len_criteria}\033[0m")
                 ui_updates['pairwise_results_for_display'] = default_pairwise_results
     # 6. Ensure all required arrays have correct lengths before returning
@@ -1428,8 +1426,8 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
     # Validate pairwise_results_for_display always has the right length
     if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria:
-        print(f"\033[91m[ADVANCE_WORKFLOW] WARNING: Overriding pairwise_results_for_display due to length mismatch!\033[0m")
-        print(f"\033[91m  Original length: {len(ui_updates.get('pairwise_results_for_display', []))}, Expected: {len_criteria}\033[0m")
         ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)]
     if 'pairwise_results_for_display' in ui_updates:
         if 'ui_buffer' not in progress_state:
@@ -1442,9 +1440,9 @@ def advance_workflow(progress_state, data_subset_state, current_pairwise=None, c
 def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons):
     """Process pairwise comparison submission and transition to scoring mode."""
-    print(f"\033[95m=== SUBMIT PAIRWISE COMPARISON DEBUG ===\033[0m")
-    print(f"\033[95mInput progress_state: {progress_state}\033[0m")
-    print(f"\033[95mPairwise comparisons: {pairwise_comparisons}\033[0m")
     # Process input parameters
     criteria_count = len_criteria
@@ -1497,7 +1495,7 @@ def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pa
     # Check individual elements if it's a list
     if isinstance(pairwise_results_for_display, list):
         for i, item in enumerate(pairwise_results_for_display):
-            print(f"\033[96m  Element {i}: {type(item)} - {item}\033[0m")
     return [
         ui_updates.get('page1_visible'),
@@ -1520,9 +1518,9 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
     Submit scoring results and proceed to the next step.
     Simplified to use unified workflow management.
     """
-    print(f"\033[95m=== SUBMIT PAIRWISE SCORING DEBUG ===\033[0m")
-    print(f"\033[95mInput progress_state: {progress_state}\033[0m")
-    print(f"\033[95mRatings: {ratings}\033[0m")
     # Save current ratings - now store by method instead of by pair
     pair = progress_state['all_pairs'][progress_state['current_score_pair_index']]
@@ -1541,7 +1539,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
         # Determine modal visibility based on completion status
         all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
                            len(progress_state['all_pairs']))
-        # next_question_modal_visibility = gr.update(visible=all_scoring_done)
         return [
             ui_updates.get('page1_visible'),                           # 5
@@ -1575,10 +1572,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
     progress_state['scoring_done_pairs'].add(pair)
-    print(f"\033[92m[SCORE_SAVE] Saved scores for {model_A}: {ratings_A}\033[0m")
-    print(f"\033[92m[SCORE_SAVE] Saved scores for {model_B}: {ratings_B}\033[0m")
-    # **ADDED: Save results to database like submit_pairwise_comparison does**
     # Extract pairwise comparison results for this pair
     pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria)
     comparison_reasons = [None] * len_criteria  # We don't have reasons for scoring page
@@ -1602,7 +1599,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
     # Determine modal visibility based on completion status
     all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
                         len(progress_state['all_pairs']))
-    # next_question_modal_visibility = gr.update(visible=all_scoring_done)
     if not all_scoring_done:
@@ -1634,7 +1630,7 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
             user_info, our_methods
         )
-    if remaining_count == 0: # code TODO
         gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!")
         return [
             ui_updates.get('page1_visible'),                           # 5
@@ -1656,11 +1652,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
             *ui_updates.get('ratings_A'),
             *ui_updates.get('ratings_B'),
             *ui_updates.get('pairwise_results_for_display'),           # 26-33
-            # next_question_modal_visibility                             # 34
         ]
     # Use advance_workflow to get all UI updates
     ui_updates = advance_workflow(progress_state, data_subset_state)
-    print(f"\033[93mIn submit_pairwise_scoring, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
     num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
     gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.")  # eval_progress_text
     return (
@@ -1686,37 +1681,6 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *ratin
             # next_question_modal_visibility
         )
-# def proceed_to_next_question(user_info):
-#     """
-#     Refactored to reuse code from go_to_eval_progress_modal by implementing it using advance_workflow.
-#     This eliminates code duplication and ensures consistent UI behavior.
-#     """
-#     # Fetch next question state
-#     user_info_new, chat_a, chat_b, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question(
-#         user_info['name'], user_info['email'], user_info['specialty'], user_info['subspecialty'],
-#         user_info['years_exp'], user_info['exp_explanation'], user_info['npi_id'], user_info['evaluator_id'], our_methods
-#     )
-#     # Use advance_workflow to get all UI updates (same pattern as go_to_eval_progress_modal)
-#     ui_updates = advance_workflow(progress_state, data_subset_state)
-#     print(f"\033[93mIn proceed_to_next_question, using advance_workflow results: mode={progress_state.get('mode')}\033[0m")
-#     # Return exactly the elements bound in next_question_btn.click:
-#     return [
-#         user_info_new,
-#         ui_updates.get('chat_a_page1', chat_a),      # 使用适合当前模式的内容
-#         ui_updates.get('chat_b_page1', chat_b),      # 使用适合当前模式的内容
-#         ui_updates.get('page1_prompt', page1_prompt), # 使用适合当前模式的提示
-#         page1_reference_answer,
-#         data_subset_state,                           # data_subset_state slot
-#         ui_updates.get('progress_state', progress_state), # progress_state slot
-#         ui_updates.get('page1_visible', gr.update(visible=True)),  # page1 visibility based on mode
-#         ui_updates.get('page2_visible', gr.update(visible=False)), # page2 visibility based on mode
-#         gr.update(visible=False)                     # next_question_modal hidden
-#     ]
 # --- Define Callback Functions for Confirmation Flow ---
 def build_row_dict(
     data_subset_state,
@@ -1812,18 +1776,18 @@ def restrict_choices(progress_state, index, score_a, score_b):
     Enforces rating constraints based on the pairwise choice for the given criterion index.
     """
     print(
-        f"\033[91mRestricting choices for index {index} with scores A: {score_a}, B: {score_b}\033[0m")
     print(
-        f"\033[91mProgress state keys: {list(progress_state.keys()) if progress_state else 'None'}\033[0m")
     # Extract the pairwise choice for the current criterion
     pairwise_choice = _extract_pairwise_choice(progress_state, index)
     if pairwise_choice is not None:
         print(
-            f"\033[92mFound pairwise choice for criterion {index}: {pairwise_choice}\033[0m")
     else:
-        print(f"\033[93mNo pairwise results found for criterion {index}\033[0m")
     # Skip if both scores are None
     if score_a is None and score_b is None:
@@ -1939,7 +1903,6 @@ with gr.Blocks(css=centered_col_css) as demo:
             img.save(buffer, format="PNG")
             encoded_string = base64.b64encode(
                 buffer.getvalue()).decode("utf-8")
-            # encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
         image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
         ReasoningTraceExampleHTML = f"""
@@ -1951,9 +1914,9 @@ with gr.Blocks(css=centered_col_css) as demo:
         gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
         """)
         gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
-        name = gr.Textbox(label="Name (required)", value="Curt Ginder")
         email = gr.Textbox(
-            label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="[email protected]")
         evaluator_id = gr.Textbox(
             label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False)
@@ -1992,8 +1955,6 @@ with gr.Blocks(css=centered_col_css) as demo:
     with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
         eval_progress_text = gr.Markdown("You have X questions remaining.")
-        # eval_progress_proceed_btn = gr.Button(
-        #     "OK, proceed to question evaluation")
     # Page 1: Pairwise Comparison.
     with gr.Column(visible=False) as page1:
@@ -2218,22 +2179,6 @@ with gr.Blocks(css=centered_col_css) as demo:
         # Clicking OK hides the modal.
         ok_btn.click(lambda: gr.update(visible=False), None, error_modal)
-    # Confirmation Modal: Ask for final submission confirmation.
-    # with Modal("Confirm Submission", visible=False, elem_id="confirm_modal") as confirm_modal:
-    #     gr.Markdown(
-    #         "Are you sure you want to submit? Once submitted, you cannot edit your responses.")
-    #     with gr.Row():
-    #         yes_btn = gr.Button("Yes, please submit")
-    #         cancel_btn = gr.Button("Cancel")
-    # Add modal for proceeding to next question
-    # with Modal("Next Question", visible=False, elem_id="next_question_modal") as next_question_modal:
-    #     gr.Markdown(
-    #         "You have completed this question. Click below to proceed to the next question.")
-    #     next_question_btn = gr.Button("Next Question")
     # --- Define Transitions Between Pages ---
     # For the "Participate in Evaluation" button, transition to page0
@@ -2267,13 +2212,6 @@ with gr.Blocks(css=centered_col_css) as demo:
     home_btn_2.click(lambda: (gr.update(visible=True), gr.update(
         visible=False)), None, [page_minus1, page2], scroll_to_output=True)
-    # Transition from Page 1 to Page 0 (Back button).
-    # back_btn_0.click(
-    #     fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
-    #     inputs=None,
-    #     outputs=[page0, page1]
-    # )
     # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
     next_btn_1.click(
         fn=submit_pairwise_comparison,
@@ -2291,27 +2229,6 @@ with gr.Blocks(css=centered_col_css) as demo:
         scroll_to_output=True,
     )
-    # Transition from Rating Page (Page 2) back to Pairwise page.
-    # back_btn_2.click(
-    #     fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
-    #     inputs=None,
-    #     outputs=[page1, page2],
-    #     scroll_to_output=True
-    # )
-    # Wire up the modal button to proceed_to_next_question and reset all UI for the new question
-    # next_question_btn.click(
-    #     fn=proceed_to_next_question,
-    #     inputs=[user_info_state],
-    #     outputs=[
-    #         user_info_state,
-    #         chat_a_page1, chat_b_page1, page1_prompt, page1_reference_answer,
-    #         data_subset_state, progress_state,
-    #         page1, page2, next_question_modal
-    #     ],
-    #     scroll_to_output=True
-    # )
     submit_btn.click(
         fn=submit_pairwise_scoring,
         inputs=[progress_state,

 EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
 TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
 our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
+# Load tool lists from 'tool_lists' subdirectory
 tools_dir = os.path.join(os.getcwd(), 'tool_lists')
 # Initialize an empty dictionary to store the results
             print(f"Error processing {filename}: {e}")
             results[key] = [f"Error loading {filename}"]
+# Tool database labels for different tool calls in format_chat
 tool_database_labels_raw = {
     "chembl_tools": "**from the ChEMBL database**",
     "efo_tools": "**from the Experimental Factor Ontology**",
         local_path = hf_hub_download(
             repo_id=REPO_ID,
             repo_type="dataset",
+            # Fetches the most recent version of the dataset each time this command is called
             revision="main",
             filename=remote_path,
             token=os.getenv("HF_TOKEN")
         )
         with open(local_path, "r") as f:
     if not evaluator_question_ids:
         return [], data_by_filename
+    # Check if evaluator has already completed any questions
+    # Must go through every tuple of (question_ID, TxAgent, other model)
     model_names = [key for key in data_by_filename.keys()
                    if key not in our_methods]
     full_question_ids_list = []
     results_df = read_sheet_to_df(custom_sheet_name=str(
         TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
     if results_df is not None and not results_df.empty:
+        # Only consider records where both "Pairwise comparison" and "scoring" fields are filled
         comparison_cols = [
             f"Criterion_{c['label']} Comparison: Which is Better?"
             for c in criteria_for_comparison
         for _, row in results_df.iterrows():
             q = row.get("Question ID")
             a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model")
+            # Ensure our_methods comes first
             if a in our_methods and b not in our_methods:
                 pair = (q, a, b)
             elif b in our_methods and a not in our_methods:
             else:
                 continue
             complete = True
+            # Check all pairwise comparison columns
             for col in comparison_cols:
                 if not row.get(col):
                     complete = False
                     break
+            # If pairwise is complete, check all scoring columns
             if complete:
                 for col in scoreA_cols + scoreB_cols:
                     if not row.get(col):
                         break
             if complete:
                 matched_pairs.add(pair)
+        # Only filter out truly completed pairs, incomplete ones (with missing values) will be retained
         full_question_ids_list = [
             t for t in full_question_ids_list if t not in matched_pairs
         ]
     # Validate required fields
     validation_error = validate_required_fields(
         name, email, evaluator_id, specialty_dd, years_exp_radio)
+    print(f"In go_to_eval_progress_modal, validation_error={validation_error}")
     if validation_error:
         return (
                 gr.update(visible=True),            # page0
     # Use advance_workflow to get all UI updates
     ui_updates = advance_workflow(progress_state, data_subset_state)
+    print(f"In go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}")
     # Map advance_workflow outputs to the required return format
     num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
     completed the pairwise comparison phase.
     """
     for idx, pair in enumerate(progress_state['all_pairs']):
+        # Ensure the pair has completed the pairwise comparison phase and hasn't been scored yet
         if pair in progress_state.get('pairwise_done', set()) and pair not in progress_state['scoring_done_pairs']:
             progress_state['current_score_pair_index'] = idx
             return pair
     if df is None or df.empty:
         return None
+    # Only keep rows for current question_id
     df_q = df[df["Question ID"] == question_id]
     if df_q.empty:
         return None
     scoring_done_pairs = set()
     pairwise_scores = {}
+    # Iterate through each record to extract model pairs, comparison results and scores
     for _, row in df_q.iterrows():
         a, b = row["ResponseA_Model"], row["ResponseB_Model"]
         pair = (a, b)
             comps.append(mapped_value)
         pairwise_results[pair] = comps
+        # Collect scores if scoring columns exist
         first_score = f"ScoreA_{criteria[0]['label']}"
         if first_score in row and row[first_score] not in (None, ""):
+            # Store scores by method instead of by pair
             scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria]
             scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria]
             scoring_done_pairs.add(pair)
             pairwise_scores[a] = scores_A
             pairwise_scores[b] = scores_B
+    # Intelligently set mode based on existing data
+    # 1. If there are completed pairwise comparisons but no corresponding scores, should enter scoring mode
+    # 2. If both pairwise comparisons and scores are completed, need to determine if there are incomplete pairs through advance_workflow
+    # 3. If no completed pairwise comparisons, should be in pairwise comparison mode
+    determined_mode = "pairwise"  # Default mode
     if pairwise_done:
+        # Has completed pairwise comparisons
+        # Check if there are completed pairs but unscored pairs
         unscored_pairs = pairwise_done - scoring_done_pairs
         if unscored_pairs:
+            # Has completed pairs but unscored pairs, should enter scoring mode
             determined_mode = "scoring"
+            print(f"load_progress_state: Found {len(unscored_pairs)} unscored pairs, setting mode to 'scoring'")
         else:
+            # All paired comparisons are scored, let advance_workflow decide next step
+            determined_mode = "pairwise"  # May still have unpaired ones
+            print(f"load_progress_state: All pairwise comparisons are scored, setting mode to 'pairwise' (will be corrected by advance_workflow)")
     else:
+        # No completed pairwise comparisons, definitely pairwise comparison mode
         determined_mode = "pairwise"
+        print(f"load_progress_state: No completed pairwise comparisons, setting mode to 'pairwise'")
+    # Construct complete progress_state (all_pairs, all_models will be overwritten later)
     progress_state = {
         "current_question_index":     0,
         "current_pair_index":         0,
         "pairwise_results":           pairwise_results,
         "scoring_done_pairs":         scoring_done_pairs,
         "pairwise_scores":            pairwise_scores,
+        "all_pairs":                  [],   # Reset later based on models_full
+        "all_models":                 [],   # Reset later based on models_full
         "evaluator_id":               evaluator_id,
+        "mode":                       determined_mode,  # Intelligently set mode
     }
+    print(progress_state)
     return progress_state
         # Create enhanced pairwise results display with conflict warnings
         pairwise_results_for_display = []
+        for i in range(len_criteria):  # Use len_criteria to generate correct number of display elements
+            # Get corresponding comparison result
             choice = pairwise_results[i] if i < len(pairwise_results) else None
+            # Normalize choice, support both mapping key and value forms
             normalized_choice = choice
             if choice in mapping:
                 normalized_choice = mapping[choice]
             pairwise_results_for_display.append(gr.Markdown(display_html, visible=True))
+        # Ensure correct length
         assert len(pairwise_results_for_display) == len_criteria, f"Expected {len_criteria}, got {len(pairwise_results_for_display)}"
         # Create enhanced chatbot components for scoring page
             'chat_a_page2': None,  # Scoring content (unused in pairwise)
             'chat_b_page2': None,  # Scoring content (unused in pairwise)
             'page2_prompt': None,  # Scoring prompt (unused in pairwise)
+            'pairwise_results_for_display': [gr.update(visible=False) for _ in range(len_criteria)]  # Ensure correct length
         }
     2. If one score exists, lock it and apply restrictions to the other based on the locked score
     3. If no scores exist, apply normal pairwise restrictions
     """
+    print(f"[DEBUG] apply_scoring_restrictions called - criterion {index}: input score_a={score_a}, score_b={score_b}")
     # Extract the pairwise choice for the current criterion (needed for restrictions)
     pairwise_choice = _extract_pairwise_choice(progress_state, index)
     if pairwise_choice is not None:
+        print(f"[Auto-restrict] Found pairwise choice for criterion {index}: {pairwise_choice}")
     else:
+        print(f"[Auto-restrict] No pairwise choice found for criterion {index}")
     # Case 1: Both scores exist - COMPARISON-FIRST logic
     if score_a is not None and score_b is not None:
+        print(f"[COMPARISON-FIRST] Both scores exist: A={score_a}, B={score_b} - checking for conflicts with pairwise choice")
         # Use unified conflict detection
         conflict_detected = _detect_scoring_conflict(pairwise_choice, score_a, score_b)
         if conflict_detected:
+            print(f"[CONFLICT] Pairwise choice conflicts with scores: A={score_a}, B={score_b}")
+            print(f"[COMPARISON-FIRST] Conflict detected - prioritizing pairwise comparison, clearing existing scores")
             # Apply normal pairwise restrictions with cleared values to prioritize pairwise choice
             result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
         else:
+            print(f"[NO-CONFLICT] Existing scores are compatible with pairwise choice - locking both")
             # Ensure scores are strings for Gradio compatibility
             score_a_str = str(score_a)
             score_b_str = str(score_b)
     # Case 2: Only score_a exists - lock A, restrict B based on A's value (B value stays None)
     elif score_a is not None and score_b is None:
+        print(f"[LOCK] Found existing score_a={score_a}, locking A and restricting B based on A (B value=None)")
         score_a_str = str(score_a)
         result_A = gr.update(choices=[score_a_str], value=score_a_str)
         # Apply restrictions to B based on A's locked value and pairwise choice, but keep B value as None
     # Case 3: Only score_b exists - lock B, restrict A based on B's value (A value stays None)
     elif score_a is None and score_b is not None:
+        print(f"[LOCK] Found existing score_b={score_b}, locking B and restricting A based on B (A value=None)")
         score_b_str = str(score_b)
         result_B = gr.update(choices=[score_b_str], value=score_b_str)
         # Apply restrictions to A based on B's locked value and pairwise choice, but keep A value as None
     # Case 4: No scores exist - apply normal pairwise restrictions with None values to ensure clearing
     else:
+        print(f"[CLEAR] No existing scores - applying normal restrictions with None values to ensure clearing")
         # Apply restrictions using the shared utility function with explicit None values
         result_A, result_B = _apply_rating_restrictions(pairwise_choice, None, None, include_values=True)
+    print(f"[DEBUG] apply_scoring_restrictions result - criterion {index}: returning gradio updates")
     return result_A, result_B
     """
     Unified workflow manager that handles all state transitions and UI updates.
     """
+    print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
     print(progress_state)
     default_pairwise_results = [gr.update(visible=False) for _ in range(len_criteria)]
     next_pairwise_pair = get_next_uncompleted_pair(progress_state)
     next_scoring_pair = get_next_unscored_pair(progress_state)
+    # 2. Determine workflow phase and set mode
     if next_pairwise_pair is not None:
         progress_state['mode'] = 'pairwise'
         next_pair = next_pairwise_pair
+        print(f"Pairwise mode: next pair {next_pair}")
     elif next_scoring_pair is not None:
         progress_state['mode'] = 'scoring'
         next_pair = next_scoring_pair
+        print(f"Scoring mode: next pair {next_pair}")
     else:
         progress_state['mode'] = 'completed'
         next_pair = None
         'pairwise_results_for_display': default_pairwise_results
     }
+    # 4. If in scoring mode, check historical scoring records
     if current_mode == 'scoring' and next_pair is not None:
+        # Get historical scores for current model pair (if exists)
         current_model_A, current_model_B = next_pair
+        # Search for historical scores directly from method-keyed storage - simpler and more efficient!
         relevant_scores_A = progress_state.get('pairwise_scores', {}).get(current_model_A)
         relevant_scores_B = progress_state.get('pairwise_scores', {}).get(current_model_B)
+        print(f"Relevant scores for {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}")
         # Debug check for None values
         if relevant_scores_A is None:
+            print(f"[DEBUG] relevant_scores_A is None for pair {next_pair}")
         if relevant_scores_B is None:
+            print(f"[DEBUG] relevant_scores_B is None for pair {next_pair}")
+        # Set initial values for UI scoring controls
         ratings_A_updates = []
         ratings_B_updates = []
+        # Set initial values for A and B model scoring controls and apply scoring restrictions
         for i in range(len_criteria):
+            # For A model: use historical score if available, otherwise clear
             score_A = relevant_scores_A[i] if relevant_scores_A and i < len(relevant_scores_A) else None
+            # For B model: use historical score if available, otherwise clear
             score_B = relevant_scores_B[i] if relevant_scores_B and i < len(relevant_scores_B) else None
+            print(f"[DEBUG] Criterion {i}: Before apply_scoring_restrictions - score_A={score_A}, score_B={score_B}")
+            # Apply scoring restrictions based on pairwise comparison results
             restricted_update_A, restricted_update_B = apply_scoring_restrictions(
                 progress_state, i, score_A, score_B)
+            print(f"[DEBUG] Criterion {i}: After apply_scoring_restrictions - restricted_update_A={restricted_update_A}, restricted_update_B={restricted_update_B}")
             ratings_A_updates.append(restricted_update_A)
             ratings_B_updates.append(restricted_update_B)
         ui_updates['ratings_A'] = ratings_A_updates
         ui_updates['ratings_B'] = ratings_B_updates
+        print(f"Loaded relevant scores for pair {next_pair}: A={relevant_scores_A}, B={relevant_scores_B}")
+        print(f"Applied automatic scoring restrictions based on pairwise choices")
     else:
+        # Non-scoring mode, keep blank
         ui_updates['ratings_A'] = [gr.update(value=None) for _ in range(len_criteria)]
         ui_updates['ratings_B'] = [gr.update(value=None) for _ in range(len_criteria)]
     if next_pair is not None:
         print("debug: Extracting UI content for next pair")
         print("progress_state:", progress_state)
         print("next_pair:", next_pair)
         content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair)
         ui_updates.update(content_updates)
+        # If extract_ui_content_by_mode returned new pairwise_results_for_display，
+        # ensure it has correct length
         if 'pairwise_results_for_display' in content_updates:
             pairwise_results = content_updates['pairwise_results_for_display']
             if pairwise_results is None or len(pairwise_results) != len_criteria:
+                print(f"Warning: pairwise_results_for_display has incorrect length {len(pairwise_results) if pairwise_results else 'None'}, expected {len_criteria}")
                 ui_updates['pairwise_results_for_display'] = default_pairwise_results
     # 6. Ensure all required arrays have correct lengths before returning
     # Validate pairwise_results_for_display always has the right length
     if len(ui_updates.get('pairwise_results_for_display', [])) != len_criteria:
+        print(f"[ADVANCE_WORKFLOW] WARNING: Overriding pairwise_results_for_display due to length mismatch!")
+        print(f"  Original length: {len(ui_updates.get('pairwise_results_for_display', []))}, Expected: {len_criteria}")
         ui_updates['pairwise_results_for_display'] = [gr.update(visible=False) for _ in range(len_criteria)]
     if 'pairwise_results_for_display' in ui_updates:
         if 'ui_buffer' not in progress_state:
 def submit_pairwise_comparison(progress_state, data_subset_state, user_info, *pairwise_comparisons):
     """Process pairwise comparison submission and transition to scoring mode."""
+    print(f"=== SUBMIT PAIRWISE COMPARISON DEBUG ===")
+    print(f"Input progress_state: {progress_state}")
+    print(f"Pairwise comparisons: {pairwise_comparisons}")
     # Process input parameters
     criteria_count = len_criteria
     # Check individual elements if it's a list
     if isinstance(pairwise_results_for_display, list):
         for i, item in enumerate(pairwise_results_for_display):
+            print(f"  Element {i}: {type(item)} - {item}")
     return [
         ui_updates.get('page1_visible'),
     Submit scoring results and proceed to the next step.
     Simplified to use unified workflow management.
     """
+    print(f"=== SUBMIT PAIRWISE SCORING DEBUG ===")
+    print(f"Input progress_state: {progress_state}")
+    print(f"Ratings: {ratings}")
     # Save current ratings - now store by method instead of by pair
     pair = progress_state['all_pairs'][progress_state['current_score_pair_index']]
         # Determine modal visibility based on completion status
         all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
                            len(progress_state['all_pairs']))
         return [
             ui_updates.get('page1_visible'),                           # 5
     progress_state['scoring_done_pairs'].add(pair)
+    print(f"[SCORE_SAVE] Saved scores for {model_A}: {ratings_A}")
+    print(f"[SCORE_SAVE] Saved scores for {model_B}: {ratings_B}")
+    # Save results to database like submit_pairwise_comparison does
     # Extract pairwise comparison results for this pair
     pairwise_results = progress_state['pairwise_results'].get(pair, [None] * len_criteria)
     comparison_reasons = [None] * len_criteria  # We don't have reasons for scoring page
     # Determine modal visibility based on completion status
     all_scoring_done = (len(progress_state['scoring_done_pairs']) ==
                         len(progress_state['all_pairs']))
     if not all_scoring_done:
             user_info, our_methods
         )
+    if remaining_count == 0: # TODO: handle completion
         gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!")
         return [
             ui_updates.get('page1_visible'),                           # 5
             *ui_updates.get('ratings_A'),
             *ui_updates.get('ratings_B'),
             *ui_updates.get('pairwise_results_for_display'),           # 26-33
         ]
     # Use advance_workflow to get all UI updates
     ui_updates = advance_workflow(progress_state, data_subset_state)
+    print(f"In submit_pairwise_scoring, using advance_workflow results: mode={progress_state.get('mode')}")
     num_remaining_questions = remaining_count// len(progress_state['all_pairs'])
     gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.")  # eval_progress_text
     return (
             # next_question_modal_visibility
         )
 # --- Define Callback Functions for Confirmation Flow ---
 def build_row_dict(
     data_subset_state,
     Enforces rating constraints based on the pairwise choice for the given criterion index.
     """
     print(
+        f"Restricting choices for index {index} with scores A: {score_a}, B: {score_b}")
     print(
+        f"Progress state keys: {list(progress_state.keys()) if progress_state else 'None'}")
     # Extract the pairwise choice for the current criterion
     pairwise_choice = _extract_pairwise_choice(progress_state, index)
     if pairwise_choice is not None:
         print(
+            f"Found pairwise choice for criterion {index}: {pairwise_choice}")
     else:
+        print(f"No pairwise results found for criterion {index}")
     # Skip if both scores are None
     if score_a is None and score_b is None:
             img.save(buffer, format="PNG")
             encoded_string = base64.b64encode(
                 buffer.getvalue()).decode("utf-8")
         image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
         ReasoningTraceExampleHTML = f"""
         gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
         """)
         gr.Markdown("## Please enter your information to get a question to evaluate. Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.")
+        name = gr.Textbox(label="Name (required)", value="")
         email = gr.Textbox(
+            label="Email (required). Please use the same email every time you log onto this evaluation portal, as we use your email to prevent showing repeat questions.", value="")
         evaluator_id = gr.Textbox(
             label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False)
     with Modal(visible=False, elem_id="confirm_modal") as eval_progress_modal:
         eval_progress_text = gr.Markdown("You have X questions remaining.")
     # Page 1: Pairwise Comparison.
     with gr.Column(visible=False) as page1:
         # Clicking OK hides the modal.
         ok_btn.click(lambda: gr.update(visible=False), None, error_modal)
     # --- Define Transitions Between Pages ---
     # For the "Participate in Evaluation" button, transition to page0
     home_btn_2.click(lambda: (gr.update(visible=True), gr.update(
         visible=False)), None, [page_minus1, page2], scroll_to_output=True)
     # Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2).
     next_btn_1.click(
         fn=submit_pairwise_comparison,
         scroll_to_output=True,
     )
     submit_btn.click(
         fn=submit_pairwise_scoring,
         inputs=[progress_state,