Turing-test-web-en

Sleeping

App Files Files

xet

Community

intersteller2887 commited on Jul 11

Commit

ea9a277

verified ·

1 Parent(s): 7f467e0

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -78

app.py CHANGED Viewed

@@ -6,17 +6,21 @@ import random
 import shutil
 import time
 import collections
 from datasets import load_dataset, Audio
 from huggingface_hub import HfApi
 dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
-dataset = dataset.cast_column("audio", Audio(decode=False))
 target_audio_dir = "/home/user/app/audio"
 os.makedirs(target_audio_dir, exist_ok=True)
 COUNT_JSON_PATH = "/home/user/app/count.json"
-COUNT_JSON_REPO_PATH = "submissions/count.json"
 local_audio_paths = []
 for item in dataset:
@@ -30,9 +34,8 @@ for item in dataset:
 all_data_audio_paths = local_audio_paths
 sample1_audio_path = local_audio_paths[0]
-# sample1_audio_path = next((p for p in all_data_audio_paths if p.endswith("sample1.wav")), None)
-print(sample1_audio_path)
 # ==============================================================================
 # 数据定义 (Data Definition)
@@ -89,42 +92,30 @@ DIMENSIONS_DATA = [
 ]
 DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
-"""def load_or_initialize_count_json(audio_paths):
-    if os.path.exists(COUNT_JSON_PATH):
-        with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
-            # 使用 object_pairs_hook 保持原始顺序
-            count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
-    else:
-        count_data = collections.OrderedDict()
-    updated = False
-    # 获取所有样例音频文件名
-    sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
-    for path in audio_paths:
-        filename = os.path.basename(path)
-        if filename not in count_data:
-            # 如果是样例音频，直接设置为最大值
-            if filename in sample_audio_files:
-                count_data[filename] = 999  # 设置为很大的值，确保不会被选中
-            else:
-                count_data[filename] = 0
-            updated = True
-    if updated or not os.path.exists(COUNT_JSON_PATH):
-        with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
-            # 确保写入时也保持顺序
-            json.dump(count_data, f, indent=4, ensure_ascii=False)
-    return count_data"""
-from filelock import FileLock
 def load_or_initialize_count_json(audio_paths):
     lock_path = COUNT_JSON_PATH + ".lock"
-    with FileLock(lock_path, timeout=10):  # 最多等 10 秒
         if os.path.exists(COUNT_JSON_PATH):
             with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
                 count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
@@ -134,6 +125,8 @@ def load_or_initialize_count_json(audio_paths):
         updated = False
         sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
         for path in audio_paths:
             filename = os.path.basename(path)
             if filename not in count_data:
@@ -149,6 +142,7 @@ def load_or_initialize_count_json(audio_paths):
     return count_data
 def append_cache_buster(audio_path):
     return f"{audio_path}?t={int(time.time() * 1000)}"
@@ -173,8 +167,7 @@ def append_cache_buster(audio_path):
     return selected, count_data"""
-# Version 2，相近的两份卷子会得到一样的试题
-def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
     eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
     if len(eligible_paths) < k:
@@ -193,24 +186,10 @@ def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
     return selected, count_data
-count_data = load_or_initialize_count_json(all_data_audio_paths)
-selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
-QUESTION_SET = [
-    {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
-    for path in selected_audio_paths
-]
-MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
-# ==============================================================================
-# 功能函数定义 (Function Definitions)
-# ==============================================================================
-"""def start_challenge():
-    return gr.update(visible=False), gr.update(visible=True)"""
-def start_challenge(user_data_state):
-    global QUESTION_SET, updated_count_data
     # 每次点击“开始挑战”时重新抽题
     count_data = load_or_initialize_count_json(all_data_audio_paths)
@@ -223,33 +202,23 @@ def start_challenge(user_data_state):
     # 重置 user_data 中的状态（也可以留空）
     user_data_state.clear()
-    return gr.update(visible=False), gr.update(visible=True), user_data_state
-"""def start_challenge(user_data_state):
-    global QUESTION_SET
-    # Step 1: 读取最新 count.json
     count_data = load_or_initialize_count_json(all_data_audio_paths)
-    # Step 2: 从 eligible 音频中抽题
-    selected_audio_paths, _ = sample_audio_paths(
-        audio_paths=all_data_audio_paths,
-        count_data=count_data.copy(),  # 不修改原始数据
-        k=5,
-        max_count=1
-    )
-    # Step 3: 保存抽到的音频文件名在用户状态中
-    user_data_state["selected_filenames"] = [os.path.basename(p) for p in selected_audio_paths]
-    # Step 4: 设置题目列表
-    QUESTION_SET = [
         {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
         for path in selected_audio_paths
     ]
-    return gr.update(visible=False), gr.update(visible=True), user_data_state"""
 def toggle_education_other(choice):
     is_other = (choice == "其他（请注明）")
     return gr.update(visible=is_other, interactive=is_other, value="")
@@ -329,8 +298,10 @@ def update_test_dimension_view(d_idx, selections):
 def init_test_question(user_data, q_idx):
     d_idx = 0
-    question = QUESTION_SET[q_idx]
-    progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
     initial_updates = update_test_dimension_view(d_idx, {})
     dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
@@ -394,15 +365,23 @@ def navigate_dimensions(direction, q_idx, d_idx, selections, *slider_values):
 def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
     selections["final_choice"] = final_choice
-    final_question_result = {
         "question_id": q_idx, "audio_file": QUESTION_SET[q_idx]['audio'],
         "selections": selections
     }
     all_results.append(final_question_result)
     q_idx += 1
-    if q_idx < len(QUESTION_SET):
         init_q_updates = init_test_question(user_data, q_idx)
         return init_q_updates + (all_results, gr.update(value=""))
     else:
@@ -705,7 +684,8 @@ if __name__ == "__main__":
         os.makedirs("audio")
     if "SPACE_ID" in os.environ:
         print("Running in a Hugging Face Space, checking for audio files...")
-        all_files = [q["audio"] for q in QUESTION_SET] + [d["audio"] for d in DIMENSIONS_DATA]
         for audio_file in set(all_files):
             if not os.path.exists(audio_file):
                 print(f"⚠️ Warning: Audio file not found: {audio_file}")

 import shutil
 import time
 import collections
+from filelock import FileLock
 from datasets import load_dataset, Audio
 from huggingface_hub import HfApi
 dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
+dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
+# Huggingface space working directory: "/home/user/app"
 target_audio_dir = "/home/user/app/audio"
 os.makedirs(target_audio_dir, exist_ok=True)
 COUNT_JSON_PATH = "/home/user/app/count.json"
+COUNT_JSON_REPO_PATH = "submissions/count.json" # Output directory (Huggingface dataset directory)
+# Copy recordings to the working directory
 local_audio_paths = []
 for item in dataset:
 all_data_audio_paths = local_audio_paths
+# Take first file of the datasets as sample
 sample1_audio_path = local_audio_paths[0]
 # ==============================================================================
 # 数据定义 (Data Definition)
 ]
 DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
+MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
+"""
+# Issue: this is initialized on the starting of the space, might somehow not covered
+count_data = load_or_initialize_count_json(all_data_audio_paths)
+selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
+QUESTION_SET = [
+    {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
+    for path in selected_audio_paths
+]"""
+# ==============================================================================
+# 功能函数定义 (Function Definitions)
+# ==============================================================================
+# Function that load or initialize count.json
 def load_or_initialize_count_json(audio_paths):
+    # Add filelock to /workspace/count.json
     lock_path = COUNT_JSON_PATH + ".lock"
+    # Read of count.json will wait for 10 seconds until another thread involving releases it, and then add a lock to it
+    with FileLock(lock_path, timeout=10):
+        # If count.json exists: load into count_data
+        # Else initialize count_data with orderedDict
         if os.path.exists(COUNT_JSON_PATH):
             with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
                 count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
         updated = False
         sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
+        # Guarantee that the sample recording won't be take into the pool
+        # Update newly updated recordings into count.json
         for path in audio_paths:
             filename = os.path.basename(path)
             if filename not in count_data:
     return count_data
+# Shorten the time of playing previous audio when reached next question
 def append_cache_buster(audio_path):
     return f"{audio_path}?t={int(time.time() * 1000)}"
     return selected, count_data"""
+def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
     eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
     if len(eligible_paths) < k:
     return selected, count_data
+"""def start_challenge(user_data_state):
+    # global QUESTION_SET, updated_count_data
+    # Issue: global variables in huggingface hub is shared by all threads
     # 每次点击“开始挑战”时重新抽题
     count_data = load_or_initialize_count_json(all_data_audio_paths)
     # 重置 user_data 中的状态（也可以留空）
     user_data_state.clear()
+    return gr.update(visible=False), gr.update(visible=True), user_data_state"""
+# Save question_set in each user_data_state, preventing global sharing
+def start_challenge(user_data_state):
     count_data = load_or_initialize_count_json(all_data_audio_paths)
+    selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
+    question_set = [
         {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
         for path in selected_audio_paths
     ]
+    user_data_state["question_set"] = question_set
+    user_data_state["updated_count_data"] = updated_count_data
+    return gr.update(visible=False), gr.update(visible=True), user_data_state
 def toggle_education_other(choice):
     is_other = (choice == "其他（请注明）")
     return gr.update(visible=is_other, interactive=is_other, value="")
 def init_test_question(user_data, q_idx):
     d_idx = 0
+    # question = QUESTION_SET[q_idx]
+    # progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
+    question = user_data["question_set"][q_idx]
+    progress_q = f"第 {q_idx + 1} / {len(user_data["question_set"])} 题"
     initial_updates = update_test_dimension_view(d_idx, {})
     dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
 def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
     selections["final_choice"] = final_choice
+    """final_question_result = {
         "question_id": q_idx, "audio_file": QUESTION_SET[q_idx]['audio'],
         "selections": selections
     }
+    """
+    final_question_result = {
+        "question_id": q_idx, "audio_file": user_data["question_set"][q_idx]['audio'],
+        "selections": selections
+    }
     all_results.append(final_question_result)
     q_idx += 1
+    # if q_idx < len(QUESTION_SET):
+    if q_idx < len(user_data["question_set"]):
         init_q_updates = init_test_question(user_data, q_idx)
         return init_q_updates + (all_results, gr.update(value=""))
     else:
         os.makedirs("audio")
     if "SPACE_ID" in os.environ:
         print("Running in a Hugging Face Space, checking for audio files...")
+        # all_files = [q["audio"] for q in QUESTION_SET] + [d["audio"] for d in DIMENSIONS_DATA]
+        all_files = [[d["audio"] for d in DIMENSIONS_DATA]
         for audio_file in set(all_files):
             if not os.path.exists(audio_file):
                 print(f"⚠️ Warning: Audio file not found: {audio_file}")