Update app.py
Browse files
app.py
CHANGED
@@ -6,17 +6,21 @@ import random
|
|
6 |
import shutil
|
7 |
import time
|
8 |
import collections
|
|
|
9 |
from datasets import load_dataset, Audio
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
|
13 |
-
dataset = dataset.cast_column("audio", Audio(decode=False))
|
14 |
|
|
|
15 |
target_audio_dir = "/home/user/app/audio"
|
16 |
os.makedirs(target_audio_dir, exist_ok=True)
|
17 |
COUNT_JSON_PATH = "/home/user/app/count.json"
|
18 |
-
COUNT_JSON_REPO_PATH = "submissions/count.json"
|
19 |
|
|
|
|
|
|
|
20 |
local_audio_paths = []
|
21 |
|
22 |
for item in dataset:
|
@@ -30,9 +34,8 @@ for item in dataset:
|
|
30 |
|
31 |
all_data_audio_paths = local_audio_paths
|
32 |
|
|
|
33 |
sample1_audio_path = local_audio_paths[0]
|
34 |
-
# sample1_audio_path = next((p for p in all_data_audio_paths if p.endswith("sample1.wav")), None)
|
35 |
-
print(sample1_audio_path)
|
36 |
|
37 |
# ==============================================================================
|
38 |
# 数据定义 (Data Definition)
|
@@ -89,42 +92,30 @@ DIMENSIONS_DATA = [
|
|
89 |
]
|
90 |
|
91 |
DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
|
|
|
92 |
|
93 |
-
"""
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
|
98 |
-
else:
|
99 |
-
count_data = collections.OrderedDict()
|
100 |
-
|
101 |
-
updated = False
|
102 |
-
|
103 |
-
# 获取所有样例音频文件名
|
104 |
-
sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
|
105 |
-
|
106 |
-
for path in audio_paths:
|
107 |
-
filename = os.path.basename(path)
|
108 |
-
if filename not in count_data:
|
109 |
-
# 如果是样例音频,直接设置为最大值
|
110 |
-
if filename in sample_audio_files:
|
111 |
-
count_data[filename] = 999 # 设置为很大的值,确保不会被选中
|
112 |
-
else:
|
113 |
-
count_data[filename] = 0
|
114 |
-
updated = True
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
return count_data"""
|
122 |
|
123 |
-
|
|
|
|
|
124 |
|
|
|
125 |
def load_or_initialize_count_json(audio_paths):
|
|
|
126 |
lock_path = COUNT_JSON_PATH + ".lock"
|
127 |
-
|
|
|
|
|
|
|
128 |
if os.path.exists(COUNT_JSON_PATH):
|
129 |
with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
|
130 |
count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
|
@@ -134,6 +125,8 @@ def load_or_initialize_count_json(audio_paths):
|
|
134 |
updated = False
|
135 |
sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
|
136 |
|
|
|
|
|
137 |
for path in audio_paths:
|
138 |
filename = os.path.basename(path)
|
139 |
if filename not in count_data:
|
@@ -149,6 +142,7 @@ def load_or_initialize_count_json(audio_paths):
|
|
149 |
|
150 |
return count_data
|
151 |
|
|
|
152 |
def append_cache_buster(audio_path):
|
153 |
return f"{audio_path}?t={int(time.time() * 1000)}"
|
154 |
|
@@ -173,8 +167,7 @@ def append_cache_buster(audio_path):
|
|
173 |
|
174 |
return selected, count_data"""
|
175 |
|
176 |
-
#
|
177 |
-
def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
|
178 |
eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
|
179 |
|
180 |
if len(eligible_paths) < k:
|
@@ -193,24 +186,10 @@ def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
|
|
193 |
|
194 |
return selected, count_data
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
{"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
|
201 |
-
for path in selected_audio_paths
|
202 |
-
]
|
203 |
-
|
204 |
-
MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
|
205 |
-
|
206 |
-
# ==============================================================================
|
207 |
-
# 功能函数定义 (Function Definitions)
|
208 |
-
# ==============================================================================
|
209 |
-
"""def start_challenge():
|
210 |
-
return gr.update(visible=False), gr.update(visible=True)"""
|
211 |
-
|
212 |
-
def start_challenge(user_data_state):
|
213 |
-
global QUESTION_SET, updated_count_data
|
214 |
|
215 |
# 每次点击“开始挑战”时重新抽题
|
216 |
count_data = load_or_initialize_count_json(all_data_audio_paths)
|
@@ -223,33 +202,23 @@ def start_challenge(user_data_state):
|
|
223 |
|
224 |
# 重置 user_data 中的状态(也可以留空)
|
225 |
user_data_state.clear()
|
226 |
-
return gr.update(visible=False), gr.update(visible=True), user_data_state
|
227 |
|
228 |
-
|
229 |
-
|
230 |
|
231 |
-
# Step 1: 读取最新 count.json
|
232 |
count_data = load_or_initialize_count_json(all_data_audio_paths)
|
|
|
233 |
|
234 |
-
|
235 |
-
selected_audio_paths, _ = sample_audio_paths(
|
236 |
-
audio_paths=all_data_audio_paths,
|
237 |
-
count_data=count_data.copy(), # 不修改原始数据
|
238 |
-
k=5,
|
239 |
-
max_count=1
|
240 |
-
)
|
241 |
-
|
242 |
-
# Step 3: 保存抽到的音频文件名在用户状态中
|
243 |
-
user_data_state["selected_filenames"] = [os.path.basename(p) for p in selected_audio_paths]
|
244 |
-
|
245 |
-
# Step 4: 设置题目列表
|
246 |
-
QUESTION_SET = [
|
247 |
{"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
|
248 |
for path in selected_audio_paths
|
249 |
]
|
250 |
|
251 |
-
|
252 |
-
|
|
|
|
|
253 |
def toggle_education_other(choice):
|
254 |
is_other = (choice == "其他(请注明)")
|
255 |
return gr.update(visible=is_other, interactive=is_other, value="")
|
@@ -329,8 +298,10 @@ def update_test_dimension_view(d_idx, selections):
|
|
329 |
|
330 |
def init_test_question(user_data, q_idx):
|
331 |
d_idx = 0
|
332 |
-
question = QUESTION_SET[q_idx]
|
333 |
-
progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
|
|
|
|
|
334 |
|
335 |
initial_updates = update_test_dimension_view(d_idx, {})
|
336 |
dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
|
@@ -394,15 +365,23 @@ def navigate_dimensions(direction, q_idx, d_idx, selections, *slider_values):
|
|
394 |
def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
|
395 |
selections["final_choice"] = final_choice
|
396 |
|
397 |
-
final_question_result = {
|
398 |
"question_id": q_idx, "audio_file": QUESTION_SET[q_idx]['audio'],
|
399 |
"selections": selections
|
400 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
all_results.append(final_question_result)
|
402 |
|
403 |
q_idx += 1
|
404 |
|
405 |
-
if q_idx < len(QUESTION_SET):
|
|
|
406 |
init_q_updates = init_test_question(user_data, q_idx)
|
407 |
return init_q_updates + (all_results, gr.update(value=""))
|
408 |
else:
|
@@ -705,7 +684,8 @@ if __name__ == "__main__":
|
|
705 |
os.makedirs("audio")
|
706 |
if "SPACE_ID" in os.environ:
|
707 |
print("Running in a Hugging Face Space, checking for audio files...")
|
708 |
-
all_files = [q["audio"] for q in QUESTION_SET] + [d["audio"] for d in DIMENSIONS_DATA]
|
|
|
709 |
for audio_file in set(all_files):
|
710 |
if not os.path.exists(audio_file):
|
711 |
print(f"⚠️ Warning: Audio file not found: {audio_file}")
|
|
|
6 |
import shutil
|
7 |
import time
|
8 |
import collections
|
9 |
+
from filelock import FileLock
|
10 |
from datasets import load_dataset, Audio
|
11 |
from huggingface_hub import HfApi
|
12 |
|
13 |
dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
|
14 |
+
dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
|
15 |
|
16 |
+
# Huggingface space working directory: "/home/user/app"
|
17 |
target_audio_dir = "/home/user/app/audio"
|
18 |
os.makedirs(target_audio_dir, exist_ok=True)
|
19 |
COUNT_JSON_PATH = "/home/user/app/count.json"
|
|
|
20 |
|
21 |
+
COUNT_JSON_REPO_PATH = "submissions/count.json" # Output directory (Huggingface dataset directory)
|
22 |
+
|
23 |
+
# Copy recordings to the working directory
|
24 |
local_audio_paths = []
|
25 |
|
26 |
for item in dataset:
|
|
|
34 |
|
35 |
all_data_audio_paths = local_audio_paths
|
36 |
|
37 |
+
# Take first file of the datasets as sample
|
38 |
sample1_audio_path = local_audio_paths[0]
|
|
|
|
|
39 |
|
40 |
# ==============================================================================
|
41 |
# 数据定义 (Data Definition)
|
|
|
92 |
]
|
93 |
|
94 |
DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
|
95 |
+
MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
|
96 |
|
97 |
+
"""
|
98 |
+
# Issue: this is initialized on the starting of the space, might somehow not covered
|
99 |
+
count_data = load_or_initialize_count_json(all_data_audio_paths)
|
100 |
+
selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
QUESTION_SET = [
|
103 |
+
{"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
|
104 |
+
for path in selected_audio_paths
|
105 |
+
]"""
|
|
|
|
|
106 |
|
107 |
+
# ==============================================================================
|
108 |
+
# 功能函数定义 (Function Definitions)
|
109 |
+
# ==============================================================================
|
110 |
|
111 |
+
# Function that load or initialize count.json
|
112 |
def load_or_initialize_count_json(audio_paths):
|
113 |
+
# Add filelock to /workspace/count.json
|
114 |
lock_path = COUNT_JSON_PATH + ".lock"
|
115 |
+
# Read of count.json will wait for 10 seconds until another thread involving releases it, and then add a lock to it
|
116 |
+
with FileLock(lock_path, timeout=10):
|
117 |
+
# If count.json exists: load into count_data
|
118 |
+
# Else initialize count_data with orderedDict
|
119 |
if os.path.exists(COUNT_JSON_PATH):
|
120 |
with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
|
121 |
count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
|
|
|
125 |
updated = False
|
126 |
sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
|
127 |
|
128 |
+
# Guarantee that the sample recording won't be take into the pool
|
129 |
+
# Update newly updated recordings into count.json
|
130 |
for path in audio_paths:
|
131 |
filename = os.path.basename(path)
|
132 |
if filename not in count_data:
|
|
|
142 |
|
143 |
return count_data
|
144 |
|
145 |
+
# Shorten the time of playing previous audio when reached next question
|
146 |
def append_cache_buster(audio_path):
|
147 |
return f"{audio_path}?t={int(time.time() * 1000)}"
|
148 |
|
|
|
167 |
|
168 |
return selected, count_data"""
|
169 |
|
170 |
+
def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
|
|
|
171 |
eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
|
172 |
|
173 |
if len(eligible_paths) < k:
|
|
|
186 |
|
187 |
return selected, count_data
|
188 |
|
189 |
+
"""def start_challenge(user_data_state):
|
190 |
+
|
191 |
+
# global QUESTION_SET, updated_count_data
|
192 |
+
# Issue: global variables in huggingface hub is shared by all threads
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
# 每次点击“开始挑战”时重新抽题
|
195 |
count_data = load_or_initialize_count_json(all_data_audio_paths)
|
|
|
202 |
|
203 |
# 重置 user_data 中的状态(也可以留空)
|
204 |
user_data_state.clear()
|
205 |
+
return gr.update(visible=False), gr.update(visible=True), user_data_state"""
|
206 |
|
207 |
+
# Save question_set in each user_data_state, preventing global sharing
|
208 |
+
def start_challenge(user_data_state):
|
209 |
|
|
|
210 |
count_data = load_or_initialize_count_json(all_data_audio_paths)
|
211 |
+
selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
|
212 |
|
213 |
+
question_set = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
{"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
|
215 |
for path in selected_audio_paths
|
216 |
]
|
217 |
|
218 |
+
user_data_state["question_set"] = question_set
|
219 |
+
user_data_state["updated_count_data"] = updated_count_data
|
220 |
+
return gr.update(visible=False), gr.update(visible=True), user_data_state
|
221 |
+
|
222 |
def toggle_education_other(choice):
|
223 |
is_other = (choice == "其他(请注明)")
|
224 |
return gr.update(visible=is_other, interactive=is_other, value="")
|
|
|
298 |
|
299 |
def init_test_question(user_data, q_idx):
|
300 |
d_idx = 0
|
301 |
+
# question = QUESTION_SET[q_idx]
|
302 |
+
# progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
|
303 |
+
question = user_data["question_set"][q_idx]
|
304 |
+
progress_q = f"第 {q_idx + 1} / {len(user_data["question_set"])} 题"
|
305 |
|
306 |
initial_updates = update_test_dimension_view(d_idx, {})
|
307 |
dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
|
|
|
365 |
def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
|
366 |
selections["final_choice"] = final_choice
|
367 |
|
368 |
+
"""final_question_result = {
|
369 |
"question_id": q_idx, "audio_file": QUESTION_SET[q_idx]['audio'],
|
370 |
"selections": selections
|
371 |
}
|
372 |
+
"""
|
373 |
+
|
374 |
+
final_question_result = {
|
375 |
+
"question_id": q_idx, "audio_file": user_data["question_set"][q_idx]['audio'],
|
376 |
+
"selections": selections
|
377 |
+
}
|
378 |
+
|
379 |
all_results.append(final_question_result)
|
380 |
|
381 |
q_idx += 1
|
382 |
|
383 |
+
# if q_idx < len(QUESTION_SET):
|
384 |
+
if q_idx < len(user_data["question_set"]):
|
385 |
init_q_updates = init_test_question(user_data, q_idx)
|
386 |
return init_q_updates + (all_results, gr.update(value=""))
|
387 |
else:
|
|
|
684 |
os.makedirs("audio")
|
685 |
if "SPACE_ID" in os.environ:
|
686 |
print("Running in a Hugging Face Space, checking for audio files...")
|
687 |
+
# all_files = [q["audio"] for q in QUESTION_SET] + [d["audio"] for d in DIMENSIONS_DATA]
|
688 |
+
all_files = [[d["audio"] for d in DIMENSIONS_DATA]
|
689 |
for audio_file in set(all_files):
|
690 |
if not os.path.exists(audio_file):
|
691 |
print(f"⚠️ Warning: Audio file not found: {audio_file}")
|