intersteller2887 commited on
Commit
ea9a277
·
verified ·
1 Parent(s): 7f467e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -78
app.py CHANGED
@@ -6,17 +6,21 @@ import random
6
  import shutil
7
  import time
8
  import collections
 
9
  from datasets import load_dataset, Audio
10
  from huggingface_hub import HfApi
11
 
12
  dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
13
- dataset = dataset.cast_column("audio", Audio(decode=False))
14
 
 
15
  target_audio_dir = "/home/user/app/audio"
16
  os.makedirs(target_audio_dir, exist_ok=True)
17
  COUNT_JSON_PATH = "/home/user/app/count.json"
18
- COUNT_JSON_REPO_PATH = "submissions/count.json"
19
 
 
 
 
20
  local_audio_paths = []
21
 
22
  for item in dataset:
@@ -30,9 +34,8 @@ for item in dataset:
30
 
31
  all_data_audio_paths = local_audio_paths
32
 
 
33
  sample1_audio_path = local_audio_paths[0]
34
- # sample1_audio_path = next((p for p in all_data_audio_paths if p.endswith("sample1.wav")), None)
35
- print(sample1_audio_path)
36
 
37
  # ==============================================================================
38
  # 数据定义 (Data Definition)
@@ -89,42 +92,30 @@ DIMENSIONS_DATA = [
89
  ]
90
 
91
  DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
 
92
 
93
- """def load_or_initialize_count_json(audio_paths):
94
- if os.path.exists(COUNT_JSON_PATH):
95
- with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
96
- # 使用 object_pairs_hook 保持原始顺序
97
- count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
98
- else:
99
- count_data = collections.OrderedDict()
100
-
101
- updated = False
102
-
103
- # 获取所有样例音频文件名
104
- sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
105
-
106
- for path in audio_paths:
107
- filename = os.path.basename(path)
108
- if filename not in count_data:
109
- # 如果是样例音频,直接设置为最大值
110
- if filename in sample_audio_files:
111
- count_data[filename] = 999 # 设置为很大的值,确保不会被选中
112
- else:
113
- count_data[filename] = 0
114
- updated = True
115
 
116
- if updated or not os.path.exists(COUNT_JSON_PATH):
117
- with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
118
- # 确保写入时也保持顺序
119
- json.dump(count_data, f, indent=4, ensure_ascii=False)
120
-
121
- return count_data"""
122
 
123
- from filelock import FileLock
 
 
124
 
 
125
  def load_or_initialize_count_json(audio_paths):
 
126
  lock_path = COUNT_JSON_PATH + ".lock"
127
- with FileLock(lock_path, timeout=10): # 最多等 10
 
 
 
128
  if os.path.exists(COUNT_JSON_PATH):
129
  with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
130
  count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
@@ -134,6 +125,8 @@ def load_or_initialize_count_json(audio_paths):
134
  updated = False
135
  sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
136
 
 
 
137
  for path in audio_paths:
138
  filename = os.path.basename(path)
139
  if filename not in count_data:
@@ -149,6 +142,7 @@ def load_or_initialize_count_json(audio_paths):
149
 
150
  return count_data
151
 
 
152
  def append_cache_buster(audio_path):
153
  return f"{audio_path}?t={int(time.time() * 1000)}"
154
 
@@ -173,8 +167,7 @@ def append_cache_buster(audio_path):
173
 
174
  return selected, count_data"""
175
 
176
- # Version 2,相近的两份卷子会得到一样的试题
177
- def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
178
  eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
179
 
180
  if len(eligible_paths) < k:
@@ -193,24 +186,10 @@ def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
193
 
194
  return selected, count_data
195
 
196
- count_data = load_or_initialize_count_json(all_data_audio_paths)
197
- selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
198
-
199
- QUESTION_SET = [
200
- {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
201
- for path in selected_audio_paths
202
- ]
203
-
204
- MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
205
-
206
- # ==============================================================================
207
- # 功能函数定义 (Function Definitions)
208
- # ==============================================================================
209
- """def start_challenge():
210
- return gr.update(visible=False), gr.update(visible=True)"""
211
-
212
- def start_challenge(user_data_state):
213
- global QUESTION_SET, updated_count_data
214
 
215
  # 每次点击“开始挑战”时重新抽题
216
  count_data = load_or_initialize_count_json(all_data_audio_paths)
@@ -223,33 +202,23 @@ def start_challenge(user_data_state):
223
 
224
  # 重置 user_data 中的状态(也可以留空)
225
  user_data_state.clear()
226
- return gr.update(visible=False), gr.update(visible=True), user_data_state
227
 
228
- """def start_challenge(user_data_state):
229
- global QUESTION_SET
230
 
231
- # Step 1: 读取最新 count.json
232
  count_data = load_or_initialize_count_json(all_data_audio_paths)
 
233
 
234
- # Step 2: 从 eligible 音频中抽题
235
- selected_audio_paths, _ = sample_audio_paths(
236
- audio_paths=all_data_audio_paths,
237
- count_data=count_data.copy(), # 不修改原始数据
238
- k=5,
239
- max_count=1
240
- )
241
-
242
- # Step 3: 保存抽到的音频文件名在用户状态中
243
- user_data_state["selected_filenames"] = [os.path.basename(p) for p in selected_audio_paths]
244
-
245
- # Step 4: 设置题目列表
246
- QUESTION_SET = [
247
  {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
248
  for path in selected_audio_paths
249
  ]
250
 
251
- return gr.update(visible=False), gr.update(visible=True), user_data_state"""
252
-
 
 
253
  def toggle_education_other(choice):
254
  is_other = (choice == "其他(请注明)")
255
  return gr.update(visible=is_other, interactive=is_other, value="")
@@ -329,8 +298,10 @@ def update_test_dimension_view(d_idx, selections):
329
 
330
  def init_test_question(user_data, q_idx):
331
  d_idx = 0
332
- question = QUESTION_SET[q_idx]
333
- progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
 
 
334
 
335
  initial_updates = update_test_dimension_view(d_idx, {})
336
  dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
@@ -394,15 +365,23 @@ def navigate_dimensions(direction, q_idx, d_idx, selections, *slider_values):
394
  def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
395
  selections["final_choice"] = final_choice
396
 
397
- final_question_result = {
398
  "question_id": q_idx, "audio_file": QUESTION_SET[q_idx]['audio'],
399
  "selections": selections
400
  }
 
 
 
 
 
 
 
401
  all_results.append(final_question_result)
402
 
403
  q_idx += 1
404
 
405
- if q_idx < len(QUESTION_SET):
 
406
  init_q_updates = init_test_question(user_data, q_idx)
407
  return init_q_updates + (all_results, gr.update(value=""))
408
  else:
@@ -705,7 +684,8 @@ if __name__ == "__main__":
705
  os.makedirs("audio")
706
  if "SPACE_ID" in os.environ:
707
  print("Running in a Hugging Face Space, checking for audio files...")
708
- all_files = [q["audio"] for q in QUESTION_SET] + [d["audio"] for d in DIMENSIONS_DATA]
 
709
  for audio_file in set(all_files):
710
  if not os.path.exists(audio_file):
711
  print(f"⚠️ Warning: Audio file not found: {audio_file}")
 
6
  import shutil
7
  import time
8
  import collections
9
+ from filelock import FileLock
10
  from datasets import load_dataset, Audio
11
  from huggingface_hub import HfApi
12
 
13
  dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
14
+ dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
15
 
16
+ # Huggingface space working directory: "/home/user/app"
17
  target_audio_dir = "/home/user/app/audio"
18
  os.makedirs(target_audio_dir, exist_ok=True)
19
  COUNT_JSON_PATH = "/home/user/app/count.json"
 
20
 
21
+ COUNT_JSON_REPO_PATH = "submissions/count.json" # Output directory (Huggingface dataset directory)
22
+
23
+ # Copy recordings to the working directory
24
  local_audio_paths = []
25
 
26
  for item in dataset:
 
34
 
35
  all_data_audio_paths = local_audio_paths
36
 
37
+ # Take first file of the datasets as sample
38
  sample1_audio_path = local_audio_paths[0]
 
 
39
 
40
  # ==============================================================================
41
  # 数据定义 (Data Definition)
 
92
  ]
93
 
94
  DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
95
+ MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
96
 
97
+ """
98
+ # Issue: this is initialized on the starting of the space, might somehow not covered
99
+ count_data = load_or_initialize_count_json(all_data_audio_paths)
100
+ selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ QUESTION_SET = [
103
+ {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
104
+ for path in selected_audio_paths
105
+ ]"""
 
 
106
 
107
+ # ==============================================================================
108
+ # 功能函数定义 (Function Definitions)
109
+ # ==============================================================================
110
 
111
+ # Function that load or initialize count.json
112
  def load_or_initialize_count_json(audio_paths):
113
+ # Add filelock to /workspace/count.json
114
  lock_path = COUNT_JSON_PATH + ".lock"
115
+ # Read of count.json will wait for 10 seconds until another thread involving releases it, and then add a lock to it
116
+ with FileLock(lock_path, timeout=10):
117
+ # If count.json exists: load into count_data
118
+ # Else initialize count_data with orderedDict
119
  if os.path.exists(COUNT_JSON_PATH):
120
  with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
121
  count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
 
125
  updated = False
126
  sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
127
 
128
+ # Guarantee that the sample recording won't be take into the pool
129
+ # Update newly updated recordings into count.json
130
  for path in audio_paths:
131
  filename = os.path.basename(path)
132
  if filename not in count_data:
 
142
 
143
  return count_data
144
 
145
+ # Shorten the time of playing previous audio when reached next question
146
  def append_cache_buster(audio_path):
147
  return f"{audio_path}?t={int(time.time() * 1000)}"
148
 
 
167
 
168
  return selected, count_data"""
169
 
170
+ def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
 
171
  eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
172
 
173
  if len(eligible_paths) < k:
 
186
 
187
  return selected, count_data
188
 
189
+ """def start_challenge(user_data_state):
190
+
191
+ # global QUESTION_SET, updated_count_data
192
+ # Issue: global variables in huggingface hub is shared by all threads
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  # 每次点击“开始挑战”时重新抽题
195
  count_data = load_or_initialize_count_json(all_data_audio_paths)
 
202
 
203
  # 重置 user_data 中的状态(也可以留空)
204
  user_data_state.clear()
205
+ return gr.update(visible=False), gr.update(visible=True), user_data_state"""
206
 
207
+ # Save question_set in each user_data_state, preventing global sharing
208
+ def start_challenge(user_data_state):
209
 
 
210
  count_data = load_or_initialize_count_json(all_data_audio_paths)
211
+ selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
212
 
213
+ question_set = [
 
 
 
 
 
 
 
 
 
 
 
 
214
  {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
215
  for path in selected_audio_paths
216
  ]
217
 
218
+ user_data_state["question_set"] = question_set
219
+ user_data_state["updated_count_data"] = updated_count_data
220
+ return gr.update(visible=False), gr.update(visible=True), user_data_state
221
+
222
  def toggle_education_other(choice):
223
  is_other = (choice == "其他(请注明)")
224
  return gr.update(visible=is_other, interactive=is_other, value="")
 
298
 
299
  def init_test_question(user_data, q_idx):
300
  d_idx = 0
301
+ # question = QUESTION_SET[q_idx]
302
+ # progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
303
+ question = user_data["question_set"][q_idx]
304
+ progress_q = f"第 {q_idx + 1} / {len(user_data["question_set"])} 题"
305
 
306
  initial_updates = update_test_dimension_view(d_idx, {})
307
  dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
 
365
  def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
366
  selections["final_choice"] = final_choice
367
 
368
+ """final_question_result = {
369
  "question_id": q_idx, "audio_file": QUESTION_SET[q_idx]['audio'],
370
  "selections": selections
371
  }
372
+ """
373
+
374
+ final_question_result = {
375
+ "question_id": q_idx, "audio_file": user_data["question_set"][q_idx]['audio'],
376
+ "selections": selections
377
+ }
378
+
379
  all_results.append(final_question_result)
380
 
381
  q_idx += 1
382
 
383
+ # if q_idx < len(QUESTION_SET):
384
+ if q_idx < len(user_data["question_set"]):
385
  init_q_updates = init_test_question(user_data, q_idx)
386
  return init_q_updates + (all_results, gr.update(value=""))
387
  else:
 
684
  os.makedirs("audio")
685
  if "SPACE_ID" in os.environ:
686
  print("Running in a Hugging Face Space, checking for audio files...")
687
+ # all_files = [q["audio"] for q in QUESTION_SET] + [d["audio"] for d in DIMENSIONS_DATA]
688
+ all_files = [[d["audio"] for d in DIMENSIONS_DATA]
689
  for audio_file in set(all_files):
690
  if not os.path.exists(audio_file):
691
  print(f"⚠️ Warning: Audio file not found: {audio_file}")