Spaces:
Sleeping
Sleeping
Commit
·
d85221f
1
Parent(s):
7fc9887
modify param
Browse files- dataset/audio/segments/test1_segment_1.wav +2 -2
- dataset/audio/segments/test1_segment_10.wav +2 -2
- dataset/audio/segments/test1_segment_11.wav +2 -2
- dataset/audio/segments/test1_segment_12.wav +2 -2
- dataset/audio/segments/test1_segment_13.wav +3 -0
- dataset/audio/segments/test1_segment_14.wav +3 -0
- dataset/audio/segments/test1_segment_15.wav +3 -0
- dataset/audio/segments/test1_segment_16.wav +3 -0
- dataset/audio/segments/test1_segment_17.wav +3 -0
- dataset/audio/segments/test1_segment_2.wav +2 -2
- dataset/audio/segments/test1_segment_3.wav +2 -2
- dataset/audio/segments/test1_segment_4.wav +2 -2
- dataset/audio/segments/test1_segment_5.wav +2 -2
- dataset/audio/segments/test1_segment_6.wav +2 -2
- dataset/audio/segments/test1_segment_7.wav +2 -2
- dataset/audio/segments/test1_segment_8.wav +2 -2
- dataset/audio/segments/test1_segment_9.wav +2 -2
- dataset/transcripts/test1_segment_1_20250423_162821.json +177 -0
- vad/audio_processor.py +9 -5
dataset/audio/segments/test1_segment_1.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28c322856596e185c7405464090742ad114d1c2830bbc2b86d410c252534bb14
|
3 |
+
size 21164
|
dataset/audio/segments/test1_segment_10.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b29c94ca02c45ee8145dade0866f0f1c2451af7a11ee3b1b98b11fb867ec1a2a
|
3 |
+
size 97324
|
dataset/audio/segments/test1_segment_11.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c923e80fb6189b28fdcc6d901fe53c7e5af4410fd41bbe2ab49d1f7386c3df6
|
3 |
+
size 39084
|
dataset/audio/segments/test1_segment_12.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82984e994f0a59fe2cbaa02ec6805946b8b982e92d6d785395e472cc792eb5c8
|
3 |
+
size 95404
|
dataset/audio/segments/test1_segment_13.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cbc350f2224afe3c79ea985efd38ec02ae67c7b7c645327a51004f82ecabd1f
|
3 |
+
size 35884
|
dataset/audio/segments/test1_segment_14.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc8b91e09a1097f8477fe5180891d6fdbf9a150e52b983cf600fd33aa0adc298
|
3 |
+
size 32044
|
dataset/audio/segments/test1_segment_15.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:840350c64d3056851547ecd8a81952b87b064cc3d87e037f084aa9ce77d27c89
|
3 |
+
size 49324
|
dataset/audio/segments/test1_segment_16.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d124dd2d7d788dfb01168267f124baa9d176a8656561dfc080aa402130055f0
|
3 |
+
size 113964
|
dataset/audio/segments/test1_segment_17.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a9e2067b592e25358eb1a35728fb9319fd0472230f1184c1d11af6195e050d2
|
3 |
+
size 73644
|
dataset/audio/segments/test1_segment_2.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8618632e403ddef0f05b160710ed6eab17a5d4b7042b5cdda435212c57e5efcf
|
3 |
+
size 39084
|
dataset/audio/segments/test1_segment_3.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63859aead158b66a043a6fd16496a49211ae3516378c894174adfccb04a218b7
|
3 |
+
size 22444
|
dataset/audio/segments/test1_segment_4.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04e1bb445589f502605ec58654c08f2e66958973ff49a9f408933bffb16b9ee0
|
3 |
+
size 116524
|
dataset/audio/segments/test1_segment_5.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1acd47e3c4c68dcf959cf62d9faf988507e90f21145ca33a1d8829314d2d01a9
|
3 |
+
size 126124
|
dataset/audio/segments/test1_segment_6.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b94d1a2e269cf95774b15fd75f43fc72ccd1a1d2e57985be0bb20334603532d7
|
3 |
+
size 46124
|
dataset/audio/segments/test1_segment_7.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b075f47a9499e873430eaf99f5e0149e0f52e3fe7e1172be09fa4ce9eb56d7ae
|
3 |
+
size 42924
|
dataset/audio/segments/test1_segment_8.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4eb3a808a08cdac833973dec3811d2bfac7258b6259dfaf5f7a0c69ed6cef898
|
3 |
+
size 129964
|
dataset/audio/segments/test1_segment_9.wav
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15079fb8b683f5d8b3cad064bcd21488c3acf9837a2dc7095c922df3dd2a3f4b
|
3 |
+
size 61484
|
dataset/transcripts/test1_segment_1_20250423_162821.json
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_162821",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "第一單元",
|
7 |
+
"start_time": 3.26,
|
8 |
+
"end_time": 3.94,
|
9 |
+
"confidence": 0.661865234375,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "音频数据处理",
|
16 |
+
"start_time": 4.34,
|
17 |
+
"end_time": 6.34,
|
18 |
+
"confidence": 0.36962890625,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "单元简介",
|
25 |
+
"start_time": 7.1,
|
26 |
+
"end_time": 7.859999999999999,
|
27 |
+
"confidence": 0.77685546875,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "所有音频或语音相关的任务都需要使用音频文件",
|
34 |
+
"start_time": 8.8,
|
35 |
+
"end_time": 12.4,
|
36 |
+
"confidence": 0.9898605346679688,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "在我们先入了解这些任务之前",
|
43 |
+
"start_time": 12.8,
|
44 |
+
"end_time": 14.8,
|
45 |
+
"confidence": 0.83056640625,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "我们需要了解音频文件的实际内容",
|
52 |
+
"start_time": 14.8,
|
53 |
+
"end_time": 16.8,
|
54 |
+
"confidence": 0.83056640625,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "以及如何利用一篇文件",
|
61 |
+
"start_time": 17.32,
|
62 |
+
"end_time": 18.72,
|
63 |
+
"confidence": 0.9420166015625,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "本来员将会你介绍的",
|
70 |
+
"start_time": 19.76,
|
71 |
+
"end_time": 21.040000000000003,
|
72 |
+
"confidence": 0.90777587890625,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "本单员将为你介绍于音频数据相关的基本概念",
|
79 |
+
"start_time": 21.62,
|
80 |
+
"end_time": 25.62,
|
81 |
+
"confidence": 0.92706298828125,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "包括波形、彩虹、綠和平不圖",
|
88 |
+
"start_time": 26.28,
|
89 |
+
"end_time": 28.28,
|
90 |
+
"confidence": 0.944183349609375,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "你会学习到如何使用音频输语集",
|
97 |
+
"start_time": 28.56,
|
98 |
+
"end_time": 30.36,
|
99 |
+
"confidence": 0.98211669921875,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "包括音频输语夹仔",
|
106 |
+
"start_time": 30.36,
|
107 |
+
"end_time": 31.64,
|
108 |
+
"confidence": 0.98211669921875,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "音频书记预处理一期",
|
115 |
+
"start_time": 31.98,
|
116 |
+
"end_time": 33.26,
|
117 |
+
"confidence": 0.9024658203125,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "高效加载大规模音频数级的流适加载方法",
|
124 |
+
"start_time": 33.54,
|
125 |
+
"end_time": 36.54,
|
126 |
+
"confidence": 0.94915771484375,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "完成本單元的學期後",
|
133 |
+
"start_time": 37.82,
|
134 |
+
"end_time": 38.98,
|
135 |
+
"confidence": 0.9828643798828125,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "你会掌握",
|
142 |
+
"start_time": 39.34,
|
143 |
+
"end_time": 40.34,
|
144 |
+
"confidence": 0.8743896484375,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "基础的音频相关数",
|
151 |
+
"start_time": 40.86,
|
152 |
+
"end_time": 42.46,
|
153 |
+
"confidence": 0.75927734375,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
160 |
+
"start_time": 43.04,
|
161 |
+
"end_time": 46.64,
|
162 |
+
"confidence": 0.9911956787109375,
|
163 |
+
"verified": false,
|
164 |
+
"verified_text": null,
|
165 |
+
"verification_notes": null
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"text": "本难援的知识会成为后面章节的基础",
|
169 |
+
"start_time": 47.5,
|
170 |
+
"end_time": 49.94,
|
171 |
+
"confidence": 0.954833984375,
|
172 |
+
"verified": false,
|
173 |
+
"verified_text": null,
|
174 |
+
"verification_notes": null
|
175 |
+
}
|
176 |
+
]
|
177 |
+
}
|
vad/audio_processor.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import numpy as np
|
2 |
import soundfile as sf
|
3 |
from typing import List, Tuple, Optional, Dict
|
@@ -18,11 +19,11 @@ class AudioSegment:
|
|
18 |
class AudioProcessor:
|
19 |
def __init__(self,
|
20 |
sample_rate: int = 16000,
|
21 |
-
frame_duration_ms: int =
|
22 |
-
vad_level: int =
|
23 |
-
min_silence_duration: float = 0.
|
24 |
-
min_speech_duration: float =
|
25 |
-
amplitude_threshold: float = 0.
|
26 |
"""
|
27 |
初始化音频处理器
|
28 |
|
@@ -160,6 +161,9 @@ class AudioProcessor:
|
|
160 |
print(f"总帧数: {total_frames}")
|
161 |
print(f"语音帧数: {speech_frames}")
|
162 |
print(f"检测到的语音片段数: {len(segments)}")
|
|
|
|
|
|
|
163 |
|
164 |
# 保存中间结果到临时文件
|
165 |
temp_dir = "../dataset/audio/temp"
|
|
|
1 |
+
import sys
|
2 |
import numpy as np
|
3 |
import soundfile as sf
|
4 |
from typing import List, Tuple, Optional, Dict
|
|
|
19 |
class AudioProcessor:
|
20 |
def __init__(self,
|
21 |
sample_rate: int = 16000,
|
22 |
+
frame_duration_ms: int = 20,
|
23 |
+
vad_level: int = 0, # 降低VAD灵敏度
|
24 |
+
min_silence_duration: float = 0.3, # 静音持续时间
|
25 |
+
min_speech_duration: float = 0.3, # 增加最小语音持续时间,确保完整句子
|
26 |
+
amplitude_threshold: float = 0.0015): # 进一步降低振幅阈值
|
27 |
"""
|
28 |
初始化音频处理器
|
29 |
|
|
|
161 |
print(f"总帧数: {total_frames}")
|
162 |
print(f"语音帧数: {speech_frames}")
|
163 |
print(f"检测到的语音片段数: {len(segments)}")
|
164 |
+
# print("中断程序以进行调试")
|
165 |
+
# sys.exit(1)
|
166 |
+
|
167 |
|
168 |
# 保存中间结果到临时文件
|
169 |
temp_dir = "../dataset/audio/temp"
|