tianyaogavin commited on
Commit
d85221f
·
1 Parent(s): 7fc9887

modify param

Browse files
dataset/audio/segments/test1_segment_1.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:877aee33d778b34af2f0b819ac822d80316e97b73cb3823c1f436dbef8efcb0e
3
- size 35564
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c322856596e185c7405464090742ad114d1c2830bbc2b86d410c252534bb14
3
+ size 21164
dataset/audio/segments/test1_segment_10.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3135d983a5260d846e6cf165583efa3a0ef379bd86c885e678a63b41f66f548b
3
- size 48044
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b29c94ca02c45ee8145dade0866f0f1c2451af7a11ee3b1b98b11fb867ec1a2a
3
+ size 97324
dataset/audio/segments/test1_segment_11.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a966cbb2e82ebd278692adad509a18061306b73b715fc4a93468c27ed61627b
3
- size 111404
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c923e80fb6189b28fdcc6d901fe53c7e5af4410fd41bbe2ab49d1f7386c3df6
3
+ size 39084
dataset/audio/segments/test1_segment_12.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52cfbcdc17cc5f190df467310f1a91c89e27f79662b2ce13f4ff5ec07015afec
3
- size 71084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82984e994f0a59fe2cbaa02ec6805946b8b982e92d6d785395e472cc792eb5c8
3
+ size 95404
dataset/audio/segments/test1_segment_13.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cbc350f2224afe3c79ea985efd38ec02ae67c7b7c645327a51004f82ecabd1f
3
+ size 35884
dataset/audio/segments/test1_segment_14.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc8b91e09a1097f8477fe5180891d6fdbf9a150e52b983cf600fd33aa0adc298
3
+ size 32044
dataset/audio/segments/test1_segment_15.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:840350c64d3056851547ecd8a81952b87b064cc3d87e037f084aa9ce77d27c89
3
+ size 49324
dataset/audio/segments/test1_segment_16.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d124dd2d7d788dfb01168267f124baa9d176a8656561dfc080aa402130055f0
3
+ size 113964
dataset/audio/segments/test1_segment_17.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9e2067b592e25358eb1a35728fb9319fd0472230f1184c1d11af6195e050d2
3
+ size 73644
dataset/audio/segments/test1_segment_2.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81375721eb3a532941083c9781f53f5e0f1ccbe1ef4108f98a019de400f5c564
3
- size 117164
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8618632e403ddef0f05b160710ed6eab17a5d4b7042b5cdda435212c57e5efcf
3
+ size 39084
dataset/audio/segments/test1_segment_3.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd6120ff04e7365640b9e3a1fb062bc1c31ce0dc54904bd27e25ac5a0b068cde
3
- size 149804
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63859aead158b66a043a6fd16496a49211ae3516378c894174adfccb04a218b7
3
+ size 22444
dataset/audio/segments/test1_segment_4.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99bc0d18ffd0d10742b8d6b5450e537eccd1497c2247e714fa8efe6beb602abd
3
- size 41324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e1bb445589f502605ec58654c08f2e66958973ff49a9f408933bffb16b9ee0
3
+ size 116524
dataset/audio/segments/test1_segment_5.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9e2196db3537028898b87442f074523251b33219302e6eb8518fb33396c30bd
3
- size 122924
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1acd47e3c4c68dcf959cf62d9faf988507e90f21145ca33a1d8829314d2d01a9
3
+ size 126124
dataset/audio/segments/test1_segment_6.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e929f7966a425a559b7442a2914cb99b0df74f1d02938264642dc71f160fc383
3
- size 113324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b94d1a2e269cf95774b15fd75f43fc72ccd1a1d2e57985be0bb20334603532d7
3
+ size 46124
dataset/audio/segments/test1_segment_7.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65d800356647c415d80e59fac63db01df31ce51a497aacf43f98aa0e6ec468cb
3
- size 77804
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b075f47a9499e873430eaf99f5e0149e0f52e3fe7e1172be09fa4ce9eb56d7ae
3
+ size 42924
dataset/audio/segments/test1_segment_8.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1c574a7c20332f85c6260febf6eae232473a798404ca29f1b54ac39e5b2d35c
3
- size 91244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eb3a808a08cdac833973dec3811d2bfac7258b6259dfaf5f7a0c69ed6cef898
3
+ size 129964
dataset/audio/segments/test1_segment_9.wav CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f943b20eb3aafa0befb884f5d125e0596d3f419d8a3c5546ff3cf878603c36b8
3
- size 67244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15079fb8b683f5d8b3cad064bcd21488c3acf9837a2dc7095c922df3dd2a3f4b
3
+ size 61484
dataset/transcripts/test1_segment_1_20250423_162821.json ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_162821",
4
+ "segments": [
5
+ {
6
+ "text": "第一單元",
7
+ "start_time": 3.26,
8
+ "end_time": 3.94,
9
+ "confidence": 0.661865234375,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "音频数据处理",
16
+ "start_time": 4.34,
17
+ "end_time": 6.34,
18
+ "confidence": 0.36962890625,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "单元简介",
25
+ "start_time": 7.1,
26
+ "end_time": 7.859999999999999,
27
+ "confidence": 0.77685546875,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "所有音频或语音相关的任务都需要使用音频文件",
34
+ "start_time": 8.8,
35
+ "end_time": 12.4,
36
+ "confidence": 0.9898605346679688,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "在我们先入了解这些任务之前",
43
+ "start_time": 12.8,
44
+ "end_time": 14.8,
45
+ "confidence": 0.83056640625,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "我们需要了解音频文件的实际内容",
52
+ "start_time": 14.8,
53
+ "end_time": 16.8,
54
+ "confidence": 0.83056640625,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "以及如何利用一篇文件",
61
+ "start_time": 17.32,
62
+ "end_time": 18.72,
63
+ "confidence": 0.9420166015625,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "本来员将会你介绍的",
70
+ "start_time": 19.76,
71
+ "end_time": 21.040000000000003,
72
+ "confidence": 0.90777587890625,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "本单员将为你介绍于音频数据相关的基本概念",
79
+ "start_time": 21.62,
80
+ "end_time": 25.62,
81
+ "confidence": 0.92706298828125,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "包括波形、彩虹、綠和平不圖",
88
+ "start_time": 26.28,
89
+ "end_time": 28.28,
90
+ "confidence": 0.944183349609375,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "你会学习到如何使用音频输语集",
97
+ "start_time": 28.56,
98
+ "end_time": 30.36,
99
+ "confidence": 0.98211669921875,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "包括音频输语夹仔",
106
+ "start_time": 30.36,
107
+ "end_time": 31.64,
108
+ "confidence": 0.98211669921875,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "音频书记预处理一期",
115
+ "start_time": 31.98,
116
+ "end_time": 33.26,
117
+ "confidence": 0.9024658203125,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "高效加载大规模音频数级的流适加载方法",
124
+ "start_time": 33.54,
125
+ "end_time": 36.54,
126
+ "confidence": 0.94915771484375,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "完成本單元的學期後",
133
+ "start_time": 37.82,
134
+ "end_time": 38.98,
135
+ "confidence": 0.9828643798828125,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "你会掌握",
142
+ "start_time": 39.34,
143
+ "end_time": 40.34,
144
+ "confidence": 0.8743896484375,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "基础的音频相关数",
151
+ "start_time": 40.86,
152
+ "end_time": 42.46,
153
+ "confidence": 0.75927734375,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ },
158
+ {
159
+ "text": "并且掌握针对不同应用的音频数据处理工具",
160
+ "start_time": 43.04,
161
+ "end_time": 46.64,
162
+ "confidence": 0.9911956787109375,
163
+ "verified": false,
164
+ "verified_text": null,
165
+ "verification_notes": null
166
+ },
167
+ {
168
+ "text": "本难援的知识会成为后面章节的基础",
169
+ "start_time": 47.5,
170
+ "end_time": 49.94,
171
+ "confidence": 0.954833984375,
172
+ "verified": false,
173
+ "verified_text": null,
174
+ "verification_notes": null
175
+ }
176
+ ]
177
+ }
vad/audio_processor.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  import soundfile as sf
3
  from typing import List, Tuple, Optional, Dict
@@ -18,11 +19,11 @@ class AudioSegment:
18
  class AudioProcessor:
19
  def __init__(self,
20
  sample_rate: int = 16000,
21
- frame_duration_ms: int = 30,
22
- vad_level: int = 1, # 降低VAD灵敏度
23
- min_silence_duration: float = 0.5, # 静音持续时间
24
- min_speech_duration: float = 1.0, # 增加最小语音持续时间,确保完整句子
25
- amplitude_threshold: float = 0.003): # 进一步降低振幅阈值
26
  """
27
  初始化音频处理器
28
 
@@ -160,6 +161,9 @@ class AudioProcessor:
160
  print(f"总帧数: {total_frames}")
161
  print(f"语音帧数: {speech_frames}")
162
  print(f"检测到的语音片段数: {len(segments)}")
 
 
 
163
 
164
  # 保存中间结果到临时文件
165
  temp_dir = "../dataset/audio/temp"
 
1
+ import sys
2
  import numpy as np
3
  import soundfile as sf
4
  from typing import List, Tuple, Optional, Dict
 
19
  class AudioProcessor:
20
  def __init__(self,
21
  sample_rate: int = 16000,
22
+ frame_duration_ms: int = 20,
23
+ vad_level: int = 0, # 降低VAD灵敏度
24
+ min_silence_duration: float = 0.3, # 静音持续时间
25
+ min_speech_duration: float = 0.3, # 增加最小语音持续时间,确保完整句子
26
+ amplitude_threshold: float = 0.0015): # 进一步降低振幅阈值
27
  """
28
  初始化音频处理器
29
 
 
161
  print(f"总帧数: {total_frames}")
162
  print(f"语音帧数: {speech_frames}")
163
  print(f"检测到的语音片段数: {len(segments)}")
164
+ # print("中断程序以进行调试")
165
+ # sys.exit(1)
166
+
167
 
168
  # 保存中间结果到临时文件
169
  temp_dir = "../dataset/audio/temp"