Spaces:
Sleeping
Sleeping
Update transcription.py
Browse files- __pycache__/process.cpython-310.pyc +0 -0
- __pycache__/transcription.cpython-310.pyc +0 -0
- app.py +4 -6
- process.py +31 -4
- transcription.py +93 -8
__pycache__/process.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/process.cpython-310.pyc and b/__pycache__/process.cpython-310.pyc differ
|
|
|
__pycache__/transcription.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/transcription.cpython-310.pyc and b/__pycache__/transcription.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -123,8 +123,7 @@ def transcription():
|
|
| 123 |
try:
|
| 124 |
if not total_audio or not os.path.exists(total_audio):
|
| 125 |
return jsonify({"error": "No audio segments provided"}), 400
|
| 126 |
-
|
| 127 |
-
transcription_text = transcripter.create_transcription(audio_directory)
|
| 128 |
print("transcription")
|
| 129 |
print(transcription_text)
|
| 130 |
except Exception as e:
|
|
@@ -336,10 +335,9 @@ def upload_audio():
|
|
| 336 |
user_rates = {users[i]: rates[i] for i in range(len(users))}
|
| 337 |
return jsonify({"rates": rates, "user_rates": user_rates}), 200
|
| 338 |
else:
|
| 339 |
-
matched_time, unmatched_time,
|
| 340 |
-
total_audio = transcripter.
|
| 341 |
print("単一ユーザーの処理")
|
| 342 |
-
print(total_audio)
|
| 343 |
total_time = matched_time + unmatched_time
|
| 344 |
rate = (matched_time / total_time) * 100 if total_time > 0 else 0
|
| 345 |
return jsonify({"rate": rate, "user": users[0]}), 200
|
|
@@ -382,7 +380,7 @@ def reset():
|
|
| 382 |
# 一時ディレクトリのクリーンアップ
|
| 383 |
if total_audio:
|
| 384 |
process.delete_files_in_directory(total_audio)
|
| 385 |
-
|
| 386 |
|
| 387 |
# 書き起こしテキストの削除
|
| 388 |
if os.path.exists(transcription_text):
|
|
|
|
| 123 |
try:
|
| 124 |
if not total_audio or not os.path.exists(total_audio):
|
| 125 |
return jsonify({"error": "No audio segments provided"}), 400
|
| 126 |
+
transcription_text = transcripter.create_transcription(total_audio)
|
|
|
|
| 127 |
print("transcription")
|
| 128 |
print(transcription_text)
|
| 129 |
except Exception as e:
|
|
|
|
| 335 |
user_rates = {users[i]: rates[i] for i in range(len(users))}
|
| 336 |
return jsonify({"rates": rates, "user_rates": user_rates}), 200
|
| 337 |
else:
|
| 338 |
+
matched_time, unmatched_time, merged_segments = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
|
| 339 |
+
total_audio = transcripter.save_marged_segments(merged_segments)
|
| 340 |
print("単一ユーザーの処理")
|
|
|
|
| 341 |
total_time = matched_time + unmatched_time
|
| 342 |
rate = (matched_time / total_time) * 100 if total_time > 0 else 0
|
| 343 |
return jsonify({"rate": rate, "user": users[0]}), 200
|
|
|
|
| 380 |
# 一時ディレクトリのクリーンアップ
|
| 381 |
if total_audio:
|
| 382 |
process.delete_files_in_directory(total_audio)
|
| 383 |
+
process.delete_files_in_directory('/tmp/data/transcription_audio')
|
| 384 |
|
| 385 |
# 書き起こしテキストの削除
|
| 386 |
if os.path.exists(transcription_text):
|
process.py
CHANGED
|
@@ -250,6 +250,10 @@ class AudioProcessor():
|
|
| 250 |
入力音声からリファレンス音声に類似したセグメントを抽出する
|
| 251 |
|
| 252 |
Parameters:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
reference_path (str): リファレンス音声のパス
|
| 254 |
input_path (str): 入力音声のパス
|
| 255 |
output_folder (str): 類似セグメントを保存するディレクトリ
|
|
@@ -257,8 +261,14 @@ class AudioProcessor():
|
|
| 257 |
threshold (float): 類似度の閾値
|
| 258 |
|
| 259 |
Returns:
|
| 260 |
-
tuple: (マッチした時間(ミリ秒), マッチしなかった時間(ミリ秒),
|
| 261 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
try:
|
| 263 |
# リファレンス音声のエンベディングを計算(長さを標準化)
|
| 264 |
reference_embedding = self.calculate_embedding(reference_path)
|
|
@@ -294,15 +304,32 @@ class AudioProcessor():
|
|
| 294 |
if similarity > threshold:
|
| 295 |
shutil.copy(segment_file, output_folder)
|
| 296 |
matched_time_ms += len(AudioSegment.from_file(segment_file))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
except Exception as e:
|
| 298 |
print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
| 300 |
unmatched_time_ms = total_duration_ms - matched_time_ms
|
| 301 |
-
return matched_time_ms, unmatched_time_ms,
|
| 302 |
|
| 303 |
except Exception as e:
|
| 304 |
print(f"音声処理でエラーが発生しました: {e}")
|
| 305 |
-
return 0, 0,
|
| 306 |
|
| 307 |
def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
|
| 308 |
"""
|
|
|
|
| 250 |
入力音声からリファレンス音声に類似したセグメントを抽出する
|
| 251 |
|
| 252 |
Parameters:
|
| 253 |
+
isSpeaking(bool): 現在のセグメントがリファレンス音声と類似しているか
|
| 254 |
+
wasSpeaking(bool): 1つ前のセグメントがリファレンス音声と類似しているか
|
| 255 |
+
current_segment(list): 一致している、または一致しない話者のセグメントのストック
|
| 256 |
+
merged_segments(list): 要素は(一致するか(bool), セグメントのリスト)。書き起こしに利用。
|
| 257 |
reference_path (str): リファレンス音声のパス
|
| 258 |
input_path (str): 入力音声のパス
|
| 259 |
output_folder (str): 類似セグメントを保存するディレクトリ
|
|
|
|
| 261 |
threshold (float): 類似度の閾値
|
| 262 |
|
| 263 |
Returns:
|
| 264 |
+
tuple: (マッチした時間(ミリ秒), マッチしなかった時間(ミリ秒), 分類済みのセグメント)
|
| 265 |
"""
|
| 266 |
+
|
| 267 |
+
isSpeaking = None
|
| 268 |
+
wasSpeaking = None
|
| 269 |
+
current_segment=[]
|
| 270 |
+
merged_segments=[]
|
| 271 |
+
|
| 272 |
try:
|
| 273 |
# リファレンス音声のエンベディングを計算(長さを標準化)
|
| 274 |
reference_embedding = self.calculate_embedding(reference_path)
|
|
|
|
| 304 |
if similarity > threshold:
|
| 305 |
shutil.copy(segment_file, output_folder)
|
| 306 |
matched_time_ms += len(AudioSegment.from_file(segment_file))
|
| 307 |
+
isSpeaking = True
|
| 308 |
+
else:
|
| 309 |
+
isSpeaking = False
|
| 310 |
+
|
| 311 |
+
# 話者が変わった場合、保存
|
| 312 |
+
if wasSpeaking != isSpeaking:
|
| 313 |
+
if current_segment:
|
| 314 |
+
merged_segments.append((wasSpeaking, current_segment))
|
| 315 |
+
wasSpeaking = isSpeaking
|
| 316 |
+
current_segment = [segment_file]
|
| 317 |
+
# 変わらなかった場合、結合
|
| 318 |
+
else:
|
| 319 |
+
current_segment.append(segment_file)
|
| 320 |
+
|
| 321 |
except Exception as e:
|
| 322 |
print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
|
| 323 |
+
# 余りを保存
|
| 324 |
+
if current_segment:
|
| 325 |
+
merged_segments.append((wasSpeaking, current_segment))
|
| 326 |
+
|
| 327 |
unmatched_time_ms = total_duration_ms - matched_time_ms
|
| 328 |
+
return matched_time_ms, unmatched_time_ms, merged_segments
|
| 329 |
|
| 330 |
except Exception as e:
|
| 331 |
print(f"音声処理でエラーが発生しました: {e}")
|
| 332 |
+
return 0, 0, merged_segments
|
| 333 |
|
| 334 |
def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
|
| 335 |
"""
|
transcription.py
CHANGED
|
@@ -22,41 +22,86 @@ class TranscriptionMaker():
|
|
| 22 |
|
| 23 |
#音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
|
| 24 |
def create_transcription(self,audio_directory):
|
| 25 |
-
|
| 26 |
|
| 27 |
#ディレクトリ内のファイルを全て取得
|
| 28 |
if not os.path.isdir(audio_directory):
|
| 29 |
raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
|
| 30 |
-
audio_files =
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
| 33 |
if os.path.splitext(audio_file)[-1].lower() != '.wav':
|
| 34 |
continue
|
| 35 |
-
audio_path = os.path.join(
|
| 36 |
try:
|
| 37 |
segments,info = list(self.model.transcribe(audio_path))
|
| 38 |
except Exception as e:
|
| 39 |
print(f"Error transcripting file {audio_path}: {e}")
|
| 40 |
raise
|
| 41 |
sorted_segments = sorted(segments, key=lambda s: s.start)
|
|
|
|
| 42 |
for segment in sorted_segments:
|
| 43 |
results.append({
|
| 44 |
"start": segment.start,
|
| 45 |
"end": segment.end,
|
| 46 |
"text": segment.text
|
| 47 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
#ファイルの書き込み。ファイル名は"transcription.txt"
|
| 49 |
output_file=os.path.join(self.output_dir,"transcription.txt")
|
|
|
|
| 50 |
try:
|
| 51 |
with open(output_file,"w",encoding="utf-8") as f:
|
| 52 |
-
for result in
|
| 53 |
-
f.write(
|
| 54 |
except OSError as e:
|
| 55 |
print(f"Error writing transcription file: {e}")
|
| 56 |
raise
|
| 57 |
return output_file
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
#ファイル名が連続しているならくっつける
|
|
|
|
| 60 |
def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
|
| 61 |
if not os.path.exists(output_dir):
|
| 62 |
os.makedirs(output_dir, exist_ok=True)
|
|
@@ -97,7 +142,47 @@ class TranscriptionMaker():
|
|
| 97 |
output_file = os.path.join(output_dir, self.generate_filename(3))
|
| 98 |
combined_audio.export(output_file, format='wav')
|
| 99 |
|
| 100 |
-
return output_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
def generate_random_string(self,length):
|
| 103 |
letters = string.ascii_letters + string.digits
|
|
|
|
| 22 |
|
| 23 |
#音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
|
| 24 |
def create_transcription(self,audio_directory):
|
| 25 |
+
conversation = []
|
| 26 |
|
| 27 |
#ディレクトリ内のファイルを全て取得
|
| 28 |
if not os.path.isdir(audio_directory):
|
| 29 |
raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
|
| 30 |
+
audio_files = self.sort_audio_files_in_directory(audio_directory)
|
| 31 |
+
merged_segments = self.combine_audio(audio_files)
|
| 32 |
+
merged_audio_directory = self.save_marged_segments(merged_segments, output_directory='/tmp/data/transcription_audio')
|
| 33 |
+
merged_files = self.sort_audio_files_in_directory(merged_audio_directory)
|
| 34 |
+
|
| 35 |
+
for audio_file in merged_files:
|
| 36 |
if os.path.splitext(audio_file)[-1].lower() != '.wav':
|
| 37 |
continue
|
| 38 |
+
audio_path = os.path.join(merged_audio_directory, audio_file)
|
| 39 |
try:
|
| 40 |
segments,info = list(self.model.transcribe(audio_path))
|
| 41 |
except Exception as e:
|
| 42 |
print(f"Error transcripting file {audio_path}: {e}")
|
| 43 |
raise
|
| 44 |
sorted_segments = sorted(segments, key=lambda s: s.start)
|
| 45 |
+
results = []
|
| 46 |
for segment in sorted_segments:
|
| 47 |
results.append({
|
| 48 |
"start": segment.start,
|
| 49 |
"end": segment.end,
|
| 50 |
"text": segment.text
|
| 51 |
})
|
| 52 |
+
combined_text = "".join([result["text"] for result in results])
|
| 53 |
+
speaker = os.path.basename(audio_file).split("_")[0]
|
| 54 |
+
# 無音ならスキップ
|
| 55 |
+
if not combined_text:
|
| 56 |
+
continue
|
| 57 |
+
conversation.append(f"{speaker}: {combined_text}")
|
| 58 |
+
|
| 59 |
#ファイルの書き込み。ファイル名は"transcription.txt"
|
| 60 |
output_file=os.path.join(self.output_dir,"transcription.txt")
|
| 61 |
+
print(conversation)
|
| 62 |
try:
|
| 63 |
with open(output_file,"w",encoding="utf-8") as f:
|
| 64 |
+
for result in conversation:
|
| 65 |
+
f.write(result)
|
| 66 |
except OSError as e:
|
| 67 |
print(f"Error writing transcription file: {e}")
|
| 68 |
raise
|
| 69 |
return output_file
|
| 70 |
|
| 71 |
+
def combine_audio(self,audio_files):
|
| 72 |
+
if not audio_files:
|
| 73 |
+
raise
|
| 74 |
+
merged_segments = []
|
| 75 |
+
current_speaker = None
|
| 76 |
+
current_segment = []
|
| 77 |
+
for segment in audio_files:
|
| 78 |
+
speaker = os.path.basename(segment).split("_")[0]
|
| 79 |
+
if speaker != current_speaker:
|
| 80 |
+
# 話者が変わった場合はセグメントを保存
|
| 81 |
+
if current_segment:
|
| 82 |
+
merged_segments.append((current_speaker, current_segment))
|
| 83 |
+
current_speaker = speaker
|
| 84 |
+
current_segment = [segment]
|
| 85 |
+
else:
|
| 86 |
+
# 話者が同一の場合はセグメントを結合
|
| 87 |
+
current_segment.append(segment)
|
| 88 |
+
# 最後のセグメントを保存
|
| 89 |
+
if current_segment:
|
| 90 |
+
merged_segments.append((current_speaker, current_segment))
|
| 91 |
+
|
| 92 |
+
return merged_segments
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ディレクトリ内の音声ファイルを並べ替える
|
| 96 |
+
def sort_audio_files_in_directory(self, directory):
|
| 97 |
+
files = os.listdir(directory)
|
| 98 |
+
audio_files = [f for f in files if f.endswith(".wav")]
|
| 99 |
+
|
| 100 |
+
audio_files.sort(key=lambda x: datetime.strptime(x.split("_")[1].split(".")[0], "%Y%m%d%H%M%S"))
|
| 101 |
+
return [os.path.join(directory, f) for f in audio_files]
|
| 102 |
+
|
| 103 |
#ファイル名が連続しているならくっつける
|
| 104 |
+
'''
|
| 105 |
def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
|
| 106 |
if not os.path.exists(output_dir):
|
| 107 |
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
| 142 |
output_file = os.path.join(output_dir, self.generate_filename(3))
|
| 143 |
combined_audio.export(output_file, format='wav')
|
| 144 |
|
| 145 |
+
return output_dir'''
|
| 146 |
+
|
| 147 |
+
def save_marged_segments(self,merged_segments,output_directory='/tmp/data/conversations'):
|
| 148 |
+
if not merged_segments:
|
| 149 |
+
print("merged_segmentsが見つかりませんでした。")
|
| 150 |
+
raise
|
| 151 |
+
|
| 152 |
+
conversation = []
|
| 153 |
+
for speaker, segments in merged_segments:
|
| 154 |
+
combined_audio = self.merge_segments(segments)
|
| 155 |
+
conversation.append((speaker,combined_audio))
|
| 156 |
+
if not os.path.exists(output_directory):
|
| 157 |
+
os.makedirs(output_directory)
|
| 158 |
+
|
| 159 |
+
for i, (speaker, combined_audio) in enumerate(conversation):
|
| 160 |
+
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
|
| 161 |
+
filename = f"{speaker}_{current_time}.wav"
|
| 162 |
+
file_path = os.path.join(output_directory,filename)
|
| 163 |
+
combined_audio.export(file_path,format = "wav")
|
| 164 |
+
print(f"Saved: {file_path}")
|
| 165 |
+
|
| 166 |
+
return output_directory
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def merge_segments(self,segments):
|
| 170 |
+
combined = AudioSegment.empty() # 空のAudioSegmentを初期化
|
| 171 |
+
|
| 172 |
+
for segment in segments:
|
| 173 |
+
if isinstance(segment, str):
|
| 174 |
+
# セグメントがファイルパスの場合、読み込む
|
| 175 |
+
audio = AudioSegment.from_file(segment)
|
| 176 |
+
elif isinstance(segment, AudioSegment):
|
| 177 |
+
# セグメントがすでにAudioSegmentの場合、そのまま使用
|
| 178 |
+
audio = segment
|
| 179 |
+
else:
|
| 180 |
+
raise ValueError("Invalid segment type. Must be file path or AudioSegment.")
|
| 181 |
+
|
| 182 |
+
combined += audio
|
| 183 |
+
return combined
|
| 184 |
+
|
| 185 |
+
|
| 186 |
|
| 187 |
def generate_random_string(self,length):
|
| 188 |
letters = string.ascii_letters + string.digits
|