katospiegel commited on
Commit
73b906e
·
1 Parent(s): db46672

First Test

Browse files
Files changed (7) hide show
  1. README.md +25 -12
  2. app.py +62 -0
  3. audio.py +852 -0
  4. helpers.py +40 -0
  5. packages.txt +2 -0
  6. requirements.txt +17 -0
  7. transcription.py +218 -0
README.md CHANGED
@@ -1,12 +1,25 @@
1
- ---
2
- title: Amanu
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 3.47.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This repo's goal is to support the transcription and annotation of audios.
2
+
3
+
4
+ ## Parts
5
+
6
+ - `audio.py`: Everything related to audio preprocessing and analysis.
7
+ - `transcription.py`: All code for transcript audios using fast-whisper.
8
+ - `diarization.py`: Everything related to pyannotation.
9
+ - `textformatting.py`: All related to fomatting the text in specific outputs.
10
+
11
+ ## UI parts
12
+
13
+ 1. Transcription.
14
+ 2. Diarization.
15
+ 3. Revision.
16
+ 4. Output formatting.
17
+
18
+ ## How to access to the service?
19
+
20
+ The user will logging using a password and user specified by me. That user and password will be manually managed by me.
21
+
22
+ ## Pricing
23
+
24
+ 1. Calculate the fixed cost of a server running for a long period of time.
25
+ 2. Check if I can use the hibernation period to save some money.
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ from transformers.pipelines.audio_utils import ffmpeg_read
6
+
7
+ from transcription import fast_transcription, speech_to_text
8
+ from audio import normalizeAudio, separateVoiceInstrumental, mp3_to_wav, stereo_to_mono, cutaudio, compose_audio
9
+ from audio import overlay_audios, compose_audio, total_duration, append_wav_files
10
+ from helpers import guardar_en_archivo
11
+
12
+
13
+ def transcribe(audiofile, model):
14
+
15
+ audio_path = audiofile[0].name
16
+
17
+ audio_normalized_path = normalizeAudio(audio_path, ".wav")
18
+
19
+ novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
20
+
21
+ novocal_path = mp3_to_wav(novocal_path, "novocal")
22
+ vocal_path = mp3_to_wav(vocal_path, "vocal")
23
+
24
+ out = fast_transcription(vocal_path, model, "es")
25
+ transcript = "\n".join(out)
26
+ #Archivo
27
+ nombre_archivo = guardar_en_archivo(out)
28
+
29
+ return audio_path, audio_normalized_path, vocal_path, novocal_path, transcript, nombre_archivo
30
+
31
+
32
+ transcribeI = gr.Interface(
33
+ fn=transcribe,
34
+ inputs=[
35
+ gr.File(label="Upload Files", file_count="multiple"),
36
+ gr.Radio(["base", "small", "medium", "large-v2"], label="Models", value="large-v2"),
37
+ ],
38
+ outputs=[gr.Audio(type="filepath", label="original"),
39
+ gr.Audio(type="filepath", label="normalized"),
40
+ gr.Audio(type="filepath", label="vocal"),
41
+ gr.Audio(type="filepath", label="no_vocal"),
42
+ gr.TextArea(label="Transcription"),
43
+ gr.File(label="Archivo generado")
44
+ ],
45
+ theme="huggingface",
46
+ title="Transcripción",
47
+ description=(
48
+ "Sound extraction, processing, and dialogue transcription.\n"
49
+ "Paste a link to a youtube video\n"
50
+ ),
51
+ allow_flagging="never",
52
+ #examples=[[None, "COSER-4004-01-00_5m.wav", "large-v2"]]
53
+
54
+ )
55
+
56
+ demo = gr.Blocks()
57
+ with demo:
58
+ gr.Markdown("# Dubbing")
59
+ gr.TabbedInterface([diarizationI], ["Diarización"])
60
+
61
+ #demo.queue(concurrency_count=1).launch(enable_queue=True, auth=(os.environ['USER'], os.environ['PASSWORD']))
62
+ demo.launch(enable_queue=True)
audio.py ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import *
2
+ import datetime
3
+
4
+ from pydub import AudioSegment, effects
5
+
6
+
7
+ def normalizeAudio(file, format):
8
+ #https://stackoverflow.com/questions/42492246/how-to-normalize-the-volume-of-an-audio-file-in-python
9
+ rawsound = AudioSegment.from_file(file, format)
10
+ normalizedsound = effects.normalize(rawsound)
11
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
12
+
13
+
14
+ output_file = f"normalized_{timestamp}.wav"
15
+ normalizedsound.export(output_file, format="wav")
16
+
17
+ return output_file
18
+
19
+
20
+ def mp3_to_wav(mp3_path, tag):
21
+ # Load the MP3 file
22
+ audio = AudioSegment.from_mp3(mp3_path)
23
+
24
+ outfile = mp3_path.split(".")[0] + tag +".wav"
25
+
26
+ # Export the audio in WAV format
27
+ audio.export(outfile, format="wav")
28
+
29
+ return outfile
30
+
31
+ def stereo_to_mono(wav_path):
32
+ # Load the stereo WAV file
33
+ audio = AudioSegment.from_wav(wav_path)
34
+
35
+ # Convert to mono
36
+ audio_mono = audio.set_channels(1)
37
+
38
+ # Export the audio in WAV format
39
+ audio_mono.export(wav_path, format="wav")
40
+
41
+ return wav_path
42
+
43
+ def cutaudio(audiopath, start_time, end_time):
44
+ audio = AudioSegment.from_wav(audiopath)[start_time:end_time]
45
+ exportname = str(start_time)+"_"+str(end_time)+".wav"
46
+ audio.export(exportname, format="wav")
47
+
48
+ return exportname
49
+
50
+
51
+ def compose_audio(audio_files, timestamps, output_file):
52
+ # Example usage:
53
+ # audio_files = ["clip1.wav", "clip2.wav", "clip3.wav"]
54
+ # timestamps = [0, 5000, 10000, 15] # clip1 starts at 0s, clip2 at 5s, and clip3 at 10s; audio ends at 15s
55
+ # output_file = "composed_audio.wav"
56
+ # compose_audio(audio_files, timestamps, output_file)
57
+
58
+ # Check if lengths are consistent
59
+ if len(audio_files) != len(timestamps) - 1:
60
+ raise ValueError("Number of timestamps should be one more than number of audio files")
61
+
62
+ # Load the first audio file
63
+ final_audio = AudioSegment.silent(duration=timestamps[0])
64
+
65
+ for i, audio_file in enumerate(audio_files):
66
+ # Load the audio clip
67
+ clip = AudioSegment.from_wav(audio_file) # Change this if you're using a different format
68
+
69
+ # Calculate the amount of silence needed before the clip
70
+ silence_duration = (timestamps[i + 1] - timestamps[i] - len(clip) ) # in milliseconds
71
+
72
+ if silence_duration < 0:
73
+ print(f"Warning: Clip {audio_file} is longer than the gap between timestamps {i} and {i + 1}. Trimming the audio.")
74
+ clip = clip[:timestamps[i + 1] - timestamps[i]] # Trim the clip
75
+ silence_duration = 0
76
+
77
+ final_audio += clip + AudioSegment.silent(duration=silence_duration)
78
+
79
+ # Export final audio
80
+ #timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
81
+
82
+ #output_file_time = f"{output_file}_{timestamp}.wav"
83
+ final_audio.export(output_file, format="wav")
84
+
85
+ return output_file
86
+
87
+ def append_wav_files(filenames, output_filename):
88
+ # Load the first WAV file
89
+ combined = AudioSegment.from_wav(filenames[0])
90
+
91
+ # Load each subsequent WAV file and append to the combined segment
92
+ for filename in filenames[1:]:
93
+ audio = AudioSegment.from_wav(filename)
94
+ combined += audio
95
+
96
+ # Export the combined audio
97
+ combined.export(output_filename, format="wav")
98
+
99
+ return output_filename
100
+
101
+ # def generateAudio(respuesta, elabs_key):
102
+
103
+ # user = ElevenLabsUser(elabs_key)
104
+ # premadeVoice = user.get_voices_by_name("Rachel")[0]
105
+ # playbackOptions = PlaybackOptions(runInBackground=False)
106
+ # generationOptions = GenerationOptions(model_id="eleven_multilingual_v1", stability=0.3, similarity_boost=0.7, style=0.6, #eleven_english_v2
107
+ # use_speaker_boost=True)
108
+ # audioData, historyID = premadeVoice.generate_audio_v2(respuesta, generationOptions)
109
+ # #generationData = premadeVoice.generate_play_audio_v2(text, PlaybackOptions(runInBackground=False), GenerationOptions(stability=0.4))
110
+
111
+ # filename = "output.wav"
112
+ # #Save them to disk, in ogg format (can be any format supported by SoundFile)
113
+ # save_audio_bytes(audioData, filename, outputFormat="wav")
114
+
115
+ # return filename
116
+
117
+ def overlay_audios(audio_paths, output_file):
118
+ # Load all the audios
119
+ audios = [AudioSegment.from_wav(path) for path in audio_paths] # assuming WAV format
120
+
121
+ # Find the length of the longest audio
122
+ max_length = max(audio.duration_seconds for audio in audios)
123
+
124
+ # Pad all audios to the length of the longest one
125
+ padded_audios = [audio + AudioSegment.silent(duration=(max_length - audio.duration_seconds) * 1000) for audio in audios]
126
+
127
+ # Start with the first padded audio
128
+ overlay_audio = padded_audios[0]
129
+
130
+ # Overlay the rest of the audios on top
131
+ for audio in padded_audios[1:]:
132
+ overlay_audio = overlay_audio.overlay(audio)
133
+
134
+ overlay_audio.export(output_file, format="wav")
135
+
136
+ return output_file
137
+
138
+ def total_duration(audiofile):
139
+ audiofile = Path(audiofile)
140
+ format = audiofile.suffix.replace(".","")
141
+ song = AudioSegment.from_file(audiofile, format=format)
142
+ #song = load_audio_segment(audiofile, audiofile.split(".")[-1])
143
+ n_msecs = len(song)
144
+ return n_msecs
145
+
146
+ ###########################################################################
147
+
148
+ def separateVoiceInstrumental(audiofile):
149
+
150
+ audiofile = Path(audiofile)
151
+ filename = audiofile.stem
152
+ format = audiofile.suffix.replace(".","")
153
+
154
+ song = AudioSegment.from_file(audiofile, format=format)
155
+ #song = load_audio_segment(audiofile, audiofile.split(".")[-1])
156
+ n_secs = round(len(song) / 1000)
157
+
158
+ start_time = 0
159
+ end_time = n_secs
160
+
161
+ model_name, file_sources = ("htdemucs", ["vocals.mp3", "no_vocals.mp3"])
162
+ out_path = Path("output")
163
+ stem = "vocals"
164
+
165
+
166
+ separator(
167
+ tracks=[audiofile],
168
+ out=out_path,
169
+ model=model_name,
170
+ shifts=1,
171
+ overlap=0.5,
172
+ stem=stem,
173
+ int24=False,
174
+ float32=False,
175
+ clip_mode="rescale",
176
+ mp3=True,
177
+ mp3_bitrate=320,
178
+ verbose=True,
179
+ start_time=start_time,
180
+ end_time=end_time,
181
+ )
182
+
183
+ instrumentalFile = f"output/htdemucs/{filename}/no_vocals.mp3"
184
+ voiceFile = f"output/htdemucs/{filename}/vocals.mp3"
185
+
186
+ return instrumentalFile, voiceFile
187
+
188
+
189
+ ################################################################################
190
+
191
+ import argparse
192
+ import sys
193
+ from pathlib import Path
194
+ from typing import List
195
+ import os
196
+ from dora.log import fatal
197
+ import torch as th
198
+
199
+ from demucs.apply import apply_model, BagOfModels
200
+ from demucs.audio import save_audio
201
+ from demucs.pretrained import get_model_from_args, ModelLoadingError
202
+ from demucs.separate import load_track
203
+
204
+ def separator(
205
+ tracks: List[Path],
206
+ out: Path,
207
+ model: str,
208
+ shifts: int,
209
+ overlap: float,
210
+ stem: str,
211
+ int24: bool,
212
+ float32: bool,
213
+ clip_mode: str,
214
+ mp3: bool,
215
+ mp3_bitrate: int,
216
+ verbose: bool,
217
+ *args,
218
+ **kwargs,
219
+ ):
220
+ """Separate the sources for the given tracks
221
+ Args:
222
+ tracks (Path): Path to tracks
223
+ out (Path): Folder where to put extracted tracks. A subfolder with the model name will be
224
+ created.
225
+ model (str): Model name
226
+ shifts (int): Number of random shifts for equivariant stabilization.
227
+ Increase separation time but improves quality for Demucs.
228
+ 10 was used in the original paper.
229
+ overlap (float): Overlap
230
+ stem (str): Only separate audio into {STEM} and no_{STEM}.
231
+ int24 (bool): Save wav output as 24 bits wav.
232
+ float32 (bool): Save wav output as float32 (2x bigger).
233
+ clip_mode (str): Strategy for avoiding clipping: rescaling entire signal if necessary
234
+ (rescale) or hard clipping (clamp).
235
+ mp3 (bool): Convert the output wavs to mp3.
236
+ mp3_bitrate (int): Bitrate of converted mp3.
237
+ verbose (bool): Verbose
238
+ """
239
+
240
+ if os.environ.get("LIMIT_CPU", False):
241
+ th.set_num_threads(1)
242
+ jobs = 1
243
+ else:
244
+ # Number of jobs. This can increase memory usage but will be much faster when
245
+ # multiple cores are available.
246
+ jobs = os.cpu_count()
247
+
248
+ if th.cuda.is_available():
249
+ device = "cuda"
250
+ else:
251
+ device = "cpu"
252
+ args = argparse.Namespace()
253
+ args.tracks = tracks
254
+ args.out = out
255
+ args.model = model
256
+ args.device = device
257
+ args.shifts = shifts
258
+ args.overlap = overlap
259
+ args.stem = stem
260
+ args.int24 = int24
261
+ args.float32 = float32
262
+ args.clip_mode = clip_mode
263
+ args.mp3 = mp3
264
+ args.mp3_bitrate = mp3_bitrate
265
+ args.jobs = jobs
266
+ args.verbose = verbose
267
+ args.filename = "{track}/{stem}.{ext}"
268
+ args.split = True
269
+ args.segment = None
270
+ args.name = model
271
+ args.repo = None
272
+
273
+ try:
274
+ model = get_model_from_args(args)
275
+ except ModelLoadingError as error:
276
+ fatal(error.args[0])
277
+
278
+ if args.segment is not None and args.segment < 8:
279
+ fatal("Segment must greater than 8. ")
280
+
281
+ if ".." in args.filename.replace("\\", "/").split("/"):
282
+ fatal('".." must not appear in filename. ')
283
+
284
+ if isinstance(model, BagOfModels):
285
+ print(
286
+ f"Selected model is a bag of {len(model.models)} models. "
287
+ "You will see that many progress bars per track."
288
+ )
289
+ if args.segment is not None:
290
+ for sub in model.models:
291
+ sub.segment = args.segment
292
+ else:
293
+ if args.segment is not None:
294
+ model.segment = args.segment
295
+
296
+ model.cpu()
297
+ model.eval()
298
+
299
+ if args.stem is not None and args.stem not in model.sources:
300
+ fatal(
301
+ 'error: stem "{stem}" is not in selected model. STEM must be one of {sources}.'.format(
302
+ stem=args.stem, sources=", ".join(model.sources)
303
+ )
304
+ )
305
+ out = args.out / args.name
306
+ out.mkdir(parents=True, exist_ok=True)
307
+ print(f"Separated tracks will be stored in {out.resolve()}")
308
+ for track in args.tracks:
309
+ if not track.exists():
310
+ print(
311
+ f"File {track} does not exist. If the path contains spaces, "
312
+ 'please try again after surrounding the entire path with quotes "".',
313
+ file=sys.stderr,
314
+ )
315
+ continue
316
+ print(f"Separating track {track}")
317
+ wav = load_track(track, model.audio_channels, model.samplerate)
318
+
319
+ ref = wav.mean(0)
320
+ wav = (wav - ref.mean()) / ref.std()
321
+ sources = apply_model(
322
+ model,
323
+ wav[None],
324
+ device=args.device,
325
+ shifts=args.shifts,
326
+ split=args.split,
327
+ overlap=args.overlap,
328
+ progress=True,
329
+ num_workers=args.jobs,
330
+ )[0]
331
+ sources = sources * ref.std() + ref.mean()
332
+
333
+ if args.mp3:
334
+ ext = "mp3"
335
+ else:
336
+ ext = "wav"
337
+ kwargs = {
338
+ "samplerate": model.samplerate,
339
+ "bitrate": args.mp3_bitrate,
340
+ "clip": args.clip_mode,
341
+ "as_float": args.float32,
342
+ "bits_per_sample": 24 if args.int24 else 16,
343
+ }
344
+ if args.stem is None:
345
+ for source, name in zip(sources, model.sources):
346
+ stem = out / args.filename.format(
347
+ track=track.name.rsplit(".", 1)[0],
348
+ trackext=track.name.rsplit(".", 1)[-1],
349
+ stem=name,
350
+ ext=ext,
351
+ )
352
+ stem.parent.mkdir(parents=True, exist_ok=True)
353
+ save_audio(source, str(stem), **kwargs)
354
+ else:
355
+ sources = list(sources)
356
+ stem = out / args.filename.format(
357
+ track=track.name.rsplit(".", 1)[0],
358
+ trackext=track.name.rsplit(".", 1)[-1],
359
+ stem=args.stem,
360
+ ext=ext,
361
+ )
362
+ stem.parent.mkdir(parents=True, exist_ok=True)
363
+ save_audio(sources.pop(model.sources.index(args.stem)), str(stem), **kwargs)
364
+ # Warning : after poping the stem, selected stem is no longer in the list 'sources'
365
+ other_stem = th.zeros_like(sources[0])
366
+ for i in sources:
367
+ other_stem += i
368
+ stem = out / args.filename.format(
369
+ track=track.name.rsplit(".", 1)[0],
370
+ trackext=track.name.rsplit(".", 1)[-1],
371
+ stem="no_" + args.stem,
372
+ ext=ext,
373
+ )
374
+ stem.parent.mkdir(parents=True, exist_ok=True)
375
+ save_audio(other_stem, str(stem), **kwargs)
376
+
377
+
378
+ ##############################################################################
379
+
380
+ import os
381
+ import logging
382
+ import librosa
383
+ import numpy as np
384
+ import soundfile as sf
385
+ import torch
386
+ from pydub import AudioSegment
387
+
388
+ if os.environ.get("LIMIT_CPU", False):
389
+ torch.set_num_threads(1)
390
+
391
+
392
+ def merge_artifacts(y_mask, thres=0.05, min_range=64, fade_size=32):
393
+ if min_range < fade_size * 2:
394
+ raise ValueError("min_range must be >= fade_size * 2")
395
+
396
+ idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0]
397
+ start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
398
+ end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
399
+ artifact_idx = np.where(end_idx - start_idx > min_range)[0]
400
+ weight = np.zeros_like(y_mask)
401
+ if len(artifact_idx) > 0:
402
+ start_idx = start_idx[artifact_idx]
403
+ end_idx = end_idx[artifact_idx]
404
+ old_e = None
405
+ for s, e in zip(start_idx, end_idx):
406
+ if old_e is not None and s - old_e < fade_size:
407
+ s = old_e - fade_size * 2
408
+
409
+ if s != 0:
410
+ weight[:, :, s : s + fade_size] = np.linspace(0, 1, fade_size)
411
+ else:
412
+ s -= fade_size
413
+
414
+ if e != y_mask.shape[2]:
415
+ weight[:, :, e - fade_size : e] = np.linspace(1, 0, fade_size)
416
+ else:
417
+ e += fade_size
418
+
419
+ weight[:, :, s + fade_size : e - fade_size] = 1
420
+ old_e = e
421
+
422
+ v_mask = 1 - y_mask
423
+ y_mask += weight * v_mask
424
+
425
+ return y_mask
426
+
427
+
428
+ def make_padding(width, cropsize, offset):
429
+ left = offset
430
+ roi_size = cropsize - offset * 2
431
+ if roi_size == 0:
432
+ roi_size = cropsize
433
+ right = roi_size - (width % roi_size) + left
434
+
435
+ return left, right, roi_size
436
+
437
+
438
+ def wave_to_spectrogram(wave, hop_length, n_fft):
439
+ wave_left = np.asfortranarray(wave[0])
440
+ wave_right = np.asfortranarray(wave[1])
441
+
442
+ spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
443
+ spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
444
+ spec = np.asfortranarray([spec_left, spec_right])
445
+
446
+ return spec
447
+
448
+
449
+ def spectrogram_to_wave(spec, hop_length=1024):
450
+ if spec.ndim == 2:
451
+ wave = librosa.istft(spec, hop_length=hop_length)
452
+ elif spec.ndim == 3:
453
+ spec_left = np.asfortranarray(spec[0])
454
+ spec_right = np.asfortranarray(spec[1])
455
+
456
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
457
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
458
+ wave = np.asfortranarray([wave_left, wave_right])
459
+
460
+ return wave
461
+
462
+
463
+ class Separator(object):
464
+ def __init__(self, model, device, batchsize, cropsize, postprocess=False, progress_bar=None):
465
+ self.model = model
466
+ self.offset = model.offset
467
+ self.device = device
468
+ self.batchsize = batchsize
469
+ self.cropsize = cropsize
470
+ self.postprocess = postprocess
471
+ self.progress_bar = progress_bar
472
+
473
+ def _separate(self, X_mag_pad, roi_size):
474
+ X_dataset = []
475
+ patches = (X_mag_pad.shape[2] - 2 * self.offset) // roi_size
476
+ for i in range(patches):
477
+ start = i * roi_size
478
+ X_mag_crop = X_mag_pad[:, :, start : start + self.cropsize]
479
+ X_dataset.append(X_mag_crop)
480
+
481
+ X_dataset = np.asarray(X_dataset)
482
+
483
+ self.model.eval()
484
+ with torch.no_grad():
485
+ mask = []
486
+ # To reduce the overhead, dataloader is not used.
487
+ for i in range(0, patches, self.batchsize):
488
+ X_batch = X_dataset[i : i + self.batchsize]
489
+ X_batch = torch.from_numpy(X_batch).to(self.device)
490
+
491
+ pred = self.model.predict_mask(X_batch)
492
+
493
+ pred = pred.detach().cpu().numpy()
494
+ pred = np.concatenate(pred, axis=2)
495
+ mask.append(pred)
496
+
497
+ mask = np.concatenate(mask, axis=2)
498
+
499
+ return mask
500
+
501
+ def _preprocess(self, X_spec):
502
+ X_mag = np.abs(X_spec)
503
+ X_phase = np.angle(X_spec)
504
+
505
+ return X_mag, X_phase
506
+
507
+ def _postprocess(self, mask, X_mag, X_phase):
508
+ if self.postprocess:
509
+ mask = merge_artifacts(mask)
510
+
511
+ y_spec = mask * X_mag * np.exp(1.0j * X_phase)
512
+ v_spec = (1 - mask) * X_mag * np.exp(1.0j * X_phase)
513
+
514
+ return y_spec, v_spec
515
+
516
+ def separate(self, X_spec):
517
+ X_mag, X_phase = self._preprocess(X_spec)
518
+
519
+ n_frame = X_mag.shape[2]
520
+ pad_l, pad_r, roi_size = make_padding(n_frame, self.cropsize, self.offset)
521
+ X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
522
+ X_mag_pad /= X_mag_pad.max()
523
+
524
+ mask = self._separate(X_mag_pad, roi_size)
525
+ mask = mask[:, :, :n_frame]
526
+
527
+ y_spec, v_spec = self._postprocess(mask, X_mag, X_phase)
528
+
529
+ return y_spec, v_spec
530
+
531
+
532
+ def load_model(pretrained_model, n_fft=2048):
533
+ model = CascadedNet(n_fft, 32, 128)
534
+ if torch.cuda.is_available():
535
+ device = torch.device("cuda:0")
536
+ model.to(device)
537
+ # elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
538
+ # device = torch.device("mps")
539
+ # model.to(device)
540
+ else:
541
+ device = torch.device("cpu")
542
+ model.load_state_dict(torch.load(pretrained_model, map_location=device))
543
+ return model, device
544
+
545
+
546
+ def separate(
547
+ input,
548
+ model,
549
+ device,
550
+ output_dir,
551
+ batchsize=4,
552
+ cropsize=256,
553
+ postprocess=False,
554
+ hop_length=1024,
555
+ n_fft=2048,
556
+ sr=44100,
557
+ progress_bar=None,
558
+ only_no_vocals=False,
559
+ ):
560
+ X, sr = librosa.load(input, sr=sr, mono=False, dtype=np.float32, res_type="kaiser_fast")
561
+ basename = os.path.splitext(os.path.basename(input))[0]
562
+
563
+ if X.ndim == 1:
564
+ # mono to stereo
565
+ X = np.asarray([X, X])
566
+
567
+ X_spec = wave_to_spectrogram(X, hop_length, n_fft)
568
+
569
+ with torch.no_grad():
570
+ sp = Separator(model, device, batchsize, cropsize, postprocess, progress_bar=progress_bar)
571
+ y_spec, v_spec = sp.separate(X_spec)
572
+
573
+ base_dir = f"{output_dir}/vocal_remover/{basename}"
574
+ os.makedirs(base_dir, exist_ok=True)
575
+
576
+ wave = spectrogram_to_wave(y_spec, hop_length=hop_length)
577
+ try:
578
+ sf.write(f"{base_dir}/no_vocals.mp3", wave.T, sr)
579
+ except Exception:
580
+ logging.error("Failed to write no_vocals.mp3, trying pydub...")
581
+ pydub_write(wave, f"{base_dir}/no_vocals.mp3", sr)
582
+ if only_no_vocals:
583
+ return
584
+ wave = spectrogram_to_wave(v_spec, hop_length=hop_length)
585
+ try:
586
+ sf.write(f"{base_dir}/vocals.mp3", wave.T, sr)
587
+ except Exception:
588
+ logging.error("Failed to write vocals.mp3, trying pydub...")
589
+ pydub_write(wave, f"{base_dir}/vocals.mp3", sr)
590
+
591
+
592
+ def pydub_write(wave, output_path, frame_rate, audio_format="mp3"):
593
+ # Ensure the wave data is in the right format for pydub (mono and 16-bit depth)
594
+ wave_16bit = (wave * 32767).astype(np.int16)
595
+
596
+ audio_segment = AudioSegment(
597
+ wave_16bit.tobytes(),
598
+ frame_rate=frame_rate,
599
+ sample_width=wave_16bit.dtype.itemsize,
600
+ channels=1,
601
+ )
602
+ audio_segment.export(output_path, format=audio_format)
603
+
604
+ #####################################################################################
605
+
606
+ import torch
607
+ from torch import nn
608
+ import torch.nn.functional as F
609
+
610
+
611
+ class BaseNet(nn.Module):
612
+ def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
613
+ super(BaseNet, self).__init__()
614
+ self.enc1 = Conv2DBNActiv(nin, nout, 3, 1, 1)
615
+ self.enc2 = Encoder(nout, nout * 2, 3, 2, 1)
616
+ self.enc3 = Encoder(nout * 2, nout * 4, 3, 2, 1)
617
+ self.enc4 = Encoder(nout * 4, nout * 6, 3, 2, 1)
618
+ self.enc5 = Encoder(nout * 6, nout * 8, 3, 2, 1)
619
+
620
+ self.aspp = ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
621
+
622
+ self.dec4 = Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
623
+ self.dec3 = Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
624
+ self.dec2 = Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
625
+ self.lstm_dec2 = LSTMModule(nout * 2, nin_lstm, nout_lstm)
626
+ self.dec1 = Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
627
+
628
+ def __call__(self, x):
629
+ e1 = self.enc1(x)
630
+ e2 = self.enc2(e1)
631
+ e3 = self.enc3(e2)
632
+ e4 = self.enc4(e3)
633
+ e5 = self.enc5(e4)
634
+
635
+ h = self.aspp(e5)
636
+
637
+ h = self.dec4(h, e4)
638
+ h = self.dec3(h, e3)
639
+ h = self.dec2(h, e2)
640
+ h = torch.cat([h, self.lstm_dec2(h)], dim=1)
641
+ h = self.dec1(h, e1)
642
+
643
+ return h
644
+
645
+
646
+ class CascadedNet(nn.Module):
647
+ def __init__(self, n_fft, nout=32, nout_lstm=128):
648
+ super(CascadedNet, self).__init__()
649
+ self.max_bin = n_fft // 2
650
+ self.output_bin = n_fft // 2 + 1
651
+ self.nin_lstm = self.max_bin // 2
652
+ self.offset = 64
653
+
654
+ self.stg1_low_band_net = nn.Sequential(
655
+ BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
656
+ Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
657
+ )
658
+ self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
659
+
660
+ self.stg2_low_band_net = nn.Sequential(
661
+ BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
662
+ Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
663
+ )
664
+ self.stg2_high_band_net = BaseNet(
665
+ nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
666
+ )
667
+
668
+ self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm)
669
+
670
+ self.out = nn.Conv2d(nout, 2, 1, bias=False)
671
+ self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
672
+
673
+ def forward(self, x):
674
+ x = x[:, :, : self.max_bin]
675
+
676
+ bandw = x.size()[2] // 2
677
+ l1_in = x[:, :, :bandw]
678
+ h1_in = x[:, :, bandw:]
679
+ l1 = self.stg1_low_band_net(l1_in)
680
+ h1 = self.stg1_high_band_net(h1_in)
681
+ aux1 = torch.cat([l1, h1], dim=2)
682
+
683
+ l2_in = torch.cat([l1_in, l1], dim=1)
684
+ h2_in = torch.cat([h1_in, h1], dim=1)
685
+ l2 = self.stg2_low_band_net(l2_in)
686
+ h2 = self.stg2_high_band_net(h2_in)
687
+ aux2 = torch.cat([l2, h2], dim=2)
688
+
689
+ f3_in = torch.cat([x, aux1, aux2], dim=1)
690
+ f3 = self.stg3_full_band_net(f3_in)
691
+
692
+ mask = torch.sigmoid(self.out(f3))
693
+ mask = F.pad(
694
+ input=mask,
695
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
696
+ mode="replicate",
697
+ )
698
+
699
+ if self.training:
700
+ aux = torch.cat([aux1, aux2], dim=1)
701
+ aux = torch.sigmoid(self.aux_out(aux))
702
+ aux = F.pad(
703
+ input=aux,
704
+ pad=(0, 0, 0, self.output_bin - aux.size()[2]),
705
+ mode="replicate",
706
+ )
707
+ return mask, aux
708
+ else:
709
+ return mask
710
+
711
+ def predict_mask(self, x):
712
+ mask = self.forward(x)
713
+
714
+ if self.offset > 0:
715
+ mask = mask[:, :, :, self.offset : -self.offset]
716
+ assert mask.size()[3] > 0
717
+
718
+ return mask
719
+
720
+ def predict(self, x):
721
+ mask = self.forward(x)
722
+ pred_mag = x * mask
723
+
724
+ if self.offset > 0:
725
+ pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
726
+ assert pred_mag.size()[3] > 0
727
+
728
+ return pred_mag
729
+
730
+ ##############################################################################
731
+
732
+ def crop_center(h1, h2):
733
+ h1_shape = h1.size()
734
+ h2_shape = h2.size()
735
+
736
+ if h1_shape[3] == h2_shape[3]:
737
+ return h1
738
+ elif h1_shape[3] < h2_shape[3]:
739
+ raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
740
+
741
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
742
+ e_time = s_time + h2_shape[3]
743
+ h1 = h1[:, :, :, s_time:e_time]
744
+
745
+ return h1
746
+
747
+
748
+ class Conv2DBNActiv(nn.Module):
749
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
750
+ super(Conv2DBNActiv, self).__init__()
751
+ self.conv = nn.Sequential(
752
+ nn.Conv2d(
753
+ nin,
754
+ nout,
755
+ kernel_size=ksize,
756
+ stride=stride,
757
+ padding=pad,
758
+ dilation=dilation,
759
+ bias=False,
760
+ ),
761
+ nn.BatchNorm2d(nout),
762
+ activ(),
763
+ )
764
+
765
+ def __call__(self, x):
766
+ return self.conv(x)
767
+
768
+
769
+ class Encoder(nn.Module):
770
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
771
+ super(Encoder, self).__init__()
772
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
773
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
774
+
775
+ def __call__(self, x):
776
+ h = self.conv1(x)
777
+ h = self.conv2(h)
778
+
779
+ return h
780
+
781
+
782
+ class Decoder(nn.Module):
783
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
784
+ super(Decoder, self).__init__()
785
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
786
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
787
+
788
+ def __call__(self, x, skip=None):
789
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
790
+
791
+ if skip is not None:
792
+ skip = crop_center(skip, x)
793
+ x = torch.cat([x, skip], dim=1)
794
+
795
+ h = self.conv1(x)
796
+ # h = self.conv2(h)
797
+
798
+ if self.dropout is not None:
799
+ h = self.dropout(h)
800
+
801
+ return h
802
+
803
+
804
+ class ASPPModule(nn.Module):
805
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
806
+ super(ASPPModule, self).__init__()
807
+ self.conv1 = nn.Sequential(
808
+ nn.AdaptiveAvgPool2d((1, None)),
809
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
810
+ )
811
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
812
+ self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
813
+ self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
814
+ self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
815
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
816
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
817
+
818
+ def forward(self, x):
819
+ _, _, h, w = x.size()
820
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
821
+ feat2 = self.conv2(x)
822
+ feat3 = self.conv3(x)
823
+ feat4 = self.conv4(x)
824
+ feat5 = self.conv5(x)
825
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
826
+ out = self.bottleneck(out)
827
+
828
+ if self.dropout is not None:
829
+ out = self.dropout(out)
830
+
831
+ return out
832
+
833
+
834
+ class LSTMModule(nn.Module):
835
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
836
+ super(LSTMModule, self).__init__()
837
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
838
+ self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
839
+ self.dense = nn.Sequential(
840
+ nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
841
+ )
842
+
843
+ def forward(self, x):
844
+ N, _, nbins, nframes = x.size()
845
+ h = self.conv(x)[:, 0] # N, nbins, nframes
846
+ h = h.permute(2, 0, 1) # nframes, N, nbins
847
+ h, _ = self.lstm(h)
848
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
849
+ h = h.reshape(nframes, N, 1, nbins)
850
+ h = h.permute(1, 2, 3, 0)
851
+
852
+ return h
helpers.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ def guardar_en_archivo(lista_strings):
4
+ # Formateamos la fecha
5
+ fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
6
+ nombre_archivo = f"transcription_{fecha_actual}.txt"
7
+
8
+ # Escribimos la lista en el archivo
9
+ with open(nombre_archivo, 'w') as archivo:
10
+ for linea in lista_strings:
11
+ archivo.write(linea + '\n')
12
+
13
+ return nombre_archivo
14
+
15
+ def leer_del_archivo(nombre_archivo):
16
+ with open(nombre_archivo, 'r') as archivo:
17
+ # Leemos las líneas y eliminamos el salto de línea al final
18
+ contenido = [linea.strip() for linea in archivo.readlines()]
19
+ return contenido
20
+
21
+ def guardar_dataframe_en_csv(df):
22
+ # Obtener la fecha y hora actual y formatearla
23
+ fecha_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
24
+
25
+ # Generar el nombre del archivo
26
+ nombre_archivo = f"transcription_{fecha_actual}.csv"
27
+
28
+ # Guardar el DataFrame en el archivo CSV
29
+ df.to_csv(nombre_archivo, index=False)
30
+
31
+ return nombre_archivo
32
+
33
+ def dataframe_a_lista(df):
34
+ # Convertimos todas las columnas a string
35
+ df_str = df.astype(str)
36
+
37
+ # Concatenamos las columnas fila por fila
38
+ lista_strings = df_str.apply(lambda row: ' '.join(row), axis=1).tolist()
39
+
40
+ return lista_strings
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ portaudio19-dev
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #git+https://github.com/huggingface/transformers
2
+ torch
3
+ yt-dlp
4
+ openai
5
+ pydub
6
+ faster-whisper
7
+ scikit-learn
8
+ pandas
9
+ numpy
10
+ pytube
11
+ https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
12
+ pyannote.core
13
+ gpuinfo
14
+ psutil
15
+ wave
16
+ demucs
17
+ moviepy
transcription.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #################################################################################################
2
+ # Taking code from https://huggingface.co/spaces/vumichien/Whisper_speaker_diarization/blob/main/app.py
3
+
4
+ from faster_whisper import WhisperModel
5
+ #import datetime
6
+ #import subprocess
7
+ import gradio as gr
8
+ from pathlib import Path
9
+ import pandas as pd
10
+ #import re
11
+ import time
12
+ import os
13
+ import numpy as np
14
+ from sklearn.cluster import AgglomerativeClustering
15
+ from sklearn.metrics import silhouette_score
16
+
17
+ from pytube import YouTube
18
+ #import yt_dlp
19
+ import torch
20
+ #import pyannote.audio
21
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
22
+ from pyannote.audio import Audio
23
+ from pyannote.core import Segment
24
+
25
+ from gpuinfo import GPUInfo
26
+
27
+ import wave
28
+ import contextlib
29
+ from transformers import pipeline
30
+ import psutil
31
+
32
+ embedding_model = PretrainedSpeakerEmbedding(
33
+ "speechbrain/spkrec-ecapa-voxceleb",
34
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
35
+
36
+ def fast_transcription(audio_file, whisper_model, language):
37
+ """
38
+ # Transcribe youtube link using OpenAI Whisper
39
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
40
+ 2. Generating speaker embeddings for each segments.
41
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
42
+
43
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
44
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
45
+ """
46
+
47
+ # model = whisper.load_model(whisper_model)
48
+ # model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
49
+ model = WhisperModel(whisper_model, compute_type="int8")
50
+ time_start = time.time()
51
+ # if(video_file_path == None):
52
+ # raise ValueError("Error no video input")
53
+ # print(video_file_path)
54
+
55
+ try:
56
+ # Get duration
57
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
58
+ frames = f.getnframes()
59
+ rate = f.getframerate()
60
+ duration = frames / float(rate)
61
+ print(f"conversion to wav ready, duration of audio file: {duration}")
62
+
63
+ # Transcribe audio
64
+ options = dict(language=language, beam_size=5, best_of=5)
65
+ transcribe_options = dict(task="transcribe", **options)
66
+ segments_raw, info = model.transcribe(audio_file, **transcribe_options)
67
+
68
+ # Convert back to original openai format
69
+ segments = []
70
+ i = 0
71
+ for segment_chunk in segments_raw:
72
+ chunk = {}
73
+ chunk["start"] = segment_chunk.start
74
+ chunk["end"] = segment_chunk.end
75
+ chunk["text"] = segment_chunk.text
76
+ segments.append(chunk)
77
+ i += 1
78
+ print("transcribe audio done with fast whisper")
79
+ except Exception as e:
80
+ raise RuntimeError("Error converting video to audio")
81
+
82
+ #text from the list
83
+
84
+ return [str(s["start"]) + " " + s["text"] for s in segments] #pd.DataFrame(segments)
85
+
86
+ import datetime
87
+
88
+ def convert_time(secs):
89
+ return datetime.timedelta(seconds=round(secs))
90
+
91
+ def speech_to_text(audio_file, selected_source_lang, whisper_model, num_speakers):
92
+ """
93
+ # Transcribe youtube link using OpenAI Whisper
94
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
95
+ 2. Generating speaker embeddings for each segments.
96
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
97
+
98
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
99
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
100
+ """
101
+
102
+ # model = whisper.load_model(whisper_model)
103
+ # model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
104
+ model = WhisperModel(whisper_model, compute_type="int8")
105
+ time_start = time.time()
106
+ # if(video_file_path == None):
107
+ # raise ValueError("Error no video input")
108
+ # print(video_file_path)
109
+
110
+ try:
111
+ # # Read and convert youtube video
112
+ # _,file_ending = os.path.splitext(f'{video_file_path}')
113
+ # print(f'file enging is {file_ending}')
114
+ # audio_file = video_file_path.replace(file_ending, ".wav")
115
+ # print("starting conversion to wav")
116
+ # os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
117
+
118
+ # Get duration
119
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
120
+ frames = f.getnframes()
121
+ rate = f.getframerate()
122
+ duration = frames / float(rate)
123
+ print(f"conversion to wav ready, duration of audio file: {duration}")
124
+
125
+ # Transcribe audio
126
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
127
+ transcribe_options = dict(task="transcribe", **options)
128
+ segments_raw, info = model.transcribe(audio_file, **transcribe_options)
129
+
130
+ # Convert back to original openai format
131
+ segments = []
132
+ i = 0
133
+ for segment_chunk in segments_raw:
134
+ chunk = {}
135
+ chunk["start"] = segment_chunk.start
136
+ chunk["end"] = segment_chunk.end
137
+ chunk["text"] = segment_chunk.text
138
+ segments.append(chunk)
139
+ i += 1
140
+ print("transcribe audio done with fast whisper")
141
+ except Exception as e:
142
+ raise RuntimeError("Error converting video to audio")
143
+
144
+ try:
145
+ # Create embedding
146
+ def segment_embedding(segment):
147
+ audio = Audio()
148
+ start = segment["start"]
149
+ # Whisper overshoots the end timestamp in the last segment
150
+ end = min(duration, segment["end"])
151
+ clip = Segment(start, end)
152
+ waveform, sample_rate = audio.crop(audio_file, clip)
153
+ return embedding_model(waveform[None])
154
+
155
+ embeddings = np.zeros(shape=(len(segments), 192))
156
+ for i, segment in enumerate(segments):
157
+ embeddings[i] = segment_embedding(segment)
158
+ embeddings = np.nan_to_num(embeddings)
159
+ print(f'Embedding shape: {embeddings.shape}')
160
+
161
+ if num_speakers == 0:
162
+ # Find the best number of speakers
163
+ score_num_speakers = {}
164
+
165
+ for num_speakers in range(2, 10+1):
166
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
167
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
168
+ score_num_speakers[num_speakers] = score
169
+
170
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
171
+ print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
172
+ else:
173
+ best_num_speaker = num_speakers
174
+
175
+ # Assign speaker label
176
+ clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
177
+ labels = clustering.labels_
178
+ for i in range(len(segments)):
179
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
180
+
181
+ # Make output
182
+ objects = {
183
+ 'Start' : [],
184
+ 'End': [],
185
+ 'Speaker': [],
186
+ 'Text': []
187
+ }
188
+ text = ''
189
+ for (i, segment) in enumerate(segments):
190
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
191
+ objects['Start'].append(str(convert_time(segment["start"])))
192
+ objects['Speaker'].append(segment["speaker"])
193
+ if i != 0:
194
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
195
+ objects['Text'].append(text)
196
+ text = ''
197
+ text += segment["text"] + ' '
198
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
199
+ objects['Text'].append(text)
200
+
201
+ time_end = time.time()
202
+ time_diff = time_end - time_start
203
+ memory = psutil.virtual_memory()
204
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
205
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
206
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
207
+ system_info = f"""
208
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
209
+ *Processing time: {time_diff:.5} seconds.*
210
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
211
+ """
212
+ save_path = "transcript_result.csv"
213
+ df_results = pd.DataFrame(objects)
214
+ #df_results.to_csv(save_path)
215
+ return df_results, system_info, save_path
216
+
217
+ except Exception as e:
218
+ raise RuntimeError("Error Running inference with local model", e)