Spaces:

nsfwalex
/

whisper-transcribe-new

Running on Zero

App Files Files Community

liuyang commited on 11 days ago

Commit

28823e9

1 Parent(s): 5d33cf4

Update speaker diarization model and refactor WhisperTranscriber alignment process. Introduce align_timestamp method for improved word-level alignment and streamline segment handling. Adjusted print statements for clarity and removed unnecessary comments.

Browse files

Files changed (1) hide show

app.py +121 -31

app.py CHANGED Viewed

@@ -433,7 +433,7 @@ def _preload_alignment_and_diarization_models():
         torch.set_float32_matmul_precision('high')
         _diarizer = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.1",
             use_auth_token=os.getenv("HF_TOKEN"),
         ).to(torch.device("cuda"))
@@ -538,24 +538,24 @@ class WhisperTranscriber:
                     whisperx_model_name,
                     device=device,
                     compute_type=compute_type,
-                    download_root=CACHE_ROOT,
-                    asr_options=transcribe_options
                 )
                 _whipser_x_transcribe_models[model_name] = whisper_model
                 print(f"WhisperX transcribe model '{model_name}' loaded successfully")
             else:
                 whisper_model = _whipser_x_transcribe_models[model_name]
-            print(f"Transcribing full audio with WhisperX model '{model_name}' and batch size {batch_size}...")
-            result = whisper_model.transcribe(
-                audio,
-                language=language,
-                batch_size=batch_size,
-                #initial_prompt=prompt,
-                #task="translate" if translate else "transcribe"
-            )
-            detected_language = result.get("language", detected_language)
-            initial_segments = result.get("segments", [])
         elif engine == "faster_whisper":
             # Lazy-load Faster-Whisper model on first use
@@ -671,28 +671,24 @@ class WhisperTranscriber:
             raise ValueError(f"Unknown engine '{engine}'. Supported: 'whisperx', 'faster_whisper'")
         print(f"Detected language: {detected_language}, segments: {len(initial_segments)}, transcribing done in {time.time() - start_time:.2f} seconds")
-        # Align with WhisperX if supported for detected language (always attempt when available)
         segments = initial_segments
         if detected_language in _whipser_x_align_models:
-            print(f"Performing WhisperX alignment for language '{detected_language}'...")
-            align_start = time.time()
             try:
-                align_info = _whipser_x_align_models[detected_language]
-                align_result = whisperx.align(
-                    initial_segments,
-                    align_info["model"],
-                    align_info["metadata"],
-                    audio,
-                    "cuda",
-                    return_char_alignments=False
                 )
-                segments = align_result.get("segments", segments)
-                print(f"WhisperX alignment completed in {time.time() - align_start:.2f} seconds")
             except Exception as e:
-                print(f"WhisperX alignment failed: {e}, using original timestamps")
         else:
             print(f"No WhisperX alignment model available for language '{detected_language}', using original timestamps")
         # Process segments into the expected format
         results = []
         for seg in segments:
@@ -706,7 +702,7 @@ class WhisperTranscriber:
                         "probability": word.get("score", 1.0),
                         "speaker": "SPEAKER_00"
                     })
             results.append({
                 "start": float(seg.get("start", 0.0)) + float(base_offset_s),
                 "end": float(seg.get("end", 0.0)) + float(base_offset_s),
@@ -714,13 +710,107 @@ class WhisperTranscriber:
                 "speaker": "SPEAKER_00",
                 "avg_logprob": seg.get("avg_logprob", 0.0) if "avg_logprob" in seg else 0.0,
                 "words": words_list,
-                "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0))
             })
         print(results)
         transcription_time = time.time() - start_time
         print(f"Full audio transcribed and aligned in {transcription_time:.2f} seconds using batch size {batch_size}")
         return results, detected_language
     # Removed audio cutting; transcription is done once on the full (preprocessed) audio

         torch.set_float32_matmul_precision('high')
         _diarizer = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-community-1",
             use_auth_token=os.getenv("HF_TOKEN"),
         ).to(torch.device("cuda"))
                     whisperx_model_name,
                     device=device,
                     compute_type=compute_type,
+                        download_root=CACHE_ROOT,
+                        asr_options=transcribe_options
                 )
                 _whipser_x_transcribe_models[model_name] = whisper_model
                 print(f"WhisperX transcribe model '{model_name}' loaded successfully")
             else:
                 whisper_model = _whipser_x_transcribe_models[model_name]
+                print(f"Transcribing full audio with WhisperX model '{model_name}' and batch size {batch_size}...")
+                result = whisper_model.transcribe(
+                    audio,
+                    language=language,
+                    batch_size=batch_size,
+                    #initial_prompt=prompt,
+                    #task="translate" if translate else "transcribe"
+                )
+                detected_language = result.get("language", detected_language)
+                initial_segments = result.get("segments", [])
         elif engine == "faster_whisper":
             # Lazy-load Faster-Whisper model on first use
             raise ValueError(f"Unknown engine '{engine}'. Supported: 'whisperx', 'faster_whisper'")
         print(f"Detected language: {detected_language}, segments: {len(initial_segments)}, transcribing done in {time.time() - start_time:.2f} seconds")
+        # Align with centralized alignment method when available
         segments = initial_segments
         if detected_language in _whipser_x_align_models:
             try:
+                align_out = self.align_timestamp(
+                    audio_url=audio_path,
+                    text=None,
+                    language=detected_language,
+                    engine="whisperx",
+                    options={"segments": initial_segments},
                 )
+                if isinstance(align_out, dict) and align_out.get("segments"):
+                    segments = align_out["segments"]
             except Exception as e:
+                print(f"Alignment via align_timestamp failed: {e}, using original timestamps")
         else:
             print(f"No WhisperX alignment model available for language '{detected_language}', using original timestamps")
         # Process segments into the expected format
         results = []
         for seg in segments:
                         "probability": word.get("score", 1.0),
                         "speaker": "SPEAKER_00"
                     })
             results.append({
                 "start": float(seg.get("start", 0.0)) + float(base_offset_s),
                 "end": float(seg.get("end", 0.0)) + float(base_offset_s),
                 "speaker": "SPEAKER_00",
                 "avg_logprob": seg.get("avg_logprob", 0.0) if "avg_logprob" in seg else 0.0,
                 "words": words_list,
+                "duration": float(seg.get("end", 0.0)) - float(seg.get("start", 0.0)),
+                "language": detected_language,
             })
         print(results)
         transcription_time = time.time() - start_time
         print(f"Full audio transcribed and aligned in {transcription_time:.2f} seconds using batch size {batch_size}")
         return results, detected_language
+    @spaces.GPU           # alignment requires GPU
+    def align_timestamp(self, audio_url, text, language, engine="whisperx", options: dict = None):
+        """Return word-level alignment for the given text/audio using the specified engine.
+        Args:
+            audio_url: Path or URL to the audio file.
+            text: String text to align. If options contains 'segments', this can be None.
+            language: Language code (e.g., 'en'). Must be supported by WhisperX align models.
+            engine: Currently only 'whisperx' is supported.
+            options: Optional dict. Recognized keys:
+                - 'segments': list of {start, end, text} to align (preferred for segment-aware alignment)
+        Returns:
+            dict with keys:
+                - 'segments': aligned segments including word timings (if available)
+                - 'words': flat list of aligned words across all segments
+        """
+        global _whipser_x_align_models
+        if engine != "whisperx":
+            raise ValueError(f"align_timestamp engine '{engine}' not supported. Only 'whisperx' is supported")
+        if language not in _whipser_x_align_models:
+            raise ValueError(f"No WhisperX alignment model available for language '{language}'")
+        # Resolve audio path (download if URL)
+        local_path = None
+        tmp_file = None
+        try:
+            if isinstance(audio_url, str) and audio_url.startswith(("http://", "https://")):
+                resp = requests.get(audio_url, stream=True, timeout=60)
+                resp.raise_for_status()
+                tmp_f = tempfile.NamedTemporaryFile(suffix=".audio", delete=False)
+                for chunk in resp.iter_content(chunk_size=8192):
+                    if chunk:
+                        tmp_f.write(chunk)
+                tmp_f.flush()
+                tmp_f.close()
+                tmp_file = tmp_f.name
+                local_path = tmp_file
+            else:
+                local_path = audio_url
+            # Load audio and decide segments to align
+            audio = whisperx.load_audio(local_path)
+            sr = 16000.0  # whisperx loads at 16k
+            audio_duration = float(len(audio)) / sr if hasattr(audio, "__len__") else None
+            segments_to_align = None
+            if options and isinstance(options, dict) and options.get("segments"):
+                segments_to_align = options.get("segments")
+            else:
+                if not text or not str(text).strip():
+                    raise ValueError("align_timestamp requires 'text' when 'segments' are not provided in options")
+                if audio_duration is None:
+                    raise ValueError("Could not determine audio duration for alignment")
+                segments_to_align = [{
+                    "text": str(text),
+                    "start": 0.0,
+                    "end": audio_duration,
+                }]
+            # Perform alignment
+            align_info = _whipser_x_align_models[language]
+            aligned = whisperx.align(
+                segments_to_align,
+                align_info["model"],
+                align_info["metadata"],
+                audio,
+                "cuda",
+                return_char_alignments=False,
+            )
+            aligned_segments = aligned.get("segments", segments_to_align)
+            words_flat = []
+            for seg in aligned_segments:
+                for w in seg.get("words", []) or []:
+                    words_flat.append({
+                        "start": float(w.get("start", 0.0)),
+                        "end": float(w.get("end", 0.0)),
+                        "word": w.get("word", ""),
+                        "probability": w.get("score", 1.0)
+                    })
+            return {"segments": aligned_segments, "words": words_flat, "language": language}
+        finally:
+            if tmp_file:
+                try:
+                    os.unlink(tmp_file)
+                except Exception:
+                    pass
     # Removed audio cutting; transcription is done once on the full (preprocessed) audio