Spaces:

nsfwalex
/

whisper-transcribe-new

Running on Zero

App Files Files Community

liuyang commited on Jul 6

Commit

4a29c47

1 Parent(s): 37d6160

modify workflow

Browse files

Files changed (1) hide show

app.py +100 -168

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ pipe = pipeline(
     model="openai/whisper-large-v3-turbo",
     torch_dtype=torch.float16,
     device="cuda",
-    model_kwargs={"attn_implementation": "sdpa"},#flash_attention_2
     return_timestamps=True,
 )
@@ -87,20 +87,41 @@ class WhisperTranscriber:
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Audio conversion failed: {e}")
-    @spaces.GPU
-    def transcribe_audio(self, audio_path, language=None, translate=False, prompt=None):
-        """Transcribe audio using Whisper with flash attention"""
-        '''
-        #if self.pipe is None:
-        #    self.setup_models()
-        if next(self.pipe.model.parameters()).device.type != "cuda":
-            self.pipe.model.to("cuda")
-        '''
-        print("Starting transcription...")
         start_time = time.time()
         # Prepare generation kwargs
@@ -111,47 +132,54 @@ class WhisperTranscriber:
             generate_kwargs["task"] = "translate"
         if prompt:
             generate_kwargs["prompt_ids"] = self.pipe.tokenizer.encode(prompt)
-        # Transcribe with timestamps
-        result = self.pipe(
-            audio_path,
-            return_timestamps=True,
-            generate_kwargs=generate_kwargs,
-            chunk_length_s=30,
-            batch_size=128,
-        )
-        transcription_time = time.time() - start_time
-        print(f"Transcription completed in {transcription_time:.2f} seconds")
-        # Extract segments and detected language
-        segments = []
-        if "chunks" in result:
-            for chunk in result["chunks"]:
-                segment = {
-                    "start": float(chunk["timestamp"][0] or 0),
-                    "end": float(chunk["timestamp"][1] or 0),
-                    "text": chunk["text"].strip(),
-                }
-                segments.append(segment)
-        else:
-            # Fallback for different result format
-            segments = [{
-                "start": 0.0,
-                "end": 0.0,
-                "text": result["text"]
-            }]
-        detected_language = getattr(result, 'language', language or 'unknown')
         transcription_time = time.time() - start_time
-        print(f"Transcription parse completed in {transcription_time:.2f} seconds")
-        return segments, detected_language
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
         if self.diarization_model is None:
-            print("Diarization model not available, assigning single speaker")
-            return [], 1
         print("Starting diarization...")
         start_time = time.time()
@@ -176,7 +204,7 @@ class WhisperTranscriber:
                 "speaker": speaker
             })
-        unique_speakers = {speaker for _, _, speaker in diarization_list}
         detected_num_speakers = len(unique_speakers)
         diarization_time = time.time() - start_time
@@ -184,129 +212,35 @@ class WhisperTranscriber:
         return diarize_segments, detected_num_speakers
-    def merge_transcription_and_diarization(self, transcription_segments, diarization_segments):
-        """Merge transcription segments with speaker information"""
-        if not diarization_segments:
-            # No diarization available, assign single speaker
-            for segment in transcription_segments:
-                segment["speaker"] = "SPEAKER_00"
-            return transcription_segments
-        print("Merging transcription and diarization...")
-        diarize_df = pd.DataFrame(diarization_segments)
-        final_segments = []
-        for segment in transcription_segments:
-            # Calculate intersection with diarization segments
-            diarize_df["intersection"] = np.maximum(0,
-                np.minimum(diarize_df["end"], segment["end"]) -
-                np.maximum(diarize_df["start"], segment["start"])
-            )
-            # Find speaker with maximum intersection
-            dia_tmp = diarize_df[diarize_df["intersection"] > 0]
-            if len(dia_tmp) > 0:
-                speaker = (
-                    dia_tmp.groupby("speaker")["intersection"]
-                    .sum()
-                    .sort_values(ascending=False)
-                    .index[0]
-                )
-            else:
-                speaker = "SPEAKER_00"
-            segment["speaker"] = speaker
-            segment["duration"] = segment["end"] - segment["start"]
-            final_segments.append(segment)
-        return final_segments
-    def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
-        """Group consecutive segments from the same speaker"""
-        if not segments:
-            return segments
-        grouped_segments = []
-        current_group = segments[0].copy()
-        sentence_end_pattern = r"[.!?]+\s*$"
-        for segment in segments[1:]:
-            time_gap = segment["start"] - current_group["end"]
-            current_duration = current_group["end"] - current_group["start"]
-            # Conditions for combining segments
-            can_combine = (
-                segment["speaker"] == current_group["speaker"] and
-                time_gap <= max_gap and
-                current_duration < max_duration and
-                not re.search(sentence_end_pattern, current_group["text"])
-            )
-            if can_combine:
-                # Merge segments
-                current_group["end"] = segment["end"]
-                current_group["text"] += " " + segment["text"]
-                current_group["duration"] = current_group["end"] - current_group["start"]
-            else:
-                # Start new group
-                grouped_segments.append(current_group)
-                current_group = segment.copy()
-        grouped_segments.append(current_group)
-        # Clean up text
-        for segment in grouped_segments:
-            segment["text"] = re.sub(r"\s+", " ", segment["text"]).strip()
-            segment["text"] = re.sub(r"\s+([.,!?])", r"\1", segment["text"])
-        return grouped_segments
     @spaces.GPU
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
-        """Main processing function"""
         if audio_file is None:
             return {"error": "No audio file provided"}
         try:
-            # Setup models if not already done
-            #self.setup_models()
-            # Convert audio format
-            #wav_path = self.convert_audio_format(audio_file)
-            try:
-                # Transcribe audio
-                transcription_segments, detected_language = self.transcribe_audio(
-                    audio_file, language, translate, prompt
-                )
-                # Perform diarization
-                diarization_segments, detected_num_speakers = self.perform_diarization(
-                    audio_file, num_speakers
-                )
-                # Merge transcription and diarization
-                final_segments = self.merge_transcription_and_diarization(
-                    transcription_segments, diarization_segments
-                )
-                # Group segments if requested
-                if group_segments:
-                    final_segments = self.group_segments_by_speaker(final_segments)
-                return {
-                    "segments": final_segments,
-                    "language": detected_language,
-                    "num_speakers": detected_num_speakers or 1,
-                    "total_segments": len(final_segments)
-                }
-            finally:
-                # Clean up temporary file
-                if os.path.exists(audio_file):
-                    os.unlink(audio_file)
         except Exception as e:
             import traceback
             traceback.print_exc()
@@ -320,21 +254,19 @@ def format_segments_for_display(result):
     if "error" in result:
         return f"❌ Error: {result['error']}"
-    segments = result.get("segments", [])
-    language = result.get("language", "unknown")
-    num_speakers = result.get("num_speakers", 1)
     output = f"🎯 **Detection Results:**\n"
-    output += f"- Language: {language}\n"
-    output += f"- Speakers: {num_speakers}\n"
-    output += f"- Segments: {len(segments)}\n\n"
     output += "📝 **Transcription:**\n\n"
-    for i, segment in enumerate(segments, 1):
-        start_time = str(datetime.timedelta(seconds=int(segment["start"])))
-        end_time = str(datetime.timedelta(seconds=int(segment["end"])))
-        speaker = segment.get("speaker", "SPEAKER_00")
         text = segment["text"]
         output += f"**{speaker}** ({start_time} → {end_time})\n"

     model="openai/whisper-large-v3-turbo",
     torch_dtype=torch.float16,
     device="cuda",
+    model_kwargs={"attn_implementation": "flash_attention_2"},#flash_attention_2
     return_timestamps=True,
 )
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Audio conversion failed: {e}")
+    def cut_audio_segments(self, audio_path, diarization_segments):
+        """Cut audio into segments based on diarization results"""
+        print("Cutting audio into segments...")
+        # Load the full audio
+        waveform, sample_rate = torchaudio.load(audio_path)
+        audio_segments = []
+        for segment in diarization_segments:
+            start_sample = int(segment["start"] * sample_rate)
+            end_sample = int(segment["end"] * sample_rate)
+            # Extract the segment
+            segment_waveform = waveform[:, start_sample:end_sample]
+            # Create temporary file for this segment
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+            temp_file.close()
+            # Save the segment
+            torchaudio.save(temp_file.name, segment_waveform, sample_rate)
+            audio_segments.append({
+                "audio_path": temp_file.name,
+                "start": segment["start"],
+                "end": segment["end"],
+                "speaker": segment["speaker"]
+            })
+        return audio_segments
+    @spaces.GPU
+    def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
+        """Transcribe multiple audio segments"""
+        print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
         # Prepare generation kwargs
             generate_kwargs["task"] = "translate"
         if prompt:
             generate_kwargs["prompt_ids"] = self.pipe.tokenizer.encode(prompt)
+        results = []
+        for i, segment in enumerate(audio_segments):
+            print(f"Processing segment {i+1}/{len(audio_segments)}")
+            # Transcribe this segment
+            result = self.pipe(
+                segment["audio_path"],
+                return_timestamps=True,
+                generate_kwargs=generate_kwargs,
+                chunk_length_s=30,
+                batch_size=128,
+            )
+            # Extract text
+            text = result["text"].strip() if "text" in result else ""
+            # Create result entry
+            results.append({
+                "start_time": segment["start"],
+                "end_time": segment["end"],
+                "speaker_label": segment["speaker"],
+                "text": text
+            })
+        # Clean up temporary files
+        for segment in audio_segments:
+            if os.path.exists(segment["audio_path"]):
+                os.unlink(segment["audio_path"])
         transcription_time = time.time() - start_time
+        print(f"All segments transcribed in {transcription_time:.2f} seconds")
+        return results
+    @spaces.GPU
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
         if self.diarization_model is None:
+            print("Diarization model not available, creating single speaker segment")
+            # Load audio to get duration
+            waveform, sample_rate = torchaudio.load(audio_path)
+            duration = waveform.shape[1] / sample_rate
+            return [{
+                "start": 0.0,
+                "end": duration,
+                "speaker": "SPEAKER_00"
+            }], 1
         print("Starting diarization...")
         start_time = time.time()
                 "speaker": speaker
             })
+        unique_speakers = {speaker for segment in diarize_segments for speaker in [segment["speaker"]]}
         detected_num_speakers = len(unique_speakers)
         diarization_time = time.time() - start_time
         return diarize_segments, detected_num_speakers
     @spaces.GPU
     def process_audio(self, audio_file, num_speakers=None, language=None,
                      translate=False, prompt=None, group_segments=True):
+        """Main processing function - diarization first, then transcription"""
         if audio_file is None:
             return {"error": "No audio file provided"}
         try:
+            print("Starting new processing pipeline...")
+            # Step 1: Perform diarization first
+            diarization_segments, detected_num_speakers = self.perform_diarization(
+                audio_file, num_speakers
+            )
+            # Step 2: Cut audio into segments based on diarization
+            audio_segments = self.cut_audio_segments(audio_file, diarization_segments)
+            # Step 3: Transcribe each segment
+            transcription_results = self.transcribe_audio_segments(
+                audio_segments, language, translate, prompt
+            )
+            # Step 4: Return in requested format
+            return {
+                "speaker_count": detected_num_speakers,
+                "transcription": transcription_results
+            }
         except Exception as e:
             import traceback
             traceback.print_exc()
     if "error" in result:
         return f"❌ Error: {result['error']}"
+    speaker_count = result.get("speaker_count", 1)
+    transcription = result.get("transcription", [])
     output = f"🎯 **Detection Results:**\n"
+    output += f"- Speakers: {speaker_count}\n"
+    output += f"- Segments: {len(transcription)}\n\n"
     output += "📝 **Transcription:**\n\n"
+    for i, segment in enumerate(transcription, 1):
+        start_time = str(datetime.timedelta(seconds=int(segment["start_time"])))
+        end_time = str(datetime.timedelta(seconds=int(segment["end_time"])))
+        speaker = segment.get("speaker_label", "SPEAKER_00")
         text = segment["text"]
         output += f"**{speaker}** ({start_time} → {end_time})\n"