liuyang commited on
Commit
f800f63
·
1 Parent(s): aa984fe

Enhance speaker assignment in transcription: Introduced interval overlap calculations and smoothing techniques for improved accuracy in speaker labeling. Added methods for determining dominant speakers and stabilizing segment boundaries.

Browse files
Files changed (1) hide show
  1. app.py +98 -25
app.py CHANGED
@@ -568,41 +568,114 @@ class WhisperTranscriber:
568
  """Assign speakers to words and segments based on overlap with diarization segments."""
569
  if not diarization_segments:
570
  return transcription_results
571
- # simple helper to find speaker at given time
572
  def speaker_at(t: float):
573
- for seg in diarization_segments:
574
- if seg["start"] <= t < seg["end"]:
575
- return seg["speaker"]
576
  # if not inside, return closest segment's speaker
577
  closest = None
578
- best = float("inf")
579
- for seg in diarization_segments:
580
- if t < seg["start"]:
581
- d = seg["start"] - t
582
- elif t > seg["end"]:
583
- d = t - seg["end"]
584
  else:
585
  d = 0.0
586
- if d < best:
587
- best = d
588
- closest = seg
589
  return closest["speaker"] if closest else "SPEAKER_00"
590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
  for seg in transcription_results:
592
- # Assign per-word speakers
593
  if seg.get("words"):
594
- speaker_counts = {}
595
- for w in seg["words"]:
596
- mid = (float(w["start"]) + float(w["end"])) / 2.0
597
- spk = speaker_at(mid)
598
- w["speaker"] = spk
599
- speaker_counts[spk] = speaker_counts.get(spk, 0) + (float(w["end"]) - float(w["start"]))
600
- # Segment speaker = speaker with max accumulated word duration
601
- if speaker_counts:
602
- seg["speaker"] = max(speaker_counts.items(), key=lambda kv: kv[1])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
  else:
604
- mid = (float(seg["start"]) + float(seg["end"])) / 2.0
605
- seg["speaker"] = speaker_at(mid)
606
  return transcription_results
607
 
608
  def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
 
568
  """Assign speakers to words and segments based on overlap with diarization segments."""
569
  if not diarization_segments:
570
  return transcription_results
571
+ # Helper: find the diarization speaker active at time t, or closest
572
  def speaker_at(t: float):
573
+ for dseg in diarization_segments:
574
+ if float(dseg["start"]) <= t < float(dseg["end"]):
575
+ return dseg["speaker"]
576
  # if not inside, return closest segment's speaker
577
  closest = None
578
+ best_dist = float("inf")
579
+ for dseg in diarization_segments:
580
+ if t < float(dseg["start"]):
581
+ d = float(dseg["start"]) - t
582
+ elif t > float(dseg["end"]):
583
+ d = t - float(dseg["end"])
584
  else:
585
  d = 0.0
586
+ if d < best_dist:
587
+ best_dist = d
588
+ closest = dseg
589
  return closest["speaker"] if closest else "SPEAKER_00"
590
 
591
+ # Helper: overlap length between two intervals
592
+ def interval_overlap(a_start: float, a_end: float, b_start: float, b_end: float) -> float:
593
+ return max(0.0, min(a_end, b_end) - max(a_start, b_start))
594
+
595
+ # Helper: choose speaker for an interval by maximum overlap with diarization
596
+ def best_speaker_for_interval(start_t: float, end_t: float) -> str:
597
+ best_spk = None
598
+ best_ov = -1.0
599
+ for dseg in diarization_segments:
600
+ ov = interval_overlap(float(start_t), float(end_t), float(dseg["start"]), float(dseg["end"]))
601
+ if ov > best_ov:
602
+ best_ov = ov
603
+ best_spk = dseg["speaker"]
604
+ if best_ov > 0.0 and best_spk is not None:
605
+ return best_spk
606
+ # fallback to nearest by midpoint
607
+ mid = (float(start_t) + float(end_t)) / 2.0
608
+ return speaker_at(mid)
609
+
610
  for seg in transcription_results:
611
+ # Assign per-word speakers using overlap, then smooth and stabilize boundaries
612
  if seg.get("words"):
613
+ words = seg["words"]
614
+ # 1) Initial assignment by overlap
615
+ for w in words:
616
+ w_start = float(w["start"])
617
+ w_end = float(w["end"])
618
+ w["speaker"] = best_speaker_for_interval(w_start, w_end)
619
+
620
+ # 2) Small median filter (window=3) to fix isolated outliers
621
+ if len(words) >= 3:
622
+ smoothed = [words[i]["speaker"] for i in range(len(words))]
623
+ for i in range(1, len(words) - 1):
624
+ prev_spk = words[i - 1]["speaker"]
625
+ curr_spk = words[i]["speaker"]
626
+ next_spk = words[i + 1]["speaker"]
627
+ if prev_spk == next_spk and curr_spk != prev_spk:
628
+ smoothed[i] = prev_spk
629
+ for i in range(len(words)):
630
+ words[i]["speaker"] = smoothed[i]
631
+
632
+ # 3) Determine dominant speaker by summed word durations
633
+ speaker_dur = {}
634
+ total_word_dur = 0.0
635
+ for w in words:
636
+ dur = max(0.0, float(w["end"]) - float(w["start"]))
637
+ total_word_dur += dur
638
+ spk = w.get("speaker", "SPEAKER_00")
639
+ speaker_dur[spk] = speaker_dur.get(spk, 0.0) + dur
640
+ if speaker_dur:
641
+ dominant_speaker = max(speaker_dur.items(), key=lambda kv: kv[1])[0]
642
+ else:
643
+ dominant_speaker = speaker_at((float(seg["start"]) + float(seg["end"])) / 2.0)
644
+
645
+ # 4) Boundary stabilization: relabel tiny prefix/suffix runs to dominant
646
+ seg_duration = max(1e-6, float(seg["end"]) - float(seg["start"]))
647
+ max_boundary_sec = 0.5 # hard cap for how much to relabel at edges
648
+ max_boundary_frac = 0.2 # or up to 20% of the segment duration
649
+
650
+ # prefix
651
+ prefix_dur = 0.0
652
+ prefix_count = 0
653
+ for w in words:
654
+ if w.get("speaker") == dominant_speaker:
655
+ break
656
+ prefix_dur += max(0.0, float(w["end"]) - float(w["start"]))
657
+ prefix_count += 1
658
+ if prefix_count > 0 and prefix_dur <= min(max_boundary_sec, max_boundary_frac * seg_duration):
659
+ for i in range(prefix_count):
660
+ words[i]["speaker"] = dominant_speaker
661
+
662
+ # suffix
663
+ suffix_dur = 0.0
664
+ suffix_count = 0
665
+ for w in reversed(words):
666
+ if w.get("speaker") == dominant_speaker:
667
+ break
668
+ suffix_dur += max(0.0, float(w["end"]) - float(w["start"]))
669
+ suffix_count += 1
670
+ if suffix_count > 0 and suffix_dur <= min(max_boundary_sec, max_boundary_frac * seg_duration):
671
+ for i in range(len(words) - suffix_count, len(words)):
672
+ words[i]["speaker"] = dominant_speaker
673
+
674
+ # 5) Final segment speaker
675
+ seg["speaker"] = dominant_speaker
676
  else:
677
+ # No word timings: choose by overlap with diarization over the whole segment
678
+ seg["speaker"] = best_speaker_for_interval(float(seg["start"]), float(seg["end"]))
679
  return transcription_results
680
 
681
  def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):