liuyang commited on
Commit
5a14daf
·
1 Parent(s): 5b655f4

Update waveform handling in WhisperTranscriber to maintain channel dimension during embedding calculations. Adjust comments for clarity on input shape requirements.

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -385,8 +385,8 @@ class WhisperTranscriber:
385
  speaker_embeddings = {}
386
  try:
387
  embedder = self._load_embedder()
388
- # waveform is (1, T); embedder expects mono 1D
389
- emb = embedder({"waveform": waveform.squeeze(0), "sample_rate": sample_rate})
390
  speaker_embeddings["SPEAKER_00"] = emb.squeeze().tolist()
391
  except Exception:
392
  pass
@@ -431,7 +431,8 @@ class WhisperTranscriber:
431
  start_sample = int(float(turn.start) * sample_rate)
432
  end_sample = int(float(turn.end) * sample_rate)
433
  if end_sample > start_sample:
434
- seg_wav = waveform[0, start_sample:end_sample].contiguous()
 
435
  emb = embedder({"waveform": seg_wav, "sample_rate": sample_rate})
436
  spk_to_embs[speaker].append(emb.squeeze())
437
  # average
 
385
  speaker_embeddings = {}
386
  try:
387
  embedder = self._load_embedder()
388
+ # Provide waveform as (channel, time)
389
+ emb = embedder({"waveform": waveform, "sample_rate": sample_rate})
390
  speaker_embeddings["SPEAKER_00"] = emb.squeeze().tolist()
391
  except Exception:
392
  pass
 
431
  start_sample = int(float(turn.start) * sample_rate)
432
  end_sample = int(float(turn.end) * sample_rate)
433
  if end_sample > start_sample:
434
+ # Keep channel dimension: (channel, time)
435
+ seg_wav = waveform[:, start_sample:end_sample].contiguous()
436
  emb = embedder({"waveform": seg_wav, "sample_rate": sample_rate})
437
  spk_to_embs[speaker].append(emb.squeeze())
438
  # average