whisper-speaker-diarization

Runtime error

App Files Files Community

sanchit-gandhi commited on Jan 25, 2023

Commit

51fd668

1 Parent(s): 2503b95

Update asr_diarizer.py

Browse files

Files changed (1) hide show

asr_diarizer.py +82 -11

asr_diarizer.py CHANGED Viewed

@@ -2,8 +2,13 @@ from typing import List, Optional, Union
 import numpy as np
 import torch
 from pyannote.audio import Pipeline
 from transformers import pipeline
 class ASRDiarizationPipeline:
@@ -14,14 +19,16 @@ class ASRDiarizationPipeline:
     ):
         self.asr_pipeline = asr_pipeline
         self.diarization_pipeline = diarization_pipeline
     @classmethod
     def from_pretrained(
         cls,
         asr_model: Optional[str] = "openai/whisper-small",
         diarizer_model: Optional[str] = "pyannote/speaker-diarization",
-        chunk_length_s: int = 30,
-        use_auth_token: Union[str, bool] = True,
         **kwargs,
     ):
         asr_pipeline = pipeline(
@@ -37,21 +44,42 @@ class ASRDiarizationPipeline:
     def __call__(
         self,
         inputs: Union[np.ndarray, List[np.ndarray]],
-        sampling_rate: int,
         group_by_speaker: bool = True,
         **kwargs,
     ):
-        if not isinstance(inputs, np.ndarray):
-            raise ValueError(f"Expected a numpy ndarray as input, got `{type(inputs)}`.")
-        if len(inputs.shape) != 1:
-            raise ValueError(f"Expected a single channel audio as input, got `{len(inputs.shape)}` channels.")
-        diarizer_inputs = torch.from_numpy(inputs).float().unsqueeze(0)
         diarization = self.diarization_pipeline(
-            {"waveform": diarizer_inputs, "sample_rate": sampling_rate},
             **kwargs,
         )
-        del diarizer_inputs
         segments = diarization.for_json()["content"]
@@ -78,7 +106,7 @@ class ASRDiarizationPipeline:
         )
         asr_out = self.asr_pipeline(
-            {"array": inputs, "sampling_rate": sampling_rate},
             return_timestamps=True,
             **kwargs,
         )
@@ -110,3 +138,46 @@ class ASRDiarizationPipeline:
             end_timestamps = end_timestamps[upto_idx + 1 :]
         return segmented_preds

 import numpy as np
 import torch
+from torchaudio import functional as F
+import requests
 from pyannote.audio import Pipeline
 from transformers import pipeline
+from transformers.pipelines.audio_utils import ffmpeg_read
 class ASRDiarizationPipeline:
     ):
         self.asr_pipeline = asr_pipeline
         self.diarization_pipeline = diarization_pipeline
+        self.sampling_rate = self.asr_pipeline.feature_extractor.sampling_rate
     @classmethod
     def from_pretrained(
         cls,
         asr_model: Optional[str] = "openai/whisper-small",
         diarizer_model: Optional[str] = "pyannote/speaker-diarization",
+        chunk_length_s: Optional[int] = 30,
+        use_auth_token: Optional[Union[str, bool]] = True,
         **kwargs,
     ):
         asr_pipeline = pipeline(
     def __call__(
         self,
         inputs: Union[np.ndarray, List[np.ndarray]],
         group_by_speaker: bool = True,
         **kwargs,
     ):
+        """
+        Transcribe the audio sequence(s) given as inputs to text.
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
+                      np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
+                      treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
+                      inference to provide more context to the model). Only use `stride` with CTC models.
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str` ) -- The recognized text.
+                - **chunks** (*optional(, `List[Dict]`)
+                        When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                        chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
+                        "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                        `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        inputs, diarizer_inputs = self.preprocess(inputs)
         diarization = self.diarization_pipeline(
+            {"waveform": diarizer_inputs, "sample_rate": self.sampling_rate},
             **kwargs,
         )
         segments = diarization.for_json()["content"]
         )
         asr_out = self.asr_pipeline(
+            {"array": inputs, "sampling_rate": self.sampling_rate},
             return_timestamps=True,
             **kwargs,
         )
             end_timestamps = end_timestamps[upto_idx + 1 :]
         return segmented_preds
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.sampling_rate)
+        if isinstance(inputs, dict):
+            # Accepting `"array"` which is the key defined in `datasets` for better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to ASRDiarizePipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.sampling_rate:
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.sampling_rate
+                ).numpy()
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for ASRDiarizePipeline")
+        diarizer_inputs = torch.from_numpy(inputs).float().unsqueeze(0)
+        return inputs, diarizer_inputs