Dupaja
/

speecht5_tts

Inference Endpoints

Model card Files Files and versions Community

Dupaja commited on Jan 1, 2024

Commit

d65f95a

·

1 Parent(s): 1ddbd3e

Update handler.py

Files changed (1) hide show

handler.py +21 -23

handler.py CHANGED Viewed

@@ -1,37 +1,35 @@
-from typing import  Dict
-from transformers import pipeline
 import torch
 import soundfile as sf
-import io
 class EndpointHandler:
-    def __init__(self, path=""):
-        self.synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
         self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
         text = data.get("inputs", "")
         speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-        # Generate speech using the synthesiser
-        speech = self.synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
-        # Convert numpy audio array to a WAV byte stream.
-        audio_buffer = io.BytesIO()
-        sf.write(file=audio_buffer, data=speech["audio"], samplerate=speech["sampling_rate"], format='WAV')
-        audio_buffer.seek(0)
-        audio_wav = audio_buffer.read()
-        # Prepare the response headers.
-        headers = {
-            "Content-Type": "audio/wav"
-        }
-        # Create the response as raw audio bytes.
-        response = {
             "statusCode": 200,
-            "body": audio_wav,
-            "headers": headers
         }
-        return response

 import torch
+from transformers import pipeline
+from datasets import load_dataset
 import soundfile as sf
+from huggingface_hub.inference_api import InferenceApi
 class EndpointHandler:
+    def __init__(self):
+        self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech")
         self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    def __call__(self, data):
         text = data.get("inputs", "")
+        # Extract speaker_embedding using the index from your dataset, or replace with your own logic.
         speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+        # Convert embedding to list to avoid serialization issues
+        speaker_embedding_list = speaker_embedding.tolist()
+        # Use the API to run the model
+        response = self.api(inputs=text, parameters={"forward_params": {"speaker_embeddings": speaker_embedding_list}}, options={"wait_for_model": True})
+        # Write the response audio to a file
+        # Note: This might not be possible in all environments, ensure this is suitable for your deployment
+        sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"])
+        # Return the expected response format
+        return {
             "statusCode": 200,
+            "body": {
+                "audio": response["audio"],  # Consider encoding this to a suitable format
+                "sampling_rate": response["sampling_rate"]
+            }
         }
+handler = EndpointHandler()