File size: 1,607 Bytes
aaf0168
d65f95a
 
aaf0168
d65f95a
7b5f670
 
aaf0168
 
7b5f670
d65f95a
aaf0168
 
9f3bcf5
402e7d8
aaf0168
d65f95a
aaf0168
d65f95a
 
aaf0168
a4c5e15
d65f95a
a4c5e15
 
aaf0168
d65f95a
 
 
74cf751
d65f95a
 
aaf0168
d65f95a
 
 
 
aaf0168
 
d65f95a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import torch
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from huggingface_hub.inference_api import InferenceApi
from typing import Dict, List, Any


class EndpointHandler:
    def __init__(self, path=""):
        self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech")
        self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

    def __call__(self, data):
        
        text = data.get("inputs", "")
        # Extract speaker_embedding using the index from your dataset, or replace with your own logic.
        speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        # Convert embedding to list to avoid serialization issues
        speaker_embedding_list = speaker_embedding.tolist()


        # Use the API to run the model
        #parameters={"forward_params": {"speaker_embeddings": speaker_embedding_list}}
        response = self.api(inputs=text, options={"wait_for_model": True})

        # Write the response audio to a file
        # Note: This might not be possible in all environments, ensure this is suitable for your deployment
        sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"])

        # Return the expected response format
        return {
            "statusCode": 200,
            "body": {
                "audio": response["audio"],  # Consider encoding this to a suitable format
                "sampling_rate": response["sampling_rate"]
            }
        }

handler = EndpointHandler()