|
import torch |
|
from transformers import pipeline |
|
from datasets import load_dataset |
|
import soundfile as sf |
|
from huggingface_hub.inference_api import InferenceApi |
|
from typing import Dict, List, Any |
|
|
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech") |
|
self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
|
|
def __call__(self, data): |
|
|
|
text = data.get("inputs", "") |
|
|
|
speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0) |
|
|
|
speaker_embedding_list = speaker_embedding.tolist() |
|
|
|
|
|
|
|
|
|
response = self.api(inputs=text, options={"wait_for_model": True}) |
|
|
|
|
|
|
|
sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"]) |
|
|
|
|
|
return { |
|
"statusCode": 200, |
|
"body": { |
|
"audio": response["audio"], |
|
"sampling_rate": response["sampling_rate"] |
|
} |
|
} |
|
|
|
handler = EndpointHandler() |