import torch from transformers import pipeline from datasets import load_dataset import soundfile as sf from huggingface_hub.inference_api import InferenceApi from typing import Dict, List, Any class EndpointHandler: def __init__(self, path=""): self.api = InferenceApi(repo_id="microsoft/speecht5_tts", task="text-to-speech") self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") def __call__(self, data): text = data.get("inputs", "") # Extract speaker_embedding using the index from your dataset, or replace with your own logic. speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Convert embedding to list to avoid serialization issues speaker_embedding_list = speaker_embedding.tolist() # Use the API to run the model #parameters={"forward_params": {"speaker_embeddings": speaker_embedding_list}} response = self.api(inputs=text, options={"wait_for_model": True}) # Write the response audio to a file # Note: This might not be possible in all environments, ensure this is suitable for your deployment sf.write("speech.wav", response["audio"], samplerate=response["sampling_rate"]) # Return the expected response format return { "statusCode": 200, "body": { "audio": response["audio"], # Consider encoding this to a suitable format "sampling_rate": response["sampling_rate"] } } handler = EndpointHandler()