import torch from loguru import logger from hfendpoints.openai import Context, run from hfendpoints.openai.embeddings import EmbeddingEndpoint, EmbeddingRequest, EmbeddingResponse from sentence_transformers import SentenceTransformer from hfendpoints import EndpointConfig, Handler, __version__ class SentenceTransformerHandler(Handler): __slots__ = ("_config", "_model", "_model_config", "_model_name") def __init__(self, config: EndpointConfig): self._config = config self._model = SentenceTransformer(config.model_id, device="cpu", model_kwargs={"torch_dtype": "bfloat16"}) self._model_config = self._model._modules['0'].auto_model.config self._model_name = config.model_id self._post_init() def _post_init(self): self._model = self._model.to(memory_format=torch.channels_last) @torch.compile def forward(self, documents: str): # TODO: Ask Tom how to do this better without tokenizing twice? tokens = self._model.tokenize(documents) vectors = self._model.encode(documents, output_value="sentence_embedding", normalize_embeddings=True) return tokens, vectors async def __call__(self, request: EmbeddingRequest, ctx: Context) -> EmbeddingResponse: with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON_CREATION): with torch.inference_mode(): with torch.amp.autocast("cpu", dtype=torch.bfloat16, cache_enabled=True): tokens, vectors = self.forward(request.input) return EmbeddingResponse(model=self._model_name) def entrypoint(): config = EndpointConfig.from_env() logger.info(f"[Hugging Face Endpoint v{__version__}] Serving: {config.model_id}") endpoint = EmbeddingEndpoint(SentenceTransformerHandler(config)) run(endpoint, config.interface, config.port) if __name__ == "__main__": entrypoint()