Morgan Funtowicz
commited on
Commit
·
49b79e1
1
Parent(s):
5460614
feat(embedding): move to common I/O representation
Browse files- handler.py +13 -13
handler.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import platform
|
| 2 |
-
from
|
| 3 |
-
from typing import Generator, Tuple
|
| 4 |
|
|
|
|
| 5 |
import torch
|
| 6 |
from hfendpoints.http import Context, run
|
| 7 |
-
from hfendpoints.
|
|
|
|
|
|
|
| 8 |
from intel_extension_for_pytorch.cpu.runtime import pin
|
| 9 |
from loguru import logger
|
| 10 |
from sentence_transformers import SentenceTransformer
|
|
@@ -45,7 +47,6 @@ def get_cores_pinning_strategy() -> "CPUPool":
|
|
| 45 |
|
| 46 |
logger.info(f"Pinning CPU cores to {pinned_cpu_cores_id}")
|
| 47 |
return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
|
| 48 |
-
# return ipex.cpu.runtime.CPUPool(node_id=0)
|
| 49 |
|
| 50 |
|
| 51 |
def get_usage(mask: torch.IntTensor) -> Usage:
|
|
@@ -54,7 +55,7 @@ def get_usage(mask: torch.IntTensor) -> Usage:
|
|
| 54 |
:param mask: Attention mask tensor, as returned by the model
|
| 55 |
:return: Usage object matching OpenAI specifications
|
| 56 |
"""
|
| 57 |
-
num_tokens =
|
| 58 |
return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
|
| 59 |
|
| 60 |
|
|
@@ -65,9 +66,12 @@ class SentenceTransformerWithUsage(Module):
|
|
| 65 |
super().__init__()
|
| 66 |
self._model = model
|
| 67 |
|
| 68 |
-
def forward(self, sentences: list[str]) -> Tuple[
|
| 69 |
vectors = self._model.encode(sentences, output_value=None)
|
| 70 |
-
return
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
class SentenceTransformerHandler(Handler):
|
|
@@ -119,13 +123,9 @@ class SentenceTransformerHandler(Handler):
|
|
| 119 |
with pin(self._pinned_cores):
|
| 120 |
mask, vectors = self._model(request.input if request.is_batched else [request.input])
|
| 121 |
|
| 122 |
-
|
| 123 |
-
for (index, embedding) in enumerate(vectors):
|
| 124 |
-
embedding = Embedding(index=index, embedding=embedding.tolist())
|
| 125 |
-
embeddings[index] = embedding
|
| 126 |
-
|
| 127 |
usage = get_usage(mask)
|
| 128 |
-
return EmbeddingResponse(
|
| 129 |
|
| 130 |
|
| 131 |
def entrypoint():
|
|
|
|
| 1 |
import platform
|
| 2 |
+
from typing import Tuple, List
|
|
|
|
| 3 |
|
| 4 |
+
import numpy as np
|
| 5 |
import torch
|
| 6 |
from hfendpoints.http import Context, run
|
| 7 |
+
from hfendpoints.openai.embedding import EmbeddingEndpoint
|
| 8 |
+
from hfendpoints.tasks import Usage
|
| 9 |
+
from hfendpoints.tasks.embedding import EmbeddingRequest, EmbeddingResponse
|
| 10 |
from intel_extension_for_pytorch.cpu.runtime import pin
|
| 11 |
from loguru import logger
|
| 12 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 47 |
|
| 48 |
logger.info(f"Pinning CPU cores to {pinned_cpu_cores_id}")
|
| 49 |
return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def get_usage(mask: torch.IntTensor) -> Usage:
|
|
|
|
| 55 |
:param mask: Attention mask tensor, as returned by the model
|
| 56 |
:return: Usage object matching OpenAI specifications
|
| 57 |
"""
|
| 58 |
+
num_tokens = np.sum(mask)
|
| 59 |
return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
|
| 60 |
|
| 61 |
|
|
|
|
| 66 |
super().__init__()
|
| 67 |
self._model = model
|
| 68 |
|
| 69 |
+
def forward(self, sentences: list[str]) -> Tuple[List[List[int]], List[List[int]]]:
|
| 70 |
vectors = self._model.encode(sentences, output_value=None)
|
| 71 |
+
return (
|
| 72 |
+
[vector['attention_mask'].tolist() for vector in vectors],
|
| 73 |
+
[vector['sentence_embedding'].tolist() for vector in vectors]
|
| 74 |
+
)
|
| 75 |
|
| 76 |
|
| 77 |
class SentenceTransformerHandler(Handler):
|
|
|
|
| 123 |
with pin(self._pinned_cores):
|
| 124 |
mask, vectors = self._model(request.input if request.is_batched else [request.input])
|
| 125 |
|
| 126 |
+
# TODO: Change the way we return usage
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
usage = get_usage(mask)
|
| 128 |
+
return EmbeddingResponse(embeddings=vectors, num_tokens=usage.total_tokens)
|
| 129 |
|
| 130 |
|
| 131 |
def entrypoint():
|