Morgan Funtowicz commited on
Commit
49b79e1
·
1 Parent(s): 5460614

feat(embedding): move to common I/O representation

Browse files
Files changed (1) hide show
  1. handler.py +13 -13
handler.py CHANGED
@@ -1,10 +1,12 @@
1
  import platform
2
- from operator import itemgetter
3
- from typing import Generator, Tuple
4
 
 
5
  import torch
6
  from hfendpoints.http import Context, run
7
- from hfendpoints.http.embeddings import Embedding, EmbeddingEndpoint, EmbeddingRequest, EmbeddingResponse, Usage
 
 
8
  from intel_extension_for_pytorch.cpu.runtime import pin
9
  from loguru import logger
10
  from sentence_transformers import SentenceTransformer
@@ -45,7 +47,6 @@ def get_cores_pinning_strategy() -> "CPUPool":
45
 
46
  logger.info(f"Pinning CPU cores to {pinned_cpu_cores_id}")
47
  return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
48
- # return ipex.cpu.runtime.CPUPool(node_id=0)
49
 
50
 
51
  def get_usage(mask: torch.IntTensor) -> Usage:
@@ -54,7 +55,7 @@ def get_usage(mask: torch.IntTensor) -> Usage:
54
  :param mask: Attention mask tensor, as returned by the model
55
  :return: Usage object matching OpenAI specifications
56
  """
57
- num_tokens = sum(m.sum().item() for m in mask)
58
  return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
59
 
60
 
@@ -65,9 +66,12 @@ class SentenceTransformerWithUsage(Module):
65
  super().__init__()
66
  self._model = model
67
 
68
- def forward(self, sentences: list[str]) -> Tuple[Generator[torch.Tensor], Generator[torch.Tensor]]:
69
  vectors = self._model.encode(sentences, output_value=None)
70
- return map(itemgetter('attention_mask'), vectors), map(itemgetter('sentence_embedding'), vectors)
 
 
 
71
 
72
 
73
  class SentenceTransformerHandler(Handler):
@@ -119,13 +123,9 @@ class SentenceTransformerHandler(Handler):
119
  with pin(self._pinned_cores):
120
  mask, vectors = self._model(request.input if request.is_batched else [request.input])
121
 
122
- embeddings = [None] * len(request)
123
- for (index, embedding) in enumerate(vectors):
124
- embedding = Embedding(index=index, embedding=embedding.tolist())
125
- embeddings[index] = embedding
126
-
127
  usage = get_usage(mask)
128
- return EmbeddingResponse(model=self._model_name, embeddings=embeddings, usage=usage)
129
 
130
 
131
  def entrypoint():
 
1
  import platform
2
+ from typing import Tuple, List
 
3
 
4
+ import numpy as np
5
  import torch
6
  from hfendpoints.http import Context, run
7
+ from hfendpoints.openai.embedding import EmbeddingEndpoint
8
+ from hfendpoints.tasks import Usage
9
+ from hfendpoints.tasks.embedding import EmbeddingRequest, EmbeddingResponse
10
  from intel_extension_for_pytorch.cpu.runtime import pin
11
  from loguru import logger
12
  from sentence_transformers import SentenceTransformer
 
47
 
48
  logger.info(f"Pinning CPU cores to {pinned_cpu_cores_id}")
49
  return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
 
50
 
51
 
52
  def get_usage(mask: torch.IntTensor) -> Usage:
 
55
  :param mask: Attention mask tensor, as returned by the model
56
  :return: Usage object matching OpenAI specifications
57
  """
58
+ num_tokens = np.sum(mask)
59
  return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
60
 
61
 
 
66
  super().__init__()
67
  self._model = model
68
 
69
+ def forward(self, sentences: list[str]) -> Tuple[List[List[int]], List[List[int]]]:
70
  vectors = self._model.encode(sentences, output_value=None)
71
+ return (
72
+ [vector['attention_mask'].tolist() for vector in vectors],
73
+ [vector['sentence_embedding'].tolist() for vector in vectors]
74
+ )
75
 
76
 
77
  class SentenceTransformerHandler(Handler):
 
123
  with pin(self._pinned_cores):
124
  mask, vectors = self._model(request.input if request.is_batched else [request.input])
125
 
126
+ # TODO: Change the way we return usage
 
 
 
 
127
  usage = get_usage(mask)
128
+ return EmbeddingResponse(embeddings=vectors, num_tokens=usage.total_tokens)
129
 
130
 
131
  def entrypoint():