Morgan Funtowicz
commited on
Commit
·
5e1abf0
1
Parent(s):
6ce5654
feat(embeddings): do not tokenize twice
Browse files- handler.py +61 -20
handler.py
CHANGED
@@ -1,13 +1,17 @@
|
|
1 |
import platform
|
2 |
-
from
|
|
|
|
|
3 |
|
4 |
import torch
|
5 |
from hfendpoints.openai import Context, run
|
6 |
from hfendpoints.openai.embeddings import Embedding, EmbeddingEndpoint, EmbeddingRequest, EmbeddingResponse, Usage
|
7 |
-
from
|
8 |
from loguru import logger
|
9 |
-
from
|
10 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
11 |
|
12 |
# Not used for now
|
13 |
SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
|
@@ -27,17 +31,47 @@ def has_bf16_support() -> bool:
|
|
27 |
return torch.cpu._is_avx512_bf16_supported() or torch.cpu._is_avx512_supported()
|
28 |
|
29 |
|
30 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
"""
|
32 |
Compute the number of processed tokens and return as Usage object matching OpenAI
|
33 |
-
:param
|
34 |
:return: Usage object matching OpenAI specifications
|
35 |
"""
|
36 |
-
num_tokens =
|
37 |
return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
class SentenceTransformerHandler(Handler):
|
40 |
-
__slots__ = ("_config", "_dtype", "_model", "_model_name", "_use_amp")
|
41 |
|
42 |
def __init__(self, config: EndpointConfig):
|
43 |
self._config = config
|
@@ -47,44 +81,51 @@ class SentenceTransformerHandler(Handler):
|
|
47 |
self._allocate_model()
|
48 |
|
49 |
def _allocate_model(self):
|
50 |
-
# Denormal number is used to store
|
51 |
# Computations with denormal numbers are remarkably slower than normalized number.
|
52 |
torch.set_flush_denormal(True)
|
53 |
|
54 |
dtype = torch.bfloat16 if has_bf16_support() else torch.float32
|
55 |
model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
|
56 |
|
|
|
57 |
if platform.machine() == "x86_64":
|
58 |
import intel_extension_for_pytorch as ipex
|
59 |
logger.info(f"x64 platform detected: {platform.processor()}")
|
60 |
|
|
|
|
|
|
|
|
|
61 |
with torch.inference_mode():
|
62 |
model = model.eval()
|
63 |
model = model.to(memory_format=torch.channels_last)
|
64 |
-
|
|
|
|
|
65 |
model = torch.compile(model, dynamic=True, backend="ipex")
|
|
|
|
|
|
|
66 |
else:
|
67 |
model = torch.compile(model)
|
68 |
|
69 |
-
self._model = model
|
70 |
self._dtype = dtype
|
71 |
self._use_amp = dtype in SUPPORTED_AMP_DTYPES
|
|
|
72 |
|
73 |
async def __call__(self, request: EmbeddingRequest, ctx: Context) -> EmbeddingResponse:
|
74 |
with torch.backends.mkldnn.verbose(VERBOSE_ON_CREATION if self._config.is_debug else VERBOSE_OFF):
|
75 |
with torch.inference_mode(), torch.amp.autocast("cpu", dtype=self._dtype, enabled=self._use_amp):
|
76 |
-
|
77 |
-
|
78 |
|
79 |
embeddings = [None] * len(request)
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
embeddings[index] = embedding
|
86 |
-
|
87 |
-
usage = get_usage(tokens)
|
88 |
return EmbeddingResponse(model=self._model_name, embeddings=embeddings, usage=usage)
|
89 |
|
90 |
|
|
|
1 |
import platform
|
2 |
+
from functools import reduce
|
3 |
+
from operator import itemgetter
|
4 |
+
from typing import Generator, Tuple
|
5 |
|
6 |
import torch
|
7 |
from hfendpoints.openai import Context, run
|
8 |
from hfendpoints.openai.embeddings import Embedding, EmbeddingEndpoint, EmbeddingRequest, EmbeddingResponse, Usage
|
9 |
+
from intel_extension_for_pytorch.cpu.runtime import pin
|
10 |
from loguru import logger
|
11 |
+
from hfendpoints import EndpointConfig, Handler, __version__
|
12 |
from sentence_transformers import SentenceTransformer
|
13 |
+
from torch.nn import Module
|
14 |
+
from torch.backends.mkldnn import VERBOSE_ON_CREATION, VERBOSE_OFF
|
15 |
|
16 |
# Not used for now
|
17 |
SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
|
|
|
31 |
return torch.cpu._is_avx512_bf16_supported() or torch.cpu._is_avx512_supported()
|
32 |
|
33 |
|
34 |
+
def get_cores_pinning_strategy() -> "CPUPool":
|
35 |
+
import intel_extension_for_pytorch as ipex
|
36 |
+
|
37 |
+
# Retrieve the number of nodes
|
38 |
+
num_nodes = ipex.cpu.runtime.runtime_utils.get_num_nodes()
|
39 |
+
cpu_cores_id = [ipex.cpu.runtime.runtime_utils.get_core_list_of_node_id(node_id) for node_id in range(num_nodes)]
|
40 |
+
|
41 |
+
if num_nodes == 1:
|
42 |
+
pinned_cpu_cores_id = cpu_cores_id[0]
|
43 |
+
else:
|
44 |
+
pinned_cpu_cores_id = [core_id for node in cpu_cores_id for core_id in node]
|
45 |
+
|
46 |
+
logger.info(f"Pinning CPU cores to {pinned_cpu_cores_id}")
|
47 |
+
return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
|
48 |
+
# return ipex.cpu.runtime.CPUPool(node_id=0)
|
49 |
+
|
50 |
+
|
51 |
+
def get_usage(mask: torch.IntTensor) -> Usage:
|
52 |
"""
|
53 |
Compute the number of processed tokens and return as Usage object matching OpenAI
|
54 |
+
:param mask: Attention mask tensor, as returned by the model
|
55 |
:return: Usage object matching OpenAI specifications
|
56 |
"""
|
57 |
+
num_tokens = sum(m.sum().item() for m in mask)
|
58 |
return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
|
59 |
|
60 |
+
|
61 |
+
class SentenceTransformerWithUsage(Module):
|
62 |
+
__slots__ = ("_model", )
|
63 |
+
|
64 |
+
def __init__(self, model: SentenceTransformer):
|
65 |
+
super().__init__()
|
66 |
+
self._model = model
|
67 |
+
|
68 |
+
def forward(self, sentences: list[str]) -> Tuple[Generator[torch.Tensor], Generator[torch.Tensor]]:
|
69 |
+
vectors = self._model.encode(sentences, output_value=None)
|
70 |
+
return map(itemgetter('attention_mask'), vectors), map(itemgetter('sentence_embedding'), vectors)
|
71 |
+
|
72 |
+
|
73 |
class SentenceTransformerHandler(Handler):
|
74 |
+
__slots__ = ("_config", "_dtype", "_model", "_model_name", "_pinned_cores", "_use_amp")
|
75 |
|
76 |
def __init__(self, config: EndpointConfig):
|
77 |
self._config = config
|
|
|
81 |
self._allocate_model()
|
82 |
|
83 |
def _allocate_model(self):
|
84 |
+
# Denormal number is used to store tiny numbers that are close to 0.
|
85 |
# Computations with denormal numbers are remarkably slower than normalized number.
|
86 |
torch.set_flush_denormal(True)
|
87 |
|
88 |
dtype = torch.bfloat16 if has_bf16_support() else torch.float32
|
89 |
model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
|
90 |
|
91 |
+
|
92 |
if platform.machine() == "x86_64":
|
93 |
import intel_extension_for_pytorch as ipex
|
94 |
logger.info(f"x64 platform detected: {platform.processor()}")
|
95 |
|
96 |
+
# Retrieve all the physical cores ID for all the CPU nodes
|
97 |
+
self._pinned_cores = get_cores_pinning_strategy()
|
98 |
+
|
99 |
+
# Optimize the model for inference
|
100 |
with torch.inference_mode():
|
101 |
model = model.eval()
|
102 |
model = model.to(memory_format=torch.channels_last)
|
103 |
+
|
104 |
+
# Apply IPEx optimizations
|
105 |
+
model = ipex.optimize(model, dtype=dtype, weights_prepack=True, graph_mode=True, concat_linear=True)
|
106 |
model = torch.compile(model, dynamic=True, backend="ipex")
|
107 |
+
|
108 |
+
# model = ipex.cpu.runtime.MultiStreamModule(SentenceTransformerWithUsage(model), num_streams=1)
|
109 |
+
|
110 |
else:
|
111 |
model = torch.compile(model)
|
112 |
|
|
|
113 |
self._dtype = dtype
|
114 |
self._use_amp = dtype in SUPPORTED_AMP_DTYPES
|
115 |
+
self._model = SentenceTransformerWithUsage(model)
|
116 |
|
117 |
async def __call__(self, request: EmbeddingRequest, ctx: Context) -> EmbeddingResponse:
|
118 |
with torch.backends.mkldnn.verbose(VERBOSE_ON_CREATION if self._config.is_debug else VERBOSE_OFF):
|
119 |
with torch.inference_mode(), torch.amp.autocast("cpu", dtype=self._dtype, enabled=self._use_amp):
|
120 |
+
with pin(self._pinned_cores):
|
121 |
+
mask, vectors = self._model(request.input if request.is_batched else [request.input])
|
122 |
|
123 |
embeddings = [None] * len(request)
|
124 |
+
for (index, embedding) in enumerate(vectors):
|
125 |
+
embedding = Embedding(index=index, embedding=embedding.tolist())
|
126 |
+
embeddings[index] = embedding
|
127 |
+
|
128 |
+
usage = get_usage(mask)
|
|
|
|
|
|
|
129 |
return EmbeddingResponse(model=self._model_name, embeddings=embeddings, usage=usage)
|
130 |
|
131 |
|