Morgan Funtowicz commited on
Commit
12f7a48
·
1 Parent(s): e90e6a1

initial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +21 -0
  2. handler.py +121 -0
  3. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG SDK_VERSION=latest
2
+ FROM huggingface/hfendpoints-sdk:${SDK_VERSION} AS sdk
3
+
4
+ FROM vllm/vllm-openai:v0.9.0.1
5
+
6
+ RUN --mount=type=bind,from=sdk,source=/opt/hfendpoints/dist,target=/usr/local/endpoints/dist \
7
+ --mount=type=bind,source=requirements.txt,target=/tmp/requirements.txt \
8
+ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu && \
9
+ python3 -m pip install -r /tmp/requirements.txt && \
10
+ python3 -m pip install /usr/local/endpoints/dist/*.whl
11
+
12
+ COPY handler.py /usr/local/endpoint/
13
+
14
+ # Network interface
15
+ ENV INTERFACE=0.0.0.0
16
+ ENV PORT=80
17
+
18
+ EXPOSE 80
19
+
20
+ ENTRYPOINT ["python3"]
21
+ CMD ["/usr/local/endpoint/handler.py"]
handler.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List, Optional, Dict, Any
3
+
4
+ from huggingface_hub import hf_hub_download
5
+ from huggingface_hub.errors import EntryNotFoundError
6
+ from loguru import logger
7
+ from vllm import (
8
+ AsyncLLMEngine, AsyncEngineArgs,
9
+ PoolingParams, EmbeddingRequestOutput,
10
+ )
11
+
12
+ from hfendpoints import EndpointConfig, Handler, __version__
13
+ from hfendpoints.http import Context, run
14
+ from hfendpoints.tasks import Usage
15
+ from hfendpoints.tasks.embedding import EmbeddingRequest, EmbeddingResponse
16
+
17
+
18
+ def get_sentence_transformers_config(config: EndpointConfig) -> Optional[Dict[str, Any]]:
19
+ st_config_path = None
20
+ if not config.is_debug:
21
+ st_config_path = (Path(config.repository) / "config_sentence_transformers.json")
22
+
23
+ if not st_config_path or not st_config_path.exists():
24
+ try:
25
+ st_config_path = hf_hub_download(config.model_id, filename="config_sentence_transformers.json")
26
+ except EntryNotFoundError:
27
+ logger.info(f"Sentence Transformers config not found on {config.model_id}")
28
+ return None
29
+
30
+ with open(st_config_path, "r", encoding="utf-8") as config_f:
31
+ from json import load
32
+ return load(config_f)
33
+
34
+
35
+ class VllmEmbeddingHandler(Handler):
36
+ __slot__ = ("_engine", "_sentence_transformer_config",)
37
+
38
+ def __init__(self, config: EndpointConfig):
39
+ self._sentence_transformers_config = get_sentence_transformers_config(config)
40
+ self._engine = AsyncLLMEngine.from_engine_args(
41
+ AsyncEngineArgs(
42
+ str(config.repository),
43
+ task="embed",
44
+ device="auto",
45
+ dtype="bfloat16",
46
+ kv_cache_dtype="auto",
47
+ enforce_eager=False,
48
+ enable_prefix_caching=True,
49
+ disable_log_requests=True,
50
+ )
51
+ )
52
+
53
+ async def embeds(
54
+ self,
55
+ prompts: str,
56
+ pooling: PoolingParams,
57
+ request_id: str
58
+ ) -> List[EmbeddingRequestOutput]:
59
+ outputs = []
60
+ async for item in self._engine.encode(
61
+ prompts,
62
+ pooling_params=pooling,
63
+ request_id=request_id,
64
+ lora_request=None,
65
+ ):
66
+ outputs.append(EmbeddingRequestOutput.from_base(item))
67
+
68
+ return outputs
69
+
70
+ async def __call__(self, request: EmbeddingRequest, ctx: Context) -> EmbeddingResponse:
71
+ if "dimension" in request.parameters:
72
+ pooling_params = PoolingParams(dimensions=request.parameters["dimension"])
73
+ else:
74
+ pooling_params = None
75
+
76
+ if "prompt_name" in request.parameters and self._sentence_transformers_config:
77
+ prompt_name = request.parameters["prompt_name"]
78
+ tokenizer = await self._engine.get_tokenizer()
79
+ prompt = self._sentence_transformers_config.get("prompts", {}).get(prompt_name, None)
80
+ num_prompt_tokens = len(tokenizer.tokenize(prompt)) if prompt else 0
81
+ else:
82
+ prompt = None
83
+ num_prompt_tokens = 0
84
+
85
+ if request.is_batched:
86
+ embeddings = []
87
+ num_tokens = 0
88
+ for idx, document in enumerate(request.inputs):
89
+ input = f"{prompt} {document}" if prompt else document
90
+ print(input)
91
+
92
+ output = await self.embeds(input, pooling_params, f"{ctx.request_id}-{idx}")
93
+ num_tokens += len(output[0].prompt_token_ids)
94
+ embeddings += [output[0].outputs.embedding]
95
+ else:
96
+ input = f"{prompt} {request.inputs}" if prompt else request.inputs
97
+ print(input)
98
+ output = await self.embeds(input, pooling_params, ctx.request_id)
99
+ num_tokens = len(output[0].prompt_token_ids)
100
+ embeddings = output[0].outputs.embedding
101
+
102
+ return EmbeddingResponse(embeddings, prompt_tokens=num_prompt_tokens, num_tokens=num_tokens)
103
+
104
+
105
+ def entrypoint():
106
+ # Readout the endpoint configuration from the provided environment variable
107
+ config = EndpointConfig.from_env()
108
+
109
+ logger.info(f"[Hugging Face Endpoint v{__version__}] Serving: {config.model_id}")
110
+
111
+ # Allocate handler
112
+ handler = VllmEmbeddingHandler(config)
113
+
114
+ # Allocate endpoint
115
+ from hfendpoints.openai.embedding import EmbeddingEndpoint
116
+ endpoint = EmbeddingEndpoint(handler)
117
+ run(endpoint, config.interface, config.port)
118
+
119
+
120
+ if __name__ == "__main__":
121
+ entrypoint()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ loguru>=0.7
2
+ torch>=2.7.0
3
+ vllm>=0.9.0.1