Update README.md
Browse files
README.md
CHANGED
@@ -157,6 +157,9 @@ from huggingface_hub import snapshot_download
|
|
157 |
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
158 |
from vllm.lora.request import LoRARequest
|
159 |
|
|
|
|
|
|
|
160 |
def create_test_prompts(lora_path: str) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
161 |
"""Create a list of test prompts with their sampling parameters.
|
162 |
1 requests for base model, 1 request for the LoRA.
|
@@ -173,6 +176,7 @@ def create_test_prompts(lora_path: str) -> list[tuple[str, SamplingParams, Optio
|
|
173 |
top_k=20,
|
174 |
logprobs=1,
|
175 |
prompt_logprobs=1,
|
|
|
176 |
max_tokens=4096),
|
177 |
LoRARequest("reasoning-lora", 1, lora_path)),
|
178 |
|
@@ -187,7 +191,8 @@ def process_requests(engine: LLMEngine,
|
|
187 |
|
188 |
while test_prompts or engine.has_unfinished_requests():
|
189 |
if test_prompts:
|
190 |
-
|
|
|
191 |
engine.add_request(str(request_id),
|
192 |
prompt,
|
193 |
sampling_params,
|
|
|
157 |
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
158 |
from vllm.lora.request import LoRARequest
|
159 |
|
160 |
+
SYS_MESSAGE = 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.'
|
161 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained("")
|
162 |
+
|
163 |
def create_test_prompts(lora_path: str) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
164 |
"""Create a list of test prompts with their sampling parameters.
|
165 |
1 requests for base model, 1 request for the LoRA.
|
|
|
176 |
top_k=20,
|
177 |
logprobs=1,
|
178 |
prompt_logprobs=1,
|
179 |
+
stop=["</s>", "<eos>"],
|
180 |
max_tokens=4096),
|
181 |
LoRARequest("reasoning-lora", 1, lora_path)),
|
182 |
|
|
|
191 |
|
192 |
while test_prompts or engine.has_unfinished_requests():
|
193 |
if test_prompts:
|
194 |
+
input, sampling_params, lora_request = test_prompts.pop(0)
|
195 |
+
prompt = tokenizer.apply_chat_template([{'role':'system', 'content': SYS_MESSAGE},{'role': 'user', 'content': input}], tokenize = False, add_generation_prompt = True)
|
196 |
engine.add_request(str(request_id),
|
197 |
prompt,
|
198 |
sampling_params,
|