seedboxai
/

KafkaLM-15B

Text Generation

text-generation-inference

Model card Files Files and versions

doubledsbv commited on May 7

Commit

99786ed

·

verified ·

1 Parent(s): 0bba440

Update README.md

Files changed (1) hide show

README.md +6 -1

README.md CHANGED Viewed

@@ -157,6 +157,9 @@ from huggingface_hub import snapshot_download
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.lora.request import LoRARequest
 def create_test_prompts(lora_path: str) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
     """Create a list of test prompts with their sampling parameters.
     1 requests for base model, 1 request for the LoRA.
@@ -173,6 +176,7 @@ def create_test_prompts(lora_path: str) -> list[tuple[str, SamplingParams, Optio
                         top_k=20,
                         logprobs=1,
                         prompt_logprobs=1,
                         max_tokens=4096),
                         LoRARequest("reasoning-lora", 1, lora_path)),
@@ -187,7 +191,8 @@ def process_requests(engine: LLMEngine,
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
-            prompt, sampling_params, lora_request = test_prompts.pop(0)
             engine.add_request(str(request_id),
                                prompt,
                                sampling_params,

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.lora.request import LoRARequest
+SYS_MESSAGE = 'A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.'
+tokenizer = transformers.AutoTokenizer.from_pretrained("")
 def create_test_prompts(lora_path: str) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
     """Create a list of test prompts with their sampling parameters.
     1 requests for base model, 1 request for the LoRA.
                         top_k=20,
                         logprobs=1,
                         prompt_logprobs=1,
+                        stop=["</s>", "<eos>"],
                         max_tokens=4096),
                         LoRARequest("reasoning-lora", 1, lora_path)),
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
+            input, sampling_params, lora_request = test_prompts.pop(0)
+            prompt = tokenizer.apply_chat_template([{'role':'system', 'content': SYS_MESSAGE},{'role': 'user', 'content': input}], tokenize = False, add_generation_prompt = True)
             engine.add_request(str(request_id),
                                prompt,
                                sampling_params,