用vllm时应该是什么参数

#30
by daiwk - opened

需要加那个什么apply_chat_template不,我用的https://hf-mirror.com/lmstudio-community/QwQ-32B-GGUF 这里的gguf,好像没法抽出来tokenizer

prompt_final = [{"role": "user", "content": "xxx"}]

tensor_parallel_size=1
pipeline_parallel_size=1
ckpt_path="./QwQ-32B-Q4_K_M.gguf"

sampling_params = SamplingParams(temperature=0.6, max_tokens=1000)

batch_prompts = [prompt_final]

llm = LLM(model=ckpt_path, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend="mp", pipeline_parallel_size=pipeline_parallel_size)#, 

preds = llm.chat(batch_prompts, sampling_params)
for output in preds:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}\n")

Sign up or log in to comment