import chatglm_cpp import gradio as gr from pathlib import Path model_file_path = "chatglm3-ggml_q4_0.bin" chatglm_llm = chatglm_cpp.Pipeline(Path(model_file_path)) examples = [ "哈利波特和赫敏是什么关系?", "如何学好历史?", "明朝内阁制度的特点是什么?", "如何进行经济建设?", "How to promote Chinese traditional culture ?", "你听说过马克思吗?", ] def process_stream(instruction, temperature, top_p, top_k, max_new_tokens, seed): if "[SEP]" not in instruction: streamer = chatglm_llm.generate(prompt=instruction, temperature=temperature, top_p=top_p,top_k=top_k,max_length=max_new_tokens, stream = True ) else: history = instruction.split("[SEP]") streamer = chatglm_llm.chat( history=history, temperature=temperature, top_p=top_p,top_k=top_k,max_length=max_new_tokens, do_sample=False, stream = True ) response = "" for new_text in streamer: response += new_text yield response with gr.Blocks( theme=gr.themes.Soft(), css=''' .header img { float: middle; width: 33px; height: 33px; } .header h1 { top: 18px; left: 10px; } .disclaimer {font-variant-caps: all-small-caps;} ''', ) as demo: gr.HTML( """

ChatGLM3 on CPU in CPP

This demo uses the [chatglm.cpp](https://github.com/li-plus/chatglm.cpp) library on 2 CPU cores. """ ) with gr.Row(): with gr.Column(): with gr.Row(): instruction = gr.Textbox( placeholder="Enter your question or instruction here", label="Question/Instruction", elem_id="q-input", ) with gr.Accordion("Advanced Options:", open=False): with gr.Row(): with gr.Column(): with gr.Row(): temperature = gr.Slider( label="Temperature", value=0.5, minimum=0.1, maximum=1.0, step=0.1, interactive=True, info="Higher values produce more diverse outputs", ) with gr.Column(): with gr.Row(): top_p = gr.Slider( label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1.0, step=0.01, interactive=True, info=( "Sample from the smallest possible set of tokens whose cumulative probability " "exceeds top_p. Set to 1 to disable and sample from all tokens." ), ) with gr.Column(): with gr.Row(): top_k = gr.Slider( label="Top-k", value=40, minimum=5, maximum=80, step=1, interactive=True, info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.", ) with gr.Column(): with gr.Row(): max_new_tokens = gr.Slider( label="Maximum new tokens", value=256, minimum=0, maximum=1024, step=5, interactive=True, info="The maximum number of new tokens to generate", ) with gr.Column(): with gr.Row(): seed = gr.Number( label="Seed", value=42, interactive=True, info="The seed to use for the generation", precision=0 ) with gr.Row(): submit = gr.Button("Submit") with gr.Row(): with gr.Tab(): gr.Markdown("**ChatGLM3-6b**") output_7b = gr.Markdown() with gr.Row(): gr.Examples( examples=examples, inputs=[instruction], cache_examples=False, fn=process_stream, outputs=output_7b, ) submit.click( process_stream, inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed], outputs=output_7b, ) instruction.submit( process_stream, inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed], outputs=output_7b, ) demo.launch("0.0.0.0" ,debug=True)