ChatGLM3 on CPU in CPP

import chatglm_cpp
import gradio as gr
from pathlib import Path

model_file_path = "chatglm3-ggml_q4_0.bin"
chatglm_llm = chatglm_cpp.Pipeline(Path(model_file_path))

examples = [
            "哈利波特和赫敏是什么关系？",
            "如何学好历史？",
            "明朝内阁制度的特点是什么?",
            "如何进行经济建设?", 
            "How to promote Chinese traditional culture ?",
            "你听说过马克思吗？",
]


def process_stream(instruction, temperature, top_p, top_k, max_new_tokens, seed):
    if "[SEP]" not in instruction:
        streamer = chatglm_llm.generate(prompt=instruction,
        temperature=temperature,
        top_p=top_p,top_k=top_k,max_length=max_new_tokens,
        stream = True
    )
    else:
        history = instruction.split("[SEP]")
        streamer = chatglm_llm.chat(
            history=history,
            temperature=temperature,
            top_p=top_p,top_k=top_k,max_length=max_new_tokens,
            do_sample=False,
            stream = True
        )
    response = ""
    for new_text in streamer:
        response += new_text
        yield response


with gr.Blocks(
    theme=gr.themes.Soft(),
    css='''
    .header img {
          float: middle;
          width: 33px;
          height: 33px;
        }
    .header h1 {
          top: 18px;
          left: 10px;
        }
    .disclaimer {font-variant-caps: all-small-caps;}
    ''',
) as demo:
    gr.HTML(
        """
        <div class="header">
        <h1> <center> <img src="https://huggingface.co/spaces/svjack/chatglm3-6b-ggml/resolve/main/hanuman.png"> 
        ChatGLM3 on CPU in CPP </center></h1>
        </div>
        This demo uses the [chatglm.cpp](https://github.com/li-plus/chatglm.cpp) library on 2 CPU cores.
        """
    )
    with gr.Row():
        with gr.Column():
            with gr.Row():
                instruction = gr.Textbox(
                    placeholder="Enter your question or instruction here",
                    label="Question/Instruction",
                    elem_id="q-input",
                )
            with gr.Accordion("Advanced Options:", open=False):
                with gr.Row():
                    with gr.Column():
                        with gr.Row():
                            temperature = gr.Slider(
                                label="Temperature",
                                value=0.5,
                                minimum=0.1,
                                maximum=1.0,
                                step=0.1,
                                interactive=True,
                                info="Higher values produce more diverse outputs",
                            )
                    with gr.Column():
                        with gr.Row():
                            top_p = gr.Slider(
                                label="Top-p (nucleus sampling)",
                                value=0.95,
                                minimum=0.0,
                                maximum=1.0,
                                step=0.01,
                                interactive=True,
                                info=(
                                    "Sample from the smallest possible set of tokens whose cumulative probability "
                                    "exceeds top_p. Set to 1 to disable and sample from all tokens."
                                ),
                            )
                    with gr.Column():
                        with gr.Row():
                            top_k = gr.Slider(
                                label="Top-k",
                                value=40,
                                minimum=5,
                                maximum=80,
                                step=1,
                                interactive=True,
                                info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",
                            )
                    with gr.Column():
                        with gr.Row():
                            max_new_tokens = gr.Slider(
                                label="Maximum new tokens",
                                value=256,
                                minimum=0,
                                maximum=1024,
                                step=5,
                                interactive=True,
                                info="The maximum number of new tokens to generate",
                            )

                    with gr.Column():
                        with gr.Row():
                            seed = gr.Number(
                                label="Seed",
                                value=42,
                                interactive=True,
                                info="The seed to use for the generation",
                                precision=0
                            )
    with gr.Row():
        submit = gr.Button("Submit")
    with gr.Row():
        with gr.Tab():
            gr.Markdown("**ChatGLM3-6b**")
            output_7b = gr.Markdown()

    with gr.Row():
        gr.Examples(
            examples=examples,
            inputs=[instruction],
            cache_examples=False,
            fn=process_stream,
            outputs=output_7b,
        )

    submit.click(
        process_stream,
        inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed],
        outputs=output_7b,
    )
    instruction.submit(
        process_stream,
        inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed],
        outputs=output_7b,
    )

demo.launch("0.0.0.0" ,debug=True)