import gradio as gr from llm_rs import AutoModel,SessionConfig,GenerationConfig,Precision repo_name = "rustformers/mpt-7b-ggml" file_name = "mpt-7b-instruct-q5_1-ggjt.bin" session_config = SessionConfig(threads=2,batch_size=2) model = AutoModel.from_pretrained(repo_name, model_file=file_name, session_config=session_config,verbose=True) def process_stream(instruction, temperature, top_p, top_k, max_new_tokens, seed): prompt=f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response: Answer:""" generation_config = GenerationConfig(seed=seed,temperature=temperature,top_p=top_p,top_k=top_k,max_new_tokens=max_new_tokens) response = "" streamer = model.stream(prompt=prompt,generation_config=generation_config) for new_text in streamer: response += new_text yield response with gr.Blocks( theme=gr.themes.Soft(), css=".disclaimer {font-variant-caps: all-small-caps;}", ) as demo: with gr.Row(): with gr.Column(): with gr.Row(): instruction = gr.Textbox( placeholder="Enter your question or instruction here", label="Question/Instruction", elem_id="q-input", ) with gr.Accordion("Advanced Options:", open=False): with gr.Row(): with gr.Column(): with gr.Row(): temperature = gr.Slider( label="Temperature", value=0.8, minimum=0.1, maximum=1.0, step=0.1, interactive=True, info="Higher values produce more diverse outputs", ) with gr.Column(): with gr.Row(): top_p = gr.Slider( label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1.0, step=0.01, interactive=True, info=( "Sample from the smallest possible set of tokens whose cumulative probability " "exceeds top_p. Set to 1 to disable and sample from all tokens." ), ) with gr.Column(): with gr.Row(): top_k = gr.Slider( label="Top-k", value=40, minimum=5, maximum=80, step=1, interactive=True, info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.", ) with gr.Column(): with gr.Row(): max_new_tokens = gr.Slider( label="Maximum new tokens", value=256, minimum=0, maximum=1024, step=5, interactive=True, info="The maximum number of new tokens to generate", ) with gr.Column(): with gr.Row(): seed = gr.Number( label="Seed", value=42, interactive=True, info="The seed to use for the generation", precision=0 ) with gr.Row(): submit = gr.Button("Submit") with gr.Row(): with gr.Box(): gr.Markdown("**MPT-7B-Instruct**") output_7b = gr.Markdown() with gr.Row(): gr.Examples( examples=examples, inputs=[instruction], cache_examples=False, fn=process_stream, outputs=output_7b, ) submit.click( process_stream, inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed], outputs=output_7b, ) instruction.submit( process_stream, inputs=[instruction, temperature, top_p, top_k, max_new_tokens,seed], outputs=output_7b, ) demo.queue(max_size=4, concurrency_count=1).launch(debug=True)