chat-hf / app.py
futranbg's picture
app: llm: more config
06342ef
raw
history blame
995 Bytes
import os
import gradio as gr
import time
from langchain.llms import CTransformers
#checkpoint = "bigscience/bloomz" # english
#checkpoint = "cmarkea/bloomz-3b-sft-chat"
#checkpoint = "bigscience/bloomz-7b1-mt" # non english
#checkpoint = os.getenv('HF_BLOOM_MODEL')
llm_config = {
'max_new_tokens': 256,
'temperature' = 0.8,
'top_p' = 0.5,
'num_beams' = 2,
'repetition_penalty': 1.1,
}
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = CTransformers(model="TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF", model_file="wizardlm-1.0-uncensored-llama2-13b.Q4_0.gguf", config=llm_config)
def response(prompt):
txt = llm(prompt)
return txt
if __name__ == '__main__':
title = "Chat"
demo_status = "Demo is running on CPU"
gr.Interface(response, inputs="text", outputs="text",
title=title,
).launch()