import os import gradio as gr import time from langchain.llms import CTransformers #checkpoint = "bigscience/bloomz" # english #checkpoint = "cmarkea/bloomz-3b-sft-chat" #checkpoint = "bigscience/bloomz-7b1-mt" # non english #checkpoint = os.getenv('HF_BLOOM_MODEL') llm_config = { 'max_new_tokens': 256, 'temperature' = 0.8, 'top_p' = 0.5, 'num_beams' = 2, 'repetition_penalty': 1.1, } # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llm = CTransformers(model="TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF", model_file="wizardlm-1.0-uncensored-llama2-13b.Q4_0.gguf", config=llm_config) def response(prompt): txt = llm(prompt) return txt if __name__ == '__main__': title = "Chat" demo_status = "Demo is running on CPU" gr.Interface(response, inputs="text", outputs="text", title=title, ).launch()