import os
import gradio as gr
import time
from langchain.llms import CTransformers

#checkpoint = "bigscience/bloomz" # english
#checkpoint = "cmarkea/bloomz-3b-sft-chat"
#checkpoint = "bigscience/bloomz-7b1-mt" # non english
#checkpoint = os.getenv('HF_BLOOM_MODEL')
llm_config = {
          'max_new_tokens': 256,
          'temperature' = 0.8,
          'top_p' = 0.5,
          'num_beams' = 2,
          'repetition_penalty': 1.1,
          }


# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = CTransformers(model="TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF", model_file="wizardlm-1.0-uncensored-llama2-13b.Q4_0.gguf", config=llm_config)

def response(prompt):
    txt = llm(prompt)
    return txt

if __name__ == '__main__':

    title = "Chat"

    demo_status = "Demo is running on CPU"

    gr.Interface(response, inputs="text", outputs="text",
                 title=title,
                 ).launch()