|
import os |
|
import gradio as gr |
|
import time |
|
from langchain.llms import CTransformers |
|
|
|
|
|
|
|
|
|
|
|
llm_config = { |
|
'max_new_tokens': 256, |
|
'temperature' = 0.8, |
|
'top_p' = 0.5, |
|
'num_beams' = 2, |
|
'repetition_penalty': 1.1, |
|
} |
|
|
|
|
|
|
|
llm = CTransformers(model="TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF", model_file="wizardlm-1.0-uncensored-llama2-13b.Q4_0.gguf", config=llm_config) |
|
|
|
def response(prompt): |
|
txt = llm(prompt) |
|
return txt |
|
|
|
if __name__ == '__main__': |
|
|
|
title = "Chat" |
|
|
|
demo_status = "Demo is running on CPU" |
|
|
|
gr.Interface(response, inputs="text", outputs="text", |
|
title=title, |
|
).launch() |