|
import gradio as gr |
|
from llama_cpp import Llama |
|
|
|
|
|
llm = Llama( |
|
model_path="yugogpt-q4_0.gguf", |
|
n_ctx=2048, |
|
n_batch=512, |
|
n_threads=4, |
|
n_gpu_layers=0 |
|
) |
|
|
|
def chat(message, history): |
|
|
|
full_prompt = "USER: " + message + "\nASSISTANT:" |
|
|
|
response = llm.create_completion( |
|
full_prompt, |
|
max_tokens=512, |
|
temperature=0.7, |
|
stop=["USER:", "\n"], |
|
stream=False |
|
) |
|
|
|
return response['choices'][0]['text'] |
|
|
|
demo = gr.ChatInterface( |
|
chat, |
|
title="YugoGPT Chat", |
|
examples=["Hello, how are you?", "What's the weather like?"], |
|
cache_examples=True |
|
) |
|
|
|
|
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False |
|
) |
|
|
|
|