import gradio as gr from llama_cpp import Llama # Initialize with optimized settings llm = Llama( model_path="yugogpt-q4_0.gguf", n_ctx=2048, n_batch=512, # Increased batch size n_threads=4, # Adjust based on CPU cores n_gpu_layers=0 # Set to higher number if GPU available ) def chat(message, history): # Improved prompt formatting full_prompt = "USER: " + message + "\nASSISTANT:" response = llm.create_completion( full_prompt, max_tokens=512, temperature=0.7, stop=["USER:", "\n"], # Better conversation control stream=False ) return response['choices'][0]['text'] demo = gr.ChatInterface( chat, title="YugoGPT Chat", examples=["Hello, how are you?", "What's the weather like?"], # Optional examples cache_examples=True ) # Launch with optimized settings demo.launch( server_name="0.0.0.0", server_port=7860, share=False )