import gradio as gr from llama_cpp import Llama llm = Llama( model_path="yugogpt-q4_0.gguf", n_ctx=2048, n_threads=4 ) def chat(message, history): # Enhanced prompt template for more detailed responses prompt = f"""USER: {message} ASSISTANT: Let me provide you with a comprehensive and thoughtful response. """ response = llm( prompt, max_tokens=1024, # Increased token limit temperature=0.8, # Slightly increased creativity stop=["USER:"], # Only stop at new user input repeat_penalty=1.2, # Reduce repetition top_p=0.95 # Maintain focus while allowing creativity ) return response['choices'][0]['text'] demo = gr.ChatInterface( fn=chat, title="YugoGPT Chat", description="Ask me anything - I'll provide detailed and thoughtful responses." ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)