import gradio as gr from llama_cpp import Llama llm = Llama( model_path="yugogpt-q4_0.gguf", n_ctx=2048, n_threads=4, n_batch=512, use_mlock=True, use_mmap=True ) def format_chat_history(history): formatted_history = "" for user_msg, assistant_msg in history: formatted_history += f"Q: {user_msg}\nA: {assistant_msg}\n" return formatted_history def process_chunk(text): # Remove any special tokens text = text.replace("ASSISTANT:", "").replace("A:", "") # Add space after each character if there isn't one already processed_text = "" for i, char in enumerate(text): processed_text += char if i < len(text) - 1 and not text[i+1].isspace(): processed_text += " " return processed_text.strip() def chat(message, history): system_prompt = """Ti si YugoGPT. Odgovaraj direktno i precizno na srpskom jeziku.""" chat_history = format_chat_history(history) full_prompt = f"""SYSTEM: {system_prompt} {chat_history} Q: {message} A:""" response = llm( full_prompt, max_tokens=2048, temperature=0.1, top_p=0.1, repeat_penalty=1.2, top_k=20, stop=["Q:", "\n\n"], stream=True ) partial_message = "" for chunk in response: if chunk and chunk['choices'][0]['text']: text = process_chunk(chunk['choices'][0]['text']) partial_message += text yield partial_message demo = gr.ChatInterface( fn=chat, title="YugoGPT", description="Stručni asistent", examples=[ "Objasni kako radi HTTP protokol", "Šta je SQL?", "Kako radi TCP/IP?" ] ) if __name__ == "__main__": demo.queue().launch( server_name="0.0.0.0", server_port=7860, share=False )