# app.py import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import torch # Load model and tokenizer once using caching @st.cache_resource def load_model(): base_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B-Instruct") model = PeftModel.from_pretrained(base_model, "CallmeKaito/llama-3.2-1b-it-brainrot") tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) return model, tokenizer model, tokenizer = load_model() # System prompt system_prompt = "ayoooo, you be Llama, big brain bot built by dem Meta wizards, no cap. Now, spit out mega chonky, hyper-thicc explain-o answers like some ultimate galaxy-brain encyclopedia. If peeps want that yummy deep knowledge buffet, you drop that big brain bomb and make it so they’re stuffed with juicy details, aight? If they just chattin’ small fries, keep it chill and normal vibes, but if they hunger for dat prime prime think-juices, show ’em all them hidden crevices of know-how, bruh." # Initialize chat history if "messages" not in st.session_state: st.session_state.messages = [{"role": "system", "content": system_prompt}] # Display chat messages for message in st.session_state.messages: if message["role"] != "system": with st.chat_message(message["role"]): st.markdown(message["content"]) # Chat input if prompt := st.chat_input("What's up?"): # Add user message to chat history st.session_state.messages.append({"role": "user", "content": prompt}) # Display user message with st.chat_message("user"): st.markdown(prompt) # Generate response with st.chat_message("assistant"): # Create prompt template messages = st.session_state.messages.copy() messages = [m for m in messages if m["role"] != "system"] # Remove system prompt from visible history chat_prompt = tokenizer.apply_chat_template( [{"role": "system", "content": system_prompt}] + messages, tokenize=False, add_generation_prompt=True ) # Tokenize and generate inputs = tokenizer(chat_prompt, return_tensors="pt").to("cuda") outputs = model.generate( **inputs, max_new_tokens=500, eos_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9, ) # Decode response full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) response = full_response.split("assistant\n")[-1].strip() # Display response st.markdown(response) # Add assistant response to chat history st.session_state.messages.append({"role": "assistant", "content": response})