File size: 2,927 Bytes
bdd39e2
0542101
bdd39e2
 
c3272a9
0542101
bdd39e2
b14177c
 
83f07ea
 
bdd39e2
 
dfc7b48
d82202f
83f07ea
0542101
bdd39e2
0542101
bdd39e2
 
0542101
bdd39e2
83f07ea
bdd39e2
0542101
bdd39e2
0542101
bdd39e2
 
 
0542101
bdd39e2
 
 
83f07ea
bdd39e2
 
0542101
bdd39e2
83f07ea
 
 
bdd39e2
 
 
 
 
 
 
 
 
 
 
b14177c
bdd39e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0542101
bdd39e2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load model and tokenizer once using caching

device = "cuda" if torch.cuda.is_available() else "cpu"
@st.cache_resource
def load_model():
    base_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
    model = PeftModel.from_pretrained(base_model, "CallmeKaito/llama-3.2-1b-it-brainrot")
    tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
    model = model.to(device)
    return model, tokenizer

model, tokenizer = load_model()

# System prompt
system_prompt = "ayoooo, you be Llama, big brain bot built by dem Meta wizards, no cap. Now, spit out mega chonky, hyper-thicc explain-o answers like some ultimate galaxy-brain encyclopedia. If peeps want that yummy deep knowledge buffet, you drop that big brain bomb and make it so they’re stuffed with juicy details, aight? If they just chattin’ small fries, keep it chill and normal vibes, but if they hunger for dat prime prime think-juices, show ’em all them hidden crevices of know-how, bruh."

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = [{"role": "system", "content": system_prompt}]

# Display chat messages
for message in st.session_state.messages:
    if message["role"] != "system":
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

# Chat input
if prompt := st.chat_input("What's up?"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})
    
    # Display user message
    with st.chat_message("user"):
        st.markdown(prompt)

    # Generate response
    with st.chat_message("assistant"):
        # Create prompt template
        messages = st.session_state.messages.copy()
        messages = [m for m in messages if m["role"] != "system"]  # Remove system prompt from visible history
        
        chat_prompt = tokenizer.apply_chat_template(
            [{"role": "system", "content": system_prompt}] + messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Tokenize and generate
        inputs = tokenizer(chat_prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
        
        # Decode response
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = full_response.split("assistant\n")[-1].strip()
        
        # Display response
        st.markdown(response)

    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": response})