Spaces:
Sleeping
Sleeping
File size: 5,673 Bytes
bba8253 fb19b6e 3e64474 bba8253 6f080ab bba8253 6f080ab bba8253 e8d7bea bba8253 6f080ab 088f906 bba8253 e8d7bea 088f906 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 80351f4 e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab e8d7bea 6f080ab bba8253 6f080ab 088f906 0bd5ba6 e8d7bea 088f906 6f080ab 088f906 e8d7bea 088f906 e8d7bea 088f906 e8d7bea bba8253 e8d7bea 6f080ab 088f906 bba8253 6f080ab e8d7bea 6f080ab e8d7bea 088f906 80351f4 6f080ab 088f906 6f080ab 088f906 6f080ab 80351f4 088f906 6f080ab 088f906 e8d7bea 6f080ab e8d7bea 6f080ab 088f906 6f080ab e8d7bea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime
# Set page configuration
st.set_page_config(
page_title="Qwen2.5-Coder Chat",
page_icon="π¬",
layout="wide"
)
# Initialize session state
if 'messages' not in st.session_state:
st.session_state.messages = []
@st.cache_resource
def load_model_and_tokenizer():
try:
# Display loading message
with st.spinner("π Loading model and tokenizer... This might take a few minutes..."):
model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
# Load tokenizer first
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
# Determine device and display info
device = "cuda" if torch.cuda.is_available() else "cpu"
st.info(f"π» Using device: {device}")
# Load model with appropriate settings
if device == "cuda":
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # Use float16 for GPU
device_map="auto",
trust_remote_code=True
).eval() # Set to evaluation mode
else:
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map={"": device},
trust_remote_code=True,
low_cpu_mem_usage=True
).eval() # Set to evaluation mode
return tokenizer, model
except Exception as e:
st.error(f"β Error loading model: {str(e)}")
raise e
def generate_response(prompt, model, tokenizer, max_new_tokens=512, temperature=0.7, top_p=0.9):
"""Generate response from the model with better error handling"""
try:
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate response with progress bar
with torch.no_grad(), st.spinner("π€ Thinking..."):
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
no_repeat_ngram_size=3
)
# Decode and return response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(prompt):].strip()
except torch.cuda.OutOfMemoryError:
st.error("πΎ GPU memory exceeded. Try reducing the maximum length or clearing the conversation.")
return None
except Exception as e:
st.error(f"β Error generating response: {str(e)}")
return None
# Main UI
st.title("π¬ Qwen2.5-Coder Chat")
# Sidebar settings
with st.sidebar:
st.header("βοΈ Settings")
# Model settings
max_length = st.slider(
"Maximum Length π",
min_value=64,
max_value=2048,
value=512,
step=64
)
temperature = st.slider(
"Temperature π‘οΈ",
min_value=0.1,
max_value=2.0,
value=0.7,
step=0.1
)
top_p = st.slider(
"Top P π",
min_value=0.1,
max_value=1.0,
value=0.9,
step=0.1
)
# Clear conversation button
if st.button("ποΈ Clear Conversation"):
st.session_state.messages = []
st.rerun()
# Load model
try:
tokenizer, model = load_model_and_tokenizer()
except Exception as e:
st.error("β Failed to load model. Please check the logs and refresh the page.")
st.stop()
# Display conversation history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(f"{message['content']}\n\n_{message['timestamp']}_")
# Chat input
if prompt := st.chat_input("π Ask me anything about coding..."):
# Add user message
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.messages.append({
"role": "user",
"content": prompt,
"timestamp": timestamp
})
# Display user message
with st.chat_message("user"):
st.markdown(f"{prompt}\n\n_{timestamp}_")
# Generate and display response
with st.chat_message("assistant"):
# Prepare conversation context (limit to last 3 messages to prevent context overflow)
conversation = "\n".join(
f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
for msg in st.session_state.messages[-3:]
) + "\nAssistant:"
response = generate_response(
conversation,
model,
tokenizer,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p
)
if response:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.markdown(f"{response}\n\n_{timestamp}_")
# Add response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": response,
"timestamp": timestamp
})
else:
st.error("β Failed to generate response. Please try again with different settings.") |