Eiad Gomaa commited on
Commit
5ab0078
·
1 Parent(s): 04b4d4a
Files changed (2) hide show
  1. app.py +124 -39
  2. oldapp.py +16 -3
app.py CHANGED
@@ -1,38 +1,59 @@
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
 
 
 
 
 
 
4
 
5
  @st.cache_resource
6
  def load_model():
7
  """Load model and tokenizer with caching"""
8
  try:
 
 
 
 
 
 
 
 
 
 
 
 
9
  tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
10
- model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-3.2-1B")
11
 
12
  # Set up padding token
13
  if tokenizer.pad_token is None:
14
  tokenizer.pad_token = tokenizer.eos_token
15
  model.config.pad_token_id = model.config.eos_token_id
16
-
 
17
  return model, tokenizer
18
  except Exception as e:
 
19
  st.error(f"Error loading model: {str(e)}")
20
  return None, None
21
 
22
- # Page config
23
- st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")
24
- st.title("Chat with Quasar-32B")
25
-
26
- # Initialize session state for chat history
27
- if 'messages' not in st.session_state:
28
- st.session_state.messages = []
29
-
30
- # Load model and tokenizer
31
- model, tokenizer = load_model()
 
 
32
 
33
- # Chat interface
34
- def generate_response(prompt):
35
- """Generate response from the model"""
36
  try:
37
  # Prepare the input
38
  inputs = tokenizer(
@@ -40,26 +61,85 @@ def generate_response(prompt):
40
  return_tensors="pt",
41
  padding=True,
42
  truncation=True,
43
- max_length=512 # Add max length for input
44
- )
45
 
46
- # Generate response
 
 
47
  with torch.no_grad():
48
  outputs = model.generate(
49
  inputs["input_ids"],
50
- max_length=200,
 
51
  num_return_sequences=1,
52
- temperature=0.7,
53
  pad_token_id=tokenizer.pad_token_id,
54
- attention_mask=inputs["attention_mask"] # Add attention mask
 
 
 
 
 
 
 
55
  )
56
 
57
- # Decode and return the response
 
 
58
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
59
- return response.replace(prompt, "").strip() # Remove the input prompt from response
 
 
 
 
 
 
 
 
60
  except Exception as e:
 
61
  return f"Error generating response: {str(e)}"
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Chat interface
64
  st.write("### Chat")
65
  chat_container = st.container()
@@ -83,27 +163,32 @@ if prompt := st.chat_input("Type your message here"):
83
  # Generate and display assistant response
84
  if model and tokenizer:
85
  with st.chat_message("assistant"):
86
- with st.spinner("Thinking..."):
87
- response = generate_response(prompt)
 
 
 
 
 
 
 
 
 
88
  st.write(response)
89
  st.session_state.messages.append({"role": "assistant", "content": response})
 
 
 
 
 
 
 
 
 
90
  else:
91
  st.error("Model failed to load. Please check your configuration.")
92
 
93
  # Add a button to clear chat history
94
  if st.button("Clear Chat History"):
95
  st.session_state.messages = []
96
- st.experimental_rerun()
97
-
98
- # Display system information
99
- with st.sidebar:
100
- st.write("### System Information")
101
- st.write("Model: Quasar-32B")
102
- st.write("Status: Running" if model and tokenizer else "Status: Not loaded")
103
-
104
- # Add some helpful instructions
105
- st.write("### Instructions")
106
- st.write("1. Type your message in the chat input")
107
- st.write("2. Press Enter or click Send")
108
- st.write("3. Wait for the AI to respond")
109
- st.write("4. Use 'Clear Chat History' to start fresh")
 
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
6
+ import logging
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
 
12
  @st.cache_resource
13
  def load_model():
14
  """Load model and tokenizer with caching"""
15
  try:
16
+ st.spinner("Loading model... This may take a few minutes")
17
+ logger.info("Starting model loading...")
18
+
19
+ # Load with 8-bit quantization for CPU
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ "NousResearch/Llama-3.2-1B",
22
+ load_in_8bit=True, # Use 8-bit quantization
23
+ device_map="auto", # Automatically handle device placement
24
+ low_cpu_mem_usage=True,
25
+ torch_dtype=torch.float32 if not torch.cuda.is_available() else torch.float16
26
+ )
27
+
28
  tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
 
29
 
30
  # Set up padding token
31
  if tokenizer.pad_token is None:
32
  tokenizer.pad_token = tokenizer.eos_token
33
  model.config.pad_token_id = model.config.eos_token_id
34
+
35
+ logger.info("Model loaded successfully")
36
  return model, tokenizer
37
  except Exception as e:
38
+ logger.error(f"Error loading model: {str(e)}")
39
  st.error(f"Error loading model: {str(e)}")
40
  return None, None
41
 
42
+ def check_for_repetition(text, threshold=3):
43
+ """Check if the generated text has too many repetitions"""
44
+ words = text.split()
45
+ if len(words) < threshold:
46
+ return False
47
+
48
+ # Check for repeated phrases
49
+ for i in range(len(words) - threshold):
50
+ phrase = ' '.join(words[i:i+threshold])
51
+ if text.count(phrase) > 2: # If phrase appears more than twice
52
+ return True
53
+ return False
54
 
55
+ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30):
56
+ """Generate response with timeout and repetition checking"""
 
57
  try:
58
  # Prepare the input
59
  inputs = tokenizer(
 
61
  return_tensors="pt",
62
  padding=True,
63
  truncation=True,
64
+ max_length=256 # Reduced for CPU
65
+ ).to(model.device)
66
 
67
+ start_time = time.time()
68
+
69
+ # Generate response with stricter parameters
70
  with torch.no_grad():
71
  outputs = model.generate(
72
  inputs["input_ids"],
73
+ max_length=100, # Shorter responses
74
+ min_length=20, # Ensure some minimum content
75
  num_return_sequences=1,
76
+ temperature=0.8, # Slightly higher temperature
77
  pad_token_id=tokenizer.pad_token_id,
78
+ attention_mask=inputs["attention_mask"],
79
+ do_sample=True,
80
+ top_p=0.92,
81
+ top_k=40,
82
+ repetition_penalty=1.5, # Increased repetition penalty
83
+ no_repeat_ngram_size=3, # Prevent 3-gram repetitions
84
+ early_stopping=True,
85
+ length_penalty=1.0
86
  )
87
 
88
+ generation_time = time.time() - start_time
89
+ logger.info(f"Response generated in {generation_time:.2f} seconds")
90
+
91
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
+ response = response.replace(prompt, "").strip()
93
+
94
+ # Check for repetitions and retry if necessary
95
+ if check_for_repetition(response):
96
+ logger.warning("Detected repetition, retrying with stricter parameters")
97
+ return "I apologize, but I'm having trouble generating a coherent response. Could you try rephrasing your question?"
98
+
99
+ return response
100
+
101
  except Exception as e:
102
+ logger.error(f"Error in generation: {str(e)}")
103
  return f"Error generating response: {str(e)}"
104
 
105
+ # Page config
106
+ st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")
107
+
108
+ # Add debug information in sidebar
109
+ with st.sidebar:
110
+ st.write("### System Information")
111
+ st.write("Model: Quasar-32B")
112
+
113
+ # Device and memory information
114
+ device = "GPU" if torch.cuda.is_available() else "CPU"
115
+ st.write(f"Running on: {device}")
116
+ if torch.cuda.is_available():
117
+ st.write(f"GPU: {torch.cuda.get_device_name(0)}")
118
+ st.write(f"Memory Usage: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
119
+ else:
120
+ import psutil
121
+ st.write(f"CPU Memory Usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
122
+ st.write("⚠️ Running on CPU - Responses may be slow")
123
+
124
+ # Model settings
125
+ st.write("### Model Settings")
126
+ if 'temperature' not in st.session_state:
127
+ st.session_state.temperature = 0.8
128
+ if 'max_length' not in st.session_state:
129
+ st.session_state.max_length = 100
130
+
131
+ st.session_state.temperature = st.slider("Temperature", 0.1, 1.0, st.session_state.temperature)
132
+ st.session_state.max_length = st.slider("Max Length", 50, 200, st.session_state.max_length)
133
+
134
+ st.title("Chat with Quasar-32B")
135
+
136
+ # Initialize session state for chat history
137
+ if 'messages' not in st.session_state:
138
+ st.session_state.messages = []
139
+
140
+ # Load model and tokenizer
141
+ model, tokenizer = load_model()
142
+
143
  # Chat interface
144
  st.write("### Chat")
145
  chat_container = st.container()
 
163
  # Generate and display assistant response
164
  if model and tokenizer:
165
  with st.chat_message("assistant"):
166
+ try:
167
+ with st.spinner("Generating response... (timeout: 30s)"):
168
+ with ThreadPoolExecutor() as executor:
169
+ future = executor.submit(
170
+ generate_response_with_timeout,
171
+ model,
172
+ tokenizer,
173
+ prompt
174
+ )
175
+ response = future.result(timeout=30)
176
+
177
  st.write(response)
178
  st.session_state.messages.append({"role": "assistant", "content": response})
179
+
180
+ except TimeoutError:
181
+ error_msg = "Response generation timed out. The model might be overloaded."
182
+ st.error(error_msg)
183
+ logger.error(error_msg)
184
+ except Exception as e:
185
+ error_msg = f"Error generating response: {str(e)}"
186
+ st.error(error_msg)
187
+ logger.error(error_msg)
188
  else:
189
  st.error("Model failed to load. Please check your configuration.")
190
 
191
  # Add a button to clear chat history
192
  if st.button("Clear Chat History"):
193
  st.session_state.messages = []
194
+ st.experimental_rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
oldapp.py CHANGED
@@ -8,6 +8,12 @@ def load_model():
8
  try:
9
  tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
10
  model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-3.2-1B")
 
 
 
 
 
 
11
  return model, tokenizer
12
  except Exception as e:
13
  st.error(f"Error loading model: {str(e)}")
@@ -29,7 +35,13 @@ def generate_response(prompt):
29
  """Generate response from the model"""
30
  try:
31
  # Prepare the input
32
- inputs = tokenizer(prompt, return_tensors="pt", padding=True)
 
 
 
 
 
 
33
 
34
  # Generate response
35
  with torch.no_grad():
@@ -38,12 +50,13 @@ def generate_response(prompt):
38
  max_length=200,
39
  num_return_sequences=1,
40
  temperature=0.7,
41
- pad_token_id=tokenizer.eos_token_id
 
42
  )
43
 
44
  # Decode and return the response
45
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
46
- return response
47
  except Exception as e:
48
  return f"Error generating response: {str(e)}"
49
 
 
8
  try:
9
  tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
10
  model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-3.2-1B")
11
+
12
+ # Set up padding token
13
+ if tokenizer.pad_token is None:
14
+ tokenizer.pad_token = tokenizer.eos_token
15
+ model.config.pad_token_id = model.config.eos_token_id
16
+
17
  return model, tokenizer
18
  except Exception as e:
19
  st.error(f"Error loading model: {str(e)}")
 
35
  """Generate response from the model"""
36
  try:
37
  # Prepare the input
38
+ inputs = tokenizer(
39
+ prompt,
40
+ return_tensors="pt",
41
+ padding=True,
42
+ truncation=True,
43
+ max_length=512 # Add max length for input
44
+ )
45
 
46
  # Generate response
47
  with torch.no_grad():
 
50
  max_length=200,
51
  num_return_sequences=1,
52
  temperature=0.7,
53
+ pad_token_id=tokenizer.pad_token_id,
54
+ attention_mask=inputs["attention_mask"] # Add attention mask
55
  )
56
 
57
  # Decode and return the response
58
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
59
+ return response.replace(prompt, "").strip() # Remove the input prompt from response
60
  except Exception as e:
61
  return f"Error generating response: {str(e)}"
62