Spaces:

eyad-silx
/

Quasar

Runtime error

App Files Files Community

Eiad Gomaa commited on Oct 24, 2024

Commit

f488be3

1 Parent(s): 6f6da11

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -24

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ import time
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -21,20 +24,20 @@ def load_model():
     try:
         st.spinner("Loading model... This may take a few minutes")
         logger.info("Starting model loading...")
         # Basic model loading without device map
         model = AutoModelForCausalLM.from_pretrained(
             "NousResearch/Llama-3.2-1B",
             torch_dtype=torch.float32  # Use float32 for CPU
         )
         tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
         # Set up padding token
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             model.config.pad_token_id = model.config.eos_token_id
         logger.info("Model loaded successfully")
         return model, tokenizer
     except Exception as e:
@@ -47,7 +50,7 @@ def check_for_repetition(text, threshold=3):
     words = text.split()
     if len(words) < threshold:
         return False
     # Check for repeated phrases
     for i in range(len(words) - threshold):
         phrase = ' '.join(words[i:i+threshold])
@@ -66,9 +69,9 @@ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30)
             truncation=True,
             max_length=256  # Reduced for CPU
         )
         start_time = time.time()
         # Generate response with stricter parameters
         with torch.no_grad():
             outputs = model.generate(
@@ -86,47 +89,44 @@ def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30)
                 no_repeat_ngram_size=3,  # Prevent 3-gram repetitions
                 early_stopping=True
             )
         generation_time = time.time() - start_time
         logger.info(f"Response generated in {generation_time:.2f} seconds")
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response = response.replace(prompt, "").strip()
         # Check for repetitions and retry if necessary
         if check_for_repetition(response):
             logger.warning("Detected repetition, retrying with stricter parameters")
             return "I apologize, but I'm having trouble generating a coherent response. Could you try rephrasing your question?"
         return response
     except Exception as e:
         logger.error(f"Error in generation: {str(e)}")
         return f"Error generating response: {str(e)}"
-# Page config
-st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")
 # Add debug information in sidebar
 with st.sidebar:
     st.write("### System Information")
     st.write("Model: Quasar-32B")
     # Device and memory information
     device = "GPU" if torch.cuda.is_available() else "CPU"
     st.write(f"Running on: {device}")
     # Warning for CPU usage
     if not torch.cuda.is_available():
         st.warning("⚠️ Running on CPU - Responses may be very slow. Consider using a GPU or a smaller model.")
     # Model settings
     st.write("### Model Settings")
     if 'temperature' not in st.session_state:
         st.session_state.temperature = 0.8
     if 'max_length' not in st.session_state:
         st.session_state.max_length = 100
     st.session_state.temperature = st.slider("Temperature", 0.1, 1.0, st.session_state.temperature)
     st.session_state.max_length = st.slider("Max Length", 50, 200, st.session_state.max_length)
@@ -153,12 +153,12 @@ with chat_container:
 if prompt := st.chat_input("Type your message here"):
     # Add user message to chat history
     st.session_state.messages.append({"role": "user", "content": prompt})
     # Display user message
     with chat_container:
         with st.chat_message("user"):
             st.write(prompt)
     # Generate and display assistant response
     if model and tokenizer:
         with st.chat_message("assistant"):
@@ -172,10 +172,10 @@ if prompt := st.chat_input("Type your message here"):
                             prompt
                         )
                         response = future.result(timeout=30)
                 st.write(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
             except TimeoutError:
                 error_msg = "Response generation timed out. The model might be overloaded."
                 st.error(error_msg)
@@ -190,4 +190,4 @@ if prompt := st.chat_input("Type your message here"):
 # Add a button to clear chat history
 if st.button("Clear Chat History"):
     st.session_state.messages = []
-    st.experimental_rerun()

 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 import logging
+# Page config - this must be the first Streamlit command
+st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     try:
         st.spinner("Loading model... This may take a few minutes")
         logger.info("Starting model loading...")
         # Basic model loading without device map
         model = AutoModelForCausalLM.from_pretrained(
             "NousResearch/Llama-3.2-1B",
             torch_dtype=torch.float32  # Use float32 for CPU
         )
         tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
         # Set up padding token
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             model.config.pad_token_id = model.config.eos_token_id
         logger.info("Model loaded successfully")
         return model, tokenizer
     except Exception as e:
     words = text.split()
     if len(words) < threshold:
         return False
     # Check for repeated phrases
     for i in range(len(words) - threshold):
         phrase = ' '.join(words[i:i+threshold])
             truncation=True,
             max_length=256  # Reduced for CPU
         )
         start_time = time.time()
         # Generate response with stricter parameters
         with torch.no_grad():
             outputs = model.generate(
                 no_repeat_ngram_size=3,  # Prevent 3-gram repetitions
                 early_stopping=True
             )
         generation_time = time.time() - start_time
         logger.info(f"Response generated in {generation_time:.2f} seconds")
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response = response.replace(prompt, "").strip()
         # Check for repetitions and retry if necessary
         if check_for_repetition(response):
             logger.warning("Detected repetition, retrying with stricter parameters")
             return "I apologize, but I'm having trouble generating a coherent response. Could you try rephrasing your question?"
         return response
     except Exception as e:
         logger.error(f"Error in generation: {str(e)}")
         return f"Error generating response: {str(e)}"
 # Add debug information in sidebar
 with st.sidebar:
     st.write("### System Information")
     st.write("Model: Quasar-32B")
     # Device and memory information
     device = "GPU" if torch.cuda.is_available() else "CPU"
     st.write(f"Running on: {device}")
     # Warning for CPU usage
     if not torch.cuda.is_available():
         st.warning("⚠️ Running on CPU - Responses may be very slow. Consider using a GPU or a smaller model.")
     # Model settings
     st.write("### Model Settings")
     if 'temperature' not in st.session_state:
         st.session_state.temperature = 0.8
     if 'max_length' not in st.session_state:
         st.session_state.max_length = 100
     st.session_state.temperature = st.slider("Temperature", 0.1, 1.0, st.session_state.temperature)
     st.session_state.max_length = st.slider("Max Length", 50, 200, st.session_state.max_length)
 if prompt := st.chat_input("Type your message here"):
     # Add user message to chat history
     st.session_state.messages.append({"role": "user", "content": prompt})
     # Display user message
     with chat_container:
         with st.chat_message("user"):
             st.write(prompt)
     # Generate and display assistant response
     if model and tokenizer:
         with st.chat_message("assistant"):
                             prompt
                         )
                         response = future.result(timeout=30)
                 st.write(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
             except TimeoutError:
                 error_msg = "Response generation timed out. The model might be overloaded."
                 st.error(error_msg)
 # Add a button to clear chat history
 if st.button("Clear Chat History"):
     st.session_state.messages = []
+    st.experimental_rerun()