Spaces:

eyad-silx
/

Quasar

Runtime error

App Files Files Community

Eiad Gomaa commited on Oct 24, 2024

Commit

5ab0078

1 Parent(s): 04b4d4a

new model

Browse files

Files changed (2) hide show

app.py +124 -39
oldapp.py +16 -3

app.py CHANGED Viewed

@@ -1,38 +1,59 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 @st.cache_resource
 def load_model():
     """Load model and tokenizer with caching"""
     try:
         tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
-        model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-3.2-1B")
         # Set up padding token
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             model.config.pad_token_id = model.config.eos_token_id
         return model, tokenizer
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
         return None, None
-# Page config
-st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")
-st.title("Chat with Quasar-32B")
-# Initialize session state for chat history
-if 'messages' not in st.session_state:
-    st.session_state.messages = []
-# Load model and tokenizer
-model, tokenizer = load_model()
-# Chat interface
-def generate_response(prompt):
-    """Generate response from the model"""
     try:
         # Prepare the input
         inputs = tokenizer(
@@ -40,26 +61,85 @@ def generate_response(prompt):
             return_tensors="pt",
             padding=True,
             truncation=True,
-            max_length=512  # Add max length for input
-        )
-        # Generate response
         with torch.no_grad():
             outputs = model.generate(
                 inputs["input_ids"],
-                max_length=200,
                 num_return_sequences=1,
-                temperature=0.7,
                 pad_token_id=tokenizer.pad_token_id,
-                attention_mask=inputs["attention_mask"]  # Add attention mask
             )
-        # Decode and return the response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response.replace(prompt, "").strip()  # Remove the input prompt from response
     except Exception as e:
         return f"Error generating response: {str(e)}"
 # Chat interface
 st.write("### Chat")
 chat_container = st.container()
@@ -83,27 +163,32 @@ if prompt := st.chat_input("Type your message here"):
     # Generate and display assistant response
     if model and tokenizer:
         with st.chat_message("assistant"):
-            with st.spinner("Thinking..."):
-                response = generate_response(prompt)
                 st.write(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
     else:
         st.error("Model failed to load. Please check your configuration.")
 # Add a button to clear chat history
 if st.button("Clear Chat History"):
     st.session_state.messages = []
-    st.experimental_rerun()
-# Display system information
-with st.sidebar:
-    st.write("### System Information")
-    st.write("Model: Quasar-32B")
-    st.write("Status: Running" if model and tokenizer else "Status: Not loaded")
-    # Add some helpful instructions
-    st.write("### Instructions")
-    st.write("1. Type your message in the chat input")
-    st.write("2. Press Enter or click Send")
-    st.write("3. Wait for the AI to respond")
-    st.write("4. Use 'Clear Chat History' to start fresh")

 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+import time
+from concurrent.futures import ThreadPoolExecutor, TimeoutError
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 @st.cache_resource
 def load_model():
     """Load model and tokenizer with caching"""
     try:
+        st.spinner("Loading model... This may take a few minutes")
+        logger.info("Starting model loading...")
+        # Load with 8-bit quantization for CPU
+        model = AutoModelForCausalLM.from_pretrained(
+            "NousResearch/Llama-3.2-1B",
+            load_in_8bit=True,          # Use 8-bit quantization
+            device_map="auto",          # Automatically handle device placement
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float32 if not torch.cuda.is_available() else torch.float16
+        )
         tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
         # Set up padding token
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             model.config.pad_token_id = model.config.eos_token_id
+        logger.info("Model loaded successfully")
         return model, tokenizer
     except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
         st.error(f"Error loading model: {str(e)}")
         return None, None
+def check_for_repetition(text, threshold=3):
+    """Check if the generated text has too many repetitions"""
+    words = text.split()
+    if len(words) < threshold:
+        return False
+    # Check for repeated phrases
+    for i in range(len(words) - threshold):
+        phrase = ' '.join(words[i:i+threshold])
+        if text.count(phrase) > 2:  # If phrase appears more than twice
+            return True
+    return False
+def generate_response_with_timeout(model, tokenizer, prompt, timeout_seconds=30):
+    """Generate response with timeout and repetition checking"""
     try:
         # Prepare the input
         inputs = tokenizer(
             return_tensors="pt",
             padding=True,
             truncation=True,
+            max_length=256  # Reduced for CPU
+        ).to(model.device)
+        start_time = time.time()
+        # Generate response with stricter parameters
         with torch.no_grad():
             outputs = model.generate(
                 inputs["input_ids"],
+                max_length=100,          # Shorter responses
+                min_length=20,           # Ensure some minimum content
                 num_return_sequences=1,
+                temperature=0.8,         # Slightly higher temperature
                 pad_token_id=tokenizer.pad_token_id,
+                attention_mask=inputs["attention_mask"],
+                do_sample=True,
+                top_p=0.92,
+                top_k=40,
+                repetition_penalty=1.5,  # Increased repetition penalty
+                no_repeat_ngram_size=3,  # Prevent 3-gram repetitions
+                early_stopping=True,
+                length_penalty=1.0
             )
+        generation_time = time.time() - start_time
+        logger.info(f"Response generated in {generation_time:.2f} seconds")
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        response = response.replace(prompt, "").strip()
+        # Check for repetitions and retry if necessary
+        if check_for_repetition(response):
+            logger.warning("Detected repetition, retrying with stricter parameters")
+            return "I apologize, but I'm having trouble generating a coherent response. Could you try rephrasing your question?"
+        return response
     except Exception as e:
+        logger.error(f"Error in generation: {str(e)}")
         return f"Error generating response: {str(e)}"
+# Page config
+st.set_page_config(page_title="Chat with Quasar-32B", layout="wide")
+# Add debug information in sidebar
+with st.sidebar:
+    st.write("### System Information")
+    st.write("Model: Quasar-32B")
+    # Device and memory information
+    device = "GPU" if torch.cuda.is_available() else "CPU"
+    st.write(f"Running on: {device}")
+    if torch.cuda.is_available():
+        st.write(f"GPU: {torch.cuda.get_device_name(0)}")
+        st.write(f"Memory Usage: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
+    else:
+        import psutil
+        st.write(f"CPU Memory Usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")
+        st.write("⚠️ Running on CPU - Responses may be slow")
+    # Model settings
+    st.write("### Model Settings")
+    if 'temperature' not in st.session_state:
+        st.session_state.temperature = 0.8
+    if 'max_length' not in st.session_state:
+        st.session_state.max_length = 100
+    st.session_state.temperature = st.slider("Temperature", 0.1, 1.0, st.session_state.temperature)
+    st.session_state.max_length = st.slider("Max Length", 50, 200, st.session_state.max_length)
+st.title("Chat with Quasar-32B")
+# Initialize session state for chat history
+if 'messages' not in st.session_state:
+    st.session_state.messages = []
+# Load model and tokenizer
+model, tokenizer = load_model()
 # Chat interface
 st.write("### Chat")
 chat_container = st.container()
     # Generate and display assistant response
     if model and tokenizer:
         with st.chat_message("assistant"):
+            try:
+                with st.spinner("Generating response... (timeout: 30s)"):
+                    with ThreadPoolExecutor() as executor:
+                        future = executor.submit(
+                            generate_response_with_timeout,
+                            model,
+                            tokenizer,
+                            prompt
+                        )
+                        response = future.result(timeout=30)
                 st.write(response)
                 st.session_state.messages.append({"role": "assistant", "content": response})
+            except TimeoutError:
+                error_msg = "Response generation timed out. The model might be overloaded."
+                st.error(error_msg)
+                logger.error(error_msg)
+            except Exception as e:
+                error_msg = f"Error generating response: {str(e)}"
+                st.error(error_msg)
+                logger.error(error_msg)
     else:
         st.error("Model failed to load. Please check your configuration.")
 # Add a button to clear chat history
 if st.button("Clear Chat History"):
     st.session_state.messages = []
+    st.experimental_rerun()

oldapp.py CHANGED Viewed

@@ -8,6 +8,12 @@ def load_model():
     try:
         tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
         model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-3.2-1B")
         return model, tokenizer
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
@@ -29,7 +35,13 @@ def generate_response(prompt):
     """Generate response from the model"""
     try:
         # Prepare the input
-        inputs = tokenizer(prompt, return_tensors="pt", padding=True)
         # Generate response
         with torch.no_grad():
@@ -38,12 +50,13 @@ def generate_response(prompt):
                 max_length=200,
                 num_return_sequences=1,
                 temperature=0.7,
-                pad_token_id=tokenizer.eos_token_id
             )
         # Decode and return the response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response
     except Exception as e:
         return f"Error generating response: {str(e)}"

     try:
         tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-3.2-1B")
         model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-3.2-1B")
+        # Set up padding token
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            model.config.pad_token_id = model.config.eos_token_id
         return model, tokenizer
     except Exception as e:
         st.error(f"Error loading model: {str(e)}")
     """Generate response from the model"""
     try:
         # Prepare the input
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512  # Add max length for input
+        )
         # Generate response
         with torch.no_grad():
                 max_length=200,
                 num_return_sequences=1,
                 temperature=0.7,
+                pad_token_id=tokenizer.pad_token_id,
+                attention_mask=inputs["attention_mask"]  # Add attention mask
             )
         # Decode and return the response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response.replace(prompt, "").strip()  # Remove the input prompt from response
     except Exception as e:
         return f"Error generating response: {str(e)}"