Spaces:

Shilpaj
/

SmoLLMv2

Sleeping

App Files Files Community

Shilpaj commited on 20 days ago

Commit

baa4d1d

verified ·

1 Parent(s): ea06c18

Upload app.py

Browse files

Files changed (1) hide show

app.py +70 -53

app.py CHANGED Viewed

@@ -11,12 +11,16 @@ from transformers import GPT2Tokenizer
 import spaces
 import os
 from pathlib import Path
 # Local imports
 from smollmv2 import SmollmV2
 from config import SmollmConfig, DataConfig
 from smollv2_lightning import LitSmollmv2
 def combine_model_parts(model_dir="split_models", output_file="checkpoints/last.ckpt"):
     """
@@ -56,7 +60,7 @@ def load_model():
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     # Load model directly from checkpoint
-    checkpoint_path = "last.ckpt"  # Assuming the checkpoint is in the root directory
     if not os.path.exists(checkpoint_path):
         raise FileNotFoundError(
@@ -64,21 +68,25 @@ def load_model():
             "Please ensure the model checkpoint file 'last.ckpt' is present in the root directory."
         )
-    # Load the model from checkpoint using Lightning module
-    model = LitSmollmv2.load_from_checkpoint(
-        checkpoint_path,
-        model_config=SmollmConfig,
-        strict=False
-    )
-    model.to(device)
-    model.eval()
-    # Initialize tokenizer
-    tokenizer = GPT2Tokenizer.from_pretrained(DataConfig.tokenizer_path)
-    tokenizer.pad_token = tokenizer.eos_token
-    return model, tokenizer, device
 @spaces.GPU(enable_queue=True)
@@ -86,50 +94,59 @@ def generate_text(prompt, num_tokens, temperature=0.8, top_p=0.9):
     """
     Generate text using the SmollmV2 model.
     """
-    # Ensure num_tokens doesn't exceed model's block size
-    num_tokens = min(num_tokens, SmollmConfig.block_size)
-    # Tokenize input prompt
-    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
-    # Generate tokens one at a time
-    for _ in range(num_tokens):
-        # Get the model's predictions
-        with torch.no_grad():
-            with torch.autocast(device_type=device, dtype=torch.bfloat16):
-                logits, _ = model.model(input_ids)
-        # Get the next token probabilities
-        logits = logits[:, -1, :] / temperature
-        probs = F.softmax(logits, dim=-1)
-        # Apply top-p sampling
-        if top_p > 0:
-            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
-            cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
-            sorted_indices_to_keep = cumsum_probs <= top_p
-            sorted_indices_to_keep[..., 1:] = sorted_indices_to_keep[..., :-1].clone()
-            sorted_indices_to_keep[..., 0] = 1
-            indices_to_keep = torch.zeros_like(probs, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_keep)
-            probs = torch.where(indices_to_keep, probs, torch.zeros_like(probs))
-            probs = probs / probs.sum(dim=-1, keepdim=True)
-        # Sample next token
-        next_token = torch.multinomial(probs, num_samples=1)
-        # Append to input_ids
-        input_ids = torch.cat([input_ids, next_token], dim=-1)
-        # Stop if we generate an EOS token
-        if next_token.item() == tokenizer.eos_token_id:
-            break
-    # Decode and return the generated text
-    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
-    return generated_text
 # Load the model globally
-model, tokenizer, device = load_model()
 # Create the Gradio interface
 demo = gr.Interface(

 import spaces
 import os
 from pathlib import Path
+import warnings
 # Local imports
 from smollmv2 import SmollmV2
 from config import SmollmConfig, DataConfig
 from smollv2_lightning import LitSmollmv2
+# Configure PyTorch to handle the device properties issue
+torch._dynamo.config.suppress_errors = True
+warnings.filterwarnings('ignore', category=UserWarning)
 def combine_model_parts(model_dir="split_models", output_file="checkpoints/last.ckpt"):
     """
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     # Load model directly from checkpoint
+    checkpoint_path = "last.ckpt"
     if not os.path.exists(checkpoint_path):
         raise FileNotFoundError(
             "Please ensure the model checkpoint file 'last.ckpt' is present in the root directory."
         )
+    try:
+        # Load the model from checkpoint using Lightning module
+        model = LitSmollmv2.load_from_checkpoint(
+            checkpoint_path,
+            model_config=SmollmConfig,
+            strict=False
+        )
+        model.to(device)
+        model.eval()
+        # Initialize tokenizer
+        tokenizer = GPT2Tokenizer.from_pretrained(DataConfig.tokenizer_path)
+        tokenizer.pad_token = tokenizer.eos_token
+        return model, tokenizer, device
+    except Exception as e:
+        raise RuntimeError(f"Error loading model: {str(e)}")
 @spaces.GPU(enable_queue=True)
     """
     Generate text using the SmollmV2 model.
     """
+    try:
+        # Ensure num_tokens doesn't exceed model's block size
+        num_tokens = min(num_tokens, SmollmConfig.block_size)
+        # Tokenize input prompt
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+        # Generate tokens one at a time
+        with torch.inference_mode():  # Use inference_mode instead of no_grad
+            for _ in range(num_tokens):
+                # Get the model's predictions
+                with torch.autocast(device_type=device, dtype=torch.float16):  # Changed to float16
+                    outputs = model(input_ids)
+                    logits = outputs[0] if isinstance(outputs, tuple) else outputs
+                # Get the next token probabilities
+                logits = logits[:, -1, :] / temperature
+                probs = F.softmax(logits, dim=-1)
+                # Apply top-p sampling
+                if top_p > 0:
+                    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+                    cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
+                    sorted_indices_to_keep = cumsum_probs <= top_p
+                    sorted_indices_to_keep[..., 1:] = sorted_indices_to_keep[..., :-1].clone()
+                    sorted_indices_to_keep[..., 0] = 1
+                    indices_to_keep = torch.zeros_like(probs, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_keep)
+                    probs = torch.where(indices_to_keep, probs, torch.zeros_like(probs))
+                    probs = probs / probs.sum(dim=-1, keepdim=True)
+                # Sample next token
+                next_token = torch.multinomial(probs, num_samples=1)
+                # Append to input_ids
+                input_ids = torch.cat([input_ids, next_token], dim=-1)
+                # Stop if we generate an EOS token
+                if next_token.item() == tokenizer.eos_token_id:
+                    break
+        # Decode and return the generated text
+        generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        return generated_text
+    except Exception as e:
+        return f"Error during text generation: {str(e)}"
 # Load the model globally
+try:
+    model, tokenizer, device = load_model()
+except Exception as e:
+    print(f"Error initializing model: {str(e)}")
+    raise
 # Create the Gradio interface
 demo = gr.Interface(