anemll
/

anemll-DeepSeek_ctx1024_iOS.0.1.2

@@ -26,10 +26,8 @@ DARK_BLUE = "\033[34m"
 LIGHT_GREEN = "\033[92m"
 RESET_COLOR = "\033[0m"
-# Add at the top with other constants
 WARMUP_TOKEN_LIMIT = 10  # Maximum tokens to generate during warmup
-THINKING_MODE = False
-THINKING_PROMPT = """You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem."""
 class TokenPrinter:
     """Handles background printing of generated tokens."""
@@ -42,12 +40,9 @@ class TokenPrinter:
         self.lock = threading.Lock()
         self.thinking = True  # Track if we're still in thinking mode
         self.decoding_buffer = []  # Buffer for token IDs
-        # Timing and stats tracking
         self.start_time = time.time()
         self.token_count = 0
-        self.prefill_time = 0
-        self.inference_time = 0
-        self.context_pos = 0
         self.start()
     def start(self):
@@ -108,15 +103,15 @@ class TokenPrinter:
                 self.thread.join(timeout=1.0)
             except Exception:
                 pass
-            print(RESET_COLOR)  # Reset color at the end
         return self.buffer
-    def set_timing(self, prefill_time, inference_time, context_pos):
-        """Set timing information."""
-        self.prefill_time = prefill_time
-        self.inference_time = inference_time
-        self.context_pos = context_pos
 def parse_model_path(path):
     """Parse model path and return full path with .mlmodelc or .mlpackage extension."""
     path = Path(path)
@@ -193,89 +188,6 @@ def load_model(path, function_name=None):
             print("\nTry using the .mlpackage version instead, or recompile the model.")
         raise
-def parse_args():
-    parser = argparse.ArgumentParser(description='Full Chat with CoreML LLaMA with context window shifting, gil resolved (c) 2025 Anemll')
-    # Add meta.yaml option
-    parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
-    # Add existing arguments
-    parser.add_argument('--d', '--dir', type=str, default='.',
-                       help='Directory containing model files (default: current directory)')
-    parser.add_argument('--embed', type=str, required=False,
-                       help='Path to embeddings model (relative to --dir)')
-    parser.add_argument('--ffn', type=str, required=False,
-                       help='Path to FFN model (can be chunked, relative to --dir)')
-    parser.add_argument('--lmhead', type=str, required=False,
-                       help='Path to LM head model (relative to --dir)')
-    parser.add_argument('--tokenizer', type=str, required=False,
-                       help='Path to tokenizer')
-    # Add new argument for auto-generation
-    parser.add_argument('--prompt', type=str,
-                       help='If specified, run once with this prompt and exit')
-    # Add no-warmup flag
-    parser.add_argument('--nw', action='store_true',
-                       help='Skip warmup phase')
-    # Model configuration
-    parser.add_argument('--context-length', type=int,
-                       help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
-    parser.add_argument('--batch-size', type=int,
-                       help='Batch size for prefill (default: 64)')
-    args = parser.parse_args()
-    # If meta.yaml is provided, load parameters from it
-    if args.meta:
-        try:
-            with open(args.meta, 'r') as f:
-                meta = yaml.safe_load(f)
-            params = meta['model_info']['parameters']
-            # Set model directory to meta.yaml directory if not specified
-            if not args.d or args.d == '.':
-                args.d = str(Path(args.meta).parent)
-            # Build model paths based on parameters
-            prefix = params.get('model_prefix', 'llama')  # Default to 'llama' if not specified
-            lut_ffn = f"_lut{params['lut_ffn']}" if params['lut_ffn'] != 'none' else ''
-            lut_lmhead = f"_lut{params['lut_lmhead']}" if params['lut_lmhead'] != 'none' else ''
-            num_chunks = int(params['num_chunks'])
-            # Set model paths if not specified
-            if not args.embed:
-                args.embed = f'{prefix}_embeddings'
-            if not args.lmhead:
-                args.lmhead = f'{prefix}_lm_head{lut_lmhead}'
-            if not args.ffn:
-                args.ffn = f'{prefix}_FFN_PF{lut_ffn}_chunk_01of{num_chunks:02d}'
-            if not args.tokenizer:
-                args.tokenizer = args.d
-            # Set other parameters if not overridden by command line
-            if args.context_length is None:
-                args.context_length = int(params['context_length'])
-            if args.batch_size is None:
-                args.batch_size = int(params['batch_size'])
-            args.num_chunks = num_chunks
-            print(f"\nLoaded parameters from {args.meta}:")
-            print(f"  Context Length: {args.context_length}")
-            print(f"  Batch Size: {args.batch_size}")
-            print(f"  Num Chunks: {args.num_chunks}")
-            print(f"  Models Directory: {args.d}")
-            print(f"  Embeddings: {args.embed}")
-            print(f"  LM Head: {args.lmhead}")
-            print(f"  FFN: {args.ffn}")
-        except Exception as e:
-            print(f"\nError loading meta.yaml: {str(e)}")
-            sys.exit(1)
-    return args
 def load_metadata(model,args):
     # Extract metadata and config parameters
     metadata = {}
@@ -474,74 +386,84 @@ def make_causal_mask(length, start):
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
-def run_prefill(embed_model, ffn_models, input_ids, current_pos, context_length, batch_size, state, causal_mask):
     """Run prefill on the input sequence."""
-    #print(f"[DEBUG] Running prefill from 0 to {current_pos}")
     # Process in batches
     batch_pos = 0
-    while batch_pos < current_pos:
-        batch_end = min(batch_pos + batch_size, current_pos)
         current_batch_size = batch_end - batch_pos
-        #print(f"[DEBUG] Prefill batch {batch_pos}-{batch_end} (size={current_batch_size})")
         # Get current batch
         batch_input = input_ids[:, batch_pos:batch_end]
-        # Pad to full batch size
         batch_input = F.pad(
             batch_input,
             (0, batch_size - current_batch_size),
             value=0
         )
-        # Generate position IDs for this batch
-        position_ids = torch.arange(batch_pos, batch_pos + batch_size, dtype=torch.int32)
-        # Use the pre-initialized causal mask and extract the batch portion
-        batch_causal_mask = causal_mask[:, :, batch_pos:batch_pos + batch_size, :]
         # Run embeddings
         hidden_states = torch.from_numpy(
             embed_model.predict({'input_ids': batch_input.numpy()})['hidden_states']
         )
-        # Run through FFN chunks
         for ffn_model in ffn_models:
             if isinstance(ffn_model, dict):
                 inputs = {
-                    'hidden_states': hidden_states.numpy(),
-                    'position_ids': position_ids.numpy(),
-                    'causal_mask': batch_causal_mask.numpy(),
-                    'current_pos': np.array([batch_pos], dtype=np.int32)
                 }
                 output = ffn_model['prefill'].predict(inputs, state)
                 hidden_states = torch.from_numpy(output['output_hidden_states'])
         batch_pos = batch_end
-    return torch.tensor([current_pos], dtype=torch.int32)
-def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state, causal_mask, temperature=0.0):
     """Generate the next token."""
     # Get current token
-    current_token = input_ids[:, pos-1:pos]
     # Run embeddings
     hidden_states = torch.from_numpy(
         embed_model.predict({'input_ids': current_token.numpy()})['hidden_states']
-    )
     # Create masks
     update_mask = torch.zeros((1, 1, context_length, 1), dtype=torch.float16)
     update_mask[0, 0, pos-1, 0] = 1.0
-    position_ids = torch.tensor([pos-1], dtype=torch.int32)
-    # Use the pre-initialized causal mask and extract the single position portion
-    single_causal_mask = causal_mask[:, :, pos-1:pos, :]
-    # Run through FFN chunks
     for ffn_model in ffn_models:
         if isinstance(ffn_model, dict):
             inputs = {
@@ -554,19 +476,25 @@ def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, c
             output = ffn_model['infer'].predict(inputs, state)
             hidden_states = torch.from_numpy(output['output_hidden_states'])
-    # Run LM head and get next token
     lm_output = lmhead_model.predict({'hidden_states': hidden_states.numpy()})
     if 'logits1' in lm_output:
         logits_parts = []
         for i in range(1, 9):
             key = f'logits{i}'
             if key in lm_output:
                 logits_parts.append(torch.from_numpy(lm_output[key]))
-        logits = torch.cat(logits_parts, dim=-1)
     else:
         logits = torch.from_numpy(lm_output['output_logits'])
     if temperature > 0:
         logits = logits / temperature
         probs = F.softmax(logits[0, -1, :], dim=-1)
@@ -588,93 +516,36 @@ def create_unified_state(ffn_models, context_length):
         print("\nCreated unified transformer state")
         return state
-def initialize_causal_mask(context_length):
-    """Initialize causal mask for transformer attention."""
-    causal_mask = make_causal_mask(context_length, 0)
-    causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
-    print(f"\nInitialized causal mask for context length {context_length}")
-    return causal_mask
-def get_user_input():
-    """Get input from user, handling special key combinations."""
-    global THINKING_MODE
-    try:
-        import termios
-        import tty
-        import sys
-        def _getch():
-            fd = sys.stdin.fileno()
-            old_settings = termios.tcgetattr(fd)
-            try:
-                tty.setraw(sys.stdin.fileno())
-                ch = sys.stdin.read(1)
-            finally:
-                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
-            return ch
-        buffer = []
-        while True:
-            char = _getch()
-            # Debug: print the character code
-            print(f"\nKey pressed: {repr(char)} (hex: {hex(ord(char))})")
-            # Check for Enter key
-            if char == '\r' or char == '\n':
-                print()  # Move to next line
-                input_text = ''.join(buffer)
-                # Check if the command is /t
-                if input_text == '/t':
-                    THINKING_MODE = not THINKING_MODE
-                    print(f"Thinking mode {'ON' if THINKING_MODE else 'OFF'}")
-                    buffer = []  # Clear buffer
-                    print(f"\n{LIGHT_GREEN}You{' (thinking)' if THINKING_MODE else ''}:{RESET_COLOR}", end=' ', flush=True)
-                    continue
-                return input_text
-            # Handle backspace
-            if char == '\x7f':  # backspace
-                if buffer:
-                    buffer.pop()
-                    sys.stdout.write('\b \b')  # Erase character
-                    sys.stdout.flush()
-                continue
-            # Handle Ctrl-C
-            if char == '\x03':  # Ctrl-C
-                print("^C")
-                raise KeyboardInterrupt
-            # Print character and add to buffer
-            sys.stdout.write(char)
-            sys.stdout.flush()
-            buffer.append(char)
-    except ImportError:
-        # Fallback for systems without termios
-        return input("> ")
-def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, causal_mask, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
-    global THINKING_MODE
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
     if not warmup:
         print(f"\nUsing context length: {context_length}")
         print("\nStarting chat session. Press Ctrl+D to exit.")
-        print("Type your message and press Enter to chat. Use /t to toggle thinking mode.")
-        print(f"Thinking mode is {'ON' if THINKING_MODE else 'OFF'}")
-    # Keep track of conversation history
     conversation = []
     try:
         while True:
             try:
                 if not warmup:
-                    print(f"\n{LIGHT_GREEN}You{' (thinking)' if THINKING_MODE else ''}:{RESET_COLOR}", end=' ', flush=True)
                 if auto_prompt is not None:
                     user_input = auto_prompt
                     if not warmup:
@@ -685,69 +556,41 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                 if not warmup:
                     print("\nExiting chat...")
                 break
             if not user_input:
                 continue
-            # Handle /t command
-            if user_input == "/t":
-                THINKING_MODE = not THINKING_MODE
-                print(f"Thinking mode {'ON' if THINKING_MODE else 'OFF'}")
-                continue
-            # Add user message to conversation
-            conversation.append({"role": "user", "content": user_input})
-            # Format using chat template with full history
-            if THINKING_MODE:
-                # Add thinking prompt to system message
-                conversation_with_thinking = [{"role": "system", "content": THINKING_PROMPT}] + conversation
-                base_input_ids = tokenizer.apply_chat_template(
-                    conversation_with_thinking,
                     return_tensors="pt",
                     add_generation_prompt=True
                 ).to(torch.int32)
             else:
-                base_input_ids = tokenizer.apply_chat_template(
-                    conversation,
                     return_tensors="pt",
-                    add_generation_prompt=True
-                ).to(torch.int32)
-            # Check if we need to trim history
-            while base_input_ids.size(1) > context_length - 100:  # Leave room for response
-                # Remove oldest message pair (user + assistant)
-                if len(conversation) > 2:
-                    conversation = conversation[2:]  # Remove oldest pair
-                    base_input_ids = tokenizer.apply_chat_template(
-                        conversation,
-                        return_tensors="pt",
-                        add_generation_prompt=True
-                    ).to(torch.int32)
-                else:
-                    # If only current message remains and still too long, truncate
-                    base_input_ids = base_input_ids[:, -context_length//2:]
-                    break
-            context_pos = base_input_ids.size(1)
-            # Pad sequence to context_size
-            input_ids = F.pad(
-                base_input_ids,
-                (0, context_length - context_pos),
-                value=0
-            )
             if not warmup:
                 print(f"\n{LIGHT_BLUE}Assistant:{RESET_COLOR}", end=' ', flush=True)
-            # Initialize token printer and collect response
             token_printer = TokenPrinter(tokenizer)
-            response_tokens = []
-            generation_start_time = time.time()
             try:
-                # Run prefill on entire context
                 current_pos = run_prefill(
                     embed_model,
                     ffn_models,
@@ -758,51 +601,20 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                     state,
                     causal_mask
                 )
-                #print(f"\n[DEBUG] After initial prefill - current_pos: {current_pos}")
-                # Generation loop
                 pos = context_pos
-                tokens_generated = 0
-                inference_start = time.time()  # Start inference timing
-                while True:
-                    # Check if we need to shift window
-                    if pos >= context_length - 2:
-                        # Calculate shift to maintain full batches
-                        batch_size = metadata.get('batch_size', 64)
-                        # Calculate max batches that fit in context
-                        max_batches = context_length // batch_size
-                        desired_batches = max(1, max_batches - 2)  # Leave room for new tokens
-                        new_size = min(desired_batches * batch_size, context_length - batch_size)
-                        # Create shifted input_ids
-                        tmp = torch.zeros((1, context_length), dtype=torch.int32)
-                        tmp[:,0:new_size] = input_ids[:,pos-new_size:pos]
-                        input_ids = tmp
-                        # Reset state and run prefill
-                        # keep the same state
-                        #state = create_unified_state(ffn_models, context_length)
-                        current_pos = run_prefill(
-                            embed_model,
-                            ffn_models,
-                            input_ids,
-                            new_size,  # Prefill the entire shifted content
-                            context_length,
-                            batch_size,
-                            state,
-                            causal_mask
-                        )
-                        # Start generating from the next position
-                        pos = new_size  # Don't back up, continue from where we left off
-                        #print(f"\n[DEBUG] After shift - next token will be at pos {pos}")
-                        #print(f"[DEBUG] Context before next token: {tokenizer.decode(input_ids[0, pos-40:pos])}")
-                        window_shifted = True
-                    # Generate next token
                     next_token = generate_next_token(
                         embed_model,
                         ffn_models,
@@ -814,54 +626,143 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                         causal_mask
                     )
-                    # Add token
-                    input_ids[0, pos] = next_token
                     if not warmup:
                         token_printer.add_token(next_token)
                         token_printer.drain_buffer()
-                    response_tokens.append(next_token)
                     pos += 1
                     tokens_generated += 1
-                    # In warmup mode, limit tokens
                     if warmup and tokens_generated >= WARMUP_TOKEN_LIMIT:
                         break
                     if next_token == tokenizer.eos_token_id:
                         break
-                inference_time = time.time() - inference_start  # Calculate inference time
-                # Add assistant response to conversation
-                response_text = token_printer.stop()
-                conversation.append({"role": "assistant", "content": response_text})
-                # Print stats only if not in warmup
                 if not warmup:
-                    total_time = time.time() - generation_start_time
-                    prefill_time = total_time - inference_time
-                    inference_tokens_per_sec = len(response_tokens) / inference_time if inference_time > 0 else 0
-                    prefill_ms = prefill_time * 1000
-                    prefill_tokens_per_sec = context_pos / prefill_time if prefill_time > 0 else 0
-                    print(f"{DARK_BLUE}{inference_tokens_per_sec:.1f} t/s, "
-                          f"TTFT: {prefill_ms:.1f}ms ({prefill_tokens_per_sec:.1f} t/s), "
-                          f"{len(response_tokens)} tokens{RESET_COLOR}")
                 if auto_prompt is not None:
                     break
             except KeyboardInterrupt:
-                if not warmup:
-                    print("\nGeneration interrupted")
                 token_printer.stop()
                 continue
     except Exception as e:
-        if not warmup:
-            print(f"\nError in chat loop: {str(e)}")
-            import traceback
-            traceback.print_exc()
 def main():
     args = parse_args()
@@ -926,7 +827,7 @@ def main():
                     lmhead_model=lmhead_model,
                     tokenizer=tokenizer,
                     metadata=metadata,
-                    state=state,  # Pass the state
                     causal_mask=causal_mask,  # Pass the causal mask
                     warmup=True,
                     auto_prompt="who are you?"
@@ -939,7 +840,7 @@ def main():
             lmhead_model=lmhead_model,
             tokenizer=tokenizer,
             metadata=metadata,
-            state=state,  # Pass the state
             causal_mask=causal_mask,  # Pass the causal mask
             warmup=False,
             auto_prompt=args.prompt

 LIGHT_GREEN = "\033[92m"
 RESET_COLOR = "\033[0m"
+# Add at top with other constants
 WARMUP_TOKEN_LIMIT = 10  # Maximum tokens to generate during warmup
 class TokenPrinter:
     """Handles background printing of generated tokens."""
         self.lock = threading.Lock()
         self.thinking = True  # Track if we're still in thinking mode
         self.decoding_buffer = []  # Buffer for token IDs
+        # Add token counting and timing
         self.start_time = time.time()
         self.token_count = 0
         self.start()
     def start(self):
                 self.thread.join(timeout=1.0)
             except Exception:
                 pass
+            # Calculate and print tokens/s with shorter format in blue
+            elapsed = time.time() - self.start_time
+            if elapsed > 0 and self.token_count > 0:
+                tokens_per_sec = self.token_count / elapsed
+                print(f"\n{DARK_BLUE}{tokens_per_sec:.1f} t/s{RESET_COLOR}")
+            else:
+                print(RESET_COLOR)  # Reset color at the end
         return self.buffer
 def parse_model_path(path):
     """Parse model path and return full path with .mlmodelc or .mlpackage extension."""
     path = Path(path)
             print("\nTry using the .mlpackage version instead, or recompile the model.")
         raise
 def load_metadata(model,args):
     # Extract metadata and config parameters
     metadata = {}
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
+def initialize_causal_mask(context_length):
+    """Initialize causal mask for transformer attention."""
+    causal_mask = make_causal_mask(context_length, 0)
+    causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
+    print(f"\nInitialized causal mask for context length {context_length}")
+    return causal_mask
+def run_prefill(embed_model, ffn_models, input_ids, context_pos, context_length, batch_size=64, state=None, causal_mask=None):
     """Run prefill on the input sequence."""
+    # Use provided causal mask or create one if not provided
+    if causal_mask is None:
+        causal_mask = make_causal_mask(context_length, 0)
+        causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
     # Process in batches
     batch_pos = 0
+    while batch_pos < context_pos:
+        batch_end = min(batch_pos + batch_size, context_pos)
         current_batch_size = batch_end - batch_pos
         # Get current batch
         batch_input = input_ids[:, batch_pos:batch_end]
+        # Always pad to full batch size for prefill
         batch_input = F.pad(
             batch_input,
             (0, batch_size - current_batch_size),
             value=0
         )
+        # Generate position IDs for full batch size
+        position_ids = torch.arange(batch_size, dtype=torch.int32)  # Changed: Always use full batch size
+        batch_causal_mask = causal_mask[:, :, :batch_size, :]  # Changed: Use full batch size
         # Run embeddings
         hidden_states = torch.from_numpy(
             embed_model.predict({'input_ids': batch_input.numpy()})['hidden_states']
         )
+        # Run through FFN chunks with state
         for ffn_model in ffn_models:
             if isinstance(ffn_model, dict):
                 inputs = {
+                    'hidden_states': hidden_states.numpy(),  # [1, 64, hidden_size]
+                    'position_ids': position_ids.numpy(),    # [64]
+                    'causal_mask': batch_causal_mask.numpy(), # [1, 1, 64, context_length]
+                    'current_pos': np.array([batch_pos], dtype=np.int32)  # [1]
                 }
                 output = ffn_model['prefill'].predict(inputs, state)
                 hidden_states = torch.from_numpy(output['output_hidden_states'])
         batch_pos = batch_end
+    return torch.tensor([context_pos], dtype=torch.int32)
+def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state=None, causal_mask=None, temperature=0.0):
     """Generate the next token."""
     # Get current token
+    current_token = input_ids[:, pos-1:pos]  # [1, 1]
     # Run embeddings
     hidden_states = torch.from_numpy(
         embed_model.predict({'input_ids': current_token.numpy()})['hidden_states']
+    )  # [1, 1, hidden_size]
     # Create masks
     update_mask = torch.zeros((1, 1, context_length, 1), dtype=torch.float16)
     update_mask[0, 0, pos-1, 0] = 1.0
+    position_ids = torch.tensor([pos-1], dtype=torch.int32)  # [1]
+    # Use provided causal mask or create one if not provided
+    if causal_mask is None:
+        causal_mask_data = make_causal_mask(context_length, 0)
+        single_causal_mask = torch.tensor(causal_mask_data[:, :, pos-1:pos, :], dtype=torch.float16)  # [1, 1, 1, context_length]
+    else:
+        single_causal_mask = causal_mask[:, :, pos-1:pos, :]
+    # Run through FFN chunks with state
     for ffn_model in ffn_models:
         if isinstance(ffn_model, dict):
             inputs = {
             output = ffn_model['infer'].predict(inputs, state)
             hidden_states = torch.from_numpy(output['output_hidden_states'])
+    # Run LM head
     lm_output = lmhead_model.predict({'hidden_states': hidden_states.numpy()})
+    # Debug print
+    #print("\nLM Head output keys:", list(lm_output.keys()))
+    # Combine logits1-8 if they exist
     if 'logits1' in lm_output:
+        # Concatenate all logits parts
         logits_parts = []
         for i in range(1, 9):
             key = f'logits{i}'
             if key in lm_output:
                 logits_parts.append(torch.from_numpy(lm_output[key]))
+        logits = torch.cat(logits_parts, dim=-1)  # Concatenate along vocab dimension
     else:
+        # Try output_logits as fallback
         logits = torch.from_numpy(lm_output['output_logits'])
+    # Apply temperature and sample
     if temperature > 0:
         logits = logits / temperature
         probs = F.softmax(logits[0, -1, :], dim=-1)
         print("\nCreated unified transformer state")
         return state
+def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, causal_mask=None, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
     if not warmup:
         print(f"\nUsing context length: {context_length}")
         print("\nStarting chat session. Press Ctrl+D to exit.")
+        print("Type your message and press Enter to chat.")
+    # Check if tokenizer has chat template and if it works
+    has_chat_template = False
+    try:
+        # Test if chat template works
+        test_messages = [{"role": "user", "content": "test"}]
+        tokenizer.apply_chat_template(test_messages, return_tensors="pt")
+        has_chat_template = True
+        if not warmup:
+            print("\nUsing chat template for prompts")
+    except:
+        if not warmup:
+            print("\nUsing manual formatting for prompts")
     conversation = []
     try:
         while True:
             try:
                 if not warmup:
+                    print(f"\n{LIGHT_GREEN}You:{RESET_COLOR}", end=' ', flush=True)
                 if auto_prompt is not None:
                     user_input = auto_prompt
                     if not warmup:
                 if not warmup:
                     print("\nExiting chat...")
                 break
             if not user_input:
                 continue
+            # Format prompt based on tokenizer capabilities
+            if has_chat_template:
+                messages = [{"role": "user", "content": user_input}]
+                input_ids = tokenizer.apply_chat_template(
+                    messages,
                     return_tensors="pt",
                     add_generation_prompt=True
                 ).to(torch.int32)
             else:
+                # Manual formatting for Llama models without chat template
+                formatted_prompt = f"[INST] {user_input} [/INST]"
+                input_ids = tokenizer(
+                    formatted_prompt,
                     return_tensors="pt",
+                    add_special_tokens=True
+                ).input_ids.to(torch.int32)
+            context_pos = input_ids.size(1)
             if not warmup:
                 print(f"\n{LIGHT_BLUE}Assistant:{RESET_COLOR}", end=' ', flush=True)
+            # Initialize token printer
             token_printer = TokenPrinter(tokenizer)
+            tokens_generated = 0  # Track number of tokens
             try:
+                # Start prefill timing
+                prefill_start = time.time()
+                # Run prefill with state and causal mask
                 current_pos = run_prefill(
                     embed_model,
                     ffn_models,
                     state,
                     causal_mask
                 )
+                # Calculate prefill timing
+                prefill_time = time.time() - prefill_start
+                prefill_tokens = context_pos  # Number of tokens in input
+                prefill_tokens_per_sec = prefill_tokens / prefill_time if prefill_time > 0 else 0
+                # Generation loop with state
+                input_ids = input_ids
                 pos = context_pos
+                inference_start = time.time()
+                inference_tokens = 0
+                while pos < context_length - 1:
+                    # Generate next token with causal mask
                     next_token = generate_next_token(
                         embed_model,
                         ffn_models,
                         causal_mask
                     )
+                    # Add token to sequence
+                    if pos < input_ids.size(1):
+                        input_ids[0, pos] = next_token
+                    else:
+                        input_ids = torch.cat([
+                            input_ids,
+                            torch.tensor([[next_token]], dtype=torch.int32)
+                        ], dim=1)
+                    # Add to printer only if not in warmup
                     if not warmup:
                         token_printer.add_token(next_token)
                         token_printer.drain_buffer()
                     pos += 1
                     tokens_generated += 1
+                    inference_tokens += 1
+                    # Check limits
                     if warmup and tokens_generated >= WARMUP_TOKEN_LIMIT:
                         break
                     if next_token == tokenizer.eos_token_id:
                         break
+                # Calculate inference timing
+                inference_time = time.time() - inference_start
+                inference_tokens_per_sec = inference_tokens / inference_time if inference_time > 0 else 0
+                # Get final response and add to conversation
                 if not warmup:
+                    response = token_printer.stop()
+                    # Print timing stats
+                    prefill_ms = prefill_time * 1000  # Convert to milliseconds
+                    print(f"\nPrefill: {prefill_ms:.1f}ms ({prefill_tokens_per_sec:.1f} t/s)")
+                    print(f"Inference: {inference_tokens_per_sec:.1f} t/s")
+                    print(f"Total: Generated {tokens_generated} tokens in {prefill_time + inference_time:.2f}s")
+                    conversation.append({"role": "assistant", "content": response})
+                else:
+                    token_printer.stop()  # Clean up without printing stats
+                # Exit after one response in auto_prompt mode
                 if auto_prompt is not None:
                     break
             except KeyboardInterrupt:
+                print("\nGeneration interrupted")
                 token_printer.stop()
                 continue
     except Exception as e:
+        print(f"\nError in chat loop: {str(e)}")
+        import traceback
+        traceback.print_exc()
+def parse_args():
+    parser = argparse.ArgumentParser(description='Chat with CoreML LLaMA, gil resolved  (c) 2025 Anemll')
+    # Add meta.yaml option
+    parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
+    # Model paths
+    parser.add_argument('--d', '--dir', type=str, default='.',
+                       help='Directory containing model files (default: current directory)')
+    parser.add_argument('--embed', type=str, required=False,
+                       help='Path to embeddings model (relative to --dir)')
+    parser.add_argument('--ffn', type=str, required=False,
+                       help='Path to FFN model (can be chunked, relative to --dir)')
+    parser.add_argument('--lmhead', type=str, required=False,
+                       help='Path to LM head model (relative to --dir)')
+    parser.add_argument('--tokenizer', type=str, required=False,
+                       help='Path to tokenizer')
+    # Add new argument for auto-generation
+    parser.add_argument('--prompt', type=str,
+                       help='If specified, run once with this prompt and exit')
+    # Add no-warmup flag
+    parser.add_argument('--nw', action='store_true',
+                       help='Skip warmup phase')
+    # Model configuration
+    parser.add_argument('--context-length', type=int,
+                       help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
+    parser.add_argument('--batch-size', type=int,
+                       help='Batch size for prefill (default: 64)')
+    args = parser.parse_args()
+    # If meta.yaml is provided, load parameters from it
+    if args.meta:
+        try:
+            with open(args.meta, 'r') as f:
+                meta = yaml.safe_load(f)
+            params = meta['model_info']['parameters']
+            # Set model directory to meta.yaml directory if not specified
+            if not args.d or args.d == '.':
+                args.d = str(Path(args.meta).parent)
+            # Build model paths based on parameters
+            prefix = params.get('model_prefix', 'llama')  # Default to 'llama' if not specified
+            lut_ffn = f"_lut{params['lut_ffn']}" if params['lut_ffn'] != 'none' else ''
+            lut_lmhead = f"_lut{params['lut_lmhead']}" if params['lut_lmhead'] != 'none' else ''
+            num_chunks = int(params['num_chunks'])
+            # Set model paths if not specified
+            if not args.embed:
+                args.embed = f'{prefix}_embeddings'
+            if not args.lmhead:
+                args.lmhead = f'{prefix}_lm_head{lut_lmhead}'
+            if not args.ffn:
+                args.ffn = f'{prefix}_FFN_PF{lut_ffn}_chunk_01of{num_chunks:02d}'
+            if not args.tokenizer:
+                args.tokenizer = args.d
+            # Set other parameters if not overridden by command line
+            if args.context_length is None:
+                args.context_length = int(params['context_length'])
+            if args.batch_size is None:
+                args.batch_size = int(params['batch_size'])
+            args.num_chunks = num_chunks
+            print(f"\nLoaded parameters from {args.meta}:")
+            print(f"  Context Length: {args.context_length}")
+            print(f"  Batch Size: {args.batch_size}")
+            print(f"  Num Chunks: {args.num_chunks}")
+            print(f"  Models Directory: {args.d}")
+            print(f"  Embeddings: {args.embed}")
+            print(f"  LM Head: {args.lmhead}")
+            print(f"  FFN: {args.ffn}")
+        except Exception as e:
+            print(f"\nError loading meta.yaml: {str(e)}")
+            sys.exit(1)
+    return args
 def main():
     args = parse_args()
                     lmhead_model=lmhead_model,
                     tokenizer=tokenizer,
                     metadata=metadata,
+                    state=state,
                     causal_mask=causal_mask,  # Pass the causal mask
                     warmup=True,
                     auto_prompt="who are you?"
             lmhead_model=lmhead_model,
             tokenizer=tokenizer,
             metadata=metadata,
+            state=state,
             causal_mask=causal_mask,  # Pass the causal mask
             warmup=False,
             auto_prompt=args.prompt