# Loads your checkpoint and prints the top 20 most likely first tokens import torch, tiktoken from model import GPT, GPTConfig # Load checkpoint ckpt = torch.load("out-shiffman/ckpt.pt", map_location="cpu") cfg = GPTConfig(**ckpt["model_args"]) # Build model and load weights model = GPT(cfg) model.load_state_dict(ckpt["model"]) model.eval() # GPT-2 tokenizer and BOS/EOT id enc = tiktoken.get_encoding("gpt2") eot_id = enc.eot_token if hasattr(enc, "eot_token") and enc.eot_token is not None else enc.encode("<|endoftext|>")[0] # Single-token context = “silence” idx = torch.tensor([[eot_id]], dtype=torch.long) # Forward to get next-token distribution out = model(idx) logits = out[0] if isinstance(out, tuple) else out probs = torch.softmax(logits[0, -1], dim=0) # Show top-20 candidates top_p, top_i = torch.topk(probs, k=20) for p, i in zip(top_p.tolist(), top_i.tolist()): print(f"{p:.4f} {i:>6} {repr(enc.decode([i]))}") print("-------") # Keep showing “top tokens”, but filter to word-like pieces and strip leading spaces top_p, top_i = torch.topk(probs, k=200) # Helper to decide if a decoded token looks like a word def is_wordlike(s: str) -> bool: s = s.lstrip() return any(ch.isalpha() for ch in s) # Show the top 20 word-like first tokens shown = 0 for p, i in zip(top_p.tolist(), top_i.tolist()): tok = enc.decode([i]).lstrip() if is_wordlike(tok): print(f"{p:.4f} {i:>6} {repr(tok)}") shown += 1 if shown == 20: break