# Loads your checkpoint and prints the top 20 most likely first tokens
import torch, tiktoken
from model import GPT, GPTConfig

# Load checkpoint
ckpt = torch.load("out-shiffman/ckpt.pt", map_location="cpu")
cfg = GPTConfig(**ckpt["model_args"])

# Build model and load weights
model = GPT(cfg)
model.load_state_dict(ckpt["model"])
model.eval()

# GPT-2 tokenizer and BOS/EOT id
enc = tiktoken.get_encoding("gpt2")
eot_id = enc.eot_token if hasattr(enc, "eot_token") and enc.eot_token is not None else enc.encode("<|endoftext|>")[0]

# Single-token context = “silence”
idx = torch.tensor([[eot_id]], dtype=torch.long)

# Forward to get next-token distribution
out = model(idx)
logits = out[0] if isinstance(out, tuple) else out
probs = torch.softmax(logits[0, -1], dim=0)

# Show top-20 candidates
top_p, top_i = torch.topk(probs, k=20)
for p, i in zip(top_p.tolist(), top_i.tolist()):
    print(f"{p:.4f}  {i:>6}  {repr(enc.decode([i]))}")


print("-------")

# Keep showing “top tokens”, but filter to word-like pieces and strip leading spaces
top_p, top_i = torch.topk(probs, k=200)

# Helper to decide if a decoded token looks like a word
def is_wordlike(s: str) -> bool:
    s = s.lstrip()
    return any(ch.isalpha() for ch in s)

# Show the top 20 word-like first tokens
shown = 0
for p, i in zip(top_p.tolist(), top_i.tolist()):
    tok = enc.decode([i]).lstrip()
    if is_wordlike(tok):
        print(f"{p:.4f}  {i:>6}  {repr(tok)}")
        shown += 1
        if shown == 20:
            break