In [1]:
!nvidia-smi

Sun May 11 04:25:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   64C    P8             19W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer
import time
import copy

# 0. Define the Tanh "Normalization" Layer
class TanhReplacement(nn.Module):
    def __init__(self, original_ln_config=None): # original_ln_config is for API compatibility if needed by some architectures
        super().__init__()
        # Tanh has no learnable parameters or specific configuration like hidden_size or eps
        # If the original LayerNorm had learnable parameters, they are now gone.
        # The shape of the output will be the same as the input, just like LayerNorm.
        if original_ln_config:
            self.normalized_shape = original_ln_config.normalized_shape
            self.eps = original_ln_config.eps
            # We don't actually use these, but store them for potential inspection

    def forward(self, x):
        return torch.tanh(x)

# 1. Setup: Model and Tokenizer
model_name = "gpt2" # Smallest GPT-2 for quicker testing
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 2. Load Original GPT-2 Model
model_orig = GPT2Model.from_pretrained(model_name, config=config)
model_orig.eval() # Set to evaluation mode

# 3. Create Modified Model (DyT-like)
model_dyt = copy.deepcopy(model_orig) # Deep copy to modify independently

# Replace LayerNorms in the transformer blocks
for i, block in enumerate(model_dyt.h):
    # LayerNorm before MultiHeadAttention
    block.ln_1 = TanhReplacement(original_ln_config=block.ln_1) # Pass original LN config for potential reference
    # LayerNorm before MLP
    block.ln_2 = TanhReplacement(original_ln_config=block.ln_2)
    # print(f"Replaced LayerNorms in block {i}")

# Replace final LayerNorm (if present in GPT2Model's main structure, GPT2Model has ln_f)
if hasattr(model_dyt, 'ln_f') and isinstance(model_dyt.ln_f, nn.LayerNorm):
    model_dyt.ln_f = TanhReplacement(original_ln_config=model_dyt.ln_f)
    # print("Replaced final LayerNorm (ln_f)")

model_dyt.eval()

# --- Sanity check: Print model structures (optional) ---
# print("Original Model Structure (relevant parts):")
# for i, block in enumerate(model_orig.h):
#   print(f"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}")
#   print(f"Final ln_f={model_orig.ln_f}")

# print("\nModified Model Structure (relevant parts):")
# for i, block in enumerate(model_dyt.h):
#   print(f"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}")
#   print(f"Final ln_f={model_dyt.ln_f}")
# --- End Sanity check ---


# 4. Prepare Input Data
text = "Remember that William Gibson quote The future is already here, it's just not evenly distributed? Surprise - the future is already here, and it is shockingly distributed. Power to the people. Personally, I love it."
# Using a moderately long sequence for better timing
text = " ".join(["test"] * 100)

text = """
Transformative technologies usually follow a top-down diffusion path: originating in government or military contexts, passing through corporations, and eventually reaching individuals - think electricity, cryptography, computers, flight, the internet, or GPS. This progression feels intuitive, new and powerful technologies are usually scarce, capital-intensive, and their use requires specialized technical expertise in the early stages.

So it strikes me as quite unique and remarkable that LLMs display a dramatic reversal of this pattern - they generate disproportionate benefit for regular people, while their impact is a lot more muted and lagging in corporations and governments. ChatGPT is the fastest growing consumer application in history, with 400 million weekly active users who use it for writing, coding, translation, tutoring, summarization, deep research, brainstorming, etc. This isn't a minor upgrade to what existed before, it is a major multiplier to an individual's power level across a broad range of capabilities. And the barrier to use is incredibly low - the models are cheap (free, even), fast, available to anyone on demand behind a url (or even local machine), and they speak anyone's native language, including tone, slang or emoji. This is insane. As far as I can tell, the average person has never experienced a technological unlock this dramatic, this fast.

Why then are the benefits a lot more muted in the corporate and government realms? I think the first reason is that LLMs offer a very specific profile of capability - that of merely quasi-expert knowledge/performance, but simultaneously across a very wide variety of domains. In other words, they are simultaneously versatile but also shallow and fallible. Meanwhile, an organization's unique superpower is the ability to concentrate diverse expertise into a single entity by employing engineers, researchers, analysts, lawyers, marketers, etc. While LLMs can certainly make these experts more efficient individually (e.g. drafting initial legal clauses, generating boilerplate code, etc.), the improvement to the organization takes the form of becoming a bit better at the things it could already do. In contrast, an individual will usually only be an expert in at most one thing, so the broad quasi-expertise offered by the LLM fundamentally allows them to do things they couldn't do before. People can now vibe code apps. They can approach legal documents. They can grok esoteric research papers. They can do data analytics. They can generate multimodal content for branding and marketing. They can do all of this at an adequate capability without involving an additional expert.

Second, organizations deal with problems of a lot greater complexity and necessary coordination, think: various integrations, legacy systems, corporate brand or style guides, stringent security protocols, privacy considerations, internationalization, regulatory compliance and legal risk. There are a lot more variables, a lot more constraints, a lot more considerations, and a lot lower margin for error. It's not so easy to put all of it into a context window. You can't just vibe code something. You might be one disastrous hallucination away from losing your job. And third, there is the well-documented inertia of a larger organization, featuring culture, historical precedents, political turf wars that escalate in periods of rapid change, communication overhead, re-training challenges of a distributed workforce and good old-fashioned bureaucracy. These are major headwinds when it comes to rapid adoption of a sparkling new, versatile-but-shallow-and-fallible tool. I don't wish to downplay the impacts of LLMs in corporations or governments, but at least for the moment and in aggregate across society, they have been significantly more life altering for individuals than they have been for organizations. Mary, Jim and Joes are experiencing the majority of the benefit, not Google or the government of the United States.

Looking forward, the continued diffusion of LLMs of course depends on continued performance improvement and its capability profile. The "benefit distribution" overall is particularly interesting to chart, and depends heavily on the dynamic range of the performance as a function of capital expenditure. Today, frontier-grade LLM performance is very accessible and cheap. Beyond this point, you cannot spend a marginal dollar to get better performance, reliability or autonomy. Money can't buy better ChatGPT. Bill Gates talks to GPT 4o just like you do. But can this be expected to last? Train-time scaling (increase parameters, data), test-time scaling (increase time) and model ensembles (increase batch) are forces increasing the dynamic range. On the other hand, model distillation (the ability to train disproportionately powerful small models by training to mimic the big model) has been a force decreasing dynamic range. Certainly, the moment money can buy dramatically better ChatGPT, things change. Large organizations get to concentrate their vast resources to buy more intelligence. And within the category of "individual" too, the elite may once again split away from the rest of society. Their child will be tutored by GPT-8-pro-max-high, yours by GPT-6 mini.

But at least at this moment in time, we find ourselves in a unique and unprecedented situation in the history of technology. If you go back through various sci-fi you'll see that very few would have predicted that the AI revolution would feature this progression. It was supposed to be a top secret government megabrain project wielded by the generals, not ChatGPT appearing basically overnight and for free on a device already in everyone's pocket. Remember that William Gibson quote "The future is already here, it's just not evenly distributed"? Surprise - the future is already here, and it is shockingly distributed. Power to the people. Personally, I love it.
"""

inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 5. Move models and data to device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_orig.to(device)
model_dyt.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# 6. Inference Time Measurement Function
def measure_inference_time(model, input_ids_tensor, attention_mask_tensor, num_runs=100, warmup_runs=10):
    # Warmup runs
    for _ in range(warmup_runs):
        with torch.no_grad():
            _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

    if device.type == 'cuda':
        torch.cuda.synchronize() # Ensure warmup is complete

    total_time = 0
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None

    for i in range(num_runs):
        if device.type == 'cuda':
            start_events[i].record()
        else:
            start_time = time.perf_counter()

        with torch.no_grad():
            _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

        if device.type == 'cuda':
            end_events[i].record()
        else:
            end_time = time.perf_counter()
            total_time += (end_time - start_time)

    if device.type == 'cuda':
        torch.cuda.synchronize() # Wait for all runs to complete
        for i in range(num_runs):
            total_time += start_events[i].elapsed_time(end_events[i]) / 1000.0 # elapsed_time is in ms

    avg_time = total_time / num_runs
    return avg_time

# 7. Run and Compare
print(f"\nBenchmarking with sequence length: {input_ids.shape[1]}")
num_runs = 200
warmup_runs = 20

# --- Test outputs to ensure they are different (as expected) ---
# with torch.no_grad():
#     out_orig = model_orig(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
#     out_dyt = model_dyt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
# print(f"Output norm difference: {torch.norm(out_orig - out_dyt)}")
# assert not torch.allclose(out_orig, out_dyt), "Outputs should be different!"
# ---

avg_time_orig = measure_inference_time(model_orig, input_ids, attention_mask, num_runs, warmup_runs)
print(f"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms")

avg_time_dyt = measure_inference_time(model_dyt, input_ids, attention_mask, num_runs, warmup_runs)
print(f"DyT-like (Tanh) GPT-2 avg inference time: {avg_time_dyt*1000:.4f} ms")

if avg_time_dyt < avg_time_orig:
    speedup_percentage = ((avg_time_orig - avg_time_dyt) / avg_time_orig) * 100
    print(f"Speedup with Tanh replacement: {speedup_percentage:.2f}%")
else:
    slowdown_percentage = ((avg_time_dyt - avg_time_orig) / avg_time_orig) * 100
    print(f"Slowdown with Tanh replacement: {slowdown_percentage:.2f}% (This is unexpected if Tanh is truly faster)")

Token indices sequence length is longer than the specified maximum sequence length for this model (1171 > 1024). Running this sequence through the model will result in indexing errors


Using device: cuda

Benchmarking with sequence length: 1171


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer
import time
import copy

# 0. Define the Tanh "Normalization" Layer
class TanhReplacement(nn.Module):
    def __init__(self, original_ln_config=None): # original_ln_config is for API compatibility if needed by some architectures
        super().__init__()
        # Tanh has no learnable parameters or specific configuration like hidden_size or eps
        # If the original LayerNorm had learnable parameters, they are now gone.
        # The shape of the output will be the same as the input, just like LayerNorm.
        if original_ln_config:
            self.normalized_shape = original_ln_config.normalized_shape
            self.eps = original_ln_config.eps
            # We don't actually use these, but store them for potential inspection

    def forward(self, x):
        return torch.tanh(x)

# 1. Setup: Model and Tokenizer
model_name = "gpt2" # Smallest GPT-2 for quicker testing
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 2. Load Original GPT-2 Model
model_orig = GPT2Model.from_pretrained(model_name, config=config)
model_orig.eval() # Set to evaluation mode

# 3. Create Modified Model (DyT-like)
model_dyt = copy.deepcopy(model_orig) # Deep copy to modify independently

# Replace LayerNorms in the transformer blocks
for i, block in enumerate(model_dyt.h):
    # LayerNorm before MultiHeadAttention
    block.ln_1 = TanhReplacement(original_ln_config=block.ln_1) # Pass original LN config for potential reference
    # LayerNorm before MLP
    block.ln_2 = TanhReplacement(original_ln_config=block.ln_2)
    # print(f"Replaced LayerNorms in block {i}")

# Replace final LayerNorm (if present in GPT2Model's main structure, GPT2Model has ln_f)
if hasattr(model_dyt, 'ln_f') and isinstance(model_dyt.ln_f, nn.LayerNorm):
    model_dyt.ln_f = TanhReplacement(original_ln_config=model_dyt.ln_f)
    # print("Replaced final LayerNorm (ln_f)")

model_dyt.eval()

# --- Sanity check: Print model structures (optional) ---
# print("Original Model Structure (relevant parts):")
# for i, block in enumerate(model_orig.h):
# print(f"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}")
# print(f"Final ln_f={model_orig.ln_f}")

# print("\nModified Model Structure (relevant parts):")
# for i, block in enumerate(model_dyt.h):
# print(f"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}")
# print(f"Final ln_f={model_dyt.ln_f}")
# --- End Sanity check ---


# 4. Prepare Input Data
text = "Replace this with your desired input text for benchmarking. Longer sequences might show more difference."
# Using a moderately long sequence for better timing
text = " ".join(["test"] * 100)
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 5. Move models and data to device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_orig.to(device)
model_dyt.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# 6. Inference Time Measurement Function
def measure_inference_time(model, input_ids_tensor, attention_mask_tensor, num_runs=100, warmup_runs=10):
    # Warmup runs
    for _ in range(warmup_runs):
        with torch.no_grad():
            _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

    if device.type == 'cuda':
        torch.cuda.synchronize() # Ensure warmup is complete

    total_time = 0
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None

    for i in range(num_runs):
        if device.type == 'cuda':
            start_events[i].record()
        else:
            start_time = time.perf_counter()

        with torch.no_grad():
            _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

        if device.type == 'cuda':
            end_events[i].record()
        else:
            end_time = time.perf_counter()
            total_time += (end_time - start_time)

    if device.type == 'cuda':
        torch.cuda.synchronize() # Wait for all runs to complete
        for i in range(num_runs):
            total_time += start_events[i].elapsed_time(end_events[i]) / 1000.0 # elapsed_time is in ms

    avg_time = total_time / num_runs
    return avg_time

# 7. Run and Compare
print(f"\nBenchmarking with sequence length: {input_ids.shape[1]}")
num_runs = 200
warmup_runs = 20

# --- Test outputs to ensure they are different (as expected) ---
# with torch.no_grad():
#     out_orig = model_orig(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
#     out_dyt = model_dyt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
# print(f"Output norm difference: {torch.norm(out_orig - out_dyt)}")
# assert not torch.allclose(out_orig, out_dyt), "Outputs should be different!"
# ---

avg_time_orig = measure_inference_time(model_orig, input_ids, attention_mask, num_runs, warmup_runs)
print(f"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms")

avg_time_dyt = measure_inference_time(model_dyt, input_ids, attention_mask, num_runs, warmup_runs)
print(f"DyT-like (Tanh) GPT-2 avg inference time: {avg_time_dyt*1000:.4f} ms")

if avg_time_dyt < avg_time_orig:
    speedup_percentage = ((avg_time_orig - avg_time_dyt) / avg_time_orig) * 100
    print(f"Speedup with Tanh replacement: {speedup_percentage:.2f}%")
else:
    slowdown_percentage = ((avg_time_dyt - avg_time_orig) / avg_time_orig) * 100
    print(f"Slowdown with Tanh replacement: {slowdown_percentage:.2f}% (This is unexpected if Tanh is truly faster)")

Using device: cuda

Benchmarking with sequence length: 100
Original GPT-2 avg inference time: 13.5885 ms
DyT-like (Tanh) GPT-2 avg inference time: 12.9903 ms
Speedup with Tanh replacement: 4.40%


In [2]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer
import time
import copy
import os

# IMPORTANT FOR DEBUGGING CUDA ERRORS:
# This makes CUDA kernel launches synchronous. If a kernel errors,
# the Python stack trace will now point to the exact line that launched the faulty kernel.
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# 0. Define the Tanh "Normalization" Layer
class TanhReplacement(nn.Module):
    def __init__(self, original_ln_config=None): # original_ln_config is for API compatibility if needed
        super().__init__()
        # Store original config for potential inspection, though tanh doesn't use it
        if original_ln_config:
            self.normalized_shape = original_ln_config.normalized_shape
            self.eps = original_ln_config.eps
        # Tanh has no learnable parameters or specific configuration like hidden_size or eps
        # If the original LayerNorm had learnable parameters (gamma, beta), they are now gone.

    def forward(self, x):
        # --- Debugging: Check for NaNs/Infs in input to this specific Tanh layer ---
        # if torch.isnan(x).any() or torch.isinf(x).any():
        #     print(f"WARNING: NaN or Inf detected in input to TanhReplacement! Min: {x.min().item()}, Max: {x.max().item()}")

        output = torch.tanh(x)

        # --- Debugging: Check for NaNs/Infs in output of this Tanh layer ---
        # (tanh(nan) -> nan; tanh(inf) -> 1.0; tanh(-inf) -> -1.0)
        # if torch.isnan(output).any():
        #     print(f"WARNING: NaN detected in TanhReplacement output! Input was likely NaN.")
        # if torch.isinf(output).any(): # Should not happen for tanh if input is not inf
        #     print(f"WARNING: Inf detected in TanhReplacement output! This is unexpected for tanh.")
        return output

# Function to recursively replace LayerNorm modules
def replace_layernorm_with_tanh_recursive(module):
    has_replaced = False
    for name, child_module in module.named_children():
        if isinstance(child_module, nn.LayerNorm):
            # print(f"Replacing LayerNorm '{name}' in {module.__class__.__name__} with TanhReplacement")
            setattr(module, name, TanhReplacement(original_ln_config=child_module))
            has_replaced = True
        else:
            has_replaced = replace_layernorm_with_tanh_recursive(child_module) or has_replaced # Recurse
    return has_replaced

# 1. Setup: Model and Tokenizer
model_name = "gpt2" # Smallest GPT-2 for quicker testing
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None: # GPT-2 often doesn't have a pad token; add if missing for batching
    tokenizer.pad_token = tokenizer.eos_token


# 2. Load Original GPT-2 Model
model_orig = GPT2Model.from_pretrained(model_name, config=config)
model_orig.eval() # Set to evaluation mode

# 3. Create Modified Model (DyT-like)
model_dyt = copy.deepcopy(model_orig) # Deep copy to modify independently
replaced_in_dyt = replace_layernorm_with_tanh_recursive(model_dyt)
# print(f"LayerNorms replaced in DyT model: {replaced_in_dyt}")
model_dyt.eval()


# --- Sanity check: Print model structures (optional) ---
# print("Original Model Structure (example LayerNorm):")
# if len(model_orig.h) > 0:
#     print(f"Block 0 ln_1: {model_orig.h[0].ln_1}")
# if hasattr(model_orig, 'ln_f'):
#     print(f"Final ln_f: {model_orig.ln_f}")

# print("\nModified Model Structure (example TanhReplacement):")
# if len(model_dyt.h) > 0:
#     print(f"Block 0 ln_1: {model_dyt.h[0].ln_1}")
# if hasattr(model_dyt, 'ln_f'):
#     print(f"Final ln_f: {model_dyt.ln_f}")
# --- End Sanity check ---


# 4. Prepare Input Data
# text = "Replace this with your desired input text for benchmarking. Longer sequences might show more difference."
text = " ".join(["test"] * 50) # Using a shorter sequence for faster debugging if issues persist
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 5. Move models and data to device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_orig.to(device)
model_dyt.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# 6. Inference Time Measurement Function
def measure_inference_time(model, model_desc, input_ids_tensor, attention_mask_tensor, num_runs=100, warmup_runs=10):
    # Warmup runs
    for _ in range(warmup_runs):
        with torch.no_grad():
            _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

    if device.type == 'cuda':
        torch.cuda.synchronize() # Ensure warmup is complete

    total_time = 0
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None

    for i in range(num_runs):
        if device.type == 'cuda':
            start_events[i].record()
        else:
            start_time = time.perf_counter()

        with torch.no_grad():
            _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)

        if device.type == 'cuda':
            end_events[i].record()
        else:
            end_time = time.perf_counter()
            total_time += (end_time - start_time)

    if device.type == 'cuda':
        torch.cuda.synchronize() # Wait for all runs to complete
        for i in range(num_runs):
            total_time += start_events[i].elapsed_time(end_events[i]) / 1000.0 # elapsed_time is in ms

    avg_time = total_time / num_runs
    return avg_time

# 7. Run and Compare
print(f"\nBenchmarking with sequence length: {input_ids.shape[1]}")
num_runs = 50 # Reduced for potentially faster debugging cycles, increase for stable benchmarks
warmup_runs = 5

avg_time_orig = float('nan')
avg_time_dyt = float('nan')

try:
    avg_time_orig = measure_inference_time(model_orig, "Original GPT-2", input_ids, attention_mask, num_runs, warmup_runs)
    print(f"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms")
except Exception as e:
    print(f"Error benchmarking original model: {e}")
    # This would be very unexpected

try:
    avg_time_dyt = measure_inference_time(model_dyt, "DyT-like (Tanh) GPT-2", input_ids, attention_mask, num_runs, warmup_runs)
    print(f"DyT-like (Tanh) GPT-2 avg inference time: {avg_time_dyt*1000:.4f} ms")
except RuntimeError as e:
    print(f"\nERROR during benchmarking of DyT-like model: {e}")
    if "CUDA error: device-side assert triggered" in str(e):
        print("\n" + "="*50)
        print("A 'device-side assert' was triggered in the DyT-like model.")
        print("This LIKELY means that replacing LayerNorm with tanh in the pre-trained GPT-2")
        print("caused numerical instability (e.g., NaNs or Infs), which then made a GPU kernel fail.")
        print("The Python stack trace (above, with CUDA_LAUNCH_BLOCKING=1) should now pinpoint")
        print("the exact operation in the Hugging Face code where the problem occurred.")
        print("Common culprits are operations like softmax in attention if inputs become too large/NaN.")
        print("This experiment highlights that LayerNorm is critical for the stability of pre-trained models;")
        print("simply swapping it out without retraining is generally not viable for correct model output.")
        print("The DyT paper's hypothesis is about training models with tanh units from scratch.")
        print("="*50 + "\n")
    # No further comparison if DyT model failed
    avg_time_dyt = float('nan') # Ensure it's NaN if it failed

if not (torch.isnan(torch.tensor(avg_time_orig)) or torch.isnan(torch.tensor(avg_time_dyt))):
    if avg_time_dyt < avg_time_orig:
        speedup_percentage = ((avg_time_orig - avg_time_dyt) / avg_time_orig) * 100
        print(f"Speedup with Tanh replacement: {speedup_percentage:.2f}%")
    else:
        slowdown_percentage = ((avg_time_dyt - avg_time_orig) / avg_time_orig) * 100
        print(f"Slowdown with Tanh replacement: {slowdown_percentage:.2f}% (This is unexpected if Tanh is truly faster and model runs)")
else:
    print("\nCould not complete the performance comparison due to errors or incomplete runs.")

Using device: cuda

Benchmarking with sequence length: 50
Original GPT-2 avg inference time: 12.8963 ms
DyT-like (Tanh) GPT-2 avg inference time: 12.3657 ms
Speedup with Tanh replacement: 4.11%
