Upload 5 files

Browse files

Files changed (5) hide show

__init__.py +0 -0
lnn.py +511 -0
model.py +114 -0
moe.py +88 -0
pmb.py +210 -0

__init__.py ADDED Viewed

File without changes

lnn.py ADDED Viewed

	@@ -0,0 +1,511 @@

+# Copyright 2024 Quasar AI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import math
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.utils.generic import ModelOutput
+from typing import Optional, Tuple, List
+from dataclasses import dataclass
+from .pmb import ParameterMemoryBank
+from .moe import MoELayer, Expert
+from tqdm import tqdm
+try:
+    from torchdiffeq import odeint
+except ImportError:
+    raise ImportError("torchdiffeq is not installed. Please install it with `pip install torchdiffeq`")
+# --- 1. Configuration Class ---
+class LNNConfig(PretrainedConfig):
+    """
+    Configuration class for the Liquid Neural Network (LNN) model.
+    Inherits from HuggingFace's PretrainedConfig.
+    """
+    model_type = "quasar"
+    def __init__(
+        self,
+        vocab_size=151552,
+        hidden_size=8192,
+        num_hidden_layers=96,  # 96 layers to keep active parameters manageable
+        activation='gelu',
+        lambda_res=0.0,
+        dt=0.2, # Step size for the fixed-step Euler solver.
+        initializer_range=0.02,
+        dropout=0.1,
+        use_pmb=False,
+        pmb_num_blocks=1024,
+        pmb_slots_per_block=4096,
+        pmb_top_k=1,
+        # MoE parameters
+        use_moe: bool = False,
+        num_experts: int = 407,   # 407 experts to reach 440B total parameters
+        num_experts_per_tok: int = 4,  # 4 active experts per token to maintain 25B active params
+        expert_dim: int = 32768,  # 32K expert dimension for capacity
+        moe_load_balance_loss_weight: float = 0.01,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.lambda_res = lambda_res
+        self.dt = dt
+        self.activation = activation
+        self.initializer_range = initializer_range
+        self.dropout = dropout
+        self.use_pmb = use_pmb
+        self.pmb_num_blocks = pmb_num_blocks
+        self.pmb_slots_per_block = pmb_slots_per_block
+        self.pmb_top_k = pmb_top_k
+        # MoE
+        self.use_moe = use_moe
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.expert_dim = expert_dim
+        self.moe_load_balance_loss_weight = moe_load_balance_loss_weight
+        super().__init__(**kwargs)
+# --- 2. Custom Model Output ---
+@dataclass
+class LNNModelOutput(ModelOutput):
+    """
+    Base class for LNN model's outputs, ensuring compatibility with HuggingFace.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    load_balancing_loss: Optional[torch.FloatTensor] = None
+# --- 3. Core LNN Cell ---
+class LNNCell(nn.Module):
+    """A single Liquid Neural Network cell with continuous-time dynamics."""
+    def __init__(self, config: LNNConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.lambda_res = config.lambda_res
+        # Core LNN parameters
+        self.W = nn.Parameter(torch.empty(config.hidden_size, config.hidden_size))
+        self.U = nn.Parameter(torch.empty(config.hidden_size, config.hidden_size))
+        self.b = nn.Parameter(torch.empty(config.hidden_size))
+        # Input-Dependent Dynamics
+        self.tau_w_h = nn.Linear(config.hidden_size, config.hidden_size)
+        self.tau_w_u = nn.Linear(config.hidden_size, config.hidden_size)
+        self.tau_b = nn.Parameter(torch.empty(config.hidden_size))
+        # Initialize weights
+        nn.init.orthogonal_(self.W) # Orthogonal init for recurrent weights
+        nn.init.xavier_uniform_(self.U)
+        nn.init.zeros_(self.b)
+        self.tau_b.data.uniform_(-2, 2)
+        self.sigma = nn.Tanh() # Use Tanh for bounded output and stability
+    def forward(self, h, u):
+        """Core ODE dynamics calculation for a single discrete step."""
+        # 1. Compute Input-Dependent Time Constant (tau)
+        tau_control = self.tau_w_h(h) + self.tau_w_u(u) + self.tau_b
+        # Increased the floor from 0.01 to 1.0 to prevent division by a near-zero
+        # number, which is a common cause of NaN in bf16.
+        tau_positive = F.softplus(tau_control) + 1.0
+        # 2. Compute State Update
+        decay_term = -h / tau_positive
+        activation_input = F.linear(h, self.W) + F.linear(u, self.U) + self.b
+        activation_output = self.sigma(activation_input)
+        dx_dt = decay_term + activation_output
+        if self.lambda_res > 0:
+            dx_dt = dx_dt + self.lambda_res * u
+        # 3. Stability: Clip the derivative
+        dx_dt = torch.clamp(dx_dt, -10, 10)
+        return dx_dt
+# --- 4. LNN Block (Layer + Residual) ---
+class LNNBlock(nn.Module):
+    """ A single block of the LNN, using a fixed-step Euler loop. """
+    def __init__(self, config: LNNConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.dt = config.dt
+        self.cell = LNNCell(config)
+        self.ln = nn.LayerNorm(config.hidden_size)
+    def forward(self, x: torch.Tensor, h: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Processes the entire sequence using a fixed-step Euler integration loop,
+        starting from a given hidden state h.
+        This version is optimized to be JIT-friendly by pre-allocating the output tensor.
+        """
+        seq_len = x.size(1)
+        # Pre-allocate tensor for outputs to avoid slow list appends
+        outputs = torch.empty(x.size(0), seq_len, self.hidden_size, device=x.device)
+        for t in range(seq_len):
+            u = x[:, t, :]
+            dx_dt = self.cell(h, u)
+            h = h + self.dt * dx_dt
+            # Clamp the hidden state to prevent runaway values, a common
+            # source of instability in recurrent models.
+            h = torch.clamp(h, -100, 100)
+            outputs[:, t, :] = h
+        # Add residual connection and layer norm
+        output = self.ln(outputs + x)
+        return output, h
+# --- 5. Full LNN Model ---
+class LNNModel(PreTrainedModel, GenerationMixin):
+    """
+    The Liquid Neural Network Model.
+    This version restores the architecture from the high-performing `old_lnn.py`.
+    It uses stacked LNNBlocks to process the sequence and a Transformer-based
+    attention readout for global context before prediction.
+    """
+    config_class = LNNConfig
+    def __init__(self, config: LNNConfig):
+        super().__init__(config)
+        self.config = config
+        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.blocks = nn.ModuleList([LNNBlock(config) for _ in range(config.num_hidden_layers)])
+        # JIT-compile the LNNBlocks for a significant performance boost
+        # Disabling JIT as a test, as it can sometimes cause unexpected memory allocation issues with recurrent loops.
+        # for i in range(len(self.blocks)):
+        #     self.blocks[i] = torch.jit.script(self.blocks[i])
+        self.ln_final = nn.LayerNorm(config.hidden_size, eps=1e-5)
+        # The attention-based readout is removed to prevent the model from "cheating"
+        # by using self-attention on the whole sequence instead of relying on its
+        # recurrent state. This forces the LNN to learn more robust representations.
+        # self.readout = nn.TransformerEncoderLayer(...)
+        self.proj_out = nn.Linear(config.hidden_size, config.vocab_size)
+    def get_input_embeddings(self):
+        return self.embedding
+    def set_input_embeddings(self, value):
+        self.embedding = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        hidden_states: Optional[List[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # Accept attention_mask
+        **kwargs,  # Accept other arguments
+    ) -> LNNModelOutput:
+        """
+        Processes a sequence, calculates loss, and handles unexpected arguments.
+        The `attention_mask` is accepted but not used, as the LNN processes
+        the sequence recurrently.
+        """
+        # 1. Get Embeddings
+        x = self.embedding(input_ids)
+        batch_size = input_ids.shape[0]
+        # 2. Initialize hidden states if not provided
+        if hidden_states is None:
+            hidden_states = [
+                torch.zeros(batch_size, self.config.hidden_size, device=x.device)
+                for _ in range(self.config.num_hidden_layers)
+            ]
+        # 3. Process sequence through LNN blocks
+        new_hidden_states = []
+        layer_output = x
+        for i, block in enumerate(self.blocks):
+            h_initial = hidden_states[i]
+            layer_output, h_final = block(layer_output, h_initial)
+            new_hidden_states.append(h_final)
+        # 4. Final Projection (without attention readout)
+        final_output = self.ln_final(layer_output)
+        logits = self.proj_out(final_output)
+        # 5. Calculate loss if labels are provided
+        loss = None
+        if labels is not None:
+            # Shift so that logits at time t predict token at time t+1
+            # This is the standard procedure for training causal language models.
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            # Flatten the tokens and compute loss
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+        return LNNModelOutput(
+            loss=loss,
+            logits=logits,
+            last_hidden_state=final_output,
+            hidden_states=tuple(new_hidden_states),
+        )
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_length: int = 100,
+        max_new_tokens: int = None,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 0.9,
+        do_sample: bool = True,
+        pad_token_id: int = None,
+        eos_token_id: int = None,
+        repetition_penalty: float = 1.0,
+        **kwargs
+    ) -> torch.LongTensor:
+        """
+        Generate text using the LNN model with improved repetition handling.
+        """
+        batch_size = input_ids.shape[0]
+        device = input_ids.device
+        # Determine actual max length
+        if max_new_tokens is not None:
+            max_length = input_ids.shape[1] + max_new_tokens
+        # Initialize hidden states
+        hidden_states = [
+            torch.zeros(batch_size, self.config.hidden_size, device=device)
+            for _ in range(self.config.num_hidden_layers)
+        ]
+        # Initialize output with input_ids
+        generated = input_ids.clone()
+        # Set model to evaluation mode
+        self.eval()
+        for step in range(max_length - input_ids.shape[1]):
+            # Get model output - only pass the last few tokens to avoid recomputing everything
+            context_length = min(generated.shape[1], 512)  # Limit context to prevent memory issues
+            context_ids = generated[:, -context_length:]
+            with torch.no_grad():
+                outputs = self.forward(
+                    input_ids=context_ids,
+                    hidden_states=hidden_states if step == 0 else None  # Only use initial hidden states
+                )
+                # Get logits for the last token
+                logits = outputs.logits[:, -1, :]  # Shape: [batch_size, vocab_size]
+                # Apply repetition penalty
+                if repetition_penalty != 1.0:
+                    for i in range(batch_size):
+                        for token_id in set(generated[i].tolist()):
+                            # If logit is positive, divide by penalty, else multiply
+                            if logits[i, token_id] > 0:
+                                logits[i, token_id] /= repetition_penalty
+                            else:
+                                logits[i, token_id] *= repetition_penalty
+                # Apply temperature
+                if temperature != 1.0:
+                    logits = logits / temperature
+                # Apply top-k filtering
+                if top_k > 0:
+                    top_k_values, _ = torch.topk(logits, min(top_k, logits.size(-1)), dim=-1)
+                    indices_to_remove = logits < top_k_values[..., -1, None]
+                    logits[indices_to_remove] = -float('inf')
+                # Apply top-p filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    # Remove tokens with cumulative probability above the threshold
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    # Convert back to original indices
+                    indices_to_remove = sorted_indices_to_remove.gather(dim=-1, index=sorted_indices.argsort(dim=-1))
+                    logits[indices_to_remove] = -float('inf')
+                # Sample next token
+                if do_sample:
+                    probs = F.softmax(logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                else:
+                    next_token = torch.argmax(logits, dim=-1, keepdim=True)
+                # Append to generated sequence
+                generated = torch.cat([generated, next_token], dim=-1)
+                # Check for EOS token
+                if eos_token_id is not None and (next_token == eos_token_id).all():
+                    break
+        return generated
+    def generate_simple(
+        self,
+        input_ids: torch.LongTensor,
+        max_length: int = 100,
+        temperature: float = 1.0,
+        do_sample: bool = True,
+        pad_token_id: int = None,
+        eos_token_id: int = None,
+        hidden_states: Optional[List[torch.Tensor]] = None,
+        **kwargs
+    ) -> torch.LongTensor:
+        """
+        Simple generate method without top-k/top-p sampling to avoid dimension issues.
+        """
+        batch_size = input_ids.shape[0]
+        device = input_ids.device
+        # Initialize hidden states if not provided
+        if hidden_states is None:
+            hidden_states = [
+                torch.zeros(batch_size, self.config.hidden_size, device=device)
+                for _ in range(self.config.num_hidden_layers)
+            ]
+        # Initialize output with input_ids
+        generated = input_ids.clone()
+        # Set model to evaluation mode
+        self.eval()
+        for _ in range(max_length - input_ids.shape[1]):
+            # Get model output
+            with torch.no_grad():
+                outputs = self.forward(
+                    input_ids=generated,
+                    hidden_states=hidden_states
+                )
+                # Get logits for the last token
+                logits = outputs.logits[:, -1, :]  # Shape: [batch_size, vocab_size]
+                hidden_states = list(outputs.hidden_states)
+                # Apply temperature
+                if temperature != 1.0:
+                    logits = logits / temperature
+                # Sample next token
+                if do_sample:
+                    probs = F.softmax(logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                else:
+                    next_token = torch.argmax(logits, dim=-1, keepdim=True)
+                # Append to generated sequence
+                generated = torch.cat([generated, next_token], dim=-1)
+                # Check for EOS token
+                if eos_token_id is not None and (next_token == eos_token_id).all():
+                    break
+        return generated
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = True,
+        **kwargs
+    ) -> dict:
+        """
+        Prepare inputs for generation. For LNN, we use hidden_states instead of past_key_values.
+        """
+        # For LNN, we don't use past_key_values in the traditional sense
+        # Instead, we rely on the recurrent nature of the model
+        model_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+        return model_inputs
+    def _reorder_cache(self, past_key_values: List[torch.Tensor], beam_idx: torch.Tensor) -> List[torch.Tensor]:
+        """
+        Reorder hidden states for beam search.
+        """
+        if past_key_values is None:
+            return None
+        reordered_past = []
+        for hidden_state in past_key_values:
+            reordered_past.append(hidden_state.index_select(0, beam_idx))
+        return reordered_past
+# --- 6. For Causal LM compatibility ---
+class LNNForCausalLM(LNNModel):
+    """
+    Wrapper class for compatibility with HuggingFace's CausalLM interface.
+    """
+    def __init__(self, config: LNNConfig):
+        super().__init__(config)
+        self.lm_head = self.proj_out  # Alias for compatibility
+    @property
+    def model(self):
+        """Return self for compatibility with some HF utilities."""
+        return self
+    def get_output_embeddings(self):
+        return self.proj_out
+    def set_output_embeddings(self, new_embeddings):
+        self.proj_out = new_embeddings
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        hidden_states: Optional[List[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ) -> LNNModelOutput:
+        """Forward pass that's compatible with CausalLM interface."""
+        return super().forward(
+            input_ids=input_ids,
+            labels=labels,
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            **kwargs
+        )
+# --- 7. Model registration ---
+# Register the model with transformers
+try:
+    from transformers import AutoModel, AutoModelForCausalLM
+    AutoModel.register(LNNConfig, LNNModel)
+    AutoModelForCausalLM.register(LNNConfig, LNNForCausalLM)
+except ImportError:
+    pass  # transformers not available or version doesn't support registration

model.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from tqdm import tqdm
+from .moe import MoELayer
+class QuasarConfig(PretrainedConfig):
+    model_type = "quasar"
+    def __init__(
+        self,
+        vocab_size=129280,
+        embedding_dim=8192,
+        num_hidden_layers=96,  # 96 layers to keep active parameters manageable
+        num_attention_heads=64,
+        num_experts=407,   # 407 experts to reach 440B total parameters
+        expert_dim=32768,  # 32K expert dimension for capacity
+        top_k=4,          # 4 active experts per token to maintain 25B active params
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_experts = num_experts
+        self.expert_dim = expert_dim
+        self.top_k = top_k
+        super().__init__(**kwargs)
+class SelfAttention(nn.Module):
+    def __init__(self, config: QuasarConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.embedding_dim // self.num_heads
+        self.q_proj = nn.Linear(config.embedding_dim, config.embedding_dim, bias=False)
+        self.k_proj = nn.Linear(config.embedding_dim, config.embedding_dim, bias=False)
+        self.v_proj = nn.Linear(config.embedding_dim, config.embedding_dim, bias=False)
+        self.out_proj = nn.Linear(config.embedding_dim, config.embedding_dim, bias=False)
+    def forward(self, x):
+        batch_size, seq_len, _ = x.shape
+        q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        attn_output = F.scaled_dot_product_attention(q, k, v)
+        output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
+        return self.out_proj(output)
+class QuasarBlock(nn.Module):
+    def __init__(self, config: QuasarConfig):
+        super().__init__()
+        self.attention = SelfAttention(config)
+        self.moe_layer = MoELayer(
+            embedding_dim=config.embedding_dim,
+            num_experts=config.num_experts,
+            expert_dim=config.expert_dim,
+            top_k=config.top_k
+        )
+        self.ln1 = nn.LayerNorm(config.embedding_dim)
+        self.ln2 = nn.LayerNorm(config.embedding_dim)
+    def forward(self, x):
+        x = x + self.attention(self.ln1(x))
+        moe_out, lb_loss = self.moe_layer(self.ln2(x))
+        x = x + moe_out
+        return x, lb_loss
+class Quasar(PreTrainedModel):
+    config_class = QuasarConfig
+    _supports_gradient_checkpointing = True
+    def __init__(self, config: QuasarConfig):
+        super().__init__(config)
+        self.config = config
+        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
+        print(f"\nInitializing {config.num_hidden_layers} Quasar layers...")
+        self.layers = nn.ModuleList([QuasarBlock(config) for _ in tqdm(range(config.num_hidden_layers), desc="Creating Quasar Layers")])
+        self.final_ln = nn.LayerNorm(config.embedding_dim)
+        self.output_head = nn.Linear(config.embedding_dim, config.vocab_size, bias=False)
+    def forward(self, input_ids, labels=None, **kwargs):
+        x = self.embedding(input_ids)
+        total_lb_loss = 0.0
+        # Add config to kwargs for gradient checkpointing
+        kwargs['config'] = self.config
+        for layer in self.layers:
+            if self.is_gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                x, lb_loss = torch.utils.checkpoint.checkpoint(create_custom_forward(layer), x, use_reentrant=False)
+            else:
+                x, lb_loss = layer(x)
+            total_lb_loss += lb_loss
+        x = self.final_ln(x)
+        logits = self.output_head(x)
+        loss = None
+        if labels is not None:
+            main_loss = F.cross_entropy(logits.view(-1, self.config.vocab_size), labels.view(-1))
+            loss = main_loss + total_lb_loss
+        return {
+            'loss': loss,
+            'logits': logits,
+            'lb_loss': total_lb_loss
+        }

moe.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# c:\quasarv4\quasar\moe.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+class Expert(nn.Module):
+    """An expert network. For Quasar, this could be an LNN layer followed by a feed-forward network."""
+    def __init__(self, embedding_dim, expert_dim):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(embedding_dim, expert_dim),
+            nn.GELU(),
+            nn.Linear(expert_dim, embedding_dim)
+        )
+    def forward(self, x):
+        return self.net(x)
+class MoERouter(nn.Module):
+    """A simple router that learns to dispatch tokens to experts."""
+    def __init__(self, embedding_dim, num_experts, top_k=2):
+        super().__init__()
+        self.top_k = top_k
+        self.gate = nn.Linear(embedding_dim, num_experts)
+    def forward(self, x):
+        """ Returns the top-k weights and indices for each token. """
+        gate_logits = self.gate(x.reshape(-1, x.shape[-1]))
+        top_k_logits, top_k_indices = torch.topk(gate_logits, self.top_k, dim=-1)
+        top_k_weights = F.softmax(top_k_logits, dim=-1, dtype=torch.float).to(x.dtype)
+        return top_k_weights, top_k_indices
+class MoELayer(nn.Module):
+    """A Mixture of Experts layer."""
+    def __init__(self, embedding_dim, num_experts, expert_dim, top_k=2):
+        super().__init__()
+        self.router = MoERouter(embedding_dim, num_experts, top_k)
+        self.num_experts = num_experts
+        # Create experts
+        # Use a generator expression to avoid creating a temporary list of all experts in memory
+        self.experts = nn.ModuleList(Expert(embedding_dim, expert_dim) for _ in range(self.num_experts))
+    def forward(self, x):
+        """Forward pass for the MoE layer."""
+        original_shape = x.shape
+        flat_x = x.reshape(-1, x.shape[-1])
+        # Create the final output tensor on the correct device, avoiding meta-device issues.
+        final_output = torch.zeros(flat_x.shape, dtype=x.dtype, device=self.router.gate.weight.device)
+        # Get routing decisions from the router
+        top_k_weights, top_k_indices = self.router(x)
+        # Calculate load balancing loss using one_hot to be meta-tensor compatible
+        num_tokens = top_k_indices.size(0)
+        one_hot_indices = F.one_hot(top_k_indices, num_classes=self.num_experts).float()
+        tokens_per_expert = one_hot_indices.sum(dim=[0, 1])
+        router_probs_per_expert = torch.mean(F.softmax(self.router.gate.weight, dim=0), dim=1)
+        load_balancing_loss = self.num_experts * torch.dot(tokens_per_expert / num_tokens, router_probs_per_expert)
+        # Dispatch tokens to experts and aggregate outputs
+        for i in range(self.num_experts):
+            # Find which tokens are routed to this expert
+            expert_mask = (top_k_indices == i).any(dim=1)
+            expert_indices_for_expert = torch.where(expert_mask)[0]
+            if expert_indices_for_expert.numel() == 0:
+                continue
+            # Get the tokens for this expert
+            expert_tokens = flat_x[expert_indices_for_expert]
+            # Find the specific weight for this expert for each token
+            top_k_weights_for_expert = top_k_weights[expert_indices_for_expert]
+            is_expert_in_top_k = (top_k_indices[expert_indices_for_expert] == i)
+            weights_for_expert = torch.sum(top_k_weights_for_expert * is_expert_in_top_k, dim=1, keepdim=True)
+            # Process with expert and apply routing weight
+            expert_output = self.experts[i](expert_tokens)
+            weighted_output = expert_output * weights_for_expert
+            # Add the weighted output to the final output tensor at the correct positions
+            final_output.index_add_(0, expert_indices_for_expert, weighted_output)
+        return final_output.reshape(original_shape), load_balancing_loss

pmb.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+import hashlib
+import numpy as np
+class ParameterMemoryBank:
+    """
+    Parameter Memory Bank (PMB) for infinite, queryable memory.
+    This implementation uses a two-level hashing system for constant-time
+    direct access and supports semantic similarity search.
+    - Level 1: A list of 'blocks'.
+    - Level 2: Each block is a dictionary-like structure mapping slots to items.
+    For simplicity, we use Python lists and dictionaries. A production system
+    would use a more optimized backend (e.g., Redis, custom memory store).
+    """
+    def __init__(self, num_blocks=1024, slots_per_block=4096, embedding_dim=None):
+        self.num_blocks = num_blocks
+        self.slots_per_block = slots_per_block
+        self.embedding_dim = embedding_dim
+        # PMB is a list of blocks, where each block is a list of slots.
+        # Each slot can hold a tuple: (id, key_embedding, value)
+        self.pmb = [ [None] * slots_per_block for _ in range(num_blocks) ]
+        # For semantic search, we need a separate structure to hold all keys.
+        # This is a trade-off for efficient similarity search.
+        self.all_keys = []
+        self.key_locations = [] # Stores (block_idx, slot_idx) for each key
+    def _hash_fn(self, s, salt=""):
+        """A simple, salted hash function."""
+        return int(hashlib.sha256((str(s) + salt).encode()).hexdigest(), 16)
+    def _get_hash_indices(self, item_id):
+        """
+        Calculates the block and slot indices for a given item ID using
+        the two-level hashing scheme.
+        """
+        block_hash = self._hash_fn(item_id, salt="block")
+        block_idx = block_hash % self.num_blocks
+        slot_hash = self._hash_fn(item_id, salt=f"slot_{block_idx}")
+        slot_idx = slot_hash % self.slots_per_block
+        return block_idx, slot_idx
+    def store(self, item_id, key_embedding, value):
+        """
+        Stores a key-value pair in the PMB using its ID.
+        Args:
+            item_id (str or int): A unique identifier for the data.
+            key_embedding (torch.Tensor): The embedding vector (k_i,j).
+            value (any): The data to store (v_i,j), e.g., text, metadata.
+        """
+        if not isinstance(key_embedding, torch.Tensor):
+            raise TypeError("key_embedding must be a torch.Tensor")
+        block_idx, slot_idx = self._get_hash_indices(item_id)
+        # Store the item in the hash-based location.
+        # Note: This simple implementation doesn't handle hash collisions.
+        # A real system would need a collision resolution strategy (e.g., cuckoo hashing, chaining).
+        if self.pmb[block_idx][slot_idx] is not None:
+            # Handle collision by updating the existing entry or finding an empty slot
+            pass  # For now, just overwrite
+        self.pmb[block_idx][slot_idx] = (item_id, key_embedding.detach().cpu(), value.detach().cpu() if isinstance(value, torch.Tensor) else value)
+        # Also store the key for semantic search
+        self.all_keys.append(key_embedding.detach().cpu())
+        self.key_locations.append((block_idx, slot_idx))
+    def retrieve_direct(self, item_id):
+        """
+        Retrieves a value directly using its ID in O(1) time.
+        Args:
+            item_id (str or int): The unique identifier of the item.
+        Returns:
+            The stored value, or None if not found.
+        """
+        block_idx, slot_idx = self._get_hash_indices(item_id)
+        item = self.pmb[block_idx][slot_idx]
+        # Check if the found item ID matches, in case of no collision handling
+        if item and item[0] == item_id:
+            return item[2] # Return the value
+        return None
+    def retrieve_by_indices(self, indices):
+        """
+        Retrieves items by their indices in the `all_keys` list.
+        Args:
+            indices (list or torch.Tensor): A list of indices.
+        Returns:
+            A list of the retrieved values.
+        """
+        results = []
+        for idx in indices:
+            if idx < len(self.key_locations):
+                block_idx, slot_idx = self.key_locations[idx]
+                item = self.pmb[block_idx][slot_idx]
+                if item:
+                    value = item[2]  # Get the value
+                    # Convert back to tensor if it was stored as tensor
+                    if isinstance(value, torch.Tensor):
+                        results.append(value)
+                    else:
+                        # If value is not a tensor, create a zero tensor of appropriate size
+                        if self.embedding_dim:
+                            results.append(torch.zeros(self.embedding_dim))
+                        else:
+                            # Fallback: use the key embedding as value
+                            results.append(item[1])  # Use key embedding
+                else:
+                    # No item found, append zero tensor
+                    if self.embedding_dim:
+                        results.append(torch.zeros(self.embedding_dim))
+                    else:
+                        results.append(torch.zeros_like(self.all_keys[0]) if self.all_keys else torch.zeros(1))
+            else:
+                # Index out of range
+                if self.embedding_dim:
+                    results.append(torch.zeros(self.embedding_dim))
+                else:
+                    results.append(torch.zeros_like(self.all_keys[0]) if self.all_keys else torch.zeros(1))
+        return results
+    def retrieve_semantic(self, query_embeddings, top_k=1):
+        """
+        Retrieves the top_k most semantically similar items for a batch of query embeddings.
+        Args:
+            query_embeddings (torch.Tensor): Query vectors (batch_size, embedding_dim) or (batch_size, seq_len, embedding_dim).
+            top_k (int): The number of similar items to return for each query.
+        Returns:
+            A tensor of the aggregated retrieved values with the same shape as query_embeddings.
+        """
+        if not self.all_keys or top_k == 0:
+            return torch.zeros_like(query_embeddings)
+        if not isinstance(query_embeddings, torch.Tensor):
+            raise TypeError("query_embeddings must be a torch.Tensor")
+        # Store original shape and device
+        original_shape = query_embeddings.shape
+        device = query_embeddings.device
+        # Flatten query embeddings to 2D for processing
+        if query_embeddings.dim() > 2:
+            query_flat = query_embeddings.view(-1, original_shape[-1])
+        else:
+            query_flat = query_embeddings
+        # Handle empty memory bank
+        if not self.all_keys:
+            return torch.zeros_like(query_embeddings)
+        try:
+            # Stack all keys into a single tensor
+            all_keys_tensor = torch.stack(self.all_keys, dim=0).to(device)
+            # Compute cosine similarity
+            query_norm = torch.nn.functional.normalize(query_flat, p=2, dim=-1)
+            keys_norm = torch.nn.functional.normalize(all_keys_tensor, p=2, dim=-1)
+            # Compute similarities: (batch_size, num_keys)
+            similarities = torch.mm(query_norm, keys_norm.T)
+            # Get top_k results for each query
+            k = min(top_k, len(self.all_keys))
+            if k > 0:
+                top_k_scores, top_k_indices = torch.topk(similarities, k=k, dim=1)
+                # Retrieve the corresponding values
+                batch_results = []
+                for i in range(query_flat.size(0)):
+                    retrieved_values = self.retrieve_by_indices(top_k_indices[i].cpu().tolist())
+                    if retrieved_values:
+                        # Stack and move to correct device
+                        stacked_values = torch.stack(retrieved_values, dim=0).to(device)
+                        # Average the top_k retrieved values
+                        aggregated_value = torch.mean(stacked_values, dim=0)
+                        batch_results.append(aggregated_value)
+                    else:
+                        # No valid retrievals, use zero tensor
+                        batch_results.append(torch.zeros(original_shape[-1], device=device))
+                # Stack all batch results
+                if batch_results:
+                    result = torch.stack(batch_results, dim=0)
+                    # Reshape back to original shape
+                    return result.view(original_shape)
+                else:
+                    return torch.zeros_like(query_embeddings)
+            else:
+                return torch.zeros_like(query_embeddings)
+        except Exception as e:
+            print(f"Error in PMB retrieve_semantic: {e}")
+            return torch.zeros_like(query_embeddings)
+    def __len__(self):
+        return len(self.all_keys)