submission-template / tasks /custom_classifiers.py
Terry Zhang
add moe classifier function and results
03a22c3
from transformers import RobertaModel, AutoTokenizer
from transformers.modeling_outputs import SequenceClassifierOutput
from huggingface_hub import PyTorchModelHubMixin
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import torch.nn as nn
import torch
class SentenceBERTClassifier(nn.Module, PyTorchModelHubMixin):
def __init__(self, model_name="sentence-transformers/all-distilroberta-v1", num_labels=8):
super().__init__()
self.sbert = RobertaModel.from_pretrained(model_name)
self.config = self.sbert.config
self.config.num_labels = num_labels
self.dropout = nn.Dropout(0.05)
self.config.classifier_dropout = 0.05
self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)
def forward(self, input_ids, attention_mask):
outputs = self.sbert(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
dropout_output = self.dropout(pooled_output)
logits = self.classifier(dropout_output)
return SequenceClassifierOutput(
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class DenseBlock(nn.Module):
def __init__(self, input_size, output_size, dropout_rate):
super(DenseBlock, self).__init__()
self.linear = nn.Linear(input_size, output_size)
self.batch_norm = nn.BatchNorm1d(output_size)
self.activation = nn.ReLU()
self.dropout = nn.Dropout(dropout_rate)
def forward(self, input):
output = self.linear(input)
output = self.batch_norm(output)
output = self.activation(output)
output = self.dropout(output)
return output
class FeedForwardExpert(nn.Module):
def __init__(self, dropout_rate, num_labels=8):
super(FeedForwardExpert, self).__init__()
# Define the dense blocks
self.block_1 = DenseBlock(768, 400, dropout_rate)
self.block_2 = DenseBlock(400, 200, dropout_rate)
self.final_layer = nn.Linear(200, num_labels)
self.initialize_weights()
def forward(self, input):
output = self.block_1(input)
output = self.block_2(output)
output = self.final_layer(output)
return output
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
nn.init.zeros_(m.bias)
class MoEClassifier(nn.Module):
def __init__(self, num_experts, dropout_rate=0.1, gate_hidden_size = 128):
super(MoEClassifier, self).__init__()
self.dropout = dropout_rate
self.num_experts = num_experts
self.gate_hidden_size = gate_hidden_size
# Create a list of feedforward experts
self.experts = nn.ModuleList([FeedForwardExpert(self.dropout) for _ in range(self.num_experts)])
# A gating network
self.gate_fc1 = nn.Linear(768, self.gate_hidden_size)
self.gate_fc2 = nn.Linear(self.gate_hidden_size, self.num_experts)
def forward(self, x):
# Calculate gating weights
gate_hidden = F.relu(self.gate_fc1(x))
weights = F.softmax(self.gate_fc2(gate_hidden), dim=1).unsqueeze(2)
# Get outputs from all experts
outputs = torch.stack([expert(x) for expert in self.experts], dim=2)
# apply weights using a batch matrix multiplication
weighted_outputs = torch.bmm(outputs, weights).squeeze(2)
return weighted_outputs