seminar-demo / sstvit.py
ilhamap's picture
Upload 4 files
cee5099 verified
import torch
from torch import nn, einsum
import torch.nn.functional as F
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
from module import Attention, PreNorm, FeedForward, CrossAttention, SSTransformer
import numpy as np
class SSTTransformerEncoder(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, num_patches, cross_attn_depth=3, cross_attn_heads=8, dropout = 0):
super().__init__()
self.transformer = SSTransformer(dim, depth, heads, dim_head, mlp_dim, b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, num_patches, dropout)
self.cross_attn_layers = nn.ModuleList([])
for _ in range(cross_attn_depth):
self.cross_attn_layers.append(PreNorm(b_dim, CrossAttention(b_dim, heads = cross_attn_heads, dim_head=dim_head, dropout=0)))
def forward(self, x1, x2):
x1 = self.transformer(x1)
x2 = self.transformer(x2)
for cross_attn in self.cross_attn_layers:
x1_class = x1[:, 0]
x1 = x1[:, 1:]
x2_class = x2[:, 0]
x2 = x2[:, 1:]
# Cross Attn
cat1_q = x1_class.unsqueeze(1)
cat1_qkv = torch.cat((cat1_q, x2), dim=1)
cat1_out = cat1_q+cross_attn(cat1_qkv)
x1 = torch.cat((cat1_out, x1), dim=1)
cat2_q = x2_class.unsqueeze(1)
cat2_qkv = torch.cat((cat2_q, x1), dim=1)
cat2_out = cat2_q+cross_attn(cat2_qkv)
x2 = torch.cat((cat2_out, x2), dim=1)
return cat1_out, cat2_out
class SSTViT(nn.Module):
def __init__(self, image_size, near_band, num_patches, num_classes, dim, depth, heads, mlp_dim, b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, pool='cls', channels=1, dim_head = 16, dropout=0., emb_dropout=0., multi_scale_enc_depth=1):
super().__init__()
patch_dim = image_size ** 2 * near_band
self.num_patches = num_patches+1
self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches, dim))
self.patch_to_embedding = nn.Linear(patch_dim, dim)
self.cls_token_t1 = nn.Parameter(torch.randn(1, 1, dim))
self.cls_token_t2 = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)
self.multi_scale_transformers = nn.ModuleList([])
for _ in range(multi_scale_enc_depth):
self.multi_scale_transformers.append(SSTTransformerEncoder(dim, depth, heads, dim_head, mlp_dim,b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, self.num_patches,
dropout = 0.))
self.pool = pool
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(b_dim),
nn.Linear(b_dim, num_classes)
)
def forward(self, x1, x2):
# patchs[batch, patch_num, patch_size*patch_size*c] [batch,200,145*145]
# x = rearrange(x, 'b c h w -> b c (h w)')
## embedding every patch vector to embedding size: [batch, patch_num, embedding_size]
x1 = self.patch_to_embedding(x1) #[b,n,dim]
x2 = self.patch_to_embedding(x2)
b, n, _ = x1.shape
# add position embedding
cls_tokens_t1 = repeat(self.cls_token_t1, '() n d -> b n d', b = b) #[b,1,dim]
cls_tokens_t2 = repeat(self.cls_token_t2, '() n d -> b n d', b = b)
x1 = torch.cat((cls_tokens_t1, x1), dim = 1) #[b,n+1,dim]
x1 += self.pos_embedding[:, :(n + 1)]
x1 = self.dropout(x1)
x2 = torch.cat((cls_tokens_t2, x2), dim = 1) #[b,n+1,dim]
x2 += self.pos_embedding[:, :(n + 1)]
x2 = self.dropout(x2)
# transformer: x[b,n + 1,dim] -> x[b,n + 1,dim]
for multi_scale_transformer in self.multi_scale_transformers:
out1, out2 = multi_scale_transformer(x1, x2)
# classification: using cls_token output
out1 = self.to_latent(out1[:,0])
out2 = self.to_latent(out2[:,0])
out = out1+out2
# MLP classification layer
return self.mlp_head(out)