Spaces:

ilhamap
/

seminar-demo

Paused

App Files Files Community

seminar-demo / sstvit.py

ilhamap

Upload 4 files

cee5099 verified about 1 year ago

raw

history blame contribute delete

4.15 kB

	import torch
	from torch import nn, einsum
	import torch.nn.functional as F
	from einops import rearrange, repeat
	from einops.layers.torch import Rearrange
	from module import Attention, PreNorm, FeedForward, CrossAttention, SSTransformer
	import numpy as np

	class SSTTransformerEncoder(nn.Module):

	def __init__(self, dim, depth, heads, dim_head, mlp_dim, b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, num_patches, cross_attn_depth=3, cross_attn_heads=8, dropout = 0):
	super().__init__()

	self.transformer = SSTransformer(dim, depth, heads, dim_head, mlp_dim, b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, num_patches, dropout)

	self.cross_attn_layers = nn.ModuleList([])
	for _ in range(cross_attn_depth):
	self.cross_attn_layers.append(PreNorm(b_dim, CrossAttention(b_dim, heads = cross_attn_heads, dim_head=dim_head, dropout=0)))

	def forward(self, x1, x2):
	x1 = self.transformer(x1)
	x2 = self.transformer(x2)

	for cross_attn in self.cross_attn_layers:
	x1_class = x1[:, 0]
	x1 = x1[:, 1:]
	x2_class = x2[:, 0]
	x2 = x2[:, 1:]

	# Cross Attn
	cat1_q = x1_class.unsqueeze(1)
	cat1_qkv = torch.cat((cat1_q, x2), dim=1)
	cat1_out = cat1_q+cross_attn(cat1_qkv)
	x1 = torch.cat((cat1_out, x1), dim=1)
	cat2_q = x2_class.unsqueeze(1)
	cat2_qkv = torch.cat((cat2_q, x1), dim=1)
	cat2_out = cat2_q+cross_attn(cat2_qkv)
	x2 = torch.cat((cat2_out, x2), dim=1)

	return cat1_out, cat2_out

	class SSTViT(nn.Module):
	def __init__(self, image_size, near_band, num_patches, num_classes, dim, depth, heads, mlp_dim, b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, pool='cls', channels=1, dim_head = 16, dropout=0., emb_dropout=0., multi_scale_enc_depth=1):
	super().__init__()

	patch_dim = image_size ** 2 * near_band
	self.num_patches = num_patches+1
	self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches, dim))
	self.patch_to_embedding = nn.Linear(patch_dim, dim)
	self.cls_token_t1 = nn.Parameter(torch.randn(1, 1, dim))
	self.cls_token_t2 = nn.Parameter(torch.randn(1, 1, dim))

	self.dropout = nn.Dropout(emb_dropout)

	self.multi_scale_transformers = nn.ModuleList([])
	for _ in range(multi_scale_enc_depth):
	self.multi_scale_transformers.append(SSTTransformerEncoder(dim, depth, heads, dim_head, mlp_dim,b_dim, b_depth, b_heads, b_dim_head, b_mlp_head, self.num_patches,
	dropout = 0.))

	self.pool = pool
	self.to_latent = nn.Identity()

	self.mlp_head = nn.Sequential(
	nn.LayerNorm(b_dim),
	nn.Linear(b_dim, num_classes)
	)
	def forward(self, x1, x2):
	# patchs[batch, patch_num, patch_sizepatch_sizec] [batch,200,145*145]
	# x = rearrange(x, 'b c h w -> b c (h w)')
	## embedding every patch vector to embedding size: [batch, patch_num, embedding_size]
	x1 = self.patch_to_embedding(x1) #[b,n,dim]
	x2 = self.patch_to_embedding(x2)
	b, n, _ = x1.shape
	# add position embedding
	cls_tokens_t1 = repeat(self.cls_token_t1, '() n d -> b n d', b = b) #[b,1,dim]
	cls_tokens_t2 = repeat(self.cls_token_t2, '() n d -> b n d', b = b)

	x1 = torch.cat((cls_tokens_t1, x1), dim = 1) #[b,n+1,dim]
	x1 += self.pos_embedding[:, :(n + 1)]
	x1 = self.dropout(x1)
	x2 = torch.cat((cls_tokens_t2, x2), dim = 1) #[b,n+1,dim]
	x2 += self.pos_embedding[:, :(n + 1)]
	x2 = self.dropout(x2)
	# transformer: x[b,n + 1,dim] -> x[b,n + 1,dim]
	for multi_scale_transformer in self.multi_scale_transformers:
	out1, out2 = multi_scale_transformer(x1, x2)
	# classification: using cls_token output
	out1 = self.to_latent(out1[:,0])
	out2 = self.to_latent(out2[:,0])
	out = out1+out2
	# MLP classification layer
	return self.mlp_head(out)