Spaces:

ZubairAhmed777
/

VQA

Sleeping

App Files Files Community

ZubairAhmed777 commited on Jan 21

Commit

2b65df9

verified ·

1 Parent(s): 2d20c0d

Update model.py

Browse files

Files changed (1) hide show

model.py +81 -0

model.py CHANGED Viewed

@@ -42,6 +42,27 @@ class ImageEncoder(nn.Module):
         return l2_norm
 class QuesEncoder(nn.Module):
     def __init__(self, ques_vocab_size, word_embed, hidden_size, num_hidden, qu_feature_size):
         super(QuesEncoder, self).__init__()
@@ -137,3 +158,63 @@ class VQAModel(nn.Module):
         logits = self.fc2(combined_feature)
         return logits

         return l2_norm
+class ImageEncoder_attn(nn.Module):
+    def __init__(self, embed_dim):
+        super(ImageEncoder_attn, self).__init__()
+        # Load a pretrained VGG19 model
+        self.model = models.vgg19(pretrained=True).features
+        # Adding a 1x1 convolutional layer to map features to the desired embedding dimension
+        self.conv = nn.Conv2d(512, embed_dim, kernel_size=1)
+    def forward(self, image):
+        # Extracting spatial features of the image using the modified VGG19 model
+        with torch.no_grad():  # Freezing the weights of the pretrained model during this pass
+            img_features = self.model(image)  # Shape: (batch_size, 512, H, W)
+        # Map features to the desired embedding dimension
+        img_features = self.conv(img_features)  # Shape: (batch_size, embed_dim, H, W)
+        # Flatten spatial dimensions to get per-region features
+        img_features = img_features.flatten(2).permute(0, 2, 1)  # Shape: (batch_size, num_regions, embed_dim)
+        return img_features
 class QuesEncoder(nn.Module):
     def __init__(self, ques_vocab_size, word_embed, hidden_size, num_hidden, qu_feature_size):
         super(QuesEncoder, self).__init__()
         logits = self.fc2(combined_feature)
         return logits
+class VQAModel_attn(nn.Module):
+    def __init__(self, feature_size, ques_vocab_size, ans_vocab_size, word_embed, hidden_size, num_hidden):
+        super(VQAModel_attn, self).__init__()
+        # Encoder to extract image features
+        self.img_encoder = ImageEncoder_attn(feature_size)
+        # Encoder to extract question features
+        self.ques_encoder = QuesEncoder(ques_vocab_size, word_embed, hidden_size, num_hidden, feature_size)
+        # Attention mechanism layers
+        self.attention_fc = nn.Linear(2 * feature_size, 1)  # For compatibility scoring
+        # Dropout layer
+        self.dropout = nn.Dropout(0.5)
+        # Fully connected layers for answer prediction
+        self.fc1 = nn.Linear(feature_size, ans_vocab_size)
+        self.fc2 = nn.Linear(ans_vocab_size, ans_vocab_size)
+    def forward(self, image, question):
+        # Extract image features (batch_size, num_regions, feature_size)
+        img_features = self.img_encoder(image)
+        # Extract question features (batch_size, feature_size)
+        qst_feature = self.ques_encoder(question)
+        # Ensure qst_feature has the correct dimensions
+        # Expand to (batch_size, 1, feature_size), then repeat to match num_regions
+        qst_feature_exp = qst_feature.unsqueeze(1).expand(-1, img_features.size(1), -1)
+        #print(f"img_features shape: {img_features.shape}")
+        #print(f"qst_feature shape: {qst_feature.shape}")
+        #print(f"qst_feature_exp shape: {qst_feature_exp.shape}")
+        # Concatenate image and question features along the last dimension
+        # Shape: (batch_size, num_regions, 2 * feature_size)
+        combined_features = torch.cat([img_features, qst_feature_exp], dim=-1)
+        # Compute attention scores for each region
+        # Shape: (batch_size, num_regions, 1)
+        attention_scores = self.attention_fc(combined_features)
+        # Apply softmax to get attention weights
+        # Shape: (batch_size, num_regions)
+        attention_weights = F.softmax(attention_scores.squeeze(-1), dim=1)
+        # Compute the weighted sum of image features
+        # Shape: (batch_size, feature_size)
+        attended_img_feature = torch.sum(img_features * attention_weights.unsqueeze(-1), dim=1)
+        # Combine attended image features with question features
+        combined_feature = attended_img_feature + qst_feature
+        # Dropout and fully connected layers for answer prediction
+        combined_feature = self.dropout(combined_feature)
+        combined_feature = F.relu(self.fc1(combined_feature))
+        logits = self.fc2(combined_feature)
+        return logits