ZubairAhmed777 commited on
Commit
be4c742
·
verified ·
1 Parent(s): f0bd440

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +139 -0
model.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ import glob
6
+ import numpy as np
7
+ import time
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torchvision.models as models
12
+ import torch.nn.functional as F
13
+ from torch import optim
14
+ from torch.utils.data import Dataset
15
+ from torchvision import transforms
16
+ from torch.utils.data import DataLoader
17
+
18
+ from PIL import Image
19
+
20
+ class ImageEncoder(nn.Module):
21
+ def __init__(self, embed_dim):
22
+ super(ImageEncoder, self).__init__()
23
+ # Load a pretrained VGG19 model
24
+ self.model = models.vgg19(pretrained=True)
25
+ # Get the number of input features for the last fully connected layer
26
+ in_features = self.model.classifier[-1].in_features
27
+ # Removing the last layer of VGG19's classifier (the final fully connected layer for classification)
28
+ self.model.classifier = nn.Sequential(*list(self.model.classifier.children())[:-1])
29
+ # Adding a new fully connected layer to map features to the desired embedding dimension
30
+ self.fc = nn.Linear(in_features, embed_dim)
31
+
32
+ def forward(self, image):
33
+ # Extracting features of the image using the modified VGG19 model
34
+ with torch.no_grad(): # Freezing the weights of the pretrained model during this pass
35
+ img_feature = self.model(image) # Output shape: (batch_size, feature_dim)
36
+
37
+ #features to the embedding dimension
38
+ img_feature = self.fc(img_feature) # Output shape: (batch_size, embed_dim)
39
+
40
+ # Applying L2 normalization to the features for better similarity comparisons
41
+ l2_norm = F.normalize(img_feature, p=2, dim=1).detach() # Normalize along the feature dimension
42
+
43
+ return l2_norm
44
+
45
+ class QuesEncoder(nn.Module):
46
+ def __init__(self, ques_vocab_size, word_embed, hidden_size, num_hidden, qu_feature_size):
47
+ super(QuesEncoder, self).__init__()
48
+ # Embedding layer to map question words to word embeddings
49
+ self.word_embedding = nn.Embedding(ques_vocab_size, word_embed)
50
+ # Activation function to add non-linearity to embeddings
51
+ self.tanh = nn.Tanh()
52
+ # LSTM layer for sequential processing of question embeddings
53
+ # Takes word embeddings as input and outputs hidden states
54
+ self.lstm = nn.LSTM(word_embed, hidden_size, num_hidden) # (input_dim, hidden_dim, num_layers)
55
+ # Fully connected layer to transform the concatenated LSTM states to the desired feature size
56
+ self.fc = nn.Linear(2 * num_hidden * hidden_size, qu_feature_size)
57
+
58
+ def forward(self, question):
59
+ # Map question words to embeddings
60
+ # Shape: (batch_size, question_length, word_embed)
61
+ ques_embedding = self.word_embedding(question)
62
+ # Applying Tanh activation to the embeddings
63
+ ques_embedding = self.tanh(ques_embedding)
64
+ # Transpose for LSTM input: (question_length, batch_size, word_embed)
65
+ ques_embedding = ques_embedding.transpose(0, 1)
66
+ # Passing embeddings through the LSTM
67
+ # Outputs: LSTM outputs (_) and final hidden states (hidden, cell)
68
+ # hidden and cell shapes: (num_layers, batch_size, hidden_size)
69
+ _, (hidden, cell) = self.lstm(ques_embedding)
70
+ # Concatenating the hidden and cell states along the feature dimension
71
+ # Shape: (num_layers, batch_size, 2 * hidden_size)
72
+ ques_feature = torch.cat((hidden, cell), dim=2)
73
+ # Transpose for batch-first format: (batch_size, num_layers, 2 * hidden_size)
74
+ ques_feature = ques_feature.transpose(0, 1)
75
+ # Flattening the feature tensor: (batch_size, num_layers * 2 * hidden_size)
76
+ ques_feature = ques_feature.reshape(ques_feature.size(0), -1)
77
+ # Applying Tanh activation to the flattened features
78
+ ques_feature = self.tanh(ques_feature)
79
+ # Transforming the features to the desired output size: (batch_size, qu_feature_size)
80
+ ques_feature = self.fc(ques_feature)
81
+
82
+ return ques_feature
83
+
84
+ class VQAModel(nn.Module):
85
+ def __init__(self, feature_size, ques_vocab_size, ans_vocab_size, word_embed, hidden_size, num_hidden):
86
+ super(VQAModel, self).__init__()
87
+
88
+ # Encoder to extract image features
89
+ self.img_encoder = ImageEncoder(feature_size)
90
+
91
+ # Encoder to extract question features
92
+ self.ques_encoder = QuesEncoder(ques_vocab_size, word_embed, hidden_size, num_hidden, feature_size)
93
+
94
+ # Dropout layer to prevent overfitting
95
+ self.dropout = nn.Dropout(0.5)
96
+
97
+ # Tanh activation function for non-linearity
98
+ self.tanh = nn.Tanh()
99
+
100
+ # Fully connected layer to map combined features to answer space
101
+ self.fc1 = nn.Linear(feature_size, ans_vocab_size)
102
+
103
+ # Second fully connected layer to refine logits in the answer space
104
+ self.fc2 = nn.Linear(ans_vocab_size, ans_vocab_size)
105
+
106
+ def forward(self, image, question):
107
+ # Extract image features using the image encoder
108
+ # Output shape: (batch_size, feature_size)
109
+ img_feature = self.img_encoder(image)
110
+
111
+ # Extract question features using the question encoder
112
+ # Output shape: (batch_size, feature_size)
113
+ qst_feature = self.ques_encoder(question)
114
+
115
+ # Combine image and question features element-wise (Hadamard product)
116
+ # Output shape: (batch_size, feature_size)
117
+ combined_feature = img_feature * qst_feature
118
+
119
+ # Apply dropout for regularization
120
+ combined_feature = self.dropout(combined_feature)
121
+
122
+ # Apply Tanh activation for non-linearity
123
+ combined_feature = self.tanh(combined_feature)
124
+
125
+ # Map combined features to the answer space using the first fully connected layer
126
+ # Output shape: (batch_size, ans_vocab_size)
127
+ combined_feature = self.fc1(combined_feature)
128
+
129
+ # Apply another round of dropout for regularization
130
+ combined_feature = self.dropout(combined_feature)
131
+
132
+ # Apply Tanh activation again for non-linearity
133
+ combined_feature = self.tanh(combined_feature)
134
+
135
+ # Refine logits using the second fully connected layer
136
+ # Output shape: (batch_size, ans_vocab_size)
137
+ logits = self.fc2(combined_feature)
138
+
139
+ return logits