morpheushoc commited on
Commit
34975af
·
verified ·
1 Parent(s): 0248af2

Upload modeling_qformer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_qformer.py +1265 -0
modeling_qformer.py ADDED
@@ -0,0 +1,1265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ * Copyright (c) 2023, salesforce.com, inc.
3
+ * All rights reserved.
4
+ * SPDX-License-Identifier: BSD-3-Clause
5
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ * By Junnan Li
7
+ * Based on huggingface code base
8
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
9
+ """
10
+ import logging
11
+ import math
12
+ import os
13
+ import warnings
14
+ from dataclasses import dataclass
15
+ from typing import Optional, Tuple, Dict, Any
16
+
17
+ import torch
18
+ from torch import Tensor, device, dtype, nn
19
+ import torch.utils.checkpoint
20
+ from torch import nn
21
+ from torch.nn import CrossEntropyLoss
22
+ import torch.nn.functional as F
23
+
24
+ from timm.models.layers import drop_path
25
+ from transformers.activations import ACT2FN
26
+ from transformers.file_utils import (
27
+ ModelOutput,
28
+ )
29
+ from transformers.modeling_outputs import (
30
+ BaseModelOutputWithPastAndCrossAttentions,
31
+ BaseModelOutputWithPoolingAndCrossAttentions,
32
+ CausalLMOutputWithCrossAttentions,
33
+ MaskedLMOutput,
34
+ MultipleChoiceModelOutput,
35
+ NextSentencePredictorOutput,
36
+ QuestionAnsweringModelOutput,
37
+ SequenceClassifierOutput,
38
+ TokenClassifierOutput,
39
+ )
40
+ from transformers.modeling_utils import (
41
+ PreTrainedModel,
42
+ apply_chunking_to_forward,
43
+ find_pruneable_heads_and_indices,
44
+ prune_linear_layer,
45
+ )
46
+ from transformers.models.bert.configuration_bert import BertConfig
47
+
48
+ import logging
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ class BertEmbeddings(nn.Module):
53
+ """Construct the embeddings from word and position embeddings."""
54
+
55
+ def __init__(self, config):
56
+ super().__init__()
57
+ self.word_embeddings = nn.Embedding(
58
+ config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
59
+ )
60
+ self.position_embeddings = nn.Embedding(
61
+ config.max_position_embeddings, config.hidden_size
62
+ )
63
+
64
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
65
+ # any TensorFlow checkpoint file
66
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
67
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
68
+
69
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
70
+ self.register_buffer(
71
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
72
+ )
73
+ self.position_embedding_type = getattr(
74
+ config, "position_embedding_type", "absolute"
75
+ )
76
+
77
+ self.config = config
78
+
79
+ def forward(
80
+ self,
81
+ input_ids=None,
82
+ position_ids=None,
83
+ query_embeds=None,
84
+ past_key_values_length=0,
85
+ ):
86
+ if input_ids is not None:
87
+ seq_length = input_ids.size()[1]
88
+ else:
89
+ seq_length = 0
90
+
91
+ if position_ids is None:
92
+ position_ids = self.position_ids[
93
+ :, past_key_values_length : seq_length + past_key_values_length
94
+ ].clone()
95
+
96
+ if input_ids is not None:
97
+ embeddings = self.word_embeddings(input_ids)
98
+ if self.position_embedding_type == "absolute":
99
+ position_embeddings = self.position_embeddings(position_ids)
100
+ embeddings = embeddings + position_embeddings
101
+
102
+ if query_embeds is not None:
103
+ embeddings = torch.cat((query_embeds, embeddings), dim=1)
104
+ else:
105
+ embeddings = query_embeds
106
+
107
+ embeddings = self.LayerNorm(embeddings)
108
+ embeddings = self.dropout(embeddings)
109
+ return embeddings
110
+
111
+
112
+ class BertSelfAttention(nn.Module):
113
+ def __init__(self, config, is_cross_attention):
114
+ super().__init__()
115
+ self.config = config
116
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
117
+ config, "embedding_size"
118
+ ):
119
+ raise ValueError(
120
+ "The hidden size (%d) is not a multiple of the number of attention "
121
+ "heads (%d)" % (config.hidden_size, config.num_attention_heads)
122
+ )
123
+
124
+ self.num_attention_heads = config.num_attention_heads
125
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
126
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
127
+
128
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
129
+ if is_cross_attention:
130
+ self.key = nn.Linear(config.encoder_width, self.all_head_size)
131
+ self.value = nn.Linear(config.encoder_width, self.all_head_size)
132
+ else:
133
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
134
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
135
+
136
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
137
+ self.position_embedding_type = getattr(
138
+ config, "position_embedding_type", "absolute"
139
+ )
140
+ if (
141
+ self.position_embedding_type == "relative_key"
142
+ or self.position_embedding_type == "relative_key_query"
143
+ ):
144
+ self.max_position_embeddings = config.max_position_embeddings
145
+ self.distance_embedding = nn.Embedding(
146
+ 2 * config.max_position_embeddings - 1, self.attention_head_size
147
+ )
148
+ self.save_attention = False
149
+
150
+ def save_attn_gradients(self, attn_gradients):
151
+ self.attn_gradients = attn_gradients
152
+
153
+ def get_attn_gradients(self):
154
+ return self.attn_gradients
155
+
156
+ def save_attention_map(self, attention_map):
157
+ self.attention_map = attention_map
158
+
159
+ def get_attention_map(self):
160
+ return self.attention_map
161
+
162
+ def transpose_for_scores(self, x):
163
+ new_x_shape = x.size()[:-1] + (
164
+ self.num_attention_heads,
165
+ self.attention_head_size,
166
+ )
167
+ x = x.view(*new_x_shape)
168
+ return x.permute(0, 2, 1, 3)
169
+
170
+ def forward(
171
+ self,
172
+ hidden_states,
173
+ attention_mask=None,
174
+ head_mask=None,
175
+ encoder_hidden_states=None,
176
+ encoder_attention_mask=None,
177
+ past_key_value=None,
178
+ output_attentions=False,
179
+ ):
180
+
181
+ # If this is instantiated as a cross-attention module, the keys
182
+ # and values come from an encoder; the attention mask needs to be
183
+ # such that the encoder's padding tokens are not attended to.
184
+ is_cross_attention = encoder_hidden_states is not None
185
+
186
+ if is_cross_attention:
187
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
188
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
189
+ attention_mask = encoder_attention_mask
190
+ elif past_key_value is not None:
191
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
192
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
193
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
194
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
195
+ else:
196
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
197
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
198
+
199
+ mixed_query_layer = self.query(hidden_states)
200
+
201
+ query_layer = self.transpose_for_scores(mixed_query_layer)
202
+
203
+ past_key_value = (key_layer, value_layer)
204
+
205
+ # Take the dot product between "query" and "key" to get the raw attention scores.
206
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
207
+
208
+ if (
209
+ self.position_embedding_type == "relative_key"
210
+ or self.position_embedding_type == "relative_key_query"
211
+ ):
212
+ seq_length = hidden_states.size()[1]
213
+ position_ids_l = torch.arange(
214
+ seq_length, dtype=torch.long, device=hidden_states.device
215
+ ).view(-1, 1)
216
+ position_ids_r = torch.arange(
217
+ seq_length, dtype=torch.long, device=hidden_states.device
218
+ ).view(1, -1)
219
+ distance = position_ids_l - position_ids_r
220
+ positional_embedding = self.distance_embedding(
221
+ distance + self.max_position_embeddings - 1
222
+ )
223
+ positional_embedding = positional_embedding.to(
224
+ dtype=query_layer.dtype
225
+ ) # fp16 compatibility
226
+
227
+ if self.position_embedding_type == "relative_key":
228
+ relative_position_scores = torch.einsum(
229
+ "bhld,lrd->bhlr", query_layer, positional_embedding
230
+ )
231
+ attention_scores = attention_scores + relative_position_scores
232
+ elif self.position_embedding_type == "relative_key_query":
233
+ relative_position_scores_query = torch.einsum(
234
+ "bhld,lrd->bhlr", query_layer, positional_embedding
235
+ )
236
+ relative_position_scores_key = torch.einsum(
237
+ "bhrd,lrd->bhlr", key_layer, positional_embedding
238
+ )
239
+ attention_scores = (
240
+ attention_scores
241
+ + relative_position_scores_query
242
+ + relative_position_scores_key
243
+ )
244
+
245
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
246
+ if attention_mask is not None:
247
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
248
+ attention_scores = attention_scores + attention_mask
249
+
250
+ # Normalize the attention scores to probabilities.
251
+ attention_probs = nn.Softmax(dim=-1)(attention_scores)
252
+
253
+ if is_cross_attention and self.save_attention:
254
+ self.save_attention_map(attention_probs)
255
+ attention_probs.register_hook(self.save_attn_gradients)
256
+
257
+ # This is actually dropping out entire tokens to attend to, which might
258
+ # seem a bit unusual, but is taken from the original Transformer paper.
259
+ attention_probs_dropped = self.dropout(attention_probs)
260
+
261
+ # Mask heads if we want to
262
+ if head_mask is not None:
263
+ attention_probs_dropped = attention_probs_dropped * head_mask
264
+
265
+ context_layer = torch.matmul(attention_probs_dropped, value_layer)
266
+
267
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
268
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
269
+ context_layer = context_layer.view(*new_context_layer_shape)
270
+
271
+ outputs = (
272
+ (context_layer, attention_probs) if output_attentions else (context_layer,)
273
+ )
274
+
275
+ outputs = outputs + (past_key_value,)
276
+ return outputs
277
+
278
+
279
+ class DropPath(nn.Module):
280
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
281
+ """
282
+ def __init__(self, drop_prob=None):
283
+ super(DropPath, self).__init__()
284
+ self.drop_prob = drop_prob
285
+
286
+ def forward(self, x):
287
+ return drop_path(x, self.drop_prob, self.training)
288
+
289
+ def extra_repr(self) -> str:
290
+ return 'p={}'.format(self.drop_prob)
291
+
292
+
293
+ class BertSelfOutput(nn.Module):
294
+ def __init__(self, config, drop_path=0.):
295
+ super().__init__()
296
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
297
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
298
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
299
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
300
+
301
+ def forward(self, hidden_states, input_tensor):
302
+ hidden_states = self.dense(hidden_states)
303
+ hidden_states = self.dropout(hidden_states)
304
+ hidden_states = self.drop_path(hidden_states)
305
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
306
+ return hidden_states
307
+
308
+
309
+ class BertAttention(nn.Module):
310
+ def __init__(self, config, is_cross_attention=False, drop_path=0.,):
311
+ super().__init__()
312
+ self.self = BertSelfAttention(config, is_cross_attention)
313
+ self.output = BertSelfOutput(config, drop_path=drop_path)
314
+ self.pruned_heads = set()
315
+
316
+ def prune_heads(self, heads):
317
+ if len(heads) == 0:
318
+ return
319
+ heads, index = find_pruneable_heads_and_indices(
320
+ heads,
321
+ self.self.num_attention_heads,
322
+ self.self.attention_head_size,
323
+ self.pruned_heads,
324
+ )
325
+
326
+ # Prune linear layers
327
+ self.self.query = prune_linear_layer(self.self.query, index)
328
+ self.self.key = prune_linear_layer(self.self.key, index)
329
+ self.self.value = prune_linear_layer(self.self.value, index)
330
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
331
+
332
+ # Update hyper params and store pruned heads
333
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
334
+ self.self.all_head_size = (
335
+ self.self.attention_head_size * self.self.num_attention_heads
336
+ )
337
+ self.pruned_heads = self.pruned_heads.union(heads)
338
+
339
+ def forward(
340
+ self,
341
+ hidden_states,
342
+ attention_mask=None,
343
+ head_mask=None,
344
+ encoder_hidden_states=None,
345
+ encoder_attention_mask=None,
346
+ past_key_value=None,
347
+ output_attentions=False,
348
+ ):
349
+ self_outputs = self.self(
350
+ hidden_states,
351
+ attention_mask,
352
+ head_mask,
353
+ encoder_hidden_states,
354
+ encoder_attention_mask,
355
+ past_key_value,
356
+ output_attentions,
357
+ )
358
+ attention_output = self.output(self_outputs[0], hidden_states)
359
+
360
+ outputs = (attention_output,) + self_outputs[
361
+ 1:
362
+ ] # add attentions if we output them
363
+ return outputs
364
+
365
+
366
+ class BertIntermediate(nn.Module):
367
+ def __init__(self, config):
368
+ super().__init__()
369
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
370
+ if isinstance(config.hidden_act, str):
371
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
372
+ else:
373
+ self.intermediate_act_fn = config.hidden_act
374
+
375
+ def forward(self, hidden_states):
376
+ hidden_states = self.dense(hidden_states)
377
+ hidden_states = self.intermediate_act_fn(hidden_states)
378
+ return hidden_states
379
+
380
+
381
+ class BertOutput(nn.Module):
382
+ def __init__(self, config, drop_path=0.):
383
+ super().__init__()
384
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
385
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
386
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
387
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
388
+
389
+ def forward(self, hidden_states, input_tensor):
390
+ hidden_states = self.dense(hidden_states)
391
+ hidden_states = self.dropout(hidden_states)
392
+ hidden_states = self.drop_path(hidden_states)
393
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
394
+ return hidden_states
395
+
396
+
397
+ class BertLayer(nn.Module):
398
+ def __init__(self, config, layer_num):
399
+ super().__init__()
400
+ self.config = config
401
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
402
+ self.seq_len_dim = 1
403
+ drop_path = config.drop_path_list[layer_num]
404
+ self.attention = BertAttention(config, drop_path=drop_path)
405
+ self.layer_num = layer_num
406
+ if (
407
+ self.config.add_cross_attention
408
+ and layer_num % self.config.cross_attention_freq == 0
409
+ ):
410
+ self.crossattention = BertAttention(
411
+ config, is_cross_attention=self.config.add_cross_attention,
412
+ drop_path=drop_path
413
+ )
414
+ self.has_cross_attention = True
415
+ else:
416
+ self.has_cross_attention = False
417
+ self.intermediate = BertIntermediate(config)
418
+ self.output = BertOutput(config, drop_path=drop_path)
419
+
420
+ self.intermediate_query = BertIntermediate(config)
421
+ self.output_query = BertOutput(config, drop_path=drop_path)
422
+
423
+ def forward(
424
+ self,
425
+ hidden_states,
426
+ attention_mask=None,
427
+ head_mask=None,
428
+ encoder_hidden_states=None,
429
+ encoder_attention_mask=None,
430
+ past_key_value=None,
431
+ output_attentions=False,
432
+ query_length=0,
433
+ ):
434
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
435
+ self_attn_past_key_value = (
436
+ past_key_value[:2] if past_key_value is not None else None
437
+ )
438
+ self_attention_outputs = self.attention(
439
+ hidden_states,
440
+ attention_mask,
441
+ head_mask,
442
+ output_attentions=output_attentions,
443
+ past_key_value=self_attn_past_key_value,
444
+ )
445
+ attention_output = self_attention_outputs[0]
446
+ outputs = self_attention_outputs[1:-1]
447
+
448
+ present_key_value = self_attention_outputs[-1]
449
+
450
+ if query_length > 0:
451
+ query_attention_output = attention_output[:, :query_length, :]
452
+
453
+ if self.has_cross_attention:
454
+ assert (
455
+ encoder_hidden_states is not None
456
+ ), "encoder_hidden_states must be given for cross-attention layers"
457
+ cross_attention_outputs = self.crossattention(
458
+ query_attention_output,
459
+ attention_mask,
460
+ head_mask,
461
+ encoder_hidden_states,
462
+ encoder_attention_mask,
463
+ output_attentions=output_attentions,
464
+ )
465
+ query_attention_output = cross_attention_outputs[0]
466
+ outputs = (
467
+ outputs + cross_attention_outputs[1:-1]
468
+ ) # add cross attentions if we output attention weights
469
+
470
+ layer_output = apply_chunking_to_forward(
471
+ self.feed_forward_chunk_query,
472
+ self.chunk_size_feed_forward,
473
+ self.seq_len_dim,
474
+ query_attention_output,
475
+ )
476
+ if attention_output.shape[1] > query_length:
477
+ layer_output_text = apply_chunking_to_forward(
478
+ self.feed_forward_chunk,
479
+ self.chunk_size_feed_forward,
480
+ self.seq_len_dim,
481
+ attention_output[:, query_length:, :],
482
+ )
483
+ layer_output = torch.cat([layer_output, layer_output_text], dim=1)
484
+ else:
485
+ layer_output = apply_chunking_to_forward(
486
+ self.feed_forward_chunk,
487
+ self.chunk_size_feed_forward,
488
+ self.seq_len_dim,
489
+ attention_output,
490
+ )
491
+ outputs = (layer_output,) + outputs
492
+
493
+ outputs = outputs + (present_key_value,)
494
+
495
+ return outputs
496
+
497
+ def feed_forward_chunk(self, attention_output):
498
+ intermediate_output = self.intermediate(attention_output)
499
+ layer_output = self.output(intermediate_output, attention_output)
500
+ return layer_output
501
+
502
+ def feed_forward_chunk_query(self, attention_output):
503
+ intermediate_output = self.intermediate_query(attention_output)
504
+ layer_output = self.output_query(intermediate_output, attention_output)
505
+ return layer_output
506
+
507
+
508
+ class BertEncoder(nn.Module):
509
+ def __init__(self, config):
510
+ super().__init__()
511
+ self.config = config
512
+ self.layer = nn.ModuleList(
513
+ [BertLayer(config, i) for i in range(config.num_hidden_layers)]
514
+ )
515
+
516
+ def forward(
517
+ self,
518
+ hidden_states,
519
+ attention_mask=None,
520
+ head_mask=None,
521
+ encoder_hidden_states=None,
522
+ encoder_attention_mask=None,
523
+ past_key_values=None,
524
+ use_cache=None,
525
+ output_attentions=False,
526
+ output_hidden_states=False,
527
+ return_dict=True,
528
+ query_length=0,
529
+ ):
530
+ all_hidden_states = () if output_hidden_states else None
531
+ all_self_attentions = () if output_attentions else None
532
+ all_cross_attentions = (
533
+ () if output_attentions and self.config.add_cross_attention else None
534
+ )
535
+
536
+ next_decoder_cache = () if use_cache else None
537
+
538
+ for i in range(self.config.num_hidden_layers):
539
+ layer_module = self.layer[i]
540
+ if output_hidden_states:
541
+ all_hidden_states = all_hidden_states + (hidden_states,)
542
+
543
+ layer_head_mask = head_mask[i] if head_mask is not None else None
544
+ past_key_value = past_key_values[i] if past_key_values is not None else None
545
+
546
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
547
+
548
+ if use_cache:
549
+ logger.warn(
550
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
551
+ )
552
+ use_cache = False
553
+
554
+ def create_custom_forward(module):
555
+ def custom_forward(*inputs):
556
+ return module(
557
+ *inputs, past_key_value, output_attentions, query_length
558
+ )
559
+
560
+ return custom_forward
561
+
562
+ layer_outputs = torch.utils.checkpoint.checkpoint(
563
+ create_custom_forward(layer_module),
564
+ hidden_states,
565
+ attention_mask,
566
+ layer_head_mask,
567
+ encoder_hidden_states,
568
+ encoder_attention_mask,
569
+ )
570
+ else:
571
+ layer_outputs = layer_module(
572
+ hidden_states,
573
+ attention_mask,
574
+ layer_head_mask,
575
+ encoder_hidden_states,
576
+ encoder_attention_mask,
577
+ past_key_value,
578
+ output_attentions,
579
+ query_length,
580
+ )
581
+
582
+ hidden_states = layer_outputs[0]
583
+ if use_cache:
584
+ next_decoder_cache += (layer_outputs[-1],)
585
+ if output_attentions:
586
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
587
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
588
+
589
+ if output_hidden_states:
590
+ all_hidden_states = all_hidden_states + (hidden_states,)
591
+
592
+ if not return_dict:
593
+ return tuple(
594
+ v
595
+ for v in [
596
+ hidden_states,
597
+ next_decoder_cache,
598
+ all_hidden_states,
599
+ all_self_attentions,
600
+ all_cross_attentions,
601
+ ]
602
+ if v is not None
603
+ )
604
+ return BaseModelOutputWithPastAndCrossAttentions(
605
+ last_hidden_state=hidden_states,
606
+ past_key_values=next_decoder_cache,
607
+ hidden_states=all_hidden_states,
608
+ attentions=all_self_attentions,
609
+ cross_attentions=all_cross_attentions,
610
+ )
611
+
612
+
613
+ class BertPooler(nn.Module):
614
+ def __init__(self, config):
615
+ super().__init__()
616
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
617
+ self.activation = nn.Tanh()
618
+
619
+ def forward(self, hidden_states):
620
+ # We "pool" the model by simply taking the hidden state corresponding
621
+ # to the first token.
622
+ first_token_tensor = hidden_states[:, 0]
623
+ pooled_output = self.dense(first_token_tensor)
624
+ pooled_output = self.activation(pooled_output)
625
+ return pooled_output
626
+
627
+
628
+ class BertPredictionHeadTransform(nn.Module):
629
+ def __init__(self, config):
630
+ super().__init__()
631
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
632
+ if isinstance(config.hidden_act, str):
633
+ self.transform_act_fn = ACT2FN[config.hidden_act]
634
+ else:
635
+ self.transform_act_fn = config.hidden_act
636
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
637
+
638
+ def forward(self, hidden_states):
639
+ hidden_states = self.dense(hidden_states)
640
+ hidden_states = self.transform_act_fn(hidden_states)
641
+ hidden_states = self.LayerNorm(hidden_states)
642
+ return hidden_states
643
+
644
+
645
+ class BertLMPredictionHead(nn.Module):
646
+ def __init__(self, config):
647
+ super().__init__()
648
+ self.transform = BertPredictionHeadTransform(config)
649
+
650
+ # The output weights are the same as the input embeddings, but there is
651
+ # an output-only bias for each token.
652
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
653
+
654
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
655
+
656
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
657
+ self.decoder.bias = self.bias
658
+
659
+ def forward(self, hidden_states):
660
+ hidden_states = self.transform(hidden_states)
661
+ hidden_states = self.decoder(hidden_states)
662
+ return hidden_states
663
+
664
+
665
+ class BertOnlyMLMHead(nn.Module):
666
+ def __init__(self, config):
667
+ super().__init__()
668
+ self.predictions = BertLMPredictionHead(config)
669
+
670
+ def forward(self, sequence_output):
671
+ prediction_scores = self.predictions(sequence_output)
672
+ return prediction_scores
673
+
674
+
675
+ class BertPreTrainedModel(PreTrainedModel):
676
+ """
677
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
678
+ models.
679
+ """
680
+
681
+ config_class = BertConfig
682
+ base_model_prefix = "bert"
683
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
684
+
685
+ def _init_weights(self, module):
686
+ """Initialize the weights"""
687
+ if isinstance(module, (nn.Linear, nn.Embedding)):
688
+ # Slightly different from the TF version which uses truncated_normal for initialization
689
+ # cf https://github.com/pytorch/pytorch/pull/5617
690
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
691
+ elif isinstance(module, nn.LayerNorm):
692
+ module.bias.data.zero_()
693
+ module.weight.data.fill_(1.0)
694
+ if isinstance(module, nn.Linear) and module.bias is not None:
695
+ module.bias.data.zero_()
696
+
697
+
698
+ class BertModel(BertPreTrainedModel):
699
+ """
700
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
701
+ cross-attention is added between the self-attention layers, following the architecture described in `Attention is
702
+ all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
703
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
704
+ argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
705
+ input to the forward pass.
706
+ """
707
+
708
+ def __init__(self, config, add_pooling_layer=False):
709
+ super().__init__(config)
710
+ self.config = config
711
+
712
+ self.embeddings = BertEmbeddings(config)
713
+
714
+ self.encoder = BertEncoder(config)
715
+
716
+ self.pooler = BertPooler(config) if add_pooling_layer else None
717
+
718
+ self.init_weights()
719
+
720
+ def get_input_embeddings(self):
721
+ return self.embeddings.word_embeddings
722
+
723
+ def set_input_embeddings(self, value):
724
+ self.embeddings.word_embeddings = value
725
+
726
+ def _prune_heads(self, heads_to_prune):
727
+ """
728
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
729
+ class PreTrainedModel
730
+ """
731
+ for layer, heads in heads_to_prune.items():
732
+ self.encoder.layer[layer].attention.prune_heads(heads)
733
+
734
+ def get_extended_attention_mask(
735
+ self,
736
+ attention_mask: Tensor,
737
+ input_shape: Tuple[int],
738
+ device: device,
739
+ is_decoder: bool,
740
+ has_query: bool = False,
741
+ ) -> Tensor:
742
+ """
743
+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
744
+
745
+ Arguments:
746
+ attention_mask (:obj:`torch.Tensor`):
747
+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
748
+ input_shape (:obj:`Tuple[int]`):
749
+ The shape of the input to the model.
750
+ device: (:obj:`torch.device`):
751
+ The device of the input to the model.
752
+
753
+ Returns:
754
+ :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
755
+ """
756
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
757
+ # ourselves in which case we just need to make it broadcastable to all heads.
758
+ if attention_mask.dim() == 3:
759
+ extended_attention_mask = attention_mask[:, None, :, :]
760
+ elif attention_mask.dim() == 2:
761
+ # Provided a padding mask of dimensions [batch_size, seq_length]
762
+ # - if the model is a decoder, apply a causal mask in addition to the padding mask
763
+ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
764
+ if is_decoder:
765
+ batch_size, seq_length = input_shape
766
+
767
+ seq_ids = torch.arange(seq_length, device=device)
768
+ causal_mask = (
769
+ seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
770
+ <= seq_ids[None, :, None]
771
+ )
772
+
773
+ # add a prefix ones mask to the causal mask
774
+ # causal and attention masks must have same type with pytorch version < 1.3
775
+ causal_mask = causal_mask.to(attention_mask.dtype)
776
+
777
+ if causal_mask.shape[1] < attention_mask.shape[1]:
778
+ prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
779
+ if has_query: # UniLM style attention mask
780
+ causal_mask = torch.cat(
781
+ [
782
+ torch.zeros(
783
+ (batch_size, prefix_seq_len, seq_length),
784
+ device=device,
785
+ dtype=causal_mask.dtype,
786
+ ),
787
+ causal_mask,
788
+ ],
789
+ axis=1,
790
+ )
791
+ causal_mask = torch.cat(
792
+ [
793
+ torch.ones(
794
+ (batch_size, causal_mask.shape[1], prefix_seq_len),
795
+ device=device,
796
+ dtype=causal_mask.dtype,
797
+ ),
798
+ causal_mask,
799
+ ],
800
+ axis=-1,
801
+ )
802
+ extended_attention_mask = (
803
+ causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
804
+ )
805
+ else:
806
+ extended_attention_mask = attention_mask[:, None, None, :]
807
+ else:
808
+ raise ValueError(
809
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
810
+ input_shape, attention_mask.shape
811
+ )
812
+ )
813
+
814
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
815
+ # masked positions, this operation will create a tensor which is 0.0 for
816
+ # positions we want to attend and -10000.0 for masked positions.
817
+ # Since we are adding it to the raw scores before the softmax, this is
818
+ # effectively the same as removing these entirely.
819
+ extended_attention_mask = extended_attention_mask.to(
820
+ dtype=self.dtype
821
+ ) # fp16 compatibility
822
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
823
+ return extended_attention_mask
824
+
825
+ def forward(
826
+ self,
827
+ input_ids=None,
828
+ attention_mask=None,
829
+ position_ids=None,
830
+ head_mask=None,
831
+ query_embeds=None,
832
+ encoder_hidden_states=None,
833
+ encoder_attention_mask=None,
834
+ past_key_values=None,
835
+ use_cache=None,
836
+ output_attentions=None,
837
+ output_hidden_states=None,
838
+ return_dict=None,
839
+ is_decoder=False,
840
+ ):
841
+ r"""
842
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
843
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
844
+ the model is configured as a decoder.
845
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
846
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
847
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
848
+ - 1 for tokens that are **not masked**,
849
+ - 0 for tokens that are **masked**.
850
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
851
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
852
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
853
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
854
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
855
+ use_cache (:obj:`bool`, `optional`):
856
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
857
+ decoding (see :obj:`past_key_values`).
858
+ """
859
+ output_attentions = (
860
+ output_attentions
861
+ if output_attentions is not None
862
+ else self.config.output_attentions
863
+ )
864
+ output_hidden_states = (
865
+ output_hidden_states
866
+ if output_hidden_states is not None
867
+ else self.config.output_hidden_states
868
+ )
869
+ return_dict = (
870
+ return_dict if return_dict is not None else self.config.use_return_dict
871
+ )
872
+
873
+ # use_cache = use_cache if use_cache is not None else self.config.use_cache
874
+
875
+ if input_ids is None:
876
+ assert (
877
+ query_embeds is not None
878
+ ), "You have to specify query_embeds when input_ids is None"
879
+
880
+ # past_key_values_length
881
+ past_key_values_length = (
882
+ past_key_values[0][0].shape[2] - self.config.query_length
883
+ if past_key_values is not None
884
+ else 0
885
+ )
886
+
887
+ query_length = query_embeds.shape[1] if query_embeds is not None else 0
888
+
889
+ embedding_output = self.embeddings(
890
+ input_ids=input_ids,
891
+ position_ids=position_ids,
892
+ query_embeds=query_embeds,
893
+ past_key_values_length=past_key_values_length,
894
+ )
895
+
896
+ input_shape = embedding_output.size()[:-1]
897
+ batch_size, seq_length = input_shape
898
+ device = embedding_output.device
899
+
900
+ if attention_mask is None:
901
+ attention_mask = torch.ones(
902
+ ((batch_size, seq_length + past_key_values_length)), device=device
903
+ )
904
+
905
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
906
+ # ourselves in which case we just need to make it broadcastable to all heads.
907
+ if is_decoder:
908
+ extended_attention_mask = self.get_extended_attention_mask(
909
+ attention_mask,
910
+ input_ids.shape,
911
+ device,
912
+ is_decoder,
913
+ has_query=(query_embeds is not None),
914
+ )
915
+ else:
916
+ extended_attention_mask = self.get_extended_attention_mask(
917
+ attention_mask, input_shape, device, is_decoder
918
+ )
919
+
920
+ # If a 2D or 3D attention mask is provided for the cross-attention
921
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
922
+ if encoder_hidden_states is not None:
923
+ if type(encoder_hidden_states) == list:
924
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
925
+ 0
926
+ ].size()
927
+ else:
928
+ (
929
+ encoder_batch_size,
930
+ encoder_sequence_length,
931
+ _,
932
+ ) = encoder_hidden_states.size()
933
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
934
+
935
+ if type(encoder_attention_mask) == list:
936
+ encoder_extended_attention_mask = [
937
+ self.invert_attention_mask(mask) for mask in encoder_attention_mask
938
+ ]
939
+ elif encoder_attention_mask is None:
940
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
941
+ encoder_extended_attention_mask = self.invert_attention_mask(
942
+ encoder_attention_mask
943
+ )
944
+ else:
945
+ encoder_extended_attention_mask = self.invert_attention_mask(
946
+ encoder_attention_mask
947
+ )
948
+ else:
949
+ encoder_extended_attention_mask = None
950
+
951
+ # Prepare head mask if needed
952
+ # 1.0 in head_mask indicate we keep the head
953
+ # attention_probs has shape bsz x n_heads x N x N
954
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
955
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
956
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
957
+
958
+ encoder_outputs = self.encoder(
959
+ embedding_output,
960
+ attention_mask=extended_attention_mask,
961
+ head_mask=head_mask,
962
+ encoder_hidden_states=encoder_hidden_states,
963
+ encoder_attention_mask=encoder_extended_attention_mask,
964
+ past_key_values=past_key_values,
965
+ use_cache=use_cache,
966
+ output_attentions=output_attentions,
967
+ output_hidden_states=output_hidden_states,
968
+ return_dict=return_dict,
969
+ query_length=query_length,
970
+ )
971
+ sequence_output = encoder_outputs[0]
972
+ pooled_output = (
973
+ self.pooler(sequence_output) if self.pooler is not None else None
974
+ )
975
+
976
+ if not return_dict:
977
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
978
+
979
+ return BaseModelOutputWithPoolingAndCrossAttentions(
980
+ last_hidden_state=sequence_output,
981
+ pooler_output=pooled_output,
982
+ past_key_values=encoder_outputs.past_key_values,
983
+ hidden_states=encoder_outputs.hidden_states,
984
+ attentions=encoder_outputs.attentions,
985
+ cross_attentions=encoder_outputs.cross_attentions,
986
+ )
987
+
988
+
989
+ class BertLMHeadModel(BertPreTrainedModel):
990
+
991
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
992
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
993
+
994
+ def __init__(self, config):
995
+ super().__init__(config)
996
+
997
+ self.bert = BertModel(config, add_pooling_layer=False)
998
+ self.cls = BertOnlyMLMHead(config)
999
+
1000
+ self.init_weights()
1001
+
1002
+ def get_output_embeddings(self):
1003
+ return self.cls.predictions.decoder
1004
+
1005
+ def set_output_embeddings(self, new_embeddings):
1006
+ self.cls.predictions.decoder = new_embeddings
1007
+
1008
+ def forward(
1009
+ self,
1010
+ input_ids=None,
1011
+ attention_mask=None,
1012
+ position_ids=None,
1013
+ head_mask=None,
1014
+ query_embeds=None,
1015
+ encoder_hidden_states=None,
1016
+ encoder_attention_mask=None,
1017
+ labels=None,
1018
+ past_key_values=None,
1019
+ use_cache=True,
1020
+ output_attentions=None,
1021
+ output_hidden_states=None,
1022
+ return_dict=None,
1023
+ return_logits=False,
1024
+ is_decoder=True,
1025
+ reduction="mean",
1026
+ ):
1027
+ r"""
1028
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
1029
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1030
+ the model is configured as a decoder.
1031
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
1032
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1033
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
1034
+ - 1 for tokens that are **not masked**,
1035
+ - 0 for tokens that are **masked**.
1036
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
1037
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
1038
+ ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
1039
+ ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
1040
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1041
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1042
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
1043
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
1044
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
1045
+ use_cache (:obj:`bool`, `optional`):
1046
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
1047
+ decoding (see :obj:`past_key_values`).
1048
+ Returns:
1049
+ Example::
1050
+ >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
1051
+ >>> import torch
1052
+ >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
1053
+ >>> config = BertConfig.from_pretrained("bert-base-cased")
1054
+ >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
1055
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1056
+ >>> outputs = model(**inputs)
1057
+ >>> prediction_logits = outputs.logits
1058
+ """
1059
+ return_dict = (
1060
+ return_dict if return_dict is not None else self.config.use_return_dict
1061
+ )
1062
+ if labels is not None:
1063
+ use_cache = False
1064
+ if past_key_values is not None:
1065
+ query_embeds = None
1066
+
1067
+ outputs = self.bert(
1068
+ input_ids,
1069
+ attention_mask=attention_mask,
1070
+ position_ids=position_ids,
1071
+ head_mask=head_mask,
1072
+ query_embeds=query_embeds,
1073
+ encoder_hidden_states=encoder_hidden_states,
1074
+ encoder_attention_mask=encoder_attention_mask,
1075
+ past_key_values=past_key_values,
1076
+ use_cache=use_cache,
1077
+ output_attentions=output_attentions,
1078
+ output_hidden_states=output_hidden_states,
1079
+ return_dict=return_dict,
1080
+ is_decoder=is_decoder,
1081
+ )
1082
+
1083
+ sequence_output = outputs[0]
1084
+ if query_embeds is not None:
1085
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
1086
+
1087
+ prediction_scores = self.cls(sequence_output)
1088
+
1089
+ if return_logits:
1090
+ return prediction_scores[:, :-1, :].contiguous()
1091
+
1092
+ lm_loss = None
1093
+ if labels is not None:
1094
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1095
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1096
+ labels = labels[:, 1:].contiguous()
1097
+ loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
1098
+ lm_loss = loss_fct(
1099
+ shifted_prediction_scores.view(-1, self.config.vocab_size),
1100
+ labels.view(-1),
1101
+ )
1102
+ if reduction == "none":
1103
+ lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
1104
+
1105
+ if not return_dict:
1106
+ output = (prediction_scores,) + outputs[2:]
1107
+ return ((lm_loss,) + output) if lm_loss is not None else output
1108
+
1109
+ return CausalLMOutputWithCrossAttentions(
1110
+ loss=lm_loss,
1111
+ logits=prediction_scores,
1112
+ past_key_values=outputs.past_key_values,
1113
+ hidden_states=outputs.hidden_states,
1114
+ attentions=outputs.attentions,
1115
+ cross_attentions=outputs.cross_attentions,
1116
+ )
1117
+
1118
+ def prepare_inputs_for_generation(
1119
+ self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs
1120
+ ):
1121
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1122
+ if attention_mask is None:
1123
+ attention_mask = input_ids.new_ones(input_ids.shape)
1124
+ query_mask = input_ids.new_ones(query_embeds.shape[:-1])
1125
+ attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
1126
+
1127
+ # cut decoder_input_ids if past is used
1128
+ if past is not None:
1129
+ input_ids = input_ids[:, -1:]
1130
+
1131
+ return {
1132
+ "input_ids": input_ids,
1133
+ "query_embeds": query_embeds,
1134
+ "attention_mask": attention_mask,
1135
+ "past_key_values": past,
1136
+ "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
1137
+ "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
1138
+ "is_decoder": True,
1139
+ }
1140
+
1141
+ def _reorder_cache(self, past, beam_idx):
1142
+ reordered_past = ()
1143
+ for layer_past in past:
1144
+ reordered_past += (
1145
+ tuple(
1146
+ past_state.index_select(0, beam_idx) for past_state in layer_past
1147
+ ),
1148
+ )
1149
+ return reordered_past
1150
+
1151
+
1152
+ class BertForMaskedLM(BertPreTrainedModel):
1153
+
1154
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
1155
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
1156
+
1157
+ def __init__(self, config):
1158
+ super().__init__(config)
1159
+
1160
+ self.bert = BertModel(config, add_pooling_layer=False)
1161
+ self.cls = BertOnlyMLMHead(config)
1162
+
1163
+ self.init_weights()
1164
+
1165
+ def get_output_embeddings(self):
1166
+ return self.cls.predictions.decoder
1167
+
1168
+ def set_output_embeddings(self, new_embeddings):
1169
+ self.cls.predictions.decoder = new_embeddings
1170
+
1171
+ def forward(
1172
+ self,
1173
+ input_ids=None,
1174
+ attention_mask=None,
1175
+ position_ids=None,
1176
+ head_mask=None,
1177
+ query_embeds=None,
1178
+ encoder_hidden_states=None,
1179
+ encoder_attention_mask=None,
1180
+ labels=None,
1181
+ output_attentions=None,
1182
+ output_hidden_states=None,
1183
+ return_dict=None,
1184
+ return_logits=False,
1185
+ is_decoder=False,
1186
+ ):
1187
+ r"""
1188
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
1189
+ Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
1190
+ config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
1191
+ (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
1192
+ """
1193
+
1194
+ return_dict = (
1195
+ return_dict if return_dict is not None else self.config.use_return_dict
1196
+ )
1197
+
1198
+ outputs = self.bert(
1199
+ input_ids,
1200
+ attention_mask=attention_mask,
1201
+ position_ids=position_ids,
1202
+ head_mask=head_mask,
1203
+ query_embeds=query_embeds,
1204
+ encoder_hidden_states=encoder_hidden_states,
1205
+ encoder_attention_mask=encoder_attention_mask,
1206
+ output_attentions=output_attentions,
1207
+ output_hidden_states=output_hidden_states,
1208
+ return_dict=return_dict,
1209
+ is_decoder=is_decoder,
1210
+ )
1211
+
1212
+ if query_embeds is not None:
1213
+ sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
1214
+ prediction_scores = self.cls(sequence_output)
1215
+
1216
+ if return_logits:
1217
+ return prediction_scores
1218
+
1219
+ masked_lm_loss = None
1220
+ if labels is not None:
1221
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
1222
+ masked_lm_loss = loss_fct(
1223
+ prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)
1224
+ )
1225
+
1226
+ if not return_dict:
1227
+ output = (prediction_scores,) + outputs[2:]
1228
+ return (
1229
+ ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1230
+ )
1231
+
1232
+ return MaskedLMOutput(
1233
+ loss=masked_lm_loss,
1234
+ logits=prediction_scores,
1235
+ hidden_states=outputs.hidden_states,
1236
+ attentions=outputs.attentions,
1237
+ )
1238
+
1239
+
1240
+ def build_qformer(num_query_token, vision_width,
1241
+ qformer_hidden_dropout_prob=0.1,
1242
+ qformer_attention_probs_dropout_prob=0.1,
1243
+ qformer_drop_path_rate=0.,
1244
+ bert_type="bert-base-uncased"
1245
+ ):
1246
+
1247
+ encoder_config = BertConfig.from_pretrained(bert_type)
1248
+ encoder_config.encoder_width = vision_width
1249
+ # insert cross-attention layer every other block
1250
+ encoder_config.add_cross_attention = True
1251
+ encoder_config.cross_attention_freq = 2
1252
+ encoder_config.query_length = num_query_token
1253
+ encoder_config.hidden_dropout_prob = qformer_hidden_dropout_prob
1254
+ encoder_config.attention_probs_dropout_prob = qformer_attention_probs_dropout_prob
1255
+ encoder_config.drop_path_list = [x.item() for x in torch.linspace(0, qformer_drop_path_rate, encoder_config.num_hidden_layers)]
1256
+ logger.info(f"Drop_path:{encoder_config.drop_path_list}")
1257
+ logger.info(encoder_config)
1258
+ Qformer = BertLMHeadModel(encoder_config)
1259
+ query_tokens = nn.Parameter(
1260
+ torch.zeros(1, num_query_token, encoder_config.hidden_size)
1261
+ )
1262
+ query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
1263
+ return Qformer, query_tokens
1264
+
1265
+