silk-road commited on
Commit
3fdabda
·
1 Parent(s): 89f47a8

Update models.py

Browse files
Files changed (1) hide show
  1. models.py +13 -107
models.py CHANGED
@@ -3,32 +3,10 @@ import torch.nn as nn
3
  import torch.nn.functional as F
4
  import torch.distributed as dist
5
 
6
- from simcse.modeling_glm import GLMModel, GLMPreTrainedModel
7
- import simcse.mse_loss
8
-
9
- import transformers
10
- from transformers import RobertaTokenizer, AutoModel, PreTrainedModel
11
  from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead
12
  from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertLMPredictionHead
13
- from transformers.activations import gelu
14
- from transformers.file_utils import (
15
- add_code_sample_docstrings,
16
- add_start_docstrings,
17
- add_start_docstrings_to_model_forward,
18
- replace_return_docstrings,
19
- )
20
  from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions
21
 
22
- glm_model = None
23
-
24
- def init_glm(path):
25
- global glm_model
26
- glm_model = GLMModel.from_pretrained(path, trust_remote_code=True).to("cuda:0")
27
- for param in glm_model.parameters():
28
- param.requires_grad = False
29
-
30
-
31
-
32
  class MLPLayer(nn.Module):
33
  """
34
  Head for getting sentence representations over RoBERTa/BERT's CLS representation.
@@ -37,7 +15,6 @@ class MLPLayer(nn.Module):
37
  def __init__(self, config):
38
  super().__init__()
39
  self.dense = nn.Linear(config.hidden_size, config.hidden_size)
40
- # 1536
41
  self.fc = nn.Linear(config.hidden_size, 1536)
42
  self.activation = nn.Tanh()
43
 
@@ -45,14 +22,12 @@ class MLPLayer(nn.Module):
45
  x = self.dense(features)
46
  x = self.fc(x)
47
  x = self.activation(x)
48
-
49
  return x
50
 
51
  class Similarity(nn.Module):
52
  """
53
  Dot product or cosine similarity
54
  """
55
-
56
  def __init__(self, temp):
57
  super().__init__()
58
  self.temp = temp
@@ -80,7 +55,7 @@ class Pooler(nn.Module):
80
 
81
  def forward(self, attention_mask, outputs):
82
  last_hidden = outputs.last_hidden_state
83
- # pooler_output = outputs.pooler_output
84
  hidden_states = outputs.hidden_states
85
 
86
  if self.pooler_type in ['cls_before_pooler', 'cls']:
@@ -103,6 +78,11 @@ class Pooler(nn.Module):
103
  raise NotImplementedError
104
 
105
 
 
 
 
 
 
106
  def cl_init(cls, config):
107
  """
108
  Contrastive learning class init function.
@@ -125,27 +105,21 @@ def cl_forward(cls,
125
  inputs_embeds=None,
126
  labels=None,
127
  output_attentions=None,
128
- output_hidden_states=None,
129
  return_dict=None,
130
  mlm_input_ids=None,
131
  mlm_labels=None,
132
  left_emb=None,
133
- right_emb=None,
134
- kl_loss=False
135
  ):
136
  return_dict = return_dict if return_dict is not None else cls.config.use_return_dict
137
- ori_input_ids = input_ids
138
  batch_size = input_ids.size(0)
139
- # Number of sentences in one instance
140
- # 2: pair instance; 3: pair instance with a hard negative
141
  num_sent = input_ids.size(1)
142
 
143
  mlm_outputs = None
144
- # Flatten input for encoding
145
- input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
146
- attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)
147
  if token_type_ids is not None:
148
- token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)
149
 
150
  if inputs_embeds is not None:
151
  input_ids = None
@@ -187,13 +161,11 @@ def cl_forward(cls,
187
  # (same as BERT's original implementation) over the representation.
188
  if cls.pooler_type == "cls":
189
  pooler_output = cls.mlp(pooler_output)
190
- # print("QAQ")
191
 
192
  # Separate representation
193
  z1, z2 = pooler_output[:, 0], pooler_output[:, 1]
194
 
195
- tensor_left = left_emb
196
- tensor_right = right_emb
197
 
198
  # Hard negative
199
  if num_sent == 3:
@@ -224,10 +196,6 @@ def cl_forward(cls,
224
  z2 = torch.cat(z2_list, 0)
225
 
226
  mse_loss = F.mse_loss(z1, tensor_left) + F.mse_loss(z2, tensor_right)
227
-
228
- # softmax_row, softmax_col = simcse.mse_loss.giveMeMatrix(tensor_left, tensor_right)
229
- # softmax_row_model, softmax_col_model = simcse.mse_loss.giveMeMatrix(z1,z2)
230
- # ziang_labels = torch.tensor([i for i in range(8)], device='cuda:0')
231
 
232
  """
233
  this is KL div loss
@@ -236,12 +204,10 @@ def cl_forward(cls,
236
  KL_loss = nn.KLDivLoss(reduction="batchmean")
237
  beta = 5
238
 
239
- # openai的embed,giveMeMatrix返回一个normalized过前后向量,相乘后的矩阵
240
- cos_sim_matrix_openai = simcse.mse_loss.giveMeMatrix(tensor_left, tensor_right)
241
  beta_scaled_cos_sim_matrix_openai = beta * cos_sim_matrix_openai
242
 
243
- # 我们的embed,giveMeMatrix返回一个normalized过前后向量,相乘后的矩阵
244
- cos_sim_matrix_data = simcse.mse_loss.giveMeMatrix(z1, z2)
245
  beta_scaled_cos_sim_matrix_data = beta * cos_sim_matrix_data
246
 
247
  beta_scaled_cos_sim_matrix_openai_vertical = beta_scaled_cos_sim_matrix_openai.softmax(dim=1)
@@ -250,15 +216,10 @@ def cl_forward(cls,
250
  beta_scaled_cos_sim_matrix_data_vertical = beta_scaled_cos_sim_matrix_data.softmax(dim=1)
251
  beta_scaled_cos_sim_matrix_data_horizontal = beta_scaled_cos_sim_matrix_data.softmax(dim=0)
252
 
253
- # remove reduction="batchmean"
254
  KL_vertical_loss = KL_loss(beta_scaled_cos_sim_matrix_data_vertical.log(), beta_scaled_cos_sim_matrix_openai_vertical)
255
  KL_horizontal_loss = KL_loss(beta_scaled_cos_sim_matrix_data_horizontal.log(), beta_scaled_cos_sim_matrix_openai_horizontal)
256
 
257
  KL_loss = (KL_vertical_loss + KL_horizontal_loss) / 2
258
-
259
- # KL_row_loss = F.kl_div(softmax_row_model.log(), softmax_row, reduction='batchmean')
260
- # KL_col_loss = F.kl_div(softmax_col_model.log(), softmax_col, reduction='batchmean')
261
- # KL_loss = (KL_row_loss + KL_col_loss) / 2
262
 
263
  ziang_loss = KL_loss + mse_loss
264
 
@@ -358,13 +319,6 @@ class BertForCL(BertPreTrainedModel):
358
  if self.model_args.do_mlm:
359
  self.lm_head = BertLMPredictionHead(config)
360
 
361
- if self.model_args.init_embeddings_model:
362
- if "glm" in self.model_args.init_embeddings_model:
363
- init_glm(self.model_args.init_embeddings_model)
364
- self.fc = nn.Linear(glm_model.config.hidden_size, config.hidden_size)
365
- else:
366
- raise NotImplementedError
367
-
368
  cl_init(self, config)
369
 
370
  def forward(self,
@@ -384,26 +338,6 @@ class BertForCL(BertPreTrainedModel):
384
  left_emb=None,
385
  right_emb=None,
386
  ):
387
- if self.model_args.init_embeddings_model:
388
- input_ids_for_glm = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
389
- attention_mask_for_glm = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)
390
- if token_type_ids is not None:
391
- token_type_ids_for_glm = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)
392
-
393
- outputs_from_glm = glm_model(input_ids_for_glm,
394
- attention_mask=attention_mask_for_glm,
395
- token_type_ids=token_type_ids_for_glm,
396
- position_ids=position_ids,
397
- head_mask=head_mask,
398
- inputs_embeds=inputs_embeds,
399
- labels=labels,
400
- output_attentions=output_attentions,
401
- output_hidden_states=output_hidden_states,
402
- return_dict=return_dict,
403
- )
404
-
405
- inputs_embeds = self.fc(outputs_from_glm.last_hidden_state)
406
-
407
  if sent_emb:
408
  return sentemb_forward(self, self.bert,
409
  input_ids=input_ids,
@@ -447,13 +381,6 @@ class RobertaForCL(RobertaPreTrainedModel):
447
  if self.model_args.do_mlm:
448
  self.lm_head = RobertaLMHead(config)
449
 
450
- if self.model_args.init_embeddings_model:
451
- if "glm" in self.model_args.init_embeddings_model:
452
- init_glm(self.model_args.init_embeddings_model)
453
- self.fc = nn.Linear(glm_model.config.hidden_size, config.hidden_size)
454
- else:
455
- raise NotImplementedError
456
-
457
  cl_init(self, config)
458
 
459
  def forward(self,
@@ -473,27 +400,6 @@ class RobertaForCL(RobertaPreTrainedModel):
473
  left_emb=None,
474
  right_emb=None,
475
  ):
476
-
477
- if self.model_args.init_embeddings_model and not sent_emb:
478
- input_ids_for_glm = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
479
- attention_mask_for_glm = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)
480
- if token_type_ids is not None:
481
- token_type_ids_for_glm = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)
482
-
483
- outputs_from_glm = glm_model(input_ids_for_glm,
484
- attention_mask=attention_mask_for_glm,
485
- token_type_ids=token_type_ids_for_glm,
486
- position_ids=position_ids,
487
- head_mask=head_mask,
488
- inputs_embeds=inputs_embeds,
489
- labels=labels,
490
- output_attentions=output_attentions,
491
- output_hidden_states=output_hidden_states,
492
- return_dict=return_dict,
493
- )
494
-
495
- inputs_embeds = self.fc(outputs_from_glm.last_hidden_state)
496
-
497
  if sent_emb:
498
  return sentemb_forward(self, self.roberta,
499
  input_ids=input_ids,
 
3
  import torch.nn.functional as F
4
  import torch.distributed as dist
5
 
 
 
 
 
 
6
  from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead
7
  from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertLMPredictionHead
 
 
 
 
 
 
 
8
  from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions
9
 
 
 
 
 
 
 
 
 
 
 
10
  class MLPLayer(nn.Module):
11
  """
12
  Head for getting sentence representations over RoBERTa/BERT's CLS representation.
 
15
  def __init__(self, config):
16
  super().__init__()
17
  self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 
18
  self.fc = nn.Linear(config.hidden_size, 1536)
19
  self.activation = nn.Tanh()
20
 
 
22
  x = self.dense(features)
23
  x = self.fc(x)
24
  x = self.activation(x)
 
25
  return x
26
 
27
  class Similarity(nn.Module):
28
  """
29
  Dot product or cosine similarity
30
  """
 
31
  def __init__(self, temp):
32
  super().__init__()
33
  self.temp = temp
 
55
 
56
  def forward(self, attention_mask, outputs):
57
  last_hidden = outputs.last_hidden_state
58
+ pooler_output = outputs.pooler_output
59
  hidden_states = outputs.hidden_states
60
 
61
  if self.pooler_type in ['cls_before_pooler', 'cls']:
 
78
  raise NotImplementedError
79
 
80
 
81
+ def mse_loss_mat(tensor_left, tensor_right):
82
+ cos_sim_matrix = torch.matmul(tensor_left, tensor_right.t())
83
+ cos_sim_matrix /= torch.matmul(torch.norm(tensor_left, dim=1, keepdim=True), torch.norm(tensor_right, dim=1, keepdim=True).t())
84
+ return cos_sim_matrix
85
+
86
  def cl_init(cls, config):
87
  """
88
  Contrastive learning class init function.
 
105
  inputs_embeds=None,
106
  labels=None,
107
  output_attentions=None,
 
108
  return_dict=None,
109
  mlm_input_ids=None,
110
  mlm_labels=None,
111
  left_emb=None,
112
+ right_emb=None
 
113
  ):
114
  return_dict = return_dict if return_dict is not None else cls.config.use_return_dict
 
115
  batch_size = input_ids.size(0)
 
 
116
  num_sent = input_ids.size(1)
117
 
118
  mlm_outputs = None
119
+ input_ids = input_ids.view((-1, input_ids.size(-1)))
120
+ attention_mask = attention_mask.view((-1, attention_mask.size(-1)))
 
121
  if token_type_ids is not None:
122
+ token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1)))
123
 
124
  if inputs_embeds is not None:
125
  input_ids = None
 
161
  # (same as BERT's original implementation) over the representation.
162
  if cls.pooler_type == "cls":
163
  pooler_output = cls.mlp(pooler_output)
 
164
 
165
  # Separate representation
166
  z1, z2 = pooler_output[:, 0], pooler_output[:, 1]
167
 
168
+ tensor_left, tensor_right = left_emb, right_emb
 
169
 
170
  # Hard negative
171
  if num_sent == 3:
 
196
  z2 = torch.cat(z2_list, 0)
197
 
198
  mse_loss = F.mse_loss(z1, tensor_left) + F.mse_loss(z2, tensor_right)
 
 
 
 
199
 
200
  """
201
  this is KL div loss
 
204
  KL_loss = nn.KLDivLoss(reduction="batchmean")
205
  beta = 5
206
 
207
+ cos_sim_matrix_openai = mse_loss_mat(tensor_left, tensor_right)
 
208
  beta_scaled_cos_sim_matrix_openai = beta * cos_sim_matrix_openai
209
 
210
+ cos_sim_matrix_data = mse_loss_mat(z1, z2)
 
211
  beta_scaled_cos_sim_matrix_data = beta * cos_sim_matrix_data
212
 
213
  beta_scaled_cos_sim_matrix_openai_vertical = beta_scaled_cos_sim_matrix_openai.softmax(dim=1)
 
216
  beta_scaled_cos_sim_matrix_data_vertical = beta_scaled_cos_sim_matrix_data.softmax(dim=1)
217
  beta_scaled_cos_sim_matrix_data_horizontal = beta_scaled_cos_sim_matrix_data.softmax(dim=0)
218
 
 
219
  KL_vertical_loss = KL_loss(beta_scaled_cos_sim_matrix_data_vertical.log(), beta_scaled_cos_sim_matrix_openai_vertical)
220
  KL_horizontal_loss = KL_loss(beta_scaled_cos_sim_matrix_data_horizontal.log(), beta_scaled_cos_sim_matrix_openai_horizontal)
221
 
222
  KL_loss = (KL_vertical_loss + KL_horizontal_loss) / 2
 
 
 
 
223
 
224
  ziang_loss = KL_loss + mse_loss
225
 
 
319
  if self.model_args.do_mlm:
320
  self.lm_head = BertLMPredictionHead(config)
321
 
 
 
 
 
 
 
 
322
  cl_init(self, config)
323
 
324
  def forward(self,
 
338
  left_emb=None,
339
  right_emb=None,
340
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  if sent_emb:
342
  return sentemb_forward(self, self.bert,
343
  input_ids=input_ids,
 
381
  if self.model_args.do_mlm:
382
  self.lm_head = RobertaLMHead(config)
383
 
 
 
 
 
 
 
 
384
  cl_init(self, config)
385
 
386
  def forward(self,
 
400
  left_emb=None,
401
  right_emb=None,
402
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  if sent_emb:
404
  return sentemb_forward(self, self.roberta,
405
  input_ids=input_ids,