Spaces:

fclong
/

summary

Runtime error

App Files Files Community

summary / fengshen /models /deepVAE /latent_connector.py

fclong

Upload 396 files

8ebda9e about 2 years ago

raw

history blame

19 kB

	# coding=utf-8
	# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" PyTorch Della model. """

	import torch
	import logging
	import torch.nn as nn
	from dataclasses import dataclass
	from torch.nn import CrossEntropyLoss
	from typing import Optional, Tuple, Dict, Any
	# from transformers.utils.generic import ModelOutput
	from transformers.file_utils import ModelOutput
	from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
	from transformers.models.gpt2.modeling_gpt2 import GPT2PreTrainedModel, GPT2Block, GPT2Model


	@dataclass
	class DeepVAEDecoderOutput(ModelOutput):
	logits: torch.FloatTensor = None
	loss: Optional[torch.FloatTensor] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None


	logger = logging.getLogger(__name__)


	class GPT2LatentDecoderModel(GPT2Model):
	_keys_to_ignore_on_load_missing = ["attn.masked_bias"]

	def __init__(self, config, latent_dim=32):
	super().__init__(config)

	self.embed_dim = config.hidden_size

	self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
	self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

	self.drop = nn.Dropout(config.embd_pdrop)
	self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
	self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

	# Model parallel
	self.model_parallel = False
	self.device_map = None
	self.gradient_checkpointing = False

	# DeepVAE addition
	self.linear_emb_layers = nn.ModuleList([nn.Linear(latent_dim, config.hidden_size, bias=False) for i in range(config.num_hidden_layers)])
	# self.linear_emb = nn.Linear(latent_dim, config.hidden_size, bias=False) # share the same latent vector as the embeddings
	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	input_ids=None,
	layer_latent_vecs=None,
	past_key_values=None,
	attention_mask=None,
	token_type_ids=None,
	position_ids=None,
	head_mask=None,
	inputs_embeds=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	use_cache=None,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	):
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	elif input_ids is not None:
	input_shape = input_ids.size()
	input_ids = input_ids.view(-1, input_shape[-1])
	batch_size = input_ids.shape[0]
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	batch_size = inputs_embeds.shape[0]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	device = input_ids.device if input_ids is not None else inputs_embeds.device

	if token_type_ids is not None:
	token_type_ids = token_type_ids.view(-1, input_shape[-1])
	if position_ids is not None:
	position_ids = position_ids.view(-1, input_shape[-1])

	if past_key_values is None:
	past_length = 0
	past_key_values = tuple([None] * len(self.h))
	else:
	past_length = past_key_values[0][0].size(-2)
	if position_ids is None:
	position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
	position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])

	# GPT2Attention mask.
	if attention_mask is not None:
	if batch_size <= 0:
	raise ValueError("batch_size has to be defined and > 0")
	attention_mask = attention_mask.view(batch_size, -1)
	# We create a 3D attention mask from a 2D tensor mask.
	# Sizes are [batch_size, 1, 1, to_seq_length]
	# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
	# this attention mask is more simple than the triangular masking of causal attention
	# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
	attention_mask = attention_mask[:, None, None, :]

	# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
	# masked positions, this operation will create a tensor which is 0.0 for
	# positions we want to attend and -10000.0 for masked positions.
	# Since we are adding it to the raw scores before the softmax, this is
	# effectively the same as removing these entirely.
	attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
	attention_mask = (1.0 - attention_mask) * -10000.0

	# If a 2D or 3D attention mask is provided for the cross-attention
	# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if self.config.add_cross_attention and encoder_hidden_states is not None:
	encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
	encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
	if encoder_attention_mask is None:
	encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
	encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
	else:
	encoder_attention_mask = None

	# Prepare head mask if needed
	# 1.0 in head_mask indicate we keep the head
	# attention_probs has shape bsz x n_heads x N x N
	# head_mask has shape n_layer x batch x n_heads x N x N
	head_mask = self.get_head_mask(head_mask, self.config.n_layer)

	if inputs_embeds is None:
	inputs_embeds = self.wte(input_ids)
	position_embeds = self.wpe(position_ids)
	hidden_states = inputs_embeds + position_embeds

	if token_type_ids is not None:
	token_type_embeds = self.wte(token_type_ids)
	hidden_states = hidden_states + token_type_embeds

	hidden_states = self.drop(hidden_states)

	output_shape = input_shape + (hidden_states.size(-1),)

	presents = () if use_cache else None
	all_self_attentions = () if output_attentions else None
	all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
	all_hidden_states = () if output_hidden_states else None
	for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
	# NOTE: deepVAE modification. update hidden_states before passing into gpt2block!
	# hidden_states are with shape (batch_size, sequence_length, hidden_size)
	# layer_latent_vecs are with shape (batch_size, hidden_size)
	latent_repr = self.linear_emb_layers[i](layer_latent_vecs[i])
	# latent_repr = self.linear_emb_layers[-1](layer_latent_vecs[-1])
	# latent_repr = self.linear_emb(layer_latent_vecs[i])
	hidden_states += latent_repr.unsqueeze(dim=1)

	# Model parallel
	if self.model_parallel:
	torch.cuda.set_device(hidden_states.device)
	# Ensure layer_past is on same device as hidden_states (might not be correct)
	if layer_past is not None:
	layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
	# Ensure that attention_mask is always on the same device as hidden_states
	if attention_mask is not None:
	attention_mask = attention_mask.to(hidden_states.device)
	if isinstance(head_mask, torch.Tensor):
	head_mask = head_mask.to(hidden_states.device)
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if self.gradient_checkpointing and self.training:

	if use_cache:
	logger.warning(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	def create_custom_forward(module):
	def custom_forward(*inputs):
	# None for past_key_value
	return module(*inputs, use_cache, output_attentions)

	return custom_forward

	outputs = torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	hidden_states,
	None,
	attention_mask,
	head_mask[i],
	encoder_hidden_states,
	encoder_attention_mask,
	)
	else:
	outputs = block(
	hidden_states,
	layer_past=layer_past,
	attention_mask=attention_mask,
	head_mask=head_mask[i],
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)

	hidden_states = outputs[0]

	if use_cache is True:
	presents = presents + (outputs[1],)

	if output_attentions:
	all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
	if self.config.add_cross_attention:
	all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)

	# Model Parallel: If it's the last layer for that device, put things on the next device
	if self.model_parallel:
	for k, v in self.device_map.items():
	if i == v[-1] and "cuda:" + str(k) != self.last_device:
	hidden_states = hidden_states.to("cuda:" + str(k + 1))

	hidden_states = self.ln_f(hidden_states)

	hidden_states = hidden_states.view(*output_shape)
	# Add last hidden state
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v
	for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
	if v is not None
	)

	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=presents,
	hidden_states=all_hidden_states,
	attentions=all_self_attentions,
	cross_attentions=all_cross_attentions,
	)


	class GPT2ForDecoderLatentConnector(GPT2PreTrainedModel):
	r"""
	labels: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
	Labels for language modeling.
	Note that the labels are shifted inside the model, i.e. you can set ``lm_labels = input_ids``
	Indices are selected in ``[-1, 0, ..., config.vocab_size]``
	All labels set to ``-1`` are ignored (masked), the loss is only
	computed for labels in ``[0, ..., config.vocab_size]``

	Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
	loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
	Language modeling loss.
	prediction_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past:
	list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
	that contains pre-computed hidden-states (key and values in the attention blocks).
	Can be used (see `past` input) to speed up sequential decoding.
	hidden_states: (`optional`, returned when ``config.output_hidden_states=True``)
	list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
	of shape ``(batch_size, sequence_length, hidden_size)``:
	Hidden-states of the model at the output of each layer plus the initial embedding outputs.
	attentions: (`optional`, returned when ``config.output_attentions=True``)
	list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

	Examples::

	import torch
	from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel

	tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	model = GPT2LMHeadModel.from_pretrained('gpt2')

	input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
	outputs = model(input_ids, labels=input_ids)
	loss, logits = outputs[:2]

	"""

	def __init__(self, config, latent_dim=32):

	super(GPT2ForDecoderLatentConnector, self).__init__(config)
	self.transformer = GPT2LatentDecoderModel(config, latent_dim=latent_dim)
	self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.init_weights()
	self.tie_weights()

	def tie_weights(self):
	""" Make sure we are sharing the input and output embeddings.
	Export to TorchScript can't handle parameter sharing so we are cloning them instead.
	"""
	self._tie_or_clone_weights(self.lm_head,
	self.transformer.wte)

	def forward(self, input_ids, layer_latent_vecs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
	labels=None, label_ignore=None, loss_mask=None, return_dict=False,
	output_attentions=None, output_hidden_states=None, use_cache=None):

	transformer_outputs = self.transformer(input_ids,
	layer_latent_vecs,
	past_key_values=past,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	head_mask=head_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states)
	hidden_states = transformer_outputs[0]

	lm_logits = self.lm_head(hidden_states)

	outputs = (lm_logits,) + transformer_outputs[1:]
	if labels is not None:
	# Shift so that tokens < n predict n
	shift_logits = lm_logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = CrossEntropyLoss(ignore_index=label_ignore, reduction='none')
	loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
	shift_labels.view(-1))

	if loss_mask is not None:
	loss = loss.view(-1, shift_labels.shape[-1]) * loss_mask[:, :-1]
	loss = torch.sum(loss, -1)
	else:
	loss = torch.sum(loss.view(-1, shift_labels.shape[-1]), -1)
	else:
	loss = None
	outputs = DeepVAEDecoderOutput(loss=loss, logits=lm_logits, hidden_states=transformer_outputs.hidden_states,
	attentions=transformer_outputs.attentions)
	return outputs

	def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -> Dict[str, Any]:
	"""
	Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the generate method.
	"""
	return {"input_ids": input_ids, "layer_latent_vecs": kwargs['layer_latent_vecs']}


	class GPT2ForEncoderLatentConnector(GPT2PreTrainedModel):

	def __init__(self, config):

	super(GPT2ForEncoderLatentConnector, self).__init__(config)
	self.transformer = GPT2Model(config)
	self.init_weights()

	def forward(
	self,
	input_ids=None,
	past_key_values=None,
	attention_mask=None,
	token_type_ids=None,
	position_ids=None,
	head_mask=None,
	inputs_embeds=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	use_cache=None,
	output_attentions=None,
	output_hidden_states=True,
	return_dict=None,
	):
	# output hidden states must set to true to allow for layer-wise latent vars
	transformer_outputs = self.transformer(
	input_ids,
	past_key_values=past_key_values,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	head_mask=head_mask,
	inputs_embeds=inputs_embeds,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	return transformer_outputs