VRIS_vip / RIS-DMMI /bert /optimization.py

Upload folder using huggingface_hub

0b32e3c verified 28 days ago

6.26 kB

	# coding=utf-8
	# Copyright 2018 The Google AI Language Team Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Functions and classes related to optimization (weight updates)."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import re
	import tensorflow as tf


	def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
	"""Creates an optimizer training op."""
	global_step = tf.train.get_or_create_global_step()

	learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

	# Implements linear decay of the learning rate.
	learning_rate = tf.train.polynomial_decay(
	learning_rate,
	global_step,
	num_train_steps,
	end_learning_rate=0.0,
	power=1.0,
	cycle=False)

	# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
	# learning rate will be `global_step/num_warmup_steps * init_lr`.
	if num_warmup_steps:
	global_steps_int = tf.cast(global_step, tf.int32)
	warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

	global_steps_float = tf.cast(global_steps_int, tf.float32)
	warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

	warmup_percent_done = global_steps_float / warmup_steps_float
	warmup_learning_rate = init_lr * warmup_percent_done

	is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
	learning_rate = (
	(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

	# It is recommended that you use this optimizer for fine tuning, since this
	# is how the model was trained (note that the Adam m/v variables are NOT
	# loaded from init_checkpoint.)
	optimizer = AdamWeightDecayOptimizer(
	learning_rate=learning_rate,
	weight_decay_rate=0.01,
	beta_1=0.9,
	beta_2=0.999,
	epsilon=1e-6,
	exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

	if use_tpu:
	optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

	tvars = tf.trainable_variables()
	grads = tf.gradients(loss, tvars)

	# This is how the model was pre-trained.
	(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

	train_op = optimizer.apply_gradients(
	zip(grads, tvars), global_step=global_step)

	# Normally the global step update is done inside of `apply_gradients`.
	# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
	# a different optimizer, you should probably take this line out.
	new_global_step = global_step + 1
	train_op = tf.group(train_op, [global_step.assign(new_global_step)])
	return train_op


	class AdamWeightDecayOptimizer(tf.train.Optimizer):
	"""A basic Adam optimizer that includes "correct" L2 weight decay."""

	def __init__(self,
	learning_rate,
	weight_decay_rate=0.0,
	beta_1=0.9,
	beta_2=0.999,
	epsilon=1e-6,
	exclude_from_weight_decay=None,
	name="AdamWeightDecayOptimizer"):
	"""Constructs a AdamWeightDecayOptimizer."""
	super(AdamWeightDecayOptimizer, self).__init__(False, name)

	self.learning_rate = learning_rate
	self.weight_decay_rate = weight_decay_rate
	self.beta_1 = beta_1
	self.beta_2 = beta_2
	self.epsilon = epsilon
	self.exclude_from_weight_decay = exclude_from_weight_decay

	def apply_gradients(self, grads_and_vars, global_step=None, name=None):
	"""See base class."""
	assignments = []
	for (grad, param) in grads_and_vars:
	if grad is None or param is None:
	continue

	param_name = self._get_variable_name(param.name)

	m = tf.get_variable(
	name=param_name + "/adam_m",
	shape=param.shape.as_list(),
	dtype=tf.float32,
	trainable=False,
	initializer=tf.zeros_initializer())
	v = tf.get_variable(
	name=param_name + "/adam_v",
	shape=param.shape.as_list(),
	dtype=tf.float32,
	trainable=False,
	initializer=tf.zeros_initializer())

	# Standard Adam update.
	next_m = (
	tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
	next_v = (
	tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
	tf.square(grad)))

	update = next_m / (tf.sqrt(next_v) + self.epsilon)

	# Just adding the square of the weights to the loss function is not
	# the correct way of using L2 regularization/weight decay with Adam,
	# since that will interact with the m and v parameters in strange ways.
	#
	# Instead we want ot decay the weights in a manner that doesn't interact
	# with the m/v parameters. This is equivalent to adding the square
	# of the weights to the loss with plain (non-momentum) SGD.
	if self._do_use_weight_decay(param_name):
	update += self.weight_decay_rate * param

	update_with_lr = self.learning_rate * update

	next_param = param - update_with_lr

	assignments.extend(
	[param.assign(next_param),
	m.assign(next_m),
	v.assign(next_v)])
	return tf.group(*assignments, name=name)

	def _do_use_weight_decay(self, param_name):
	"""Whether to use L2 weight decay for `param_name`."""
	if not self.weight_decay_rate:
	return False
	if self.exclude_from_weight_decay:
	for r in self.exclude_from_weight_decay:
	if re.search(r, param_name) is not None:
	return False
	return True

	def _get_variable_name(self, param_name):
	"""Get the variable name from the tensor name."""
	m = re.match("^(.*):\\d+$", param_name)
	if m is not None:
	param_name = m.group(1)
	return param_name