Spaces:
Runtime error
Runtime error
| # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| from __future__ import unicode_literals | |
| from paddle import optimizer as optim | |
| class Momentum(object): | |
| """ | |
| Simple Momentum optimizer with velocity state. | |
| Args: | |
| learning_rate (float|Variable) - The learning rate used to update parameters. | |
| Can be a float value or a Variable with one float value as data element. | |
| momentum (float) - Momentum factor. | |
| regularization (WeightDecayRegularizer, optional) - The strategy of regularization. | |
| """ | |
| def __init__(self, | |
| learning_rate, | |
| momentum, | |
| weight_decay=None, | |
| grad_clip=None, | |
| **args): | |
| super(Momentum, self).__init__() | |
| self.learning_rate = learning_rate | |
| self.momentum = momentum | |
| self.weight_decay = weight_decay | |
| self.grad_clip = grad_clip | |
| def __call__(self, model): | |
| train_params = [ | |
| param for param in model.parameters() if param.trainable is True | |
| ] | |
| opt = optim.Momentum( | |
| learning_rate=self.learning_rate, | |
| momentum=self.momentum, | |
| weight_decay=self.weight_decay, | |
| grad_clip=self.grad_clip, | |
| parameters=train_params) | |
| return opt | |
| class Adam(object): | |
| def __init__(self, | |
| learning_rate=0.001, | |
| beta1=0.9, | |
| beta2=0.999, | |
| epsilon=1e-08, | |
| parameter_list=None, | |
| weight_decay=None, | |
| grad_clip=None, | |
| name=None, | |
| lazy_mode=False, | |
| **kwargs): | |
| self.learning_rate = learning_rate | |
| self.beta1 = beta1 | |
| self.beta2 = beta2 | |
| self.epsilon = epsilon | |
| self.parameter_list = parameter_list | |
| self.learning_rate = learning_rate | |
| self.weight_decay = weight_decay | |
| self.grad_clip = grad_clip | |
| self.name = name | |
| self.lazy_mode = lazy_mode | |
| self.group_lr = kwargs.get('group_lr', False) | |
| self.training_step = kwargs.get('training_step', None) | |
| def __call__(self, model): | |
| if self.group_lr: | |
| if self.training_step == 'LF_2': | |
| import paddle | |
| if isinstance(model, paddle.DataParallel): # multi gpu | |
| mlm = model._layers.head.MLM_VRM.MLM.parameters() | |
| pre_mlm_pp = model._layers.head.MLM_VRM.Prediction.pp_share.parameters( | |
| ) | |
| pre_mlm_w = model._layers.head.MLM_VRM.Prediction.w_share.parameters( | |
| ) | |
| else: # single gpu | |
| mlm = model.head.MLM_VRM.MLM.parameters() | |
| pre_mlm_pp = model.head.MLM_VRM.Prediction.pp_share.parameters( | |
| ) | |
| pre_mlm_w = model.head.MLM_VRM.Prediction.w_share.parameters( | |
| ) | |
| total = [] | |
| for param in mlm: | |
| total.append(id(param)) | |
| for param in pre_mlm_pp: | |
| total.append(id(param)) | |
| for param in pre_mlm_w: | |
| total.append(id(param)) | |
| group_base_params = [ | |
| param for param in model.parameters() if id(param) in total | |
| ] | |
| group_small_params = [ | |
| param for param in model.parameters() | |
| if id(param) not in total | |
| ] | |
| train_params = [{ | |
| 'params': group_base_params | |
| }, { | |
| 'params': group_small_params, | |
| 'learning_rate': self.learning_rate.values[0] * 0.1 | |
| }] | |
| else: | |
| print( | |
| 'group lr currently only support VisionLAN in LF_2 training step' | |
| ) | |
| train_params = [ | |
| param for param in model.parameters() | |
| if param.trainable is True | |
| ] | |
| else: | |
| train_params = [ | |
| param for param in model.parameters() if param.trainable is True | |
| ] | |
| opt = optim.Adam( | |
| learning_rate=self.learning_rate, | |
| beta1=self.beta1, | |
| beta2=self.beta2, | |
| epsilon=self.epsilon, | |
| weight_decay=self.weight_decay, | |
| grad_clip=self.grad_clip, | |
| name=self.name, | |
| lazy_mode=self.lazy_mode, | |
| parameters=train_params) | |
| return opt | |
| class RMSProp(object): | |
| """ | |
| Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method. | |
| Args: | |
| learning_rate (float|Variable) - The learning rate used to update parameters. | |
| Can be a float value or a Variable with one float value as data element. | |
| momentum (float) - Momentum factor. | |
| rho (float) - rho value in equation. | |
| epsilon (float) - avoid division by zero, default is 1e-6. | |
| regularization (WeightDecayRegularizer, optional) - The strategy of regularization. | |
| """ | |
| def __init__(self, | |
| learning_rate, | |
| momentum=0.0, | |
| rho=0.95, | |
| epsilon=1e-6, | |
| weight_decay=None, | |
| grad_clip=None, | |
| **args): | |
| super(RMSProp, self).__init__() | |
| self.learning_rate = learning_rate | |
| self.momentum = momentum | |
| self.rho = rho | |
| self.epsilon = epsilon | |
| self.weight_decay = weight_decay | |
| self.grad_clip = grad_clip | |
| def __call__(self, model): | |
| train_params = [ | |
| param for param in model.parameters() if param.trainable is True | |
| ] | |
| opt = optim.RMSProp( | |
| learning_rate=self.learning_rate, | |
| momentum=self.momentum, | |
| rho=self.rho, | |
| epsilon=self.epsilon, | |
| weight_decay=self.weight_decay, | |
| grad_clip=self.grad_clip, | |
| parameters=train_params) | |
| return opt | |
| class Adadelta(object): | |
| def __init__(self, | |
| learning_rate=0.001, | |
| epsilon=1e-08, | |
| rho=0.95, | |
| parameter_list=None, | |
| weight_decay=None, | |
| grad_clip=None, | |
| name=None, | |
| **kwargs): | |
| self.learning_rate = learning_rate | |
| self.epsilon = epsilon | |
| self.rho = rho | |
| self.parameter_list = parameter_list | |
| self.learning_rate = learning_rate | |
| self.weight_decay = weight_decay | |
| self.grad_clip = grad_clip | |
| self.name = name | |
| def __call__(self, model): | |
| train_params = [ | |
| param for param in model.parameters() if param.trainable is True | |
| ] | |
| opt = optim.Adadelta( | |
| learning_rate=self.learning_rate, | |
| epsilon=self.epsilon, | |
| rho=self.rho, | |
| weight_decay=self.weight_decay, | |
| grad_clip=self.grad_clip, | |
| name=self.name, | |
| parameters=train_params) | |
| return opt | |
| class AdamW(object): | |
| def __init__(self, | |
| learning_rate=0.001, | |
| beta1=0.9, | |
| beta2=0.999, | |
| epsilon=1e-8, | |
| weight_decay=0.01, | |
| multi_precision=False, | |
| grad_clip=None, | |
| no_weight_decay_name=None, | |
| one_dim_param_no_weight_decay=False, | |
| name=None, | |
| lazy_mode=False, | |
| **args): | |
| super().__init__() | |
| self.learning_rate = learning_rate | |
| self.beta1 = beta1 | |
| self.beta2 = beta2 | |
| self.epsilon = epsilon | |
| self.grad_clip = grad_clip | |
| self.weight_decay = 0.01 if weight_decay is None else weight_decay | |
| self.grad_clip = grad_clip | |
| self.name = name | |
| self.lazy_mode = lazy_mode | |
| self.multi_precision = multi_precision | |
| self.no_weight_decay_name_list = no_weight_decay_name.split( | |
| ) if no_weight_decay_name else [] | |
| self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay | |
| def __call__(self, model): | |
| parameters = [ | |
| param for param in model.parameters() if param.trainable is True | |
| ] | |
| self.no_weight_decay_param_name_list = [ | |
| p.name for n, p in model.named_parameters() | |
| if any(nd in n for nd in self.no_weight_decay_name_list) | |
| ] | |
| if self.one_dim_param_no_weight_decay: | |
| self.no_weight_decay_param_name_list += [ | |
| p.name for n, p in model.named_parameters() if len(p.shape) == 1 | |
| ] | |
| opt = optim.AdamW( | |
| learning_rate=self.learning_rate, | |
| beta1=self.beta1, | |
| beta2=self.beta2, | |
| epsilon=self.epsilon, | |
| parameters=parameters, | |
| weight_decay=self.weight_decay, | |
| multi_precision=self.multi_precision, | |
| grad_clip=self.grad_clip, | |
| name=self.name, | |
| lazy_mode=self.lazy_mode, | |
| apply_decay_param_fun=self._apply_decay_param_fun) | |
| return opt | |
| def _apply_decay_param_fun(self, name): | |
| return name not in self.no_weight_decay_param_name_list | |