Source code for tensorlayerx.optimizers.tensorflow_optimizers

#! /usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function
import tensorflow as tf
import tensorlayerx as tlx

__all__ = ['Adadelta', 'Adagrad', 'Adam', 'Adamax', 'Ftrl', 'Nadam', 'RMSprop', 'SGD', 'Momentum', 'Lamb', 'LARS']


[docs]class Adadelta(object): """Optimizer that implements the Adadelta algorithm. Equivalent to tf.optimizers.Adadelta. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/Adadelta?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. rho : float or constant float tensor A Tensor or a floating point value. The decay rate. eps : float A small constant for numerical stability.Defaults to 1e-7. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Adadelta(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.001, rho=0.95, eps=1e-07, weight_decay=0.0, grad_clip=None): self.lr = lr self.rho = rho self.eps = eps if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.adadelta = tf.optimizers.Adadelta(learning_rate=self.lr, rho=self.rho, epsilon=self.eps) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.adadelta.apply_gradients(grads_and_vars)
[docs]class Adagrad(object): """Optimizer that implements the Adagrad algorithm. Equivalent to tf.optimizers.Adagrad. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/Adagrad?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. initial_accumulator_value : float Floating point value. Starting value for the accumulators (per-parameter momentum values). Must be non-negative.Defaults to 0.95. eps : float A small constant for numerical stability.Defaults to 1e-7. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Adagrad(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.001, initial_accumulator=0.1, eps=1e-07, weight_decay=0.0, grad_clip=None): self.lr = lr self.initial_accumulator = initial_accumulator self.eps = eps if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.adagrad = tf.optimizers.Adagrad( learning_rate=self.lr, initial_accumulator_value=self.initial_accumulator, epsilon=self.eps ) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.adagrad.apply_gradients(grads_and_vars)
[docs]class Adam(object): """Optimizer that implements the Adam algorithm. Equivalent to tf.optimizers.Adam. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/Adam?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. beta_1 : float or constant float tensor The exponential decay rate for the 1st moment estimates. Defaults to 0.9. beta_2 : float or constant float tensor The exponential decay rate for the 2nd moment estimates. Defaults to 0.999. eps : float A small constant for numerical stability.Defaults to 1e-7. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Adam(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-07, weight_decay=0.0, grad_clip=None): self.lr = lr self.beta_1 = beta_1 self.beta_2 = beta_2 self.eps = eps if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.adam = tf.optimizers.Adam( learning_rate=self.lr, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.eps ) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.adam.apply_gradients(grads_and_vars)
[docs]class Adamax(object): """Optimizer that implements the Adamax algorithm. Equivalent to tf.optimizers.Adamax. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/Adamax?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. beta_1 : float or constant float tensor The exponential decay rate for the 1st moment estimates. Defaults to 0.9. beta_2 : float or constant float tensor The exponential decay rate for the exponentially weighted infinity norm. Defaults to 0.999. eps : float A small constant for numerical stability.Defaults to 1e-7. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Adamax(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-07, weight_decay=0.0, grad_clip=None): self.lr = lr self.beta_1 = beta_1 self.beta_2 = beta_2 self.eps = eps if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.adamax = tf.optimizers.Adamax( learning_rate=self.lr, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.eps ) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.adamax.apply_gradients(grads_and_vars)
[docs]class Ftrl(object): """Optimizer that implements the FTRL algorithm. Equivalent to tf.optimizers.Ftrl. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/Ftrl?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. lr_power : float Controls how the learning rate decreases during training. Use zero for a fixed learning rate. initial_accumulator_value : float The starting value for accumulators. Only zero or positive values are allowed. l1_regularization_strength : float A float value, must be greater than or equal to zero. Defaults to 0.0. l2_regularization_strength : float A float value, must be greater than or equal to zero. Defaults to 0.0. l2_shrinkage_regularization_strength : float This differs from L2 above in that the L2 above is a stabilization penalty, whereas this L2 shrinkage is a magnitude penalty. When input is sparse shrinkage will only happen on the active weights. beta : float A float value, representing the beta value from the paper. Defaults to 0.0. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Ftrl(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__( self, lr=0.001, lr_power=-0.5, initial_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, beta=0.0, l2_shrinkage_regularization_strength=0.0, weight_decay=0.0, grad_clip=None ): self.lr = lr self.lr_power = lr_power self.initial_accumulator_value = initial_accumulator_value self.l1_regularization_strength = l1_regularization_strength self.l2_regularization_strength = l2_regularization_strength self.beta = beta self.l2_shrinkage_regularization_strength = l2_shrinkage_regularization_strength if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.ftrl = tf.optimizers.Ftrl( learning_rate=self.lr, learning_rate_power=self.lr_power, initial_accumulator_value=self.initial_accumulator_value, l1_regularization_strength=self.l1_regularization_strength, l2_regularization_strength=self.l2_regularization_strength, beta=self.beta, l2_shrinkage_regularization_strength=self.l2_shrinkage_regularization_strength ) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.ftrl.apply_gradients(grads_and_vars)
[docs]class Nadam(object): """Optimizer that implements the NAdam algorithm. Equivalent to tf.optimizers.Nadam. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/Nadam?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. beta_1 : float or constant float tensor The exponential decay rate for the 1st moment estimates. Defaults to 0.9. beta_2 : float or constant float tensor The exponential decay rate for the exponentially weighted infinity norm. Defaults to 0.999. eps : float A small constant for numerical stability.Defaults to 1e-7. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Nadam(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-07, weight_decay=0.0, grad_clip=None): self.lr = lr self.beta_1 = beta_1 self.beta_2 = beta_2 self.eps = eps if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.nadam = tf.optimizers.Nadam( learning_rate=self.lr, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.eps ) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.nadam.apply_gradients(grads_and_vars)
[docs]class RMSprop(object): """Optimizer that implements the RMSprop algorithm. Equivalent to tf.optimizers.RMSprop. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/RMSprop?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. rho : float Discounting factor for the history/coming gradient. Defaults to 0.9. momentum : float A scalar or a scalar Tensor. Defaults to 0.0. eps : float A small constant for numerical stability.Defaults to 1e-7. centered : bool If True, gradients are normalized by the estimated variance of the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.RMSprop(0.001) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__( self, lr=0.001, rho=0.9, momentum=0.0, eps=1e-07, centered=False, weight_decay=0.0, grad_clip=None ): self.lr = lr self.rho = rho self.momentum = momentum self.eps = eps self.centered = centered if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.rmsprop = tf.optimizers.RMSprop( learning_rate=self.lr, rho=self.rho, momentum=self.momentum, epsilon=self.eps, centered=self.centered ) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.rmsprop.apply_gradients(grads_and_vars)
[docs]class SGD(object): """Gradient descent (with momentum) optimizer. Equivalent to tf.optimizers.SGD. References ---------- - https://tensorflow.google.cn/api_docs/python/tf/keras/optimizers/SGD?hl=en Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. momentum : float float hyperparameter >= 0 that accelerates gradient descent in the relevant direction and dampens oscillations. Defaults to 0, i.e., vanilla gradient descent. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.SGD(0.01) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.01, momentum=0.0, weight_decay=0.0, grad_clip=None): self.lr = lr self.momentum = momentum if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.sgd = tf.optimizers.SGD(learning_rate=self.lr, momentum=self.momentum, nesterov=False) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.sgd.apply_gradients(grads_and_vars)
[docs]class Momentum(object): """Optimizer that implements the Momentum algorithm. Equivalent to tf.compat.v1.train.MomentumOptimizer References ---------- - https://tensorflow.google.cn/api_docs/python/tf/compat/v1/train/MomentumOptimizer?hl=en&version=nightly Parameters ---------- lr : A Tensor, floating point value The learning rate. Defaults to 0.001. momentum : float A Tensor or a floating point value. The momentum. Defaults to 0 use_locking : bool If True use locks for update operations. use_nesterov : bool If True use Nesterov Momentum. See (Sutskever et al., 2013). This implementation always computes gradients at the value of the variable(s) passed to the optimizer. Using Nesterov Momentum makes the variable(s) track the values called theta_t + mu*v_t in the paper. This implementation is an approximation of the original formula, valid for high values of momentum. It will compute the "adjusted gradient" in NAG by assuming that the new gradient will be estimated by the current average gradient plus the product of momentum and the change in the average gradient. weight_decay : float weight decay (L2 penalty) (default: 0.0) grad_clip : GradientClip or None Gradient cliping strategy.There are three cliping strategies ( `tlx.ops.ClipGradByValue` , `tlx.ops.ClipGradByNorm`, `tlx.ops.ClipByGlobalNorm` ). Default None, meaning there is no gradient clipping. Examples -------- With TensorLayerx >>> import tensorlayerx as tlx >>> optimizer = tlx.optimizers.Momentum(0.01, momentum=0.9) >>> optimizer.apply_gradients(zip(grad, train_weights)) """ def __init__(self, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0, grad_clip=None): self.lr = lr self.momentum = momentum if weight_decay < 0.0: raise ValueError("weight_decay should not smaller than 0.0, but got {}".format(weight_decay)) self.weight_decay = tf.convert_to_tensor(float(weight_decay)) self.grad_clip = grad_clip self.nesterov = nesterov self.sgd = tf.optimizers.SGD(learning_rate=self.lr, momentum=self.momentum, nesterov=self.nesterov) def apply_gradients(self, grads_and_vars): if grads_and_vars is None: raise ValueError('grads_and_vars is not set.') if self.weight_decay != 0.0 or self.grad_clip is not None: grads, vars = zip(*grads_and_vars) if self.weight_decay != 0.0: new_grads = [] for grad, var in zip(grads, vars): grad = grad + self.weight_decay * var new_grads.append(grad) grads = new_grads if self.grad_clip is not None: if isinstance(self.grad_clip, tlx.ops.ClipByGlobalNorm): new_grads, _ = self.grad_clip(grads) else: new_grads = [] for g in grads: new_grads.append(self.grad_clip(g)) grads = new_grads grads_and_vars = zip(grads, vars) self.sgd.apply_gradients(grads_and_vars)
[docs]class Lamb(object): """Optimizer that implements the Layer-wise Adaptive Moments (LAMB). References ---------- - https://tensorflow.google.cn/addons/api_docs/python/tfa/optimizers/LAMB?hl=en """ def __init__(self): raise NotImplementedError('Optimizer that not implemented the Layer-wise Adaptive Moments (LAMB).')
[docs]class LARS(object): """ LARS is an optimization algorithm employing a large batch optimization technique. Refer to paper LARGE BATCH TRAINING OF CONVOLUTIONAL NETWORKS. References ---------- - https://www.mindspore.cn/docs/api/zh-CN/r1.5/api_python/nn/mindspore.nn.LARS.html?highlight=lars#mindspore.nn.LARS """ def __init__(self): raise NotImplementedError('Optimizer that not implemented the LARS.')