Module keras.optimizer_v2.legacy_learning_rate_decay
Various learning rate decay functions.
Expand source code
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Various learning rate decay functions."""
import tensorflow.compat.v2 as tf
import functools
from keras.optimizer_v2 import learning_rate_schedule
from tensorflow.python.util.tf_export import tf_export
@tf_export(v1=["train.exponential_decay"])
def exponential_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False,
name=None):
"""Applies exponential decay to the learning rate.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies an exponential decay function
to a provided initial learning rate. It requires a `global_step` value to
compute the decayed learning rate. You can just pass a TensorFlow variable
that you increment at each training step.
The function returns the decayed learning rate. It is computed as:
```python
decayed_learning_rate = learning_rate *
decay_rate ^ (global_step / decay_steps)
```
If the argument `staircase` is `True`, then `global_step / decay_steps` is an
integer division and the decayed learning rate follows a staircase function.
Example: decay every 100000 steps with a base of 0.96:
```python
...
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.1
learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate,
global_step,
100000, 0.96, staircase=True)
# Passing global_step to minimize() will increment it at each step.
learning_step = (
tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
.minimize(...my loss..., global_step=global_step)
)
```
Args:
learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
step to use for the decay computation. Must not be negative.
decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
be positive. See the decay computation above.
decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
The decay rate.
staircase: Boolean. If `True` decay the learning rate at discrete intervals
name: String. Optional name of the operation. Defaults to
'ExponentialDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.ExponentialDecay(
learning_rate, decay_steps, decay_rate, staircase=staircase, name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"])
def piecewise_constant(x, boundaries, values, name=None):
"""Piecewise constant from boundaries and interval values.
Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
for the next 10000 steps, and 0.1 for any additional steps.
```python
global_step = tf.Variable(0, trainable=False)
boundaries = [100000, 110000]
values = [1.0, 0.5, 0.1]
learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries,
values)
# Later, whenever we perform an optimization step, we increment global_step.
```
Args:
x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
`float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
increasing entries, and with all elements having the same type as `x`.
values: A list of `Tensor`s or `float`s or `int`s that specifies the values
for the intervals defined by `boundaries`. It should have one more element
than `boundaries`, and all elements should have the same type.
name: A string. Optional name of the operation. Defaults to
'PiecewiseConstant'.
Returns:
A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
`values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
and values[-1] when `x > boundaries[-1]`.
Raises:
ValueError: if types of `x` and `boundaries` do not match, or types of all
`values` do not match or
the number of elements in the lists does not match.
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
boundaries = tf.nest.map_structure(tf.convert_to_tensor,
tf.nest.flatten(boundaries))
values = tf.nest.map_structure(tf.convert_to_tensor,
tf.nest.flatten(values))
x_recomp = tf.convert_to_tensor(x)
# Avoid explicit conversion to x's dtype. This could result in faulty
# comparisons, for example if floats are converted to integers.
for i, b in enumerate(boundaries):
if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
# We can promote int32 boundaries to int64 without loss of precision.
# This covers the most common case where the user passes in boundaries
# as an array of Python integers.
if (b.dtype.base_dtype == tf.int32 and
x_recomp.dtype.base_dtype == tf.int64):
b = tf.cast(b, x_recomp.dtype.base_dtype)
boundaries[i] = b
else:
raise ValueError(
"Boundaries (%s) must have the same dtype as x (%s)." %
(b.dtype.base_dtype, x_recomp.dtype.base_dtype))
for v in values[1:]:
if v.dtype.base_dtype != values[0].dtype.base_dtype:
raise ValueError(
"Values must have elements all with the same dtype (%s vs %s)." %
(values[0].dtype.base_dtype, v.dtype.base_dtype))
decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
boundaries, values, name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(x)
else:
decayed_lr = functools.partial(decayed_lr, x)
return decayed_lr
@tf_export(v1=["train.polynomial_decay"])
def polynomial_decay(learning_rate,
global_step,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
name=None):
"""Applies a polynomial decay to the learning rate.
It is commonly observed that a monotonically decreasing learning rate, whose
degree of change is carefully chosen, results in a better performing model.
This function applies a polynomial decay function to a provided initial
`learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
It requires a `global_step` value to compute the decayed learning rate. You
can just pass a TensorFlow variable that you increment at each training step.
The function returns the decayed learning rate. It is computed as:
```python
global_step = min(global_step, decay_steps)
decayed_learning_rate = (learning_rate - end_learning_rate) *
(1 - global_step / decay_steps) ^ (power) +
end_learning_rate
```
If `cycle` is True then a multiple of `decay_steps` is used, the first one
that is bigger than `global_steps`.
```python
decay_steps = decay_steps * ceil(global_step / decay_steps)
decayed_learning_rate = (learning_rate - end_learning_rate) *
(1 - global_step / decay_steps) ^ (power) +
end_learning_rate
```
Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
```python
...
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.1
end_learning_rate = 0.01
decay_steps = 10000
learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate,
global_step,
decay_steps, end_learning_rate,
power=0.5)
# Passing global_step to minimize() will increment it at each step.
learning_step = (
tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
.minimize(...my loss..., global_step=global_step)
)
```
Args:
learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
step to use for the decay computation. Must not be negative.
decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must
be positive. See the decay computation above.
end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
number. The minimal end learning rate.
power: A scalar `float32` or `float64` `Tensor` or a Python number. The
power of the polynomial. Defaults to linear, 1.0.
cycle: A boolean, whether or not it should cycle beyond decay_steps.
name: String. Optional name of the operation. Defaults to
'PolynomialDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.PolynomialDecay(
learning_rate,
decay_steps,
end_learning_rate=end_learning_rate,
power=power,
cycle=cycle,
name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.natural_exp_decay"])
def natural_exp_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False,
name=None):
"""Applies natural exponential decay to the initial learning rate.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies an exponential decay function
to a provided initial learning rate. It requires an `global_step` value to
compute the decayed learning rate. You can just pass a TensorFlow variable
that you increment at each training step.
The function returns the decayed learning rate. It is computed as:
```python
decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
decay_step)
```
or, if `staircase` is `True`, as:
```python
decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
decay_step))
```
Example: decay exponentially with a base of 0.96:
```python
...
global_step = tf.Variable(0, trainable=False)
learning_rate = 0.1
decay_steps = 5
k = 0.5
learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate,
global_step,
decay_steps, k)
# Passing global_step to minimize() will increment it at each step.
learning_step = (
tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
.minimize(...my loss..., global_step=global_step)
)
```
Args:
learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
The initial learning rate.
global_step: A Python number. Global step to use for the decay computation.
Must not be negative.
decay_steps: How often to apply decay.
decay_rate: A Python number. The decay rate.
staircase: Whether to apply decay in a discrete staircase, as opposed to
continuous, fashion.
name: String. Optional name of the operation. Defaults to
'ExponentialTimeDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
natural_exp_rate = tf.exp(tf.negative(decay_rate))
decayed_lr = learning_rate_schedule.ExponentialDecay(
learning_rate,
decay_steps,
natural_exp_rate,
staircase=staircase,
name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.inverse_time_decay"])
def inverse_time_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False,
name=None):
"""Applies inverse time decay to the initial learning rate.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies an inverse decay function
to a provided initial learning rate. It requires an `global_step` value to
compute the decayed learning rate. You can just pass a TensorFlow variable
that you increment at each training step.
The function returns the decayed learning rate. It is computed as:
```python
decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
decay_step)
```
or, if `staircase` is `True`, as:
```python
decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
decay_step))
```
Example: decay 1/t with a rate of 0.5:
```python
...
global_step = tf.Variable(0, trainable=False)
learning_rate = 0.1
decay_steps = 1.0
decay_rate = 0.5
learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate,
global_step,
decay_steps, decay_rate)
# Passing global_step to minimize() will increment it at each step.
learning_step = (
tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
.minimize(...my loss..., global_step=global_step)
)
```
Args:
learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number.
The initial learning rate.
global_step: A Python number. Global step to use for the decay computation.
Must not be negative.
decay_steps: How often to apply decay.
decay_rate: A Python number. The decay rate.
staircase: Whether to apply decay in a discrete staircase, as opposed to
continuous, fashion.
name: String. Optional name of the operation. Defaults to
'InverseTimeDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.InverseTimeDecay(
learning_rate, decay_steps, decay_rate, staircase=staircase, name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.cosine_decay"])
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
"""Applies cosine decay to the learning rate.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a cosine decay function
to a provided initial learning rate. It requires a `global_step` value to
compute the decayed learning rate. You can just pass a TensorFlow variable
that you increment at each training step.
The function returns the decayed learning rate. It is computed as:
```python
global_step = min(global_step, decay_steps)
cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
decayed = (1 - alpha) * cosine_decay + alpha
decayed_learning_rate = learning_rate * decayed
```
Example usage:
```python
decay_steps = 1000
lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
```
Args:
learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
step to use for the decay computation.
decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
of steps to decay over.
alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
learning rate value as a fraction of learning_rate.
name: String. Optional name of the operation. Defaults to 'CosineDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
References:
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017]
(https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.CosineDecay(
learning_rate, decay_steps, alpha=alpha, name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.cosine_decay_restarts"])
def cosine_decay_restarts(learning_rate,
global_step,
first_decay_steps,
t_mul=2.0,
m_mul=1.0,
alpha=0.0,
name=None):
"""Applies cosine decay with restarts to the learning rate.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a cosine decay function with
restarts to a provided initial learning rate. It requires a `global_step`
value to compute the decayed learning rate. You can just pass a TensorFlow
variable that you increment at each training step.
The function returns the decayed learning rate while taking into account
possible warm restarts. The learning rate multiplier first decays
from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
restart is performed. Each new warm restart runs for `t_mul` times more steps
and with `m_mul` times smaller initial learning rate.
Example usage:
```python
first_decay_steps = 1000
lr_decayed = cosine_decay_restarts(learning_rate, global_step,
first_decay_steps)
```
Args:
learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
step to use for the decay computation.
first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
Number of steps to decay over.
t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to
derive the number of iterations in the i-th period
m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
Used to derive the initial learning rate of the i-th period:
alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum
learning rate value as a fraction of the learning_rate.
name: String. Optional name of the operation. Defaults to 'SGDRDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
References:
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017]
(https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.CosineDecayRestarts(
learning_rate,
first_decay_steps,
t_mul=t_mul,
m_mul=m_mul,
alpha=alpha,
name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.linear_cosine_decay"])
def linear_cosine_decay(learning_rate,
global_step,
decay_steps,
num_periods=0.5,
alpha=0.0,
beta=0.001,
name=None):
"""Applies linear cosine decay to the learning rate.
Note that linear cosine decay is more aggressive than cosine decay and
larger initial learning rates can typically be used.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a linear cosine decay function
to a provided initial learning rate. It requires a `global_step` value to
compute the decayed learning rate. You can just pass a TensorFlow variable
that you increment at each training step.
The function returns the decayed learning rate. It is computed as:
```python
global_step = min(global_step, decay_steps)
linear_decay = (decay_steps - global_step) / decay_steps)
cosine_decay = 0.5 * (
1 + cos(pi * 2 * num_periods * global_step / decay_steps))
decayed = (alpha + linear_decay) * cosine_decay + beta
decayed_learning_rate = learning_rate * decayed
```
Example usage:
```python
decay_steps = 1000
lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps)
```
Args:
learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
step to use for the decay computation.
decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
of steps to decay over.
num_periods: Number of periods in the cosine part of the decay. See
computation above.
alpha: See computation above.
beta: See computation above.
name: String. Optional name of the operation. Defaults to
'LinearCosineDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
References:
Neural Optimizer Search with Reinforcement Learning:
[Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017]
(https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.LinearCosineDecay(
learning_rate,
decay_steps,
num_periods=num_periods,
alpha=alpha,
beta=beta,
name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
@tf_export(v1=["train.noisy_linear_cosine_decay"])
def noisy_linear_cosine_decay(learning_rate,
global_step,
decay_steps,
initial_variance=1.0,
variance_decay=0.55,
num_periods=0.5,
alpha=0.0,
beta=0.001,
name=None):
"""Applies noisy linear cosine decay to the learning rate.
Note that linear cosine decay is more aggressive than cosine decay and
larger initial learning rates can typically be used.
When training a model, it is often recommended to lower the learning rate as
the training progresses. This function applies a noisy linear
cosine decay function to a provided initial learning rate.
It requires a `global_step` value to compute the decayed learning rate.
You can just pass a TensorFlow variable that you increment at each
training step.
The function returns the decayed learning rate. It is computed as:
```python
global_step = min(global_step, decay_steps)
linear_decay = (decay_steps - global_step) / decay_steps)
cosine_decay = 0.5 * (
1 + cos(pi * 2 * num_periods * global_step / decay_steps))
decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
decayed_learning_rate = learning_rate * decayed
```
where eps_t is 0-centered gaussian noise with variance
initial_variance / (1 + global_step) ** variance_decay
Example usage:
```python
decay_steps = 1000
lr_decayed = noisy_linear_cosine_decay(
learning_rate, global_step, decay_steps)
```
Args:
learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
The initial learning rate.
global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
step to use for the decay computation.
decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
of steps to decay over.
initial_variance: initial variance for the noise. See computation above.
variance_decay: decay for the noise's variance. See computation above.
num_periods: Number of periods in the cosine part of the decay. See
computation above.
alpha: See computation above.
beta: See computation above.
name: String. Optional name of the operation. Defaults to
'NoisyLinearCosineDecay'.
Returns:
A scalar `Tensor` of the same type as `learning_rate`. The decayed
learning rate.
Raises:
ValueError: if `global_step` is not supplied.
References:
Neural Optimizer Search with Reinforcement Learning:
[Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
Stochastic Gradient Descent with Warm Restarts:
[Loshchilov et al., 2017]
(https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx)
([pdf](https://openreview.net/pdf?id=Skq89Scxx))
@compatibility(eager)
When eager execution is enabled, this function returns a function which in
turn returns the decayed learning rate Tensor. This can be useful for changing
the learning rate value across different invocations of optimizer functions.
@end_compatibility
"""
decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
learning_rate,
decay_steps,
initial_variance=initial_variance,
variance_decay=variance_decay,
num_periods=num_periods,
alpha=alpha,
beta=beta,
name=name)
if not tf.executing_eagerly():
decayed_lr = decayed_lr(global_step)
else:
decayed_lr = functools.partial(decayed_lr, global_step)
return decayed_lr
Functions
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None)
-
Applies cosine decay to the learning rate.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function to a provided initial learning rate. It requires a
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
global_step = min(global_step, decay_steps) cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps)) decayed = (1 - alpha) * cosine_decay + alpha decayed_learning_rate = learning_rate * decayed
Example usage:
decay_steps = 1000 lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
Args
learning_rate
- A scalar
float32
orfloat64
Tensor or a Python number. The initial learning rate. global_step
- A scalar
int32
orint64
Tensor
or a Python number. Global step to use for the decay computation. decay_steps
- A scalar
int32
orint64
Tensor
or a Python number. Number of steps to decay over. alpha
- A scalar
float32
orfloat64
Tensor or a Python number. Minimum learning rate value as a fraction of learning_rate. name
- String. Optional name of the operation. Defaults to 'CosineDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
References
Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) (pdf)
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.cosine_decay"]) def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): """Applies cosine decay to the learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps)) decayed = (1 - alpha) * cosine_decay + alpha decayed_learning_rate = learning_rate * decayed ``` Example usage: ```python decay_steps = 1000 lr_decayed = cosine_decay(learning_rate, global_step, decay_steps) ``` Args: learning_rate: A scalar `float32` or `float64` Tensor or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number of steps to decay over. alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum learning rate value as a fraction of learning_rate. name: String. Optional name of the operation. Defaults to 'CosineDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. References: Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.CosineDecay( learning_rate, decay_steps, alpha=alpha, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def cosine_decay_restarts(learning_rate, global_step, first_decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0, name=None)
-
Applies cosine decay with restarts to the learning rate.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function with restarts to a provided initial learning rate. It requires a
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate while taking into account possible warm restarts. The learning rate multiplier first decays from 1 to
alpha
forfirst_decay_steps
steps. Then, a warm restart is performed. Each new warm restart runs fort_mul
times more steps and withm_mul
times smaller initial learning rate.Example usage:
first_decay_steps = 1000 lr_decayed = cosine_decay_restarts(learning_rate, global_step, first_decay_steps)
Args
learning_rate
- A scalar
float32
orfloat64
Tensor or a Python number. The initial learning rate. global_step
- A scalar
int32
orint64
Tensor
or a Python number. Global step to use for the decay computation. first_decay_steps
- A scalar
int32
orint64
Tensor
or a Python number. Number of steps to decay over. t_mul
- A scalar
float32
orfloat64
Tensor
or a Python number. Used to derive the number of iterations in the i-th period m_mul
- A scalar
float32
orfloat64
Tensor
or a Python number. Used to derive the initial learning rate of the i-th period: alpha
- A scalar
float32
orfloat64
Tensor or a Python number. Minimum learning rate value as a fraction of the learning_rate. name
- String. Optional name of the operation. Defaults to 'SGDRDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
References
Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) (pdf)
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.cosine_decay_restarts"]) def cosine_decay_restarts(learning_rate, global_step, first_decay_steps, t_mul=2.0, m_mul=1.0, alpha=0.0, name=None): """Applies cosine decay with restarts to the learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function with restarts to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate while taking into account possible warm restarts. The learning rate multiplier first decays from 1 to `alpha` for `first_decay_steps` steps. Then, a warm restart is performed. Each new warm restart runs for `t_mul` times more steps and with `m_mul` times smaller initial learning rate. Example usage: ```python first_decay_steps = 1000 lr_decayed = cosine_decay_restarts(learning_rate, global_step, first_decay_steps) ``` Args: learning_rate: A scalar `float32` or `float64` Tensor or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number of steps to decay over. t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to derive the number of iterations in the i-th period m_mul: A scalar `float32` or `float64` `Tensor` or a Python number. Used to derive the initial learning rate of the i-th period: alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum learning rate value as a fraction of the learning_rate. name: String. Optional name of the operation. Defaults to 'SGDRDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. References: Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.CosineDecayRestarts( learning_rate, first_decay_steps, t_mul=t_mul, m_mul=m_mul, alpha=alpha, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)
-
Applies exponential decay to the learning rate.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an exponential decay function to a provided initial learning rate. It requires a
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
If the argument
staircase
isTrue
, thenglobal_step / decay_steps
is an integer division and the decayed learning rate follows a staircase function.Example: decay every 100000 steps with a base of 0.96:
... global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) )
Args
learning_rate
- A scalar
float32
orfloat64
Tensor
or a Python number. The initial learning rate. global_step
- A scalar
int32
orint64
Tensor
or a Python number. Global step to use for the decay computation. Must not be negative. decay_steps
- A scalar
int32
orint64
Tensor
or a Python number. Must be positive. See the decay computation above. decay_rate
- A scalar
float32
orfloat64
Tensor
or a Python number. The decay rate. staircase
- Boolean.
If
True
decay the learning rate at discrete intervals name
- String. Optional name of the operation. Defaults to 'ExponentialDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.exponential_decay"]) def exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None): """Applies exponential decay to the learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an exponential decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) ``` If the argument `staircase` is `True`, then `global_step / decay_steps` is an integer division and the decayed learning rate follows a staircase function. Example: decay every 100000 steps with a base of 0.96: ```python ... global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. Must not be negative. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must be positive. See the decay computation above. decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The decay rate. staircase: Boolean. If `True` decay the learning rate at discrete intervals name: String. Optional name of the operation. Defaults to 'ExponentialDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.ExponentialDecay( learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)
-
Applies inverse time decay to the initial learning rate.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an inverse decay function to a provided initial learning rate. It requires an
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
or, if
staircase
isTrue
, as:decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
Example: decay 1/t with a rate of 0.5:
... global_step = tf.Variable(0, trainable=False) learning_rate = 0.1 decay_steps = 1.0 decay_rate = 0.5 learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) )
Args
learning_rate
- A scalar
float32
orfloat64
Tensor
or a Python number. The initial learning rate. global_step
- A Python number. Global step to use for the decay computation. Must not be negative.
decay_steps
- How often to apply decay.
decay_rate
- A Python number. The decay rate.
staircase
- Whether to apply decay in a discrete staircase, as opposed to continuous, fashion.
name
- String. Optional name of the operation. Defaults to 'InverseTimeDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.inverse_time_decay"]) def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None): """Applies inverse time decay to the initial learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an inverse decay function to a provided initial learning rate. It requires an `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) ``` or, if `staircase` is `True`, as: ```python decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) ``` Example: decay 1/t with a rate of 0.5: ```python ... global_step = tf.Variable(0, trainable=False) learning_rate = 0.1 decay_steps = 1.0 decay_rate = 0.5 learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A Python number. Global step to use for the decay computation. Must not be negative. decay_steps: How often to apply decay. decay_rate: A Python number. The decay rate. staircase: Whether to apply decay in a discrete staircase, as opposed to continuous, fashion. name: String. Optional name of the operation. Defaults to 'InverseTimeDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.InverseTimeDecay( learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def linear_cosine_decay(learning_rate, global_step, decay_steps, num_periods=0.5, alpha=0.0, beta=0.001, name=None)
-
Applies linear cosine decay to the learning rate.
Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a linear cosine decay function to a provided initial learning rate. It requires a
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
global_step = min(global_step, decay_steps) linear_decay = (decay_steps - global_step) / decay_steps) cosine_decay = 0.5 * ( 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) decayed = (alpha + linear_decay) * cosine_decay + beta decayed_learning_rate = learning_rate * decayed
Example usage:
decay_steps = 1000 lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps)
Args
learning_rate
- A scalar
float32
orfloat64
Tensor or a Python number. The initial learning rate. global_step
- A scalar
int32
orint64
Tensor
or a Python number. Global step to use for the decay computation. decay_steps
- A scalar
int32
orint64
Tensor
or a Python number. Number of steps to decay over. num_periods
- Number of periods in the cosine part of the decay. See computation above.
alpha
- See computation above.
beta
- See computation above.
name
- String. Optional name of the operation. Defaults to 'LinearCosineDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
References
Neural Optimizer Search with Reinforcement Learning: Bello et al., 2017 (pdf) Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) (pdf)
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.linear_cosine_decay"]) def linear_cosine_decay(learning_rate, global_step, decay_steps, num_periods=0.5, alpha=0.0, beta=0.001, name=None): """Applies linear cosine decay to the learning rate. Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a linear cosine decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) linear_decay = (decay_steps - global_step) / decay_steps) cosine_decay = 0.5 * ( 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) decayed = (alpha + linear_decay) * cosine_decay + beta decayed_learning_rate = learning_rate * decayed ``` Example usage: ```python decay_steps = 1000 lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps) ``` Args: learning_rate: A scalar `float32` or `float64` Tensor or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number of steps to decay over. num_periods: Number of periods in the cosine part of the decay. See computation above. alpha: See computation above. beta: See computation above. name: String. Optional name of the operation. Defaults to 'LinearCosineDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. References: Neural Optimizer Search with Reinforcement Learning: [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.LinearCosineDecay( learning_rate, decay_steps, num_periods=num_periods, alpha=alpha, beta=beta, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None)
-
Applies natural exponential decay to the initial learning rate.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an exponential decay function to a provided initial learning rate. It requires an
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
decayed_learning_rate = learning_rate * exp(-decay_rate * global_step / decay_step)
or, if
staircase
isTrue
, as:decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step / decay_step))
Example: decay exponentially with a base of 0.96:
... global_step = tf.Variable(0, trainable=False) learning_rate = 0.1 decay_steps = 5 k = 0.5 learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate, global_step, decay_steps, k) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) )
Args
learning_rate
- A scalar
float32
orfloat64
Tensor
or a Python number. The initial learning rate. global_step
- A Python number. Global step to use for the decay computation. Must not be negative.
decay_steps
- How often to apply decay.
decay_rate
- A Python number. The decay rate.
staircase
- Whether to apply decay in a discrete staircase, as opposed to continuous, fashion.
name
- String. Optional name of the operation. Defaults to 'ExponentialTimeDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.natural_exp_decay"]) def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None): """Applies natural exponential decay to the initial learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an exponential decay function to a provided initial learning rate. It requires an `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python decayed_learning_rate = learning_rate * exp(-decay_rate * global_step / decay_step) ``` or, if `staircase` is `True`, as: ```python decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step / decay_step)) ``` Example: decay exponentially with a base of 0.96: ```python ... global_step = tf.Variable(0, trainable=False) learning_rate = 0.1 decay_steps = 5 k = 0.5 learning_rate = tf.compat.v1.train.natural_exp_decay(learning_rate, global_step, decay_steps, k) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A Python number. Global step to use for the decay computation. Must not be negative. decay_steps: How often to apply decay. decay_rate: A Python number. The decay rate. staircase: Whether to apply decay in a discrete staircase, as opposed to continuous, fashion. name: String. Optional name of the operation. Defaults to 'ExponentialTimeDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ natural_exp_rate = tf.exp(tf.negative(decay_rate)) decayed_lr = learning_rate_schedule.ExponentialDecay( learning_rate, decay_steps, natural_exp_rate, staircase=staircase, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps, initial_variance=1.0, variance_decay=0.55, num_periods=0.5, alpha=0.0, beta=0.001, name=None)
-
Applies noisy linear cosine decay to the learning rate.
Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used.
When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a noisy linear cosine decay function to a provided initial learning rate. It requires a
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
global_step = min(global_step, decay_steps) linear_decay = (decay_steps - global_step) / decay_steps) cosine_decay = 0.5 * ( 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta decayed_learning_rate = learning_rate * decayed
where eps_t is 0-centered gaussian noise with variance initial_variance / (1 + global_step) ** variance_decay
Example usage:
decay_steps = 1000 lr_decayed = noisy_linear_cosine_decay( learning_rate, global_step, decay_steps)
Args
learning_rate
- A scalar
float32
orfloat64
Tensor or a Python number. The initial learning rate. global_step
- A scalar
int32
orint64
Tensor
or a Python number. Global step to use for the decay computation. decay_steps
- A scalar
int32
orint64
Tensor
or a Python number. Number of steps to decay over. initial_variance
- initial variance for the noise. See computation above.
variance_decay
- decay for the noise's variance. See computation above.
num_periods
- Number of periods in the cosine part of the decay. See computation above.
alpha
- See computation above.
beta
- See computation above.
name
- String. Optional name of the operation. Defaults to 'NoisyLinearCosineDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
References
Neural Optimizer Search with Reinforcement Learning: Bello et al., 2017 (pdf) Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) (pdf)
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.noisy_linear_cosine_decay"]) def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps, initial_variance=1.0, variance_decay=0.55, num_periods=0.5, alpha=0.0, beta=0.001, name=None): """Applies noisy linear cosine decay to the learning rate. Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a noisy linear cosine decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) linear_decay = (decay_steps - global_step) / decay_steps) cosine_decay = 0.5 * ( 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta decayed_learning_rate = learning_rate * decayed ``` where eps_t is 0-centered gaussian noise with variance initial_variance / (1 + global_step) ** variance_decay Example usage: ```python decay_steps = 1000 lr_decayed = noisy_linear_cosine_decay( learning_rate, global_step, decay_steps) ``` Args: learning_rate: A scalar `float32` or `float64` Tensor or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number of steps to decay over. initial_variance: initial variance for the noise. See computation above. variance_decay: decay for the noise's variance. See computation above. num_periods: Number of periods in the cosine part of the decay. See computation above. alpha: See computation above. beta: See computation above. name: String. Optional name of the operation. Defaults to 'NoisyLinearCosineDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. References: Neural Optimizer Search with Reinforcement Learning: [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html) ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf)) Stochastic Gradient Descent with Warm Restarts: [Loshchilov et al., 2017] (https://openreview.net/forum?id=Skq89Scxx¬eId=Skq89Scxx) ([pdf](https://openreview.net/pdf?id=Skq89Scxx)) @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay( learning_rate, decay_steps, initial_variance=initial_variance, variance_decay=variance_decay, num_periods=num_periods, alpha=alpha, beta=beta, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr
def piecewise_constant(x, boundaries, values, name=None)
-
Piecewise constant from boundaries and interval values.
Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
global_step = tf.Variable(0, trainable=False) boundaries = [100000, 110000] values = [1.0, 0.5, 0.1] learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries, values) # Later, whenever we perform an optimization step, we increment global_step.
Args
x
- A 0-D scalar
Tensor
. Must be one of the following types:float32
,float64
,uint8
,int8
,int16
,int32
,int64
. boundaries
- A list of
Tensor
s orint
s orfloat
s with strictly increasing entries, and with all elements having the same type asx
. values
- A list of
Tensor
s orfloat
s orint
s that specifies the values for the intervals defined byboundaries
. It should have one more element thanboundaries
, and all elements should have the same type. name
- A string. Optional name of the operation. Defaults to 'PiecewiseConstant'.
Returns
A 0-D Tensor. Its value is
values[0]
whenx <= boundaries[0]
,values[1]
whenx > boundaries[0]
andx <= boundaries[1]
, …, and values[-1] whenx > boundaries[-1]
.Raises
ValueError
- if types of
x
andboundaries
do not match, or types of allvalues
do not match or the number of elements in the lists does not match.
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"]) def piecewise_constant(x, boundaries, values, name=None): """Piecewise constant from boundaries and interval values. Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps. ```python global_step = tf.Variable(0, trainable=False) boundaries = [100000, 110000] values = [1.0, 0.5, 0.1] learning_rate = tf.compat.v1.train.piecewise_constant(global_step, boundaries, values) # Later, whenever we perform an optimization step, we increment global_step. ``` Args: x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`. boundaries: A list of `Tensor`s or `int`s or `float`s with strictly increasing entries, and with all elements having the same type as `x`. values: A list of `Tensor`s or `float`s or `int`s that specifies the values for the intervals defined by `boundaries`. It should have one more element than `boundaries`, and all elements should have the same type. name: A string. Optional name of the operation. Defaults to 'PiecewiseConstant'. Returns: A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`, `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ..., and values[-1] when `x > boundaries[-1]`. Raises: ValueError: if types of `x` and `boundaries` do not match, or types of all `values` do not match or the number of elements in the lists does not match. @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ boundaries = tf.nest.map_structure(tf.convert_to_tensor, tf.nest.flatten(boundaries)) values = tf.nest.map_structure(tf.convert_to_tensor, tf.nest.flatten(values)) x_recomp = tf.convert_to_tensor(x) # Avoid explicit conversion to x's dtype. This could result in faulty # comparisons, for example if floats are converted to integers. for i, b in enumerate(boundaries): if b.dtype.base_dtype != x_recomp.dtype.base_dtype: # We can promote int32 boundaries to int64 without loss of precision. # This covers the most common case where the user passes in boundaries # as an array of Python integers. if (b.dtype.base_dtype == tf.int32 and x_recomp.dtype.base_dtype == tf.int64): b = tf.cast(b, x_recomp.dtype.base_dtype) boundaries[i] = b else: raise ValueError( "Boundaries (%s) must have the same dtype as x (%s)." % (b.dtype.base_dtype, x_recomp.dtype.base_dtype)) for v in values[1:]: if v.dtype.base_dtype != values[0].dtype.base_dtype: raise ValueError( "Values must have elements all with the same dtype (%s vs %s)." % (values[0].dtype.base_dtype, v.dtype.base_dtype)) decayed_lr = learning_rate_schedule.PiecewiseConstantDecay( boundaries, values, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(x) else: decayed_lr = functools.partial(decayed_lr, x) return decayed_lr
def polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None)
-
Applies a polynomial decay to the learning rate.
It is commonly observed that a monotonically decreasing learning rate, whose degree of change is carefully chosen, results in a better performing model. This function applies a polynomial decay function to a provided initial
learning_rate
to reach anend_learning_rate
in the givendecay_steps
.It requires a
global_step
value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step.The function returns the decayed learning rate. It is computed as:
global_step = min(global_step, decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate
If
cycle
is True then a multiple ofdecay_steps
is used, the first one that is bigger thanglobal_steps
.decay_steps = decay_steps * ceil(global_step / decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate
Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
... global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 end_learning_rate = 0.01 decay_steps = 10000 learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) )
Args
learning_rate
- A scalar
float32
orfloat64
Tensor
or a Python number. The initial learning rate. global_step
- A scalar
int32
orint64
Tensor
or a Python number. Global step to use for the decay computation. Must not be negative. decay_steps
- A scalar
int32
orint64
Tensor
or a Python number. Must be positive. See the decay computation above. end_learning_rate
- A scalar
float32
orfloat64
Tensor
or a Python number. The minimal end learning rate. power
- A scalar
float32
orfloat64
Tensor
or a Python number. The power of the polynomial. Defaults to linear, 1.0. cycle
- A boolean, whether or not it should cycle beyond decay_steps.
name
- String. Optional name of the operation. Defaults to 'PolynomialDecay'.
Returns
A scalar
Tensor
of the same type aslearning_rate
. The decayed learning rate.Raises
ValueError
- if
global_step
is not supplied.
@compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility
Expand source code
@tf_export(v1=["train.polynomial_decay"]) def polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None): """Applies a polynomial decay to the learning rate. It is commonly observed that a monotonically decreasing learning rate, whose degree of change is carefully chosen, results in a better performing model. This function applies a polynomial decay function to a provided initial `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate ``` If `cycle` is True then a multiple of `decay_steps` is used, the first one that is bigger than `global_steps`. ```python decay_steps = decay_steps * ceil(global_step / decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate ``` Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5): ```python ... global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 end_learning_rate = 0.01 decay_steps = 10000 learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. Must not be negative. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must be positive. See the decay computation above. end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The minimal end learning rate. power: A scalar `float32` or `float64` `Tensor` or a Python number. The power of the polynomial. Defaults to linear, 1.0. cycle: A boolean, whether or not it should cycle beyond decay_steps. name: String. Optional name of the operation. Defaults to 'PolynomialDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.PolynomialDecay( learning_rate, decay_steps, end_learning_rate=end_learning_rate, power=power, cycle=cycle, name=name) if not tf.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr