Module keras.engine.training_distributed_v1
Part of the Keras training engine related to distributed training.
Expand source code
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Part of the Keras training engine related to distributed training."""
import tensorflow.compat.v2 as tf
# pylint: disable=protected-access
import numpy as np
from tensorflow.python.distribute import input_lib
from keras import backend
from keras import callbacks as cbks
from keras.distribute import distribute_coordinator_utils as dc
from keras.distribute import distributed_training_utils_v1 as dist_utils
from keras.engine import partial_batch_padding_handler as padding_util
from keras.engine import training_arrays_v1
from keras.engine import training_utils_v1
from keras.utils.generic_utils import Progbar
from keras.utils.mode_keys import ModeKeys
from tensorflow.python.platform import tf_logging as logging
def _per_replica_execution_function(model, mode):
exec_func = model._make_execution_function(mode)
return (exec_func.inputs, exec_func.outputs, exec_func.updates_op,
exec_func.session_kwargs)
def _build_model(strategy, model, mode, inputs, targets=None):
if model._compile_distribution:
dist_utils.clone_model_on_replicas(
model, strategy, mode, inputs=inputs, targets=targets)
else:
dist_utils._build_distributed_network(model, strategy, mode, inputs,
targets)
def _make_train_step_fn(model, mode, strategy, output_labels):
"""Create step fn.
Args:
model: a Keras Model instance.
mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
strategy: a `tf.distribute.Strategy` instance.
output_labels: the output labels for the step function.
Returns:
A step function to run by `tf.distribute.Strategy`.
"""
def _step_fn(ctx, inputs):
"""A step fn that returns update ops."""
if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
inputs, targets = inputs
else:
targets = None
# When input feature is a dictionary of tensors, dictionary is flattended
# to an array and passed as a model input. This results in input mismatch
# when model input layer names are not sorted in alphabetical order as
# `nest.flatten()`sorts dictionary elements by keys. As so, transform input
# tensors into an array and order it along `model._feed_input_names`.
if isinstance(inputs, dict):
inputs = [inputs[input_name] for input_name in model._feed_input_names]
_build_model(strategy, model, mode, inputs, targets)
(grouped_inputs, grouped_outputs, grouped_updates,
grouped_session_args) = strategy.extended.call_for_each_replica(
_per_replica_execution_function,
args=(dist_utils.get_distributed_model(model, mode), mode))
(all_inputs, all_outputs, all_updates,
all_session_args) = dist_utils.unwrap_values(strategy, grouped_inputs,
grouped_outputs,
grouped_updates,
grouped_session_args)
combined_fn = backend.function(
all_inputs,
all_outputs,
updates=all_updates,
name='distributed_' + str(mode) + '_function',
**all_session_args)
for label, output in zip(output_labels, combined_fn.outputs):
if label == 'loss':
reduce_op = tf.distribute.ReduceOp.SUM
else:
# We reduce all other metrics using mean for now. This is temporary
# workaround until new metrics are in place.
reduce_op = tf.distribute.ReduceOp.MEAN
ctx.set_last_step_output(label, output, reduce_op)
# TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
# feed_dict, session kwargs, run options, run_metadata for now. These should
# be handled appropriately
return combined_fn.updates_op
return _step_fn
def experimental_tpu_fit_loop(model,
dataset,
epochs=100,
verbose=1,
callbacks=None,
initial_epoch=0,
steps_per_epoch=None,
val_dataset=None,
validation_steps=None,
validation_freq=1):
"""Fit loop for training with TPU tf.distribute.Strategy.
Args:
model: Keras Model instance.
dataset: Dataset that returns inputs and targets
epochs: Number of times to iterate over the data
verbose: Integer, Verbosity mode, 0, 1 or 2
callbacks: List of callbacks to be called during training
initial_epoch: Epoch at which to start training
(useful for resuming a previous training run)
steps_per_epoch: Total number of steps (batches of samples)
before declaring one epoch finished and starting the
next epoch. Ignored with the default value of `None`.
val_dataset: Dataset for validation data.
validation_steps: Number of steps to run validation for
(only if doing validation from data tensors).
Ignored with the default value of `None`.
validation_freq: Only relevant if validation data is provided. Integer or
`collections.abc.Container` instance (e.g. list, tuple, etc.). If an
integer, specifies how many training epochs to run before a new
validation run is performed, e.g. `validation_freq=2` runs
validation every 2 epochs. If a Container, specifies the epochs on
which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
validation at the end of the 1st, 2nd, and 10th epochs.
Returns:
Returns `None`.
Raises:
ValueError: in case of invalid arguments.
"""
mode = ModeKeys.TRAIN
current_strategy = model._distribution_strategy
iteration_value = min(steps_per_epoch,
current_strategy.extended.steps_per_run)
steps_per_run = backend.variable(
value=iteration_value,
dtype='int32',
name='steps_per_run')
# TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
iterator = dist_utils.get_iterator(dataset, current_strategy)
scope = dist_utils.distributed_scope(
strategy=current_strategy, learning_phase=1)
scope.__enter__()
out_labels = model.metrics_names or []
step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy,
out_labels)
# Add initial dummy values for loss and other metric tensors.
initial_loop_values = {}
initial_loop_values['loss'] = tf.constant(1e7)
for m in model._get_training_eval_metrics():
tensor = m.result()
initial_loop_values[m.name] = tf.zeros(tensor.shape, tensor.dtype)
ctx = current_strategy.extended.experimental_run_steps_on_iterator(
step_fn, iterator, iterations=steps_per_run,
initial_loop_values=initial_loop_values)
train_op = ctx.run_op
output_tensors = ctx.last_step_outputs
do_validation = bool(validation_steps)
if model._compile_distribution:
dist_utils._copy_weights_to_distributed_model(model, mode)
callbacks = cbks.configure_callbacks(
callbacks,
model,
do_validation=do_validation,
epochs=epochs,
steps_per_epoch=steps_per_epoch,
verbose=verbose,
count_mode='steps',
mode=mode)
# Calculate the steps each time on the device.
steps_to_run = ([current_strategy.extended.steps_per_run] *
(steps_per_epoch //
current_strategy.extended.steps_per_run))
if steps_per_epoch % current_strategy.extended.steps_per_run:
steps_to_run.append(
steps_per_epoch % current_strategy.extended.steps_per_run)
target_steps = len(steps_to_run)
callbacks._call_begin_hook(mode)
initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
for epoch in range(initial_epoch, epochs):
dist_utils._reset_metrics(model)
callbacks.on_epoch_begin(epoch)
epoch_logs = {}
step_index = 0
prev_step_count = None
current_step = 0
while current_step < target_steps:
step_count = steps_to_run[current_step]
batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
if prev_step_count is None or step_count != prev_step_count:
backend.get_session().run(steps_per_run.assign(step_count))
prev_step_count = step_count
try:
_, outputs = backend.batch_get_value([train_op, output_tensors])
except tf.errors.OutOfRangeError:
logging.warning('Your dataset iterator ran out of data; '
'interrupting training. Make sure that your dataset '
'can generate at least `steps_per_epoch * epochs` '
'batches (in this case, %d batches).' %
steps_per_epoch * epochs)
break
batch_logs.update(outputs)
callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
step_index = step_index + step_count
current_step += 1
if callbacks.model.stop_training:
break
if (do_validation and
training_utils_v1.should_run_validation(validation_freq, epoch)):
logging.info('Running validation at fit epoch: %s', epoch)
if model._compile_distribution:
# Since we create a new clone from the original model we need to copy
# the weights back to the original model before we can run validation.
dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable
model,
val_dataset,
steps=validation_steps,
verbose=verbose,
callbacks=callbacks)
if not isinstance(val_outs, list):
val_outs = [val_outs]
# Same labels assumed.
for label, val_out in zip(out_labels, val_outs):
epoch_logs['val_' + label] = val_out
callbacks.on_epoch_end(epoch, epoch_logs)
if callbacks.model.stop_training:
break
model._successful_loop_finish = True
callbacks._call_end_hook(mode)
if model._compile_distribution:
# Copy the weights back from the replicated model to the original model.
dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
scope.__exit__(None, None, None)
return model.history
def experimental_tpu_test_loop(model,
dataset,
verbose=0,
steps=None,
callbacks=None):
"""Test loop for evaluating with TPU tf.distribute.Strategy.
Args:
model: Keras Model instance.
dataset: Dataset for input data.
verbose: Integer, Verbosity mode 0 or 1.
steps: Total number of steps (batches of samples)
before declaring predictions finished.
Ignored with the default value of `None`.
callbacks: List of callbacks to be called during training
Returns:
Scalar loss (if the model has a single output and no metrics)
or list of scalars (if the model has multiple outputs
and/or metrics). The attribute `model.metrics_names` will give you
the display labels for the outputs.
"""
mode = ModeKeys.TEST
current_strategy = model._distribution_strategy
iterator = dist_utils.get_iterator(dataset, current_strategy)
scope = dist_utils.distributed_scope(
strategy=current_strategy, learning_phase=0)
scope.__enter__()
out_labels = model.metrics_names
def _test_step_fn(inputs):
"""A fn that returns output of single test step."""
if isinstance(inputs, (tuple, list)) and len(inputs) == 2:
inputs, targets = inputs
else:
targets = None
(tf.distribute.get_replica_context().merge_call(
_build_model, args=(model, mode, inputs, targets)))
(_, outputs, updates, _) = _per_replica_execution_function(
dist_utils.get_distributed_model(model, mode), mode)
with tf.control_dependencies([updates]):
return [tf.identity(out) for out in outputs]
test_input_data = iterator.get_next()
per_replica_outputs = current_strategy.run(
_test_step_fn, args=(test_input_data,))
output_tensors = {}
for label, output in zip(out_labels, per_replica_outputs):
if label == 'loss':
reduce_op = tf.distribute.ReduceOp.SUM
else:
# We reduce all other metrics using mean for now. This is temporary
# workaround until new metrics are in place.
reduce_op = tf.distribute.ReduceOp.MEAN
output_tensors[label] = current_strategy.reduce(reduce_op, output,
axis=None)
test_op = tf.group(list(output_tensors.values()))
if verbose >= 1:
progbar = Progbar(target=steps)
if model._compile_distribution:
dist_utils._copy_weights_to_distributed_model(model, mode)
dist_utils._reset_metrics(model)
callbacks = cbks.configure_callbacks(
callbacks,
model,
do_validation=False,
epochs=1,
steps_per_epoch=steps,
verbose=verbose,
count_mode='steps',
mode=ModeKeys.TEST)
callbacks._call_begin_hook(mode)
outs = [0.] * len(model.metrics_names)
if steps is not None:
target_steps = steps
else:
raise ValueError('Number of steps could not be inferred from the data, '
'please pass the steps argument.')
current_step = 0
while current_step < target_steps:
batch_logs = {'batch': current_step, 'size': 1}
callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
try:
_, batch_outs = backend.batch_get_value([test_op, output_tensors])
except tf.errors.OutOfRangeError:
warning_msg = (
'Make sure that your dataset can generate at least '
'`steps` batches (in this case, {} batches).'.format(steps))
logging.warning('Your dataset iterator ran out of data; '
'interrupting evaluation. ' + warning_msg)
target_steps = current_step
break
for i, label in enumerate(model.metrics_names):
if i == 0:
# Loss is stateless metrics.
outs[i] += batch_outs[label]
else:
# For all stateful metrics, the aggregation is handled by mirrored vars.
outs[i] = batch_outs[label]
batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
if verbose == 1:
progbar.update(current_step + 1)
current_step += 1
if verbose >= 1:
# Progress bar finishes at the end.
progbar.update(target_steps)
callbacks._call_end_hook(mode)
scope.__exit__(None, None, None)
if len(outs) >= 0:
outs[0] /= (target_steps)
if len(outs) == 1:
return outs[0]
return outs
def experimental_tpu_predict_loop(model,
dataset,
verbose=0,
steps=None,
callbacks=None):
"""Predict loop for predicting with TPU tf.distribute.Strategy.
Args:
model: Keras Model instance.
dataset: Dataset for input data.
verbose: Integer, Verbosity mode 0 or 1.
steps: Total number of steps (batches of samples)
before declaring `_predict_loop` finished.
Ignored with the default value of `None`.
callbacks: List of callbacks to be called during training
Returns:
Array of predictions (if the model has a single output)
or list of arrays of predictions
(if the model has multiple outputs).
"""
mode = ModeKeys.PREDICT
dataset_fully_shaped = dist_utils.is_dataset_shape_fully_defined(dataset)
padding_handler = None
if not dataset_fully_shaped:
# TODO(hongjunchoi): Investigate whether operations from
# PartialBatchPaddingHandler are unnecessarily pruned out
# during graph optimization.
padding_handler = padding_util.PartialBatchPaddingHandler(
model._feed_output_shapes)
batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset)
padding_handler.padded_batch_size = batch_size
padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
padding_handler.update_mask)
dataset = dataset.map(padding_handler.pad_batch)
dataset = dataset.unbatch()
# Upon this point, it is guaranteed that the dataset does not
# have partial batches. Thus, we set `drop_remainder=True` to
# get static shape information about the elements in the dataset.
dataset = dataset.batch(batch_size, drop_remainder=True)
if prefetch_buffer is not None:
dataset = dataset.prefetch(prefetch_buffer)
current_strategy = model._distribution_strategy
iterator = dist_utils.get_iterator(dataset, current_strategy)
scope = dist_utils.distributed_scope(
strategy=current_strategy, learning_phase=0)
scope.__enter__()
def _predict_step_fn(inputs):
"""A fn that returns output of single prediction step."""
(tf.distribute.get_replica_context().merge_call(
_build_model, args=(model, mode, inputs)))
(_, outputs, updates, _) = _per_replica_execution_function(
dist_utils.get_distributed_model(model, mode), mode)
with tf.control_dependencies([updates]):
return [tf.identity(out) for out in outputs]
# TODO(hongjunchoi): When numpy array is passed as an input to `predict()`
# use numpy arrays directly to avoid cumulating unnecessary input pipeline
# ops.
predict_input_data = iterator.get_next()
per_replica_outputs = current_strategy.run(
_predict_step_fn, args=(predict_input_data,))
output_tensors = dist_utils.flatten_per_replica_values(
current_strategy, per_replica_outputs)
if verbose >= 1:
progbar = Progbar(target=steps)
if model._compile_distribution:
dist_utils._copy_weights_to_distributed_model(model, mode)
dist_utils._reset_metrics(model)
callbacks = cbks.configure_callbacks(
callbacks,
model,
do_validation=False,
epochs=1,
steps_per_epoch=steps,
verbose=verbose,
count_mode='steps',
mode=mode)
callbacks._call_begin_hook(mode)
# Since we do not know how many samples we will see, we cannot pre-allocate
# the returned Numpy arrays. Instead, we store one array per batch seen
# and concatenate them upon returning.
num_model_outputs = len(model.output_names)
unconcatenated_outs = [[] for _ in range(num_model_outputs)]
if steps is not None:
target_steps = steps
else:
raise ValueError('Number of steps could not be inferred from the data, '
'please pass the steps argument.')
current_step = 0
while current_step < target_steps:
batch_logs = {'batch': current_step, 'size': 1}
callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
try:
predict_ops = tf.group(output_tensors)
_, batch_outs = backend.batch_get_value([predict_ops, output_tensors])
except tf.errors.OutOfRangeError:
warning_msg = (
'Make sure that your dataset can generate at least '
'`steps` batches (in this case, {} batches).'.format(steps))
logging.warning('Your dataset iterator ran out of data; '
'interrupting evaluation. ' + warning_msg)
break
# TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
for i in range(num_model_outputs):
output_start_index = i * current_strategy.num_replicas_in_sync
output_end_index = (
output_start_index + current_strategy.num_replicas_in_sync)
single_model_output = batch_outs[output_start_index:output_end_index]
unconcatenated_outs[i].extend(single_model_output)
batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
if verbose == 1:
progbar.update(current_step + 1)
current_step += 1
if verbose >= 1:
# Progress bar finishes at the end.
progbar.update(current_step)
callbacks._call_end_hook(mode)
scope.__exit__(None, None, None)
if len(unconcatenated_outs) == 1:
prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
else:
prediction_result = [
np.concatenate(out, axis=0) for out in unconcatenated_outs
]
if padding_handler:
prediction_result = padding_handler.apply_mask(prediction_result)
return prediction_result
class DistributionSingleWorkerTrainingLoop(training_utils_v1.TrainingLoop):
"""Training loop for distribution strategy with single worker."""
def fit(self,
model,
x=None,
y=None,
batch_size=None,
epochs=1,
verbose=1,
callbacks=None,
validation_split=0.,
validation_data=None,
shuffle=True,
class_weight=None,
sample_weight=None,
initial_epoch=0,
steps_per_epoch=None,
validation_steps=None,
validation_freq=1,
**kwargs):
"""Fit loop for Distribution Strategies."""
dist_utils.validate_callbacks(input_callbacks=callbacks,
optimizer=model.optimizer)
dist_utils.validate_inputs(x, y)
batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
model._distribution_strategy,
x,
batch_size,
steps_per_epoch,
ModeKeys.TRAIN,
validation_split=validation_split)
batch_size = model._validate_or_infer_batch_size(
batch_size, steps_per_epoch, x)
dataset = model._distribution_standardize_user_data(
x, y,
sample_weight=sample_weight,
class_weight=class_weight,
batch_size=batch_size,
validation_split=validation_split,
shuffle=shuffle,
epochs=epochs)
if not dist_utils.is_distributing_by_cloning(model):
with model._distribution_strategy.scope():
(dataset, _, _) = model._standardize_user_data(
dataset,
sample_weight=sample_weight,
class_weight=class_weight,
batch_size=batch_size,
validation_split=validation_split,
shuffle=shuffle)
val_dataset = None
if validation_data:
val_x, val_y, val_sample_weights = (
training_utils_v1.unpack_validation_data(validation_data))
dist_utils.validate_inputs(val_x, val_y)
_, validation_steps = dist_utils.process_batch_and_step_size(
model._distribution_strategy, val_x, batch_size, validation_steps,
ModeKeys.TEST)
val_dataset = model._distribution_standardize_user_data(
val_x, val_y,
sample_weight=val_sample_weights,
class_weight=None,
batch_size=batch_size,
validation_split=validation_split,
shuffle=shuffle,
allow_partial_batch=True)
elif validation_split:
raise ValueError('validation_split argument is not supported with '
'distribution strategies.')
if backend.is_tpu_strategy(model._distribution_strategy):
steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
if steps_per_epoch is None:
raise ValueError('Number of steps could not be inferred from the data, '
'please pass the steps_per_epoch argument.')
if not tf.executing_eagerly():
# Run TPU training in a custom loop in graph mode.
return experimental_tpu_fit_loop(
model,
dataset,
epochs=epochs,
verbose=verbose,
callbacks=callbacks,
val_dataset=val_dataset,
initial_epoch=initial_epoch,
steps_per_epoch=steps_per_epoch,
validation_steps=validation_steps,
validation_freq=validation_freq)
return training_arrays_v1.fit_loop(
model,
dataset,
batch_size=batch_size,
epochs=epochs,
verbose=verbose,
callbacks=callbacks,
val_inputs=val_dataset,
shuffle=shuffle,
initial_epoch=initial_epoch,
steps_per_epoch=steps_per_epoch,
validation_steps=validation_steps,
validation_freq=validation_freq,
steps_name='steps_per_epoch')
def evaluate(self,
model,
x=None,
y=None,
batch_size=None,
verbose=1,
sample_weight=None,
steps=None,
callbacks=None,
**kwargs):
"""Evaluate loop for Distribution Strategies."""
dist_utils.validate_inputs(x, y)
batch_size, steps = dist_utils.process_batch_and_step_size(
model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST)
batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
dataset = model._distribution_standardize_user_data(
x, y,
sample_weight=sample_weight,
batch_size=batch_size,
allow_partial_batch=True)
if backend.is_tpu_strategy(model._distribution_strategy):
steps = training_utils_v1.infer_steps_for_dataset(
model, dataset, steps, steps_name='steps')
if steps is None:
raise ValueError('Number of steps could not be inferred from the data, '
'please pass the steps argument.')
if not tf.executing_eagerly():
# Run TPU evaluation in a custom loop in graph mode.
return experimental_tpu_test_loop(
model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
return training_arrays_v1.test_loop(
model,
inputs=dataset,
batch_size=batch_size,
verbose=verbose,
steps=steps,
callbacks=callbacks)
def predict(self,
model,
x,
batch_size=None,
verbose=0,
steps=None,
callbacks=None,
**kwargs):
"""Predict loop for Distribution Strategies."""
dist_utils.validate_inputs(x=x, y=None)
batch_size, steps = dist_utils.process_batch_and_step_size(
model._distribution_strategy, x, batch_size, steps, ModeKeys.PREDICT)
batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
dataset = model._distribution_standardize_user_data(
x,
batch_size=batch_size,
allow_partial_batch=True)
if backend.is_tpu_strategy(model._distribution_strategy):
steps = training_utils_v1.infer_steps_for_dataset(
model, dataset, steps, steps_name='steps')
if steps is None:
raise ValueError('Number of steps could not be inferred from the data, '
'please pass the steps argument.')
if not tf.executing_eagerly():
return experimental_tpu_predict_loop(
model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
return training_arrays_v1.predict_loop(
model,
dataset,
batch_size=batch_size,
verbose=verbose,
steps=steps,
callbacks=callbacks)
def _train_with_multi_worker(method):
"""Decorator that handles multi worker training with distribution strategy."""
def wrapper(model, **kwargs):
def _worker_fn(_):
callbacks = kwargs.pop('callbacks', None)
filtered_callbacks = dist_utils.filter_distributed_callbacks(
callbacks, model)
kwargs['callbacks'] = filtered_callbacks
return method(model, **kwargs)
return dc.run_distribute_coordinator(
_worker_fn,
model._distribution_strategy)
return wrapper
class DistributionMultiWorkerTrainingLoop(training_utils_v1.TrainingLoop):
"""Training loop for distribution strategy with multiple worker."""
def __init__(self, single_worker_loop):
self._single_worker_loop = single_worker_loop
def fit(self, *args, **kwargs):
return _train_with_multi_worker(self._single_worker_loop.fit)(
*args, **kwargs)
def evaluate(self, *args, **kwargs):
return _train_with_multi_worker(self._single_worker_loop.evaluate)(
*args, **kwargs)
def predict(self, *args, **kwargs):
# Currently predict is still using the single worker implementation.
return self._single_worker_loop.predict(*args, **kwargs)
Functions
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1)
-
Fit loop for training with TPU tf.distribute.Strategy.
Args
model
- Keras Model instance.
dataset
- Dataset that returns inputs and targets
epochs
- Number of times to iterate over the data
verbose
- Integer, Verbosity mode, 0, 1 or 2
callbacks
- List of callbacks to be called during training
initial_epoch
- Epoch at which to start training (useful for resuming a previous training run)
steps_per_epoch
- Total number of steps (batches of samples)
before declaring one epoch finished and starting the
next epoch. Ignored with the default value of
None
. val_dataset
- Dataset for validation data.
validation_steps
- Number of steps to run validation for
(only if doing validation from data tensors).
Ignored with the default value of
None
. validation_freq
- Only relevant if validation data is provided. Integer or
collections.abc.Container
instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g.validation_freq=2
runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g.validation_freq=[1, 2, 10]
runs validation at the end of the 1st, 2nd, and 10th epochs.
Returns
Returns
None
.Raises
ValueError
- in case of invalid arguments.
Expand source code
def experimental_tpu_fit_loop(model, dataset, epochs=100, verbose=1, callbacks=None, initial_epoch=0, steps_per_epoch=None, val_dataset=None, validation_steps=None, validation_freq=1): """Fit loop for training with TPU tf.distribute.Strategy. Args: model: Keras Model instance. dataset: Dataset that returns inputs and targets epochs: Number of times to iterate over the data verbose: Integer, Verbosity mode, 0, 1 or 2 callbacks: List of callbacks to be called during training initial_epoch: Epoch at which to start training (useful for resuming a previous training run) steps_per_epoch: Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. Ignored with the default value of `None`. val_dataset: Dataset for validation data. validation_steps: Number of steps to run validation for (only if doing validation from data tensors). Ignored with the default value of `None`. validation_freq: Only relevant if validation data is provided. Integer or `collections.abc.Container` instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. `validation_freq=2` runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. `validation_freq=[1, 2, 10]` runs validation at the end of the 1st, 2nd, and 10th epochs. Returns: Returns `None`. Raises: ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN current_strategy = model._distribution_strategy iteration_value = min(steps_per_epoch, current_strategy.extended.steps_per_run) steps_per_run = backend.variable( value=iteration_value, dtype='int32', name='steps_per_run') # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. iterator = dist_utils.get_iterator(dataset, current_strategy) scope = dist_utils.distributed_scope( strategy=current_strategy, learning_phase=1) scope.__enter__() out_labels = model.metrics_names or [] step_fn = _make_train_step_fn(model, ModeKeys.TRAIN, current_strategy, out_labels) # Add initial dummy values for loss and other metric tensors. initial_loop_values = {} initial_loop_values['loss'] = tf.constant(1e7) for m in model._get_training_eval_metrics(): tensor = m.result() initial_loop_values[m.name] = tf.zeros(tensor.shape, tensor.dtype) ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) train_op = ctx.run_op output_tensors = ctx.last_step_outputs do_validation = bool(validation_steps) if model._compile_distribution: dist_utils._copy_weights_to_distributed_model(model, mode) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=do_validation, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=verbose, count_mode='steps', mode=mode) # Calculate the steps each time on the device. steps_to_run = ([current_strategy.extended.steps_per_run] * (steps_per_epoch // current_strategy.extended.steps_per_run)) if steps_per_epoch % current_strategy.extended.steps_per_run: steps_to_run.append( steps_per_epoch % current_strategy.extended.steps_per_run) target_steps = len(steps_to_run) callbacks._call_begin_hook(mode) initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode) for epoch in range(initial_epoch, epochs): dist_utils._reset_metrics(model) callbacks.on_epoch_begin(epoch) epoch_logs = {} step_index = 0 prev_step_count = None current_step = 0 while current_step < target_steps: step_count = steps_to_run[current_step] batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: backend.get_session().run(steps_per_run.assign(step_count)) prev_step_count = step_count try: _, outputs = backend.batch_get_value([train_op, output_tensors]) except tf.errors.OutOfRangeError: logging.warning('Your dataset iterator ran out of data; ' 'interrupting training. Make sure that your dataset ' 'can generate at least `steps_per_epoch * epochs` ' 'batches (in this case, %d batches).' % steps_per_epoch * epochs) break batch_logs.update(outputs) callbacks._call_batch_hook(mode, 'end', step_index, batch_logs) step_index = step_index + step_count current_step += 1 if callbacks.model.stop_training: break if (do_validation and training_utils_v1.should_run_validation(validation_freq, epoch)): logging.info('Running validation at fit epoch: %s', epoch) if model._compile_distribution: # Since we create a new clone from the original model we need to copy # the weights back to the original model before we can run validation. dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN) val_outs = experimental_tpu_test_loop( # pylint: disable=undefined-variable model, val_dataset, steps=validation_steps, verbose=verbose, callbacks=callbacks) if not isinstance(val_outs, list): val_outs = [val_outs] # Same labels assumed. for label, val_out in zip(out_labels, val_outs): epoch_logs['val_' + label] = val_out callbacks.on_epoch_end(epoch, epoch_logs) if callbacks.model.stop_training: break model._successful_loop_finish = True callbacks._call_end_hook(mode) if model._compile_distribution: # Copy the weights back from the replicated model to the original model. dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN) scope.__exit__(None, None, None) return model.history
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None)
-
Predict loop for predicting with TPU tf.distribute.Strategy.
Args
model
- Keras Model instance.
dataset
- Dataset for input data.
verbose
- Integer, Verbosity mode 0 or 1.
steps
- Total number of steps (batches of samples)
before declaring
_predict_loop
finished. Ignored with the default value ofNone
. callbacks
- List of callbacks to be called during training
Returns
Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs).
Expand source code
def experimental_tpu_predict_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Predict loop for predicting with TPU tf.distribute.Strategy. Args: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring `_predict_loop` finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Array of predictions (if the model has a single output) or list of arrays of predictions (if the model has multiple outputs). """ mode = ModeKeys.PREDICT dataset_fully_shaped = dist_utils.is_dataset_shape_fully_defined(dataset) padding_handler = None if not dataset_fully_shaped: # TODO(hongjunchoi): Investigate whether operations from # PartialBatchPaddingHandler are unnecessarily pruned out # during graph optimization. padding_handler = padding_util.PartialBatchPaddingHandler( model._feed_output_shapes) batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset) padding_handler.padded_batch_size = batch_size padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask, padding_handler.update_mask) dataset = dataset.map(padding_handler.pad_batch) dataset = dataset.unbatch() # Upon this point, it is guaranteed that the dataset does not # have partial batches. Thus, we set `drop_remainder=True` to # get static shape information about the elements in the dataset. dataset = dataset.batch(batch_size, drop_remainder=True) if prefetch_buffer is not None: dataset = dataset.prefetch(prefetch_buffer) current_strategy = model._distribution_strategy iterator = dist_utils.get_iterator(dataset, current_strategy) scope = dist_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() def _predict_step_fn(inputs): """A fn that returns output of single prediction step.""" (tf.distribute.get_replica_context().merge_call( _build_model, args=(model, mode, inputs))) (_, outputs, updates, _) = _per_replica_execution_function( dist_utils.get_distributed_model(model, mode), mode) with tf.control_dependencies([updates]): return [tf.identity(out) for out in outputs] # TODO(hongjunchoi): When numpy array is passed as an input to `predict()` # use numpy arrays directly to avoid cumulating unnecessary input pipeline # ops. predict_input_data = iterator.get_next() per_replica_outputs = current_strategy.run( _predict_step_fn, args=(predict_input_data,)) output_tensors = dist_utils.flatten_per_replica_values( current_strategy, per_replica_outputs) if verbose >= 1: progbar = Progbar(target=steps) if model._compile_distribution: dist_utils._copy_weights_to_distributed_model(model, mode) dist_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=mode) callbacks._call_begin_hook(mode) # Since we do not know how many samples we will see, we cannot pre-allocate # the returned Numpy arrays. Instead, we store one array per batch seen # and concatenate them upon returning. num_model_outputs = len(model.output_names) unconcatenated_outs = [[] for _ in range(num_model_outputs)] if steps is not None: target_steps = steps else: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps argument.') current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: predict_ops = tf.group(output_tensors) _, batch_outs = backend.batch_get_value([predict_ops, output_tensors]) except tf.errors.OutOfRangeError: warning_msg = ( 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps)) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) break # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy. for i in range(num_model_outputs): output_start_index = i * current_strategy.num_replicas_in_sync output_end_index = ( output_start_index + current_strategy.num_replicas_in_sync) single_model_output = batch_outs[output_start_index:output_end_index] unconcatenated_outs[i].extend(single_model_output) batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose == 1: progbar.update(current_step + 1) current_step += 1 if verbose >= 1: # Progress bar finishes at the end. progbar.update(current_step) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(unconcatenated_outs) == 1: prediction_result = np.concatenate(unconcatenated_outs[0], axis=0) else: prediction_result = [ np.concatenate(out, axis=0) for out in unconcatenated_outs ] if padding_handler: prediction_result = padding_handler.apply_mask(prediction_result) return prediction_result
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None)
-
Test loop for evaluating with TPU tf.distribute.Strategy.
Args
model
- Keras Model instance.
dataset
- Dataset for input data.
verbose
- Integer, Verbosity mode 0 or 1.
steps
- Total number of steps (batches of samples)
before declaring predictions finished.
Ignored with the default value of
None
. callbacks
- List of callbacks to be called during training
Returns
Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute
model.metrics_names
will give you the display labels for the outputs.Expand source code
def experimental_tpu_test_loop(model, dataset, verbose=0, steps=None, callbacks=None): """Test loop for evaluating with TPU tf.distribute.Strategy. Args: model: Keras Model instance. dataset: Dataset for input data. verbose: Integer, Verbosity mode 0 or 1. steps: Total number of steps (batches of samples) before declaring predictions finished. Ignored with the default value of `None`. callbacks: List of callbacks to be called during training Returns: Scalar loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the outputs. """ mode = ModeKeys.TEST current_strategy = model._distribution_strategy iterator = dist_utils.get_iterator(dataset, current_strategy) scope = dist_utils.distributed_scope( strategy=current_strategy, learning_phase=0) scope.__enter__() out_labels = model.metrics_names def _test_step_fn(inputs): """A fn that returns output of single test step.""" if isinstance(inputs, (tuple, list)) and len(inputs) == 2: inputs, targets = inputs else: targets = None (tf.distribute.get_replica_context().merge_call( _build_model, args=(model, mode, inputs, targets))) (_, outputs, updates, _) = _per_replica_execution_function( dist_utils.get_distributed_model(model, mode), mode) with tf.control_dependencies([updates]): return [tf.identity(out) for out in outputs] test_input_data = iterator.get_next() per_replica_outputs = current_strategy.run( _test_step_fn, args=(test_input_data,)) output_tensors = {} for label, output in zip(out_labels, per_replica_outputs): if label == 'loss': reduce_op = tf.distribute.ReduceOp.SUM else: # We reduce all other metrics using mean for now. This is temporary # workaround until new metrics are in place. reduce_op = tf.distribute.ReduceOp.MEAN output_tensors[label] = current_strategy.reduce(reduce_op, output, axis=None) test_op = tf.group(list(output_tensors.values())) if verbose >= 1: progbar = Progbar(target=steps) if model._compile_distribution: dist_utils._copy_weights_to_distributed_model(model, mode) dist_utils._reset_metrics(model) callbacks = cbks.configure_callbacks( callbacks, model, do_validation=False, epochs=1, steps_per_epoch=steps, verbose=verbose, count_mode='steps', mode=ModeKeys.TEST) callbacks._call_begin_hook(mode) outs = [0.] * len(model.metrics_names) if steps is not None: target_steps = steps else: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps argument.') current_step = 0 while current_step < target_steps: batch_logs = {'batch': current_step, 'size': 1} callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs) try: _, batch_outs = backend.batch_get_value([test_op, output_tensors]) except tf.errors.OutOfRangeError: warning_msg = ( 'Make sure that your dataset can generate at least ' '`steps` batches (in this case, {} batches).'.format(steps)) logging.warning('Your dataset iterator ran out of data; ' 'interrupting evaluation. ' + warning_msg) target_steps = current_step break for i, label in enumerate(model.metrics_names): if i == 0: # Loss is stateless metrics. outs[i] += batch_outs[label] else: # For all stateful metrics, the aggregation is handled by mirrored vars. outs[i] = batch_outs[label] batch_logs = cbks.make_logs(model, batch_logs, outs, mode) callbacks._call_batch_hook(mode, 'end', current_step, batch_logs) if verbose == 1: progbar.update(current_step + 1) current_step += 1 if verbose >= 1: # Progress bar finishes at the end. progbar.update(target_steps) callbacks._call_end_hook(mode) scope.__exit__(None, None, None) if len(outs) >= 0: outs[0] /= (target_steps) if len(outs) == 1: return outs[0] return outs
Classes
class DistributionMultiWorkerTrainingLoop (single_worker_loop)
-
Training loop for distribution strategy with multiple worker.
Expand source code
class DistributionMultiWorkerTrainingLoop(training_utils_v1.TrainingLoop): """Training loop for distribution strategy with multiple worker.""" def __init__(self, single_worker_loop): self._single_worker_loop = single_worker_loop def fit(self, *args, **kwargs): return _train_with_multi_worker(self._single_worker_loop.fit)( *args, **kwargs) def evaluate(self, *args, **kwargs): return _train_with_multi_worker(self._single_worker_loop.evaluate)( *args, **kwargs) def predict(self, *args, **kwargs): # Currently predict is still using the single worker implementation. return self._single_worker_loop.predict(*args, **kwargs)
Ancestors
Methods
def predict(self, *args, **kwargs)
-
Expand source code
def predict(self, *args, **kwargs): # Currently predict is still using the single worker implementation. return self._single_worker_loop.predict(*args, **kwargs)
Inherited members
class DistributionSingleWorkerTrainingLoop
-
Training loop for distribution strategy with single worker.
Expand source code
class DistributionSingleWorkerTrainingLoop(training_utils_v1.TrainingLoop): """Training loop for distribution strategy with single worker.""" def fit(self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, **kwargs): """Fit loop for Distribution Strategies.""" dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) dist_utils.validate_inputs(x, y) batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN, validation_split=validation_split) batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle, epochs=epochs) if not dist_utils.is_distributing_by_cloning(model): with model._distribution_strategy.scope(): (dataset, _, _) = model._standardize_user_data( dataset, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle) val_dataset = None if validation_data: val_x, val_y, val_sample_weights = ( training_utils_v1.unpack_validation_data(validation_data)) dist_utils.validate_inputs(val_x, val_y) _, validation_steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, val_x, batch_size, validation_steps, ModeKeys.TEST) val_dataset = model._distribution_standardize_user_data( val_x, val_y, sample_weight=val_sample_weights, class_weight=None, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle, allow_partial_batch=True) elif validation_split: raise ValueError('validation_split argument is not supported with ' 'distribution strategies.') if backend.is_tpu_strategy(model._distribution_strategy): steps_per_epoch = training_utils_v1.infer_steps_for_dataset( model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if steps_per_epoch is None: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps_per_epoch argument.') if not tf.executing_eagerly(): # Run TPU training in a custom loop in graph mode. return experimental_tpu_fit_loop( model, dataset, epochs=epochs, verbose=verbose, callbacks=callbacks, val_dataset=val_dataset, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq) return training_arrays_v1.fit_loop( model, dataset, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, val_inputs=val_dataset, shuffle=shuffle, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq, steps_name='steps_per_epoch') def evaluate(self, model, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None, **kwargs): """Evaluate loop for Distribution Strategies.""" dist_utils.validate_inputs(x, y) batch_size, steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, batch_size=batch_size, allow_partial_batch=True) if backend.is_tpu_strategy(model._distribution_strategy): steps = training_utils_v1.infer_steps_for_dataset( model, dataset, steps, steps_name='steps') if steps is None: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps argument.') if not tf.executing_eagerly(): # Run TPU evaluation in a custom loop in graph mode. return experimental_tpu_test_loop( model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) return training_arrays_v1.test_loop( model, inputs=dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks) def predict(self, model, x, batch_size=None, verbose=0, steps=None, callbacks=None, **kwargs): """Predict loop for Distribution Strategies.""" dist_utils.validate_inputs(x=x, y=None) batch_size, steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps, ModeKeys.PREDICT) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) dataset = model._distribution_standardize_user_data( x, batch_size=batch_size, allow_partial_batch=True) if backend.is_tpu_strategy(model._distribution_strategy): steps = training_utils_v1.infer_steps_for_dataset( model, dataset, steps, steps_name='steps') if steps is None: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps argument.') if not tf.executing_eagerly(): return experimental_tpu_predict_loop( model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) return training_arrays_v1.predict_loop( model, dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
Ancestors
Methods
def evaluate(self, model, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None, **kwargs)
-
Evaluate loop for Distribution Strategies.
Expand source code
def evaluate(self, model, x=None, y=None, batch_size=None, verbose=1, sample_weight=None, steps=None, callbacks=None, **kwargs): """Evaluate loop for Distribution Strategies.""" dist_utils.validate_inputs(x, y) batch_size, steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps, ModeKeys.TEST) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, batch_size=batch_size, allow_partial_batch=True) if backend.is_tpu_strategy(model._distribution_strategy): steps = training_utils_v1.infer_steps_for_dataset( model, dataset, steps, steps_name='steps') if steps is None: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps argument.') if not tf.executing_eagerly(): # Run TPU evaluation in a custom loop in graph mode. return experimental_tpu_test_loop( model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) return training_arrays_v1.test_loop( model, inputs=dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)
def fit(self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, **kwargs)
-
Fit loop for Distribution Strategies.
Expand source code
def fit(self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, **kwargs): """Fit loop for Distribution Strategies.""" dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) dist_utils.validate_inputs(x, y) batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps_per_epoch, ModeKeys.TRAIN, validation_split=validation_split) batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) dataset = model._distribution_standardize_user_data( x, y, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle, epochs=epochs) if not dist_utils.is_distributing_by_cloning(model): with model._distribution_strategy.scope(): (dataset, _, _) = model._standardize_user_data( dataset, sample_weight=sample_weight, class_weight=class_weight, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle) val_dataset = None if validation_data: val_x, val_y, val_sample_weights = ( training_utils_v1.unpack_validation_data(validation_data)) dist_utils.validate_inputs(val_x, val_y) _, validation_steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, val_x, batch_size, validation_steps, ModeKeys.TEST) val_dataset = model._distribution_standardize_user_data( val_x, val_y, sample_weight=val_sample_weights, class_weight=None, batch_size=batch_size, validation_split=validation_split, shuffle=shuffle, allow_partial_batch=True) elif validation_split: raise ValueError('validation_split argument is not supported with ' 'distribution strategies.') if backend.is_tpu_strategy(model._distribution_strategy): steps_per_epoch = training_utils_v1.infer_steps_for_dataset( model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch') if steps_per_epoch is None: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps_per_epoch argument.') if not tf.executing_eagerly(): # Run TPU training in a custom loop in graph mode. return experimental_tpu_fit_loop( model, dataset, epochs=epochs, verbose=verbose, callbacks=callbacks, val_dataset=val_dataset, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq) return training_arrays_v1.fit_loop( model, dataset, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, val_inputs=val_dataset, shuffle=shuffle, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq, steps_name='steps_per_epoch')
def predict(self, model, x, batch_size=None, verbose=0, steps=None, callbacks=None, **kwargs)
-
Predict loop for Distribution Strategies.
Expand source code
def predict(self, model, x, batch_size=None, verbose=0, steps=None, callbacks=None, **kwargs): """Predict loop for Distribution Strategies.""" dist_utils.validate_inputs(x=x, y=None) batch_size, steps = dist_utils.process_batch_and_step_size( model._distribution_strategy, x, batch_size, steps, ModeKeys.PREDICT) batch_size = model._validate_or_infer_batch_size(batch_size, steps, x) dataset = model._distribution_standardize_user_data( x, batch_size=batch_size, allow_partial_batch=True) if backend.is_tpu_strategy(model._distribution_strategy): steps = training_utils_v1.infer_steps_for_dataset( model, dataset, steps, steps_name='steps') if steps is None: raise ValueError('Number of steps could not be inferred from the data, ' 'please pass the steps argument.') if not tf.executing_eagerly(): return experimental_tpu_predict_loop( model, dataset, verbose=verbose, steps=steps, callbacks=callbacks) return training_arrays_v1.predict_loop( model, dataset, batch_size=batch_size, verbose=verbose, steps=steps, callbacks=callbacks)