Module keras.preprocessing.sequence
Utilities for preprocessing sequence data.
Expand source code
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for preprocessing sequence data."""
# pylint: disable=invalid-name
from keras_preprocessing import sequence
from keras.utils import data_utils
from tensorflow.python.util.tf_export import keras_export
make_sampling_table = sequence.make_sampling_table
skipgrams = sequence.skipgrams
# TODO(fchollet): consider making `_remove_long_seq` public.
_remove_long_seq = sequence._remove_long_seq # pylint: disable=protected-access
@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence):
"""Utility class for generating batches of temporal data.
This class takes in a sequence of data-points gathered at
equal intervals, along with time series parameters such as
stride, length of history, etc., to produce batches for
training/validation.
# Arguments
data: Indexable generator (such as list or Numpy array)
containing consecutive data points (timesteps).
The data should be at 2D, and axis 0 is expected
to be the time dimension.
targets: Targets corresponding to timesteps in `data`.
It should have same length as `data`.
length: Length of the output sequences (in number of timesteps).
sampling_rate: Period between successive individual timesteps
within sequences. For rate `r`, timesteps
`data[i]`, `data[i-r]`, ... `data[i - length]`
are used for create a sample sequence.
stride: Period between successive output sequences.
For stride `s`, consecutive output samples would
be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
start_index: Data points earlier than `start_index` will not be used
in the output sequences. This is useful to reserve part of the
data for test or validation.
end_index: Data points later than `end_index` will not be used
in the output sequences. This is useful to reserve part of the
data for test or validation.
shuffle: Whether to shuffle output samples,
or instead draw them in chronological order.
reverse: Boolean: if `true`, timesteps in each output sample will be
in reverse chronological order.
batch_size: Number of timeseries samples in each batch
(except maybe the last one).
# Returns
A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance.
# Examples
```python
from keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
data = np.array([[i] for i in range(50)])
targets = np.array([[i] for i in range(50)])
data_gen = TimeseriesGenerator(data, targets,
length=10, sampling_rate=2,
batch_size=2)
assert len(data_gen) == 20
batch_0 = data_gen[0]
x, y = batch_0
assert np.array_equal(x,
np.array([[[0], [2], [4], [6], [8]],
[[1], [3], [5], [7], [9]]]))
assert np.array_equal(y,
np.array([[10], [11]]))
```
"""
pass
@keras_export('keras.preprocessing.sequence.pad_sequences')
def pad_sequences(sequences, maxlen=None, dtype='int32',
padding='pre', truncating='pre', value=0.):
"""Pads sequences to the same length.
This function transforms a list (of length `num_samples`)
of sequences (lists of integers)
into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
`num_timesteps` is either the `maxlen` argument if provided,
or the length of the longest sequence in the list.
Sequences that are shorter than `num_timesteps`
are padded with `value` until they are `num_timesteps` long.
Sequences longer than `num_timesteps` are truncated
so that they fit the desired length.
The position where padding or truncation happens is determined by
the arguments `padding` and `truncating`, respectively.
Pre-padding or removing values from the beginning of the sequence is the
default.
>>> sequence = [[1], [2, 3], [4, 5, 6]]
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
array([[0, 0, 1],
[0, 2, 3],
[4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
array([[-1, -1, 1],
[-1, 2, 3],
[ 4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
array([[1, 0, 0],
[2, 3, 0],
[4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
array([[0, 1],
[2, 3],
[5, 6]], dtype=int32)
Args:
sequences: List of sequences (each sequence is a list of integers).
maxlen: Optional Int, maximum length of all sequences. If not provided,
sequences will be padded to the length of the longest individual
sequence.
dtype: (Optional, defaults to int32). Type of the output sequences.
To pad sequences with variable length strings, you can use `object`.
padding: String, 'pre' or 'post' (optional, defaults to 'pre'):
pad either before or after each sequence.
truncating: String, 'pre' or 'post' (optional, defaults to 'pre'):
remove values from sequences larger than
`maxlen`, either at the beginning or at the end of the sequences.
value: Float or String, padding value. (Optional, defaults to 0.)
Returns:
Numpy array with shape `(len(sequences), maxlen)`
Raises:
ValueError: In case of invalid values for `truncating` or `padding`,
or in case of invalid shape for a `sequences` entry.
"""
return sequence.pad_sequences(
sequences, maxlen=maxlen, dtype=dtype,
padding=padding, truncating=truncating, value=value)
keras_export(
'keras.preprocessing.sequence.make_sampling_table', allow_multiple_exports=True)(make_sampling_table)
keras_export('keras.preprocessing.sequence.skipgrams', allow_multiple_exports=True)(skipgrams)
Functions
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)
-
Pads sequences to the same length.
This function transforms a list (of length
num_samples
) of sequences (lists of integers) into a 2D Numpy array of shape(num_samples, num_timesteps)
.num_timesteps
is either themaxlen
argument if provided, or the length of the longest sequence in the list.Sequences that are shorter than
num_timesteps
are padded withvalue
until they arenum_timesteps
long.Sequences longer than
num_timesteps
are truncated so that they fit the desired length.The position where padding or truncation happens is determined by the arguments
padding
andtruncating
, respectively. Pre-padding or removing values from the beginning of the sequence is the default.>>> sequence = [[1], [2, 3], [4, 5, 6]] >>> tf.keras.preprocessing.sequence.pad_sequences(sequence) array([[0, 0, 1], [0, 2, 3], [4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1) array([[-1, -1, 1], [-1, 2, 3], [ 4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post') array([[1, 0, 0], [2, 3, 0], [4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2) array([[0, 1], [2, 3], [5, 6]], dtype=int32)
Args
sequences
- List of sequences (each sequence is a list of integers).
maxlen
- Optional Int, maximum length of all sequences. If not provided, sequences will be padded to the length of the longest individual sequence.
dtype
- (Optional, defaults to int32). Type of the output sequences.
To pad sequences with variable length strings, you can use
object
. padding
- String, 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence.
truncating
- String, 'pre' or 'post' (optional, defaults to 'pre'):
remove values from sequences larger than
maxlen
, either at the beginning or at the end of the sequences. value
- Float or String, padding value. (Optional, defaults to 0.)
Returns
Numpy array with shape
(len(sequences), maxlen)
Raises
ValueError
- In case of invalid values for
truncating
orpadding
, or in case of invalid shape for asequences
entry.
Expand source code
@keras_export('keras.preprocessing.sequence.pad_sequences') def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.): """Pads sequences to the same length. This function transforms a list (of length `num_samples`) of sequences (lists of integers) into a 2D Numpy array of shape `(num_samples, num_timesteps)`. `num_timesteps` is either the `maxlen` argument if provided, or the length of the longest sequence in the list. Sequences that are shorter than `num_timesteps` are padded with `value` until they are `num_timesteps` long. Sequences longer than `num_timesteps` are truncated so that they fit the desired length. The position where padding or truncation happens is determined by the arguments `padding` and `truncating`, respectively. Pre-padding or removing values from the beginning of the sequence is the default. >>> sequence = [[1], [2, 3], [4, 5, 6]] >>> tf.keras.preprocessing.sequence.pad_sequences(sequence) array([[0, 0, 1], [0, 2, 3], [4, 5, 6]], dtype=int32) >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1) array([[-1, -1, 1], [-1, 2, 3], [ 4, 5, 6]], dtype=int32) >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post') array([[1, 0, 0], [2, 3, 0], [4, 5, 6]], dtype=int32) >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2) array([[0, 1], [2, 3], [5, 6]], dtype=int32) Args: sequences: List of sequences (each sequence is a list of integers). maxlen: Optional Int, maximum length of all sequences. If not provided, sequences will be padded to the length of the longest individual sequence. dtype: (Optional, defaults to int32). Type of the output sequences. To pad sequences with variable length strings, you can use `object`. padding: String, 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence. truncating: String, 'pre' or 'post' (optional, defaults to 'pre'): remove values from sequences larger than `maxlen`, either at the beginning or at the end of the sequences. value: Float or String, padding value. (Optional, defaults to 0.) Returns: Numpy array with shape `(len(sequences), maxlen)` Raises: ValueError: In case of invalid values for `truncating` or `padding`, or in case of invalid shape for a `sequences` entry. """ return sequence.pad_sequences( sequences, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value)
Classes
class TimeseriesGenerator (data, targets, length, sampling_rate=1, stride=1, start_index=0, end_index=None, shuffle=False, reverse=False, batch_size=128)
-
Utility class for generating batches of temporal data.
This class takes in a sequence of data-points gathered at equal intervals, along with time series parameters such as stride, length of history, etc., to produce batches for training/validation.
Arguments
data: Indexable generator (such as list or Numpy array) containing consecutive data points (timesteps). The data should be at 2D, and axis 0 is expected to be the time dimension. targets: Targets corresponding to timesteps in <code>data</code>. It should have same length as <code>data</code>. length: Length of the output sequences (in number of timesteps). sampling_rate: Period between successive individual timesteps within sequences. For rate <code>r</code>, timesteps <code>data\[i]</code>, `data[i-r]`, ... `data[i - length]` are used for create a sample sequence. stride: Period between successive output sequences. For stride <code>s</code>, consecutive output samples would be centered around <code>data\[i]</code>, `data[i+s]`, `data[i+2*s]`, etc. start_index: Data points earlier than <code>start\_index</code> will not be used in the output sequences. This is useful to reserve part of the data for test or validation. end_index: Data points later than <code>end\_index</code> will not be used in the output sequences. This is useful to reserve part of the data for test or validation. shuffle: Whether to shuffle output samples, or instead draw them in chronological order. reverse: Boolean: if <code>true</code>, timesteps in each output sample will be in reverse chronological order. batch_size: Number of timeseries samples in each batch (except maybe the last one).
Returns
A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance.
Examples
from keras.preprocessing.sequence import TimeseriesGenerator import numpy as np data = np.array([[i] for i in range(50)]) targets = np.array([[i] for i in range(50)]) data_gen = TimeseriesGenerator(data, targets, length=10, sampling_rate=2, batch_size=2) assert len(data_gen) == 20 batch_0 = data_gen[0] x, y = batch_0 assert np.array_equal(x, np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5], [7], [9]]])) assert np.array_equal(y, np.array([[10], [11]]))
Expand source code
class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence): """Utility class for generating batches of temporal data. This class takes in a sequence of data-points gathered at equal intervals, along with time series parameters such as stride, length of history, etc., to produce batches for training/validation. # Arguments data: Indexable generator (such as list or Numpy array) containing consecutive data points (timesteps). The data should be at 2D, and axis 0 is expected to be the time dimension. targets: Targets corresponding to timesteps in `data`. It should have same length as `data`. length: Length of the output sequences (in number of timesteps). sampling_rate: Period between successive individual timesteps within sequences. For rate `r`, timesteps `data[i]`, `data[i-r]`, ... `data[i - length]` are used for create a sample sequence. stride: Period between successive output sequences. For stride `s`, consecutive output samples would be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc. start_index: Data points earlier than `start_index` will not be used in the output sequences. This is useful to reserve part of the data for test or validation. end_index: Data points later than `end_index` will not be used in the output sequences. This is useful to reserve part of the data for test or validation. shuffle: Whether to shuffle output samples, or instead draw them in chronological order. reverse: Boolean: if `true`, timesteps in each output sample will be in reverse chronological order. batch_size: Number of timeseries samples in each batch (except maybe the last one). # Returns A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance. # Examples ```python from keras.preprocessing.sequence import TimeseriesGenerator import numpy as np data = np.array([[i] for i in range(50)]) targets = np.array([[i] for i in range(50)]) data_gen = TimeseriesGenerator(data, targets, length=10, sampling_rate=2, batch_size=2) assert len(data_gen) == 20 batch_0 = data_gen[0] x, y = batch_0 assert np.array_equal(x, np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5], [7], [9]]])) assert np.array_equal(y, np.array([[10], [11]])) ``` """ pass
Ancestors
- keras_preprocessing.sequence.TimeseriesGenerator
- Sequence
Inherited members