Module keras.layers.preprocessing.category_encoding
Keras CategoryEncoding preprocessing layer.
Expand source code
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras CategoryEncoding preprocessing layer."""
import tensorflow.compat.v2 as tf
# pylint: disable=g-classes-have-attributes
import numpy as np
from keras import backend
from keras.engine import base_layer
from keras.engine import base_preprocessing_layer
from keras.utils import layer_utils
from keras.utils import tf_utils
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util.tf_export import keras_export
INT = "int"
ONE_HOT = "one_hot"
MULTI_HOT = "multi_hot"
COUNT = "count"
@keras_export("keras.layers.CategoryEncoding",
"keras.layers.experimental.preprocessing.CategoryEncoding")
class CategoryEncoding(base_layer.Layer):
"""Category encoding layer.
This layer provides options for condensing data into a categorical encoding
when the total number of tokens are known in advance. It accepts integer
values as inputs, and it outputs a dense representation of those
inputs. For integer inputs where the total number of tokens is not known,
use instead `tf.keras.layers.IntegerLookup`.
Examples:
**One-hot encoding data**
>>> layer = tf.keras.layers.CategoryEncoding(
... num_tokens=4, output_mode="one_hot")
>>> layer([3, 2, 0, 1])
<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0., 0., 0., 1.],
[0., 0., 1., 0.],
[1., 0., 0., 0.],
[0., 1., 0., 0.]], dtype=float32)>
**Multi-hot encoding data**
>>> layer = tf.keras.layers.CategoryEncoding(
... num_tokens=4, output_mode="multi_hot")
>>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[1., 1., 0., 0.],
[1., 0., 0., 0.],
[0., 1., 1., 0.],
[0., 1., 0., 1.]], dtype=float32)>
**Using weighted inputs in `"count"` mode**
>>> layer = tf.keras.layers.CategoryEncoding(
... num_tokens=4, output_mode="count")
>>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
>>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
<tf.Tensor: shape=(4, 4), dtype=float64, numpy=
array([[0.1, 0.2, 0. , 0. ],
[0.2, 0. , 0. , 0. ],
[0. , 0.2, 0.3, 0. ],
[0. , 0.2, 0. , 0.4]])>
Args:
num_tokens: The total number of tokens the layer should support. All inputs
to the layer must integers in the range `0 <= value < num_tokens`, or an
error will be thrown.
output_mode: Specification for the output of the layer.
Defaults to `"multi_hot"`. Values can be `"one_hot"`, `"multi_hot"` or
`"count"`, configuring the layer as follows:
- `"one_hot"`: Encodes each individual element in the input into an
array of `num_tokens` size, containing a 1 at the element index. If
the last dimension is size 1, will encode on that dimension. If the
last dimension is not size 1, will append a new dimension for the
encoded output.
- `"multi_hot"`: Encodes each sample in the input into a single array
of `num_tokens` size, containing a 1 for each vocabulary term present
in the sample. Treats the last dimension as the sample dimension, if
input shape is `(..., sample_length)`, output shape will be
`(..., num_tokens)`.
- `"count"`: Like `"multi_hot"`, but the int array contains a count of
the number of times the token at that index appeared in the sample.
For all output modes, currently only output up to rank 2 is supported.
sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
`Tensor`. Defaults to `False`.
Call arguments:
inputs: A 1D or 2D tensor of integer inputs.
count_weights: A tensor in the same shape as `inputs` indicating the
weight for each sample value when summing up in `count` mode. Not used in
`"multi_hot"` or `"one_hot"` modes.
"""
def __init__(self,
num_tokens=None,
output_mode="multi_hot",
sparse=False,
**kwargs):
# max_tokens is an old name for the num_tokens arg we continue to support
# because of usage.
if "max_tokens" in kwargs:
logging.warning(
"max_tokens is deprecated, please use num_tokens instead.")
num_tokens = kwargs["max_tokens"]
del kwargs["max_tokens"]
super(CategoryEncoding, self).__init__(**kwargs)
base_preprocessing_layer.keras_kpl_gauge.get_cell("CategoryEncoding").set(
True)
# Support deprecated names for output_modes.
if output_mode == "binary":
output_mode = MULTI_HOT
# 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
layer_utils.validate_string_arg(
output_mode,
allowable_strings=(COUNT, ONE_HOT, MULTI_HOT),
layer_name="CategoryEncoding",
arg_name="output_mode")
if num_tokens is None:
raise ValueError("num_tokens must be set to use this layer. If the "
"number of tokens is not known beforehand, use the "
"IntegerLookup layer instead.")
if num_tokens < 1:
raise ValueError("num_tokens must be >= 1.")
self.num_tokens = num_tokens
self.output_mode = output_mode
self.sparse = sparse
def compute_output_shape(self, input_shape):
if not input_shape:
return tf.TensorShape([self.num_tokens])
if self.output_mode == ONE_HOT and input_shape[-1] != 1:
return tf.TensorShape(input_shape + [self.num_tokens])
else:
return tf.TensorShape(input_shape[:-1] + [self.num_tokens])
def compute_output_signature(self, input_spec):
output_shape = self.compute_output_shape(input_spec.shape.as_list())
if self.sparse:
return tf.SparseTensorSpec(
shape=output_shape, dtype=tf.int64)
else:
return tf.TensorSpec(shape=output_shape, dtype=tf.int64)
def get_config(self):
config = {
"num_tokens": self.num_tokens,
"output_mode": self.output_mode,
"sparse": self.sparse,
}
base_config = super(CategoryEncoding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, count_weights=None):
if isinstance(inputs, (list, np.ndarray)):
inputs = tf.convert_to_tensor(inputs)
def expand_dims(inputs, axis):
if tf_utils.is_sparse(inputs):
return tf.sparse.expand_dims(inputs, axis)
else:
return tf.expand_dims(inputs, axis)
original_shape = inputs.shape
# In all cases, we should uprank scalar input to a single sample.
if inputs.shape.rank == 0:
inputs = expand_dims(inputs, -1)
# One hot will unprank only if the final output dimension is not already 1.
if self.output_mode == ONE_HOT:
if inputs.shape[-1] != 1:
inputs = expand_dims(inputs, -1)
# TODO(b/190445202): remove output rank restriction.
if inputs.shape.rank > 2:
raise ValueError(
"Received input shape {}, which would result in output rank {}. "
"Currently only outputs up to rank 2 are supported.".format(
original_shape, inputs.shape.rank))
if count_weights is not None and self.output_mode != COUNT:
raise ValueError(
"`count_weights` is not used when `output_mode` is not `'count'`. "
"Received `count_weights={}`.".format(count_weights))
out_depth = self.num_tokens
binary_output = self.output_mode in (MULTI_HOT, ONE_HOT)
if isinstance(inputs, tf.SparseTensor):
max_value = tf.reduce_max(inputs.values)
min_value = tf.reduce_min(inputs.values)
else:
max_value = tf.reduce_max(inputs)
min_value = tf.reduce_min(inputs)
condition = tf.logical_and(
tf.greater(
tf.cast(out_depth, max_value.dtype), max_value),
tf.greater_equal(
min_value, tf.cast(0, min_value.dtype)))
assertion = tf.Assert(condition, [
"Input values must be in the range 0 <= values < num_tokens"
" with num_tokens={}".format(out_depth)
])
with tf.control_dependencies([assertion]):
if self.sparse:
return sparse_bincount(inputs, out_depth, binary_output,
count_weights)
else:
return dense_bincount(inputs, out_depth, binary_output,
count_weights)
def sparse_bincount(inputs, out_depth, binary_output, count_weights=None):
"""Apply binary or count encoding to an input and return a sparse tensor."""
result = tf.sparse.bincount(
inputs,
weights=count_weights,
minlength=out_depth,
maxlength=out_depth,
axis=-1,
binary_output=binary_output)
if inputs.shape.rank == 1:
output_shape = (out_depth,)
else:
result = tf.cast(result, backend.floatx())
batch_size = tf.shape(result)[0]
output_shape = (batch_size, out_depth)
result = tf.SparseTensor(
indices=result.indices,
values=result.values,
dense_shape=output_shape)
return result
def dense_bincount(inputs, out_depth, binary_output, count_weights=None):
"""Apply binary or count encoding to an input."""
result = tf.math.bincount(
inputs,
weights=count_weights,
minlength=out_depth,
maxlength=out_depth,
dtype=backend.floatx(),
axis=-1,
binary_output=binary_output)
if inputs.shape.rank == 1:
result.set_shape(tf.TensorShape((out_depth,)))
else:
batch_size = inputs.shape.as_list()[0]
result.set_shape(tf.TensorShape((batch_size, out_depth)))
return result
Functions
def dense_bincount(inputs, out_depth, binary_output, count_weights=None)
-
Apply binary or count encoding to an input.
Expand source code
def dense_bincount(inputs, out_depth, binary_output, count_weights=None): """Apply binary or count encoding to an input.""" result = tf.math.bincount( inputs, weights=count_weights, minlength=out_depth, maxlength=out_depth, dtype=backend.floatx(), axis=-1, binary_output=binary_output) if inputs.shape.rank == 1: result.set_shape(tf.TensorShape((out_depth,))) else: batch_size = inputs.shape.as_list()[0] result.set_shape(tf.TensorShape((batch_size, out_depth))) return result
def sparse_bincount(inputs, out_depth, binary_output, count_weights=None)
-
Apply binary or count encoding to an input and return a sparse tensor.
Expand source code
def sparse_bincount(inputs, out_depth, binary_output, count_weights=None): """Apply binary or count encoding to an input and return a sparse tensor.""" result = tf.sparse.bincount( inputs, weights=count_weights, minlength=out_depth, maxlength=out_depth, axis=-1, binary_output=binary_output) if inputs.shape.rank == 1: output_shape = (out_depth,) else: result = tf.cast(result, backend.floatx()) batch_size = tf.shape(result)[0] output_shape = (batch_size, out_depth) result = tf.SparseTensor( indices=result.indices, values=result.values, dense_shape=output_shape) return result
Classes
class CategoryEncoding (num_tokens=None, output_mode='multi_hot', sparse=False, **kwargs)
-
Category encoding layer.
This layer provides options for condensing data into a categorical encoding when the total number of tokens are known in advance. It accepts integer values as inputs, and it outputs a dense representation of those inputs. For integer inputs where the total number of tokens is not known, use instead
tf.keras.layers.IntegerLookup
.Examples:
One-hot encoding data
>>> layer = tf.keras.layers.CategoryEncoding( ... num_tokens=4, output_mode="one_hot") >>> layer([3, 2, 0, 1]) <tf.Tensor: shape=(4, 4), dtype=float32, numpy= array([[0., 0., 0., 1.], [0., 0., 1., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.]], dtype=float32)>
Multi-hot encoding data
>>> layer = tf.keras.layers.CategoryEncoding( ... num_tokens=4, output_mode="multi_hot") >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]]) <tf.Tensor: shape=(4, 4), dtype=float32, numpy= array([[1., 1., 0., 0.], [1., 0., 0., 0.], [0., 1., 1., 0.], [0., 1., 0., 1.]], dtype=float32)>
Using weighted inputs in
"count"
mode>>> layer = tf.keras.layers.CategoryEncoding( ... num_tokens=4, output_mode="count") >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]]) >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights) <tf.Tensor: shape=(4, 4), dtype=float64, numpy= array([[0.1, 0.2, 0. , 0. ], [0.2, 0. , 0. , 0. ], [0. , 0.2, 0.3, 0. ], [0. , 0.2, 0. , 0.4]])>
Args
num_tokens
- The total number of tokens the layer should support. All inputs
to the layer must integers in the range
0 <= value < num_tokens
, or an error will be thrown. output_mode
- Specification for the output of the layer.
Defaults to
"multi_hot"
. Values can be"one_hot"
,"multi_hot"
or"count"
, configuring the layer as follows: -"one_hot"
: Encodes each individual element in the input into an array ofnum_tokens
size, containing a 1 at the element index. If the last dimension is size 1, will encode on that dimension. If the last dimension is not size 1, will append a new dimension for the encoded output. -"multi_hot"
: Encodes each sample in the input into a single array ofnum_tokens
size, containing a 1 for each vocabulary term present in the sample. Treats the last dimension as the sample dimension, if input shape is(…, sample_length)
, output shape will be(…, num_tokens)
. -"count"
: Like"multi_hot"
, but the int array contains a count of the number of times the token at that index appeared in the sample. For all output modes, currently only output up to rank 2 is supported. sparse
- Boolean. If true, returns a
SparseTensor
instead of a denseTensor
. Defaults toFalse
.
Call arguments: inputs: A 1D or 2D tensor of integer inputs. count_weights: A tensor in the same shape as
inputs
indicating the weight for each sample value when summing up incount
mode. Not used in"multi_hot"
or"one_hot"
modes.Expand source code
class CategoryEncoding(base_layer.Layer): """Category encoding layer. This layer provides options for condensing data into a categorical encoding when the total number of tokens are known in advance. It accepts integer values as inputs, and it outputs a dense representation of those inputs. For integer inputs where the total number of tokens is not known, use instead `tf.keras.layers.IntegerLookup`. Examples: **One-hot encoding data** >>> layer = tf.keras.layers.CategoryEncoding( ... num_tokens=4, output_mode="one_hot") >>> layer([3, 2, 0, 1]) <tf.Tensor: shape=(4, 4), dtype=float32, numpy= array([[0., 0., 0., 1.], [0., 0., 1., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.]], dtype=float32)> **Multi-hot encoding data** >>> layer = tf.keras.layers.CategoryEncoding( ... num_tokens=4, output_mode="multi_hot") >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]]) <tf.Tensor: shape=(4, 4), dtype=float32, numpy= array([[1., 1., 0., 0.], [1., 0., 0., 0.], [0., 1., 1., 0.], [0., 1., 0., 1.]], dtype=float32)> **Using weighted inputs in `"count"` mode** >>> layer = tf.keras.layers.CategoryEncoding( ... num_tokens=4, output_mode="count") >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]]) >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights) <tf.Tensor: shape=(4, 4), dtype=float64, numpy= array([[0.1, 0.2, 0. , 0. ], [0.2, 0. , 0. , 0. ], [0. , 0.2, 0.3, 0. ], [0. , 0.2, 0. , 0.4]])> Args: num_tokens: The total number of tokens the layer should support. All inputs to the layer must integers in the range `0 <= value < num_tokens`, or an error will be thrown. output_mode: Specification for the output of the layer. Defaults to `"multi_hot"`. Values can be `"one_hot"`, `"multi_hot"` or `"count"`, configuring the layer as follows: - `"one_hot"`: Encodes each individual element in the input into an array of `num_tokens` size, containing a 1 at the element index. If the last dimension is size 1, will encode on that dimension. If the last dimension is not size 1, will append a new dimension for the encoded output. - `"multi_hot"`: Encodes each sample in the input into a single array of `num_tokens` size, containing a 1 for each vocabulary term present in the sample. Treats the last dimension as the sample dimension, if input shape is `(..., sample_length)`, output shape will be `(..., num_tokens)`. - `"count"`: Like `"multi_hot"`, but the int array contains a count of the number of times the token at that index appeared in the sample. For all output modes, currently only output up to rank 2 is supported. sparse: Boolean. If true, returns a `SparseTensor` instead of a dense `Tensor`. Defaults to `False`. Call arguments: inputs: A 1D or 2D tensor of integer inputs. count_weights: A tensor in the same shape as `inputs` indicating the weight for each sample value when summing up in `count` mode. Not used in `"multi_hot"` or `"one_hot"` modes. """ def __init__(self, num_tokens=None, output_mode="multi_hot", sparse=False, **kwargs): # max_tokens is an old name for the num_tokens arg we continue to support # because of usage. if "max_tokens" in kwargs: logging.warning( "max_tokens is deprecated, please use num_tokens instead.") num_tokens = kwargs["max_tokens"] del kwargs["max_tokens"] super(CategoryEncoding, self).__init__(**kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell("CategoryEncoding").set( True) # Support deprecated names for output_modes. if output_mode == "binary": output_mode = MULTI_HOT # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT) layer_utils.validate_string_arg( output_mode, allowable_strings=(COUNT, ONE_HOT, MULTI_HOT), layer_name="CategoryEncoding", arg_name="output_mode") if num_tokens is None: raise ValueError("num_tokens must be set to use this layer. If the " "number of tokens is not known beforehand, use the " "IntegerLookup layer instead.") if num_tokens < 1: raise ValueError("num_tokens must be >= 1.") self.num_tokens = num_tokens self.output_mode = output_mode self.sparse = sparse def compute_output_shape(self, input_shape): if not input_shape: return tf.TensorShape([self.num_tokens]) if self.output_mode == ONE_HOT and input_shape[-1] != 1: return tf.TensorShape(input_shape + [self.num_tokens]) else: return tf.TensorShape(input_shape[:-1] + [self.num_tokens]) def compute_output_signature(self, input_spec): output_shape = self.compute_output_shape(input_spec.shape.as_list()) if self.sparse: return tf.SparseTensorSpec( shape=output_shape, dtype=tf.int64) else: return tf.TensorSpec(shape=output_shape, dtype=tf.int64) def get_config(self): config = { "num_tokens": self.num_tokens, "output_mode": self.output_mode, "sparse": self.sparse, } base_config = super(CategoryEncoding, self).get_config() return dict(list(base_config.items()) + list(config.items())) def call(self, inputs, count_weights=None): if isinstance(inputs, (list, np.ndarray)): inputs = tf.convert_to_tensor(inputs) def expand_dims(inputs, axis): if tf_utils.is_sparse(inputs): return tf.sparse.expand_dims(inputs, axis) else: return tf.expand_dims(inputs, axis) original_shape = inputs.shape # In all cases, we should uprank scalar input to a single sample. if inputs.shape.rank == 0: inputs = expand_dims(inputs, -1) # One hot will unprank only if the final output dimension is not already 1. if self.output_mode == ONE_HOT: if inputs.shape[-1] != 1: inputs = expand_dims(inputs, -1) # TODO(b/190445202): remove output rank restriction. if inputs.shape.rank > 2: raise ValueError( "Received input shape {}, which would result in output rank {}. " "Currently only outputs up to rank 2 are supported.".format( original_shape, inputs.shape.rank)) if count_weights is not None and self.output_mode != COUNT: raise ValueError( "`count_weights` is not used when `output_mode` is not `'count'`. " "Received `count_weights={}`.".format(count_weights)) out_depth = self.num_tokens binary_output = self.output_mode in (MULTI_HOT, ONE_HOT) if isinstance(inputs, tf.SparseTensor): max_value = tf.reduce_max(inputs.values) min_value = tf.reduce_min(inputs.values) else: max_value = tf.reduce_max(inputs) min_value = tf.reduce_min(inputs) condition = tf.logical_and( tf.greater( tf.cast(out_depth, max_value.dtype), max_value), tf.greater_equal( min_value, tf.cast(0, min_value.dtype))) assertion = tf.Assert(condition, [ "Input values must be in the range 0 <= values < num_tokens" " with num_tokens={}".format(out_depth) ]) with tf.control_dependencies([assertion]): if self.sparse: return sparse_bincount(inputs, out_depth, binary_output, count_weights) else: return dense_bincount(inputs, out_depth, binary_output, count_weights)
Ancestors
- Layer
- tensorflow.python.module.module.Module
- tensorflow.python.training.tracking.tracking.AutoTrackable
- tensorflow.python.training.tracking.base.Trackable
- LayerVersionSelector
Inherited members
Layer
:activity_regularizer
add_loss
add_metric
add_update
add_variable
add_weight
apply
build
call
compute_dtype
compute_mask
compute_output_shape
compute_output_signature
count_params
dtype
dtype_policy
dynamic
finalize_state
from_config
get_config
get_input_at
get_input_mask_at
get_input_shape_at
get_losses_for
get_output_at
get_output_mask_at
get_output_shape_at
get_updates_for
get_weights
inbound_nodes
input
input_mask
input_shape
input_spec
losses
metrics
name
non_trainable_variables
non_trainable_weights
outbound_nodes
output
output_mask
output_shape
set_weights
supports_masking
trainable_variables
trainable_weights
variable_dtype
variables
weights