23 sigma_arr = var_arr +
static_cast<T>(eps);
45 if (
gamma !=
nullptr && beta !=
nullptr) {
48 Y_arr = (((X_arr.rowwise() * scale_arr.transpose()).rowwise() +
57 Y_arr = (X_arr.rowwise() * scale_arr.transpose()).rowwise() +
78 if (
gamma !=
nullptr) {
80 for (
int i = 0; i <
M; ++i) {
81 ds[i] = (dYxX_arr.col(i) * gamma_arr).
sum();
82 db[i] = (dY_arr.col(i) * gamma_arr).
sum();
103 const T scale =
T(1) /
static_cast<T>(
N);
110 X_scale_arr = (db_arr * mean_arr - ds_arr) * rstd_arr.cube() *
scale;
112 -X_scale_arr * mean_arr - db_arr * rstd_arr *
scale;
113 if (g_scale !=
nullptr) {
133 if (
gamma !=
nullptr) {
135 for (
int i = 0; i <
M; ++i) {
136 dX_arr.col(i) = dY_arr.col(i) * gamma_arr * dY_scale[i] +
137 X_arr.col(i) * X_scale[i] +
bias[i];
143 dX_arr = (dY_arr.rowwise() * dY_scale_arr.transpose() +
144 X_arr.rowwise() * X_scale_arr.transpose())
146 bias_arr.transpose();
161 math::Set<T, CPUContext>(
N,
T(0), dgamma, &
context_);
162 math::Set<T, CPUContext>(
N,
T(0), dbeta, &
context_);
167 for (
int i = 0; i <
M; ++i) {
168 dgamma_arr += dYxX_arr.col(i) * rstd[i] + dY_arr.col(i) * g_scale[i];
169 dbeta_arr += dY_arr.col(i);
181 std::vector<OperatorDef> GetGradientDefs()
override {
187 return SingleGradientDef(
190 std::vector<std::string>{
GO(0), O(0), O(1), O(2), I(0), I(1)},
191 std::vector<std::string>{GI(0), GI(1), GI(2)});
193 return SingleGradientDef(
196 std::vector<std::string>{
GO(0), O(0), O(1), O(2), I(0)},
197 std::vector<std::string>{GI(0)});
209 .TensorInferenceFunction([](
const OperatorDef& def,
230Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf.
231Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}],
232this op treats dimensions a_k through a_{n-1} as feature vectors. For each
233feature vector, the op contains the mean and standard deviation. Then,
234it returns the normalized values (with respect to the feature vector).
236Note that this op does not contain the scale an bias terms described in the
237paper. Simply follow this op with an FC op to add those. Concretely, this op
240h = \frac{1}{\sigma}(a - \mu)
241where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i
242and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2}
243where H is the number of hidden units (i.e. product of dimensions from 'axis'
248 "(int) default to 1; Describes axis of the inputs. Defaults to one "
249 "because the 0th axis most likely describes the batch size")
252 "(float) default to 0.001. Small value to be added to the stdev when"
253 " dividing out by that value. This prevents division by zero.")
255 "elementwise_affine",
256 "(bool) default to False; If true, this op will do affine "
257 "transformation after normalization.")
261 "Input tensor which layer normalization will be applied to")
265 "scale tensor for elementwise_affine, the shape should be the same as "
266 "the dimensions of X begin from axis")
270 "bias tensor for elementwise_affine, the shape should be the same as "
271 "the dimensions of X begin from axis")
272 .Output(0,
"output",
"Normalized values")
273 .Output(1,
"mean",
"Mean values for each feature vector")
274 .Output(2,
"stddev",
"Standard deviations for each feature vector");
280 "_caffe2::LayerNorm("
285 " float epsilon = 1e-5,"
286 " bool elementwise_affine = False"
287 ") -> (Tensor Y, Tensor mean, Tensor std)",
293 "_caffe2::LayerNorm",
294 C10LayerNorm_DontUseThisOpYet);
A helper class to index into arguments.
static bool HasArgument(const Def &def, const string &name)
static T GetSingleArgument(const Def &def, const string &name, const T &default_value)
GradientMakerBase(const OperatorDef &def, const vector< GradientWrapper > &g_output)
void ComputeFusedParams(const int M, const int N, const T *mean, const T *sigma, const T *ds, const T *db, T *rstd, T *X_scale, T *bias, T *g_scale)
void ComputeInternalGradients(const int M, const int N, const T *dY, const T *X, const T *gamma, T *dYxX, T *ds, T *db)
void LayerNormBackward(const int M, const int N, const T *dY, const T *X, const T *gamma, const T *dY_scale, const T *X_scale, const T *bias, T *dX)
void GammaBetaBackward(const int M, const int N, const T *dYxX, const T *dY, const T *rstd, const T *g_scale, T *dgamma, T *dbeta)
void ComputeSigmaAndFusedParams(const int N, const float eps, const T *mean, const T *var, T *stddev, T *scale, T *bias)
void LayerNormForward(const int M, const int N, const T *X, const T *scale, const T *bias, const T *gamma, const T *beta, T *Y)
#define C10_EXPORT_C10_OP_TO_CAFFE2_CPU( OperatorName, Name)
C10_EXPORT_CAFFE2_OP_TO_C10_CPU(LayerNorm, "_caffe2::LayerNorm(" " Tensor X," " Tensor? gamma," " Tensor? beta," " int axis = 1," " float epsilon = 1e-5," " bool elementwise_affine = False" ") -> (Tensor Y, Tensor mean, Tensor std)", caffe2::LayerNormOp< caffe2::CPUContext >) namespace caffe2
int canonical_axis_index_(int axis_index, int ndims)
Copyright (c) 2016-present, Facebook, Inc.
Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > > ConstEigenArrayMap
const auto canonical_axis
REGISTER_CPU_OPERATOR(ATen, ATenOp< CPUContext >)
parameter efficient embedding termed TT which can be plugged in into any model and trained end to end The benefits of our compressed TT layer are twofold instead of storing huge embedding it stores a sequence of much smaller dimensional and dimensional necessary for reconstructing the required which allows compressing the model significantly at the cost of a negligible performance drop the overall number of parameters can be relatively which allows to use larger batches or train efficiently in a case of limited resources DOC vector< int >
C10_EXPORT const Argument & GetArgument(const OperatorDef &def, const string &name)
If this op will do affine transformation after normalization scale tensor for elementwise_affine
return vector< TensorShape >
ArgumentHelper helper(def)
std::vector< int > input_dims(input_dims_long.begin(), input_dims_long.end())
std::vector< int > stat_dims(input_dims.begin(), input_dims.begin()+canonical_axis)
we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input size
const vector< TensorShape > & in
Eigen::Map< Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > > EigenArrayMap
Eigen::Map< Eigen::Array< T, Eigen::Dynamic, 1 > > EigenVectorArrayMap
TensorShape CreateTensorShape(vector< T_I > dims, ::caffe2::TensorProto_DataType dt)
INT_MAX NumOutputs(1, INT_MAX)
vector< int64_t > GetDimsVector(const TensorShape &shape)
default The input dimensional tensor of shape $NCHW$ or $NHWC$ depending on the order parameter The bias as a dimensional tensor of size $C$ to be applied to the output var
Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, 1 > > ConstEigenVectorArrayMap
returns a tensor containing the indices of the largest element along the given axis If the keepdims arg is *True the shape of the output tensor matches the input tensor except the axis dimension equals the axis dimension of the output tensor is removed Github axis
Feature map input with order NCHW or NHWC bias
Output tensor quantization scale X
*and produces a single output tensor *expanded *The op also takes an argument *dims *with a list of dimensions for where to add the single dimensional entries If the same blob is provided as input and the operation is copy free This is the exact inverse operation of *Squeeze *Github dims
See RoIPoolF Gradient of forward dX
SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >::LENGTHS SetDoc(R"DOC(
Variation of SparseLengthsMean operator, where DATA is
stored using 8bits. DATA was quantized with 8Bit row-wise
quantization (see doc to FloatToRowwiseQuantized8Bits operator). To
restore DATA from 8Bit, we use additional input that stores scales
and biases.
)DOC") .Input(0
REGISTER_GRADIENT(CTC, GetCTCGradient)
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC mean
computes the sum of all elements per channel and the sum of all elements squared per channel These values can be reduced across multiple batches and used to obtain the mean and variance across the full set of batches Using the new mean and variance as input to SpatialBN has the effect of changing the batch size over which SpatialBN is applied DOC sum
If this op will do affine transformation after normalization gamma
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop both dimensions of the image will be set to minsize or scale
CAFFE_ENFORCE(dims.front() >=0, "Dimension ids must be non-negative.")