pytorch  1.8.2
About: PyTorch provides Tensor computation (like NumPy) with strong GPU acceleration and Deep Neural Networks (in Python) built on a tape-based autograd system. LTS (Long Term Support) release.
  Fossies Dox: pytorch-1.8.2.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

layer_norm_op.cc
Go to the documentation of this file.
2
3#include <array>
4
7#include "caffe2/utils/math.h"
8
9namespace caffe2 {
10
11template <>
12template <typename T>
14 const int N,
15 const float eps,
16 const T* mean,
17 const T* var,
18 T* sigma,
19 T* scale,
20 T* bias) {
22 EigenVectorArrayMap<T> sigma_arr(sigma, N);
23 sigma_arr = var_arr + static_cast<T>(eps);
24 math::Rsqrt<T, CPUContext>(N, sigma, scale, &context_);
25 math::Mul<T, CPUContext>(N, scale, sigma, sigma, &context_);
28}
29
30template <>
31template <typename T>
33 const int M,
34 const int N,
35 const T* X,
36 const T* scale,
37 const T* bias,
38 const T* gamma,
39 const T* beta,
40 T* Y) {
41 ConstEigenArrayMap<T> X_arr(X, N, M);
44 EigenArrayMap<T> Y_arr(Y, N, M);
45 if (gamma != nullptr && beta != nullptr) {
47 ConstEigenVectorArrayMap<T> beta_arr(beta, N);
48 Y_arr = (((X_arr.rowwise() * scale_arr.transpose()).rowwise() +
49 bias_arr.transpose())
50 .colwise() *
51 gamma_arr)
52 .colwise() +
53 beta_arr;
54 } else {
55 CAFFE_ENFORCE(gamma == nullptr);
56 CAFFE_ENFORCE(beta == nullptr);
57 Y_arr = (X_arr.rowwise() * scale_arr.transpose()).rowwise() +
58 bias_arr.transpose();
59 }
60}
61
63
64template <>
65template <typename T>
67 const int M,
68 const int N,
69 const T* dY,
70 const T* X,
71 const T* gamma,
72 T* dYxX,
73 T* ds,
74 T* db) {
75 math::Mul<T, CPUContext>(M * N, dY, X, dYxX, &context_);
76 ConstEigenArrayMap<T> dYxX_arr(dYxX, N, M);
77 ConstEigenArrayMap<T> dY_arr(dY, N, M);
78 if (gamma != nullptr) {
80 for (int i = 0; i < M; ++i) {
81 ds[i] = (dYxX_arr.col(i) * gamma_arr).sum();
82 db[i] = (dY_arr.col(i) * gamma_arr).sum();
83 }
84 } else {
85 EigenVectorArrayMap<T>(ds, M) = dYxX_arr.colwise().sum();
86 EigenVectorArrayMap<T>(db, M) = dY_arr.colwise().sum();
87 }
88}
89
90template <>
91template <typename T>
93 const int M,
94 const int N,
95 const T* mean,
96 const T* sigma,
97 const T* ds,
98 const T* db,
99 T* rstd,
100 T* X_scale,
101 T* bias,
102 T* g_scale) {
103 const T scale = T(1) / static_cast<T>(N);
105 ConstEigenVectorArrayMap<T> ds_arr(ds, M);
106 ConstEigenVectorArrayMap<T> db_arr(db, M);
107 EigenVectorArrayMap<T> rstd_arr(rstd, M);
108 EigenVectorArrayMap<T> X_scale_arr(X_scale, M);
109 rstd_arr = ConstEigenVectorArrayMap<T>(sigma, M).inverse();
110 X_scale_arr = (db_arr * mean_arr - ds_arr) * rstd_arr.cube() * scale;
112 -X_scale_arr * mean_arr - db_arr * rstd_arr * scale;
113 if (g_scale != nullptr) {
114 EigenVectorArrayMap<T>(g_scale, M) = -rstd_arr * mean_arr;
115 }
116}
117
118template <>
119template <typename T>
121 const int M,
122 const int N,
123 const T* dY,
124 const T* X,
125 const T* gamma,
126 const T* dY_scale,
127 const T* X_scale,
128 const T* bias,
129 T* dX) {
130 ConstEigenArrayMap<T> dY_arr(dY, N, M);
131 ConstEigenArrayMap<T> X_arr(X, N, M);
132 EigenArrayMap<T> dX_arr(dX, N, M);
133 if (gamma != nullptr) {
135 for (int i = 0; i < M; ++i) {
136 dX_arr.col(i) = dY_arr.col(i) * gamma_arr * dY_scale[i] +
137 X_arr.col(i) * X_scale[i] + bias[i];
138 }
139 } else {
140 ConstEigenVectorArrayMap<T> dY_scale_arr(dY_scale, M);
141 ConstEigenVectorArrayMap<T> X_scale_arr(X_scale, M);
143 dX_arr = (dY_arr.rowwise() * dY_scale_arr.transpose() +
144 X_arr.rowwise() * X_scale_arr.transpose())
145 .rowwise() +
146 bias_arr.transpose();
147 }
148}
149
150template <>
151template <typename T>
153 const int M,
154 const int N,
155 const T* dYxX,
156 const T* dY,
157 const T* rstd,
158 const T* g_scale,
159 T* dgamma,
160 T* dbeta) {
161 math::Set<T, CPUContext>(N, T(0), dgamma, &context_);
162 math::Set<T, CPUContext>(N, T(0), dbeta, &context_);
163 ConstEigenArrayMap<T> dYxX_arr(dYxX, N, M);
164 ConstEigenArrayMap<T> dY_arr(dY, N, M);
165 EigenVectorArrayMap<T> dgamma_arr(dgamma, N);
166 EigenVectorArrayMap<T> dbeta_arr(dbeta, N);
167 for (int i = 0; i < M; ++i) {
168 dgamma_arr += dYxX_arr.col(i) * rstd[i] + dY_arr.col(i) * g_scale[i];
169 dbeta_arr += dY_arr.col(i);
170 }
171}
172
173OPERATOR_SCHEMA(LayerNormGradient).NumInputs({5, 6}).NumOutputs({1, 3});
174
176
177namespace {
178
179class GetLayerNormGradient : public GradientMakerBase {
181 std::vector<OperatorDef> GetGradientDefs() override {
182 bool elementwise_affine = false;
183 if (ArgumentHelper::HasArgument(Def(), "elementwise_affine")) {
184 elementwise_affine = GetArgument(Def(), "elementwise_affine").i();
185 }
186 if (elementwise_affine) {
187 return SingleGradientDef(
188 "LayerNormGradient",
189 "",
190 std::vector<std::string>{GO(0), O(0), O(1), O(2), I(0), I(1)},
191 std::vector<std::string>{GI(0), GI(1), GI(2)});
192 } else {
193 return SingleGradientDef(
194 "LayerNormGradient",
195 "",
196 std::vector<std::string>{GO(0), O(0), O(1), O(2), I(0)},
197 std::vector<std::string>{GI(0)});
198 }
199 }
200};
201
202} // namespace
203
204REGISTER_GRADIENT(LayerNorm, GetLayerNormGradient);
205
206OPERATOR_SCHEMA(LayerNorm)
207 .NumInputs({1, 3})
208 .NumOutputs(3)
209 .TensorInferenceFunction([](const OperatorDef& def,
210 const vector<TensorShape>& in) {
214 input_dims_long.begin(), input_dims_long.end());
216
218
219 auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
220 const auto canonical_axis =
223 input_dims.begin(), input_dims.begin() + canonical_axis);
224 stat_dims.push_back(1);
227 return out;
228 })
229 .SetDoc(R"DOC(
230Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf.
231Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}],
232this op treats dimensions a_k through a_{n-1} as feature vectors. For each
233feature vector, the op contains the mean and standard deviation. Then,
234it returns the normalized values (with respect to the feature vector).
235
236Note that this op does not contain the scale an bias terms described in the
237paper. Simply follow this op with an FC op to add those. Concretely, this op
238implements:
239
240h = \frac{1}{\sigma}(a - \mu)
241where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i
242and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2}
243where H is the number of hidden units (i.e. product of dimensions from 'axis'
244to the end.)
245)DOC")
246 .Arg(
247 "axis",
248 "(int) default to 1; Describes axis of the inputs. Defaults to one "
249 "because the 0th axis most likely describes the batch size")
250 .Arg(
251 "epsilon",
252 "(float) default to 0.001. Small value to be added to the stdev when"
253 " dividing out by that value. This prevents division by zero.")
254 .Arg(
255 "elementwise_affine",
256 "(bool) default to False; If true, this op will do affine "
257 "transformation after normalization.")
258 .Input(
259 0,
260 "input",
261 "Input tensor which layer normalization will be applied to")
262 .Input(
263 1,
264 "gamma",
265 "scale tensor for elementwise_affine, the shape should be the same as "
266 "the dimensions of X begin from axis")
267 .Input(
268 2,
269 "beta",
270 "bias tensor for elementwise_affine, the shape should be the same as "
271 "the dimensions of X begin from axis")
272 .Output(0, "output", "Normalized values")
273 .Output(1, "mean", "Mean values for each feature vector")
274 .Output(2, "stddev", "Standard deviations for each feature vector");
275
276} // namespace caffe2
277
279 LayerNorm,
280 "_caffe2::LayerNorm("
281 " Tensor X,"
282 " Tensor? gamma,"
283 " Tensor? beta,"
284 " int axis = 1,"
285 " float epsilon = 1e-5,"
286 " bool elementwise_affine = False"
287 ") -> (Tensor Y, Tensor mean, Tensor std)",
289
290namespace caffe2 {
291
293 "_caffe2::LayerNorm",
294 C10LayerNorm_DontUseThisOpYet);
295
296} // namespace caffe2
A helper class to index into arguments.
Definition: proto_utils.h:203
static bool HasArgument(const Def &def, const string &name)
Definition: proto_utils.h:206
static T GetSingleArgument(const Def &def, const string &name, const T &default_value)
Definition: proto_utils.h:211
GradientMakerBase(const OperatorDef &def, const vector< GradientWrapper > &g_output)
void ComputeFusedParams(const int M, const int N, const T *mean, const T *sigma, const T *ds, const T *db, T *rstd, T *X_scale, T *bias, T *g_scale)
void ComputeInternalGradients(const int M, const int N, const T *dY, const T *X, const T *gamma, T *dYxX, T *ds, T *db)
void LayerNormBackward(const int M, const int N, const T *dY, const T *X, const T *gamma, const T *dY_scale, const T *X_scale, const T *bias, T *dX)
void GammaBetaBackward(const int M, const int N, const T *dYxX, const T *dY, const T *rstd, const T *g_scale, T *dgamma, T *dbeta)
void ComputeSigmaAndFusedParams(const int N, const float eps, const T *mean, const T *var, T *stddev, T *scale, T *bias)
void LayerNormForward(const int M, const int N, const T *X, const T *scale, const T *bias, const T *gamma, const T *beta, T *Y)
#define C10_EXPORT_C10_OP_TO_CAFFE2_CPU( OperatorName, Name)
CPUContext * context_
C10_EXPORT_CAFFE2_OP_TO_C10_CPU(LayerNorm, "_caffe2::LayerNorm(" " Tensor X," " Tensor? gamma," " Tensor? beta," " int axis = 1," " float epsilon = 1e-5," " bool elementwise_affine = False" ") -> (Tensor Y, Tensor mean, Tensor std)", caffe2::LayerNormOp< caffe2::CPUContext >) namespace caffe2
int canonical_axis_index_(int axis_index, int ndims)
Definition: TensorImpl.h:93
Copyright (c) 2016-present, Facebook, Inc.
Definition: blob.h:13
Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > > ConstEigenArrayMap
Definition: eigen_utils.h:29
auto input_dims_long
const auto canonical_axis
REGISTER_CPU_OPERATOR(ATen, ATenOp< CPUContext >)
parameter efficient embedding termed TT which can be plugged in into any model and trained end to end The benefits of our compressed TT layer are twofold instead of storing huge embedding it stores a sequence of much smaller dimensional and dimensional necessary for reconstructing the required which allows compressing the model significantly at the cost of a negligible performance drop the overall number of parameters can be relatively which allows to use larger batches or train efficiently in a case of limited resources DOC vector< int >
C10_EXPORT const Argument & GetArgument(const OperatorDef &def, const string &name)
Definition: proto_utils.cc:522
OPERATOR_SCHEMA(ATen)
If this op will do affine transformation after normalization scale tensor for elementwise_affine
return vector< TensorShape >
Definition: slice_op.cc:110
ArgumentHelper helper(def)
std::vector< int > input_dims(input_dims_long.begin(), input_dims_long.end())
std::vector< int > stat_dims(input_dims.begin(), input_dims.begin()+canonical_axis)
float T
Definition: cc_bmm_bg_op.h:11
we first initialize the output tensor to all and then do accumulation Any further calls to the The input tensor that has to be accumulated to the output tensor If the output size is not the same as input size
const vector< TensorShape > & in
Eigen::Map< Eigen::Array< T, Eigen::Dynamic, Eigen::Dynamic > > EigenArrayMap
Definition: eigen_utils.h:19
See RoIPoolF dY
Eigen::Map< Eigen::Array< T, Eigen::Dynamic, 1 > > EigenVectorArrayMap
Definition: eigen_utils.h:23
TensorShape CreateTensorShape(vector< T_I > dims, ::caffe2::TensorProto_DataType dt)
INT_MAX NumOutputs(1, INT_MAX)
vector< int64_t > GetDimsVector(const TensorShape &shape)
default The input dimensional tensor of shape $NCHW$ or $NHWC$ depending on the order parameter The bias as a dimensional tensor of size $C$ to be applied to the output var
Eigen::Map< const Eigen::Array< T, Eigen::Dynamic, 1 > > ConstEigenVectorArrayMap
Definition: eigen_utils.h:35
returns a tensor containing the indices of the largest element along the given axis If the keepdims arg is *True the shape of the output tensor matches the input tensor except the axis dimension equals the axis dimension of the output tensor is removed Github axis
Definition: arg_ops.cc:125
Feature map input with order NCHW or NHWC bias
Output tensor quantization scale X
*and produces a single output tensor *expanded *The op also takes an argument *dims *with a list of dimensions for where to add the single dimensional entries If the same blob is provided as input and the operation is copy free This is the exact inverse operation of *Squeeze *Github dims
int M
Definition: matmul_op.cc:22
See RoIPoolF Gradient of forward dX
SparseLengths8BitsRowwiseOp< CPUContext, 0, 1 >::LENGTHS SetDoc(R"DOC( Variation of SparseLengthsMean operator, where DATA is stored using 8bits. DATA was quantized with 8Bit row-wise quantization (see doc to FloatToRowwiseQuantized8Bits operator). To restore DATA from 8Bit, we use additional input that stores scales and biases. )DOC") .Input(0
REGISTER_GRADIENT(CTC, GetCTCGradient)
the gradient for the output of SpatialBN and the per channel mean and inverse std var vectors for the computes the per channel bias and scale gradient to be used during the backward pass for subsequent spatial batch normalization gradient calculation the results of this op are subsequently reduced over multiple devices to obtain statistics over a larger batch size in cases where the batch size for a single model copy is too low to yield the full benefit of batch normalization The resulting bias and scale can then be plugged back into SpatialBNGradient to get results over the larger batch size DOC mean
computes the sum of all elements per channel and the sum of all elements squared per channel These values can be reduced across multiple batches and used to obtain the mean and variance across the full set of batches Using the new mean and variance as input to SpatialBN has the effect of changing the batch size over which SpatialBN is applied DOC sum
If this op will do affine transformation after normalization gamma
INT_MAX batch_size images will be processed GPUs can optionally be used for part of the processing The following transformations are applied to the image A bounding box is applied to the initial Number of images to output for each run of the Whether or not to do color jitter Defaults to Image brightness scale used in color jittering Defaults to Whether or not to do color lighting Defaults to Scale the size of the smallest dimension of the image to this Scale and minsize are mutually exclusive Must be larger than crop both dimensions of the image will be set to minsize or scale
int N
Definition: im2col_op.cc:52
CAFFE_ENFORCE(dims.front() >=0, "Dimension ids must be non-negative.")