pytorch  1.8.2
About: PyTorch provides Tensor computation (like NumPy) with strong GPU acceleration and Deep Neural Networks (in Python) built on a tape-based autograd system. LTS (Long Term Support) release.
  Fossies Dox: pytorch-1.8.2.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

CPUApplyUtils.h
Go to the documentation of this file.
1#pragma once
2
3#include <ATen/Parallel.h>
4#include <ATen/TensorUtils.h>
5#include <limits>
6#include <utility>
7#include <cstring>
8
9namespace at {
10
11/*
12[collapse dims] Updates sizes, and strides to reflect a "collapse" of
13the info, possibly excluding the optional excludeDim. A "collapsed" version
14of the info is the fewest dims that order the tensor's elements in the same
15way as the original info. If excludeDim is specified, the collapse is the
16fewest dims that order the tensor's elements as the original and preserve the
17excluded dimension, unless the tensor collapses to a point.
18
19This function returns a pair of values.
20
211) The (new) index of the preserved dimension if excludeDim is
22specified. 0 if the tensor is collapsed to a point. -1
23otherwise.
24
252) The new number of dimensions.
26*/
27template <typename T>
28inline std::pair<int64_t, int64_t> collapse_dims(
29 T* sizes,
30 T* strides,
32 const int excludeDim = -1) {
34 excludeDim >= -1 && excludeDim < dims,
35 "expected excluded dim between -1 and dims - 1");
36
37 int64_t stopDim = (excludeDim == -1) ? dims : excludeDim;
38 int64_t newIndex = -1;
39 int64_t oldIndex = 0;
40 int64_t remappedExcludedDim = -1;
41
42 while (oldIndex < dims) {
43 // Finds a dimension to collapse into
44 for (; oldIndex < stopDim; ++oldIndex) {
45 if (sizes[oldIndex] == 1) {
46 continue;
47 }
48
49 ++newIndex;
50 sizes[newIndex] = sizes[oldIndex];
51 strides[newIndex] = strides[oldIndex];
52 ++oldIndex;
53 break;
54 }
55
56 // Collapses dims
57 for (; oldIndex < stopDim; ++oldIndex) {
58 if (sizes[oldIndex] == 1) {
59 continue;
60 }
61
62 if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
63 sizes[newIndex] *= sizes[oldIndex];
64 strides[newIndex] = strides[oldIndex];
65 } else {
66 ++newIndex;
67 sizes[newIndex] = sizes[oldIndex];
68 strides[newIndex] = strides[oldIndex];
69 }
70 }
71
72 // Handles excludeDim being set (oldIndex == excludeDim)
73 if (oldIndex != dims) {
74 // Preserves excluded dimension
75 ++newIndex;
76 sizes[newIndex] = sizes[oldIndex];
77 strides[newIndex] = strides[oldIndex];
78 remappedExcludedDim = newIndex;
79
80 // Restarts iteration after excludeDim
81 ++oldIndex;
82 stopDim = dims;
83 }
84 }
85
86 // Handles special case of all dims size 1
87 if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
88 dims = 1;
89 sizes[0] = 1;
90 strides[0] = 1;
91
92 return std::pair<int64_t, int64_t>(0, 1);
93 }
94
95 dims = newIndex + 1;
96 return std::pair<int64_t, int64_t>(remappedExcludedDim, dims);
97}
98
99/*
100 * The basic strategy for apply is as follows:
101 *
102 * 1. Starting with the outermost index, loop until we reach a dimension where
103 * the data is no longer contiguous, i.e. the stride at that dimension is not
104 * equal to the size of the tensor defined by the outer dimensions. Let's call
105 * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then
106 * A is equal to the entire Tensor. Let's call the inner tensor B.
107 *
108 * 2. We loop through the indices in B, starting at its outermost dimension. For
109 * example, if B is a 2x2 matrix, then we do:
110 *
111 * B[0][0]
112 * B[0][1]
113 * B[1][0]
114 * B[1][1]
115 *
116 * We set the offset into the underlying storage as (storageOffset + stride_B *
117 * index_B), i.e. basically we compute the offset into the storage as we would
118 * normally for a Tensor. But because we are guaranteed the subsequent data is
119 * contiguous in memory, we can simply loop for sizeof(A) iterations and perform
120 * the operation, without having to follow the order described by the strides of
121 * A.
122 *
123 * 3. As an optimization, we merge dimensions of A that are contiguous in
124 * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor,
125 * then the first two dimensions can be merged for the purposes of APPLY,
126 * reducing the number of nested loops.
127 */
128
129inline Tensor sort_strides(Tensor& tensor_) {
130 IntArrayRef strides = tensor_.strides();
131 std::vector<int64_t> indices;
132 indices.reserve(tensor_.ndimension());
133 for (int64_t i = 0; i < tensor_.ndimension(); i++) {
134 indices.push_back(i);
135 }
136 std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
137 return strides[i1] > strides[i2];
138 });
139 Tensor tensor = tensor_.permute(indices);
140 return tensor;
141}
142
143template <typename T, int N>
145 public:
146 T* data_ = NULL;
148
152
154 void operator=(strided_tensor_iter_fixed const& x) = delete;
157 : data_(tensor.data_ptr<T>()) {
158 std::memset(counter_, 0, sizeof(int64_t) * N);
159 if (tensor.dim() > 0) {
160 std::memcpy(
161 sizes_, tensor.sizes().data(), tensor.dim() * sizeof(int64_t));
162 std::memcpy(
163 strides_,
164 tensor.strides().data(),
165 tensor.dim() * sizeof(int64_t));
166 }
167 dim_ = std::get<1>(collapse_dims(sizes_, strides_, tensor.ndimension()));
168 }
169};
170
171template <typename T>
173 private:
174 public:
175 T* data_ = NULL;
177
178 std::vector<int64_t> counter_;
179 std::vector<int64_t> sizes_;
180 std::vector<int64_t> strides_;
181
183 void operator=(strided_tensor_iter const& x) = delete;
186 : data_(tensor.data_ptr<T>()),
187 dim_(tensor.ndimension()),
188 counter_(dim_, 0),
189 sizes_(tensor.sizes().vec()),
190 strides_(tensor.strides().vec()) {
191 dim_ = std::get<1>(collapse_dims(sizes_.data(), strides_.data(), dim_));
192 }
193};
194
196 if (tensors.size() == 0)
197 return true;
198 int64_t all_numel = tensors[0].numel();
199 for (size_t i = 1; i < tensors.size(); i++) {
200 if (tensors[i].numel() != all_numel)
201 return false;
202 }
203 return true;
204}
205
207 std::ostringstream oss;
208 oss << "inconsistent tensor size, expected ";
209 for (size_t i = 0; i < tensors.size() - 1; i++) {
210 oss << tensors[i].sizes() << ", ";
211 }
212 oss << "and " << tensors[tensors.size() - 1].sizes()
213 << " to have the same number of elements, but got ";
214 for (size_t i = 0; i < tensors.size() - 1; i++) {
215 oss << tensors[i].numel() << ", ";
216 }
217 oss << "and " << tensors[tensors.size() - 1].numel()
218 << " elements respectively";
219 return oss.str();
220}
221
223 checkDeviceType("CPU_tensor_apply", tensors, kCPU);
224 checkLayout("CPU_tensor_apply", tensors, kStrided);
227 // An empty tensor has no elements
228 for (auto& t : tensors)
229 if (t.numel() == 0)
230 return false;
231 return true;
232}
233
235 int64_t dim = 0;
236 for (auto& t : tensors)
237 dim = std::max(dim, t.ndimension());
238 return dim;
239}
240
241inline void iterate(int64_t size){};
242
243template <typename Arg, typename... Args>
244inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) {
245 iter.counter_[iter.dim_ - 1] += size;
246 iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1];
247 iterate(size, iter_tail...);
248}
249
250inline bool iterate_continue() {
251 return true;
252};
253
254template <typename Arg, typename... Args>
255inline bool iterate_continue(Arg& iter, Args&... iter_tail) {
256 return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] &&
257 iterate_continue(iter_tail...);
258}
259
262};
263
264template <typename Arg, typename... Args>
265inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) {
266 return std::min(
267 (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]),
268 max_iterate_size(iter_tail...));
269}
270
271inline void iterate_overflow(){};
272
273template <typename Arg, typename... Args>
274inline void iterate_overflow(Arg& iter, Args&... iter_tail) {
275 if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) {
276 for (int64_t i = iter.dim_ - 1; i > 0; i--) {
277 if (iter.counter_[i] == iter.sizes_[i]) {
278 iter.counter_[i] = 0;
279 iter.counter_[i - 1]++;
280 iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) +
281 iter.strides_[i - 1];
282 }
283 }
284 }
285 iterate_overflow(iter_tail...);
286}
287
288inline void forward(int64_t offset){};
289
290template <typename Arg, typename... Args>
291inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) {
292 int64_t multi = offset;
293 for (int64_t i = iter.dim_ - 1; i >= 0; i--) {
294 int64_t inc = multi % iter.sizes_[i];
295 multi = multi / iter.sizes_[i];
296 iter.data_ = iter.data_ + inc * iter.strides_[i];
297 iter.counter_[i] += inc;
298 }
299 forward(offset, iter_tail...);
300}
301
302inline int64_t max_dim() {
303 return 0;
304}
305
306template <typename Arg, typename... Args>
307inline int64_t max_dim(Arg& iter, Args&... iter_tail) {
308 return std::max(iter.dim_, max_dim(iter_tail...));
309}
310
311inline void apply_op(){};
312
313template <typename Op, typename... Args>
314inline void
315apply_op(int64_t numel, int64_t offset, const Op& op, Args... iters) {
316 // For 0-dim tensors
317 if (numel == 1 && max_dim(iters...) == 0) {
318 op(*iters.data_...);
319 return;
320 }
321 if (offset > 0)
322 forward(offset, iters...);
323 // Splitting this into chunks helps the compiler create faster assembly
324 for (int64_t i = 0; i < numel;) {
325 for (; iterate_continue(iters...) && i < numel;) {
326 op(*iters.data_...);
327 iterate(1, iters...);
328 i++;
329 }
330 iterate_overflow(iters...);
331 }
332}
333
334/*
335 Apply a pointwise operator to sequence of tensors
336
337 The calling convention for op is a function/functor that takes the same
338 number of pointers of type scalar as the number of given tensors. For example,
339 to compute a = b * c, op would be of the form:
340 [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] =
341 b_val[0] * c_val[0]; };
342*/
343
344template <typename scalar1, typename scalar2, typename Op>
345inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
346 if (!_apply_preamble({tensor1, tensor2}))
347 return;
348 if (_max_dim_tensors({tensor1, tensor2}) <= 8) {
349 apply_op(
350 tensor1.numel(),
351 0,
352 op,
355 } else {
356 apply_op(
357 tensor1.numel(),
358 0,
359 op,
362 }
363}
364
365template <typename scalar1, typename scalar2, typename scalar3, typename Op>
366inline void
367CPU_tensor_apply3(Tensor tensor1, Tensor tensor2, Tensor tensor3, const Op op) {
368 if (!_apply_preamble({tensor1, tensor2, tensor3}))
369 return;
370 if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) {
371 apply_op(
372 tensor1.numel(),
373 0,
374 op,
378 } else {
379 apply_op(
380 tensor1.numel(),
381 0,
382 op,
386 }
387}
388
389template <
390 typename scalar1,
391 typename scalar2,
392 typename scalar3,
393 typename scalar4,
394 typename Op>
396 Tensor tensor1,
397 Tensor tensor2,
398 Tensor tensor3,
399 Tensor tensor4,
400 const Op op) {
401 if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4}))
402 return;
403 if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) {
404 apply_op(
405 tensor1.numel(),
406 0,
407 op,
412 } else {
413 apply_op(
414 tensor1.numel(),
415 0,
416 op,
421 }
422}
423
424} // namespace at
static int iter
Definition: Benchmark.mm:16
void * data_ptr
Definition: CatKernel.cpp:13
#define AT_ERROR(...)
Definition: Exception.h:491
#define TORCH_CHECK(cond,...)
Definition: Exception.h:361
uint32_t max
Definition: Resource.cpp:270
DimVector dim
c10::SmallVector< int64_t, 5 > sizes
c10::SmallVector< int64_t, 5 > strides
Args({2<< 5}) -> Args({2<< 8}) ->Args({2<< 12}) ->Args({2<< 14})
int64_t ndimension() const
Definition: TensorBody.h:241
IntArrayRef strides() const
Definition: TensorBody.h:230
int64_t numel() const
Definition: TensorBody.h:287
Distributions kernel adapted from THRandom.cpp The kernels try to follow std::random distributions si...
bool _apply_preamble(ArrayRef< Tensor > tensors)
void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op)
void iterate_overflow()
void CPU_tensor_apply4(Tensor tensor1, Tensor tensor2, Tensor tensor3, Tensor tensor4, const Op op)
void checkLayout(CheckedFrom c, const Tensor &t, Layout layout)
int64_t max_dim()
void checkDeviceType(CheckedFrom c, const Tensor &t, DeviceType device_type)
bool _all_equal_numel(at::ArrayRef< Tensor > tensors)
void apply_op()
bool iterate_continue()
Tensor sort_strides(Tensor &tensor_)
void forward(int64_t offset)
void iterate(int64_t size)
void CPU_tensor_apply3(Tensor tensor1, Tensor tensor2, Tensor tensor3, const Op op)
int64_t max_iterate_size()
int64_t numel(const Tensor &tensor)
Definition: Functions.h:133
int64_t size(const Tensor &tensor, int64_t dim)
Definition: Functions.h:137
std::string _all_equal_numel_error(at::ArrayRef< Tensor > tensors)
int64_t _max_dim_tensors(ArrayRef< Tensor > tensors)
std::pair< int64_t, int64_t > collapse_dims(T *sizes, T *strides, int64_t dims, const int excludeDim=-1)
Definition: CPUApplyUtils.h:28
constexpr DeviceType kCPU
Definition: DeviceType.h:36
constexpr auto kStrided
Definition: Layout.h:11
core.CreateOperator("Slice",["X"],["Y"], starts=(0, 1), ends=(-1, 3)) workspace.FeedBlob("X", np.array()) print("X:", workspace.FetchBlob("X")) workspace.RunOperatorOnce(op) print("Y:", workspace.FetchBlob("Y")) ``` **Result **``` X:Y:```</details >) DOC") .Input(0, "X", "(*Tensor *):tensor to extract slices from") .Input( 1, "starts", "(*Tensor`< int >` *):1D tensor of start-indices for each dimension of data(dimensions following the sliced one might be omitted)") .Input( 2, "ends", "(*Tensor`< int >` *):1D tensor of end-indices for each dimension of data(dimensions following the sliced one might be omitted)") .Arg("starts", "(*Tuple(int) *):list of starting indices") .Arg("ends", "(*Tuple(int) *):list of ending indices") .TensorInferenceFunction([](const OperatorDef& def, const vector<TensorShape>& in) { if (in.size() > 1) { return vector<TensorShape>() op
Definition: slice_op.cc:82
float T
Definition: cc_bmm_bg_op.h:11
parameter efficient embedding termed TT which can be plugged in into any model and trained end to end The benefits of our compressed TT layer are twofold instead of storing huge embedding it stores a sequence of much smaller dimensional and dimensional tensors
for each weights are accessed by indices[0..L-1]
*and produces a single output tensor *expanded *The op also takes an argument *dims *with a list of dimensions for where to add the single dimensional entries If the same blob is provided as input and the operation is copy free This is the exact inverse operation of *Squeeze *Github dims
required base learning rate default used only for inv policy type default sampling rate on iterations default True in alter policy int64_t
c10::BFloat16 max(c10::BFloat16 a, c10::BFloat16 b)
Definition: BFloat16-math.h:33
c10::BFloat16 min(c10::BFloat16 a, c10::BFloat16 b)
Definition: BFloat16-math.h:32
typename function_traits< Func >::template arg< i >::type Arg
Definition: Exceptions.h:348
const Tensor * tensor
uint8_t * data
strided_tensor_iter_fixed(Tensor &tensor, bool sort_strides=false)
void operator=(strided_tensor_iter_fixed const &x)=delete
strided_tensor_iter_fixed(strided_tensor_iter_fixed &&)=default
strided_tensor_iter_fixed(strided_tensor_iter_fixed const &)=delete
std::vector< int64_t > strides_
strided_tensor_iter(Tensor &tensor)
std::vector< int64_t > counter_
strided_tensor_iter(strided_tensor_iter &&)=default
std::vector< int64_t > sizes_
strided_tensor_iter(strided_tensor_iter const &)=delete
void operator=(strided_tensor_iter const &x)=delete