pytorch  1.8.2
About: PyTorch provides Tensor computation (like NumPy) with strong GPU acceleration and Deep Neural Networks (in Python) built on a tape-based autograd system. LTS (Long Term Support) release.
  Fossies Dox: pytorch-1.8.2.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

MPSCNNOps.mm
Go to the documentation of this file.
13
14#include <ATen/InferSize.h>
15#include <ATen/native/Pool.h>
17
18namespace at {
19namespace native {
20namespace metal {
21namespace mpscnn {
22
25
26API_AVAILABLE(ios(10.0), macos(10.13))
27static inline MPSImage* imageFromMetalTensor(const MetalTensor& tensor) {
28 return tensor.texture()->image();
29}
30
31API_AVAILABLE(ios(10.0), macos(10.13))
32static inline MPSImage* imageFromTensor(const Tensor& tensor) {
33 TORCH_CHECK(tensor.is_metal());
34 MetalTensorImpl* impl = (MetalTensorImpl*)tensor.unsafeGetTensorImpl();
35 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
36 return imageFromMetalTensor(metalTensor);
37}
38
39API_AVAILABLE(ios(10.0), macos(10.13))
40static inline MetalCommandBuffer* commandBufferFromInputTensor(
42 TORCH_CHECK(tensor.is_metal());
43 MetalTensorImpl* impl = (MetalTensorImpl*)tensor.unsafeGetTensorImpl();
44 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
45 MetalCommandBuffer* cmdBuffer = metalTensor.texture()->commandBuffer();
46 TORCH_CHECK(cmdBuffer, @"Command Buffer can't be nil!");
47 return cmdBuffer;
48}
49
50API_AVAILABLE(ios(10.0), macos(10.13))
58 MPSImage* X = imageFromTensor(input);
59 const int64_t oC = weight.sizes()[0];
60 const int64_t iC = weight.sizes()[1];
61 const int64_t kH = weight.sizes()[2];
62 const int64_t kW = weight.sizes()[3];
64 weight.data_ptr<float>(), {oC, iC, kH, kW});
65 // MPSCNN Convolution
66 float* w = packedWeights.data();
67 float* b = bias.has_value() ? bias->data_ptr<float>() : nullptr;
70 bias:b
71 neuronFilter:t];
72 auto outputSize = params.output_sizes();
74 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
75 mt.texture()->allocateTemporaryTextureStorage(outputSize, commandBuffer);
76 MPSImage* Y = imageFromMetalTensor(mt);
77 [op encode:commandBuffer.buffer sourceImage:X destinationImage:Y];
79 return output;
80}
81
82API_AVAILABLE(ios(10.0), macos(10.13))
84 MPSImage* X = imageFromTensor(input);
86 context.weight.sizes(),
87 context.padding,
88 context.stride,
89 context.dilation,
90 context.groups};
91 MPSCNNConvOp* op = (__bridge MPSCNNConvOp*)(context.conv2dOp);
93 if (!op) {
94 float* w = context.weight.data_ptr<float>();
95 float* b = context.bias.has_value() ? ((*context.bias).data_ptr<float>())
96 : nullptr;
97 op = [MPSCNNConvOp conv2d:params weights:w bias:b neuronFilter:nt];
98 context.conv2dOp = (void*)CFBridgingRetain(op);
99 context.releaseCallback = ^(void* res) {
100 if (res) {
101 CFBridgingRelease(res);
102 }
103 };
104 }
105
106 auto outputSize = params.output_sizes();
108 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
109 mt.texture()->allocateTemporaryTextureStorage(outputSize, commandBuffer);
110 MPSImage* Y1 = imageFromMetalTensor(mt);
111 [op encode:commandBuffer.buffer sourceImage:X destinationImage:Y1];
112 // fuse hardtanh with convolution
114 MPSImage* Y2 = [MPSImage temporaryImageFromSize:[Y1 sizes]
116 float min = context.output_min.value().toFloat();
117 float max = context.output_max.value().toFloat();
118 MPSCNNClampOp* clampOp =
119 [MPSCNNClampOp newWithTextures:@[ Y1, Y2 ] Args:@[ @(min), @(max) ]];
120 [clampOp encode:commandBuffer.buffer];
121 mt.texture()->copyFromTexture(Y2);
122 }
123 auto output = MetalTensor::toTensor(std::move(mt), input.options());
124 return output;
125}
126
127API_AVAILABLE(ios(10.0), macos(10.13))
134 bool ceil_mode) {
135 const int64_t iN = input.sizes()[0];
136 const int64_t iC = input.sizes()[1];
137 const int64_t iH = input.sizes()[2];
138 const int64_t iW = input.sizes()[3];
139 const int64_t kH = kernel_size[0];
140 const int64_t kW = kernel_size[1];
141 const int64_t sH = stride[0];
142 const int64_t sW = stride[1];
143 const int64_t pH = padding[0];
144 const int64_t pW = padding[1];
145 const int64_t dH = dilation[0];
146 const int64_t dW = dilation[1];
147 MPSImage* X = imageFromTensor(input);
148 MPSCNNPoolingMax* pool = [[MPSCNNPoolingMax alloc]
149 initWithDevice:[MPSCNNContext sharedInstance].device
150 kernelWidth:kernel_size[0]
151 kernelHeight:kernel_size[1]
152 strideInPixelsX:stride[0]
153 strideInPixelsY:stride[1]];
154 [pool setEdgeMode:MPSImageEdgeModeClamp];
155 [pool setOffset:{.x = static_cast<NSInteger>(kernel_size[0] / 2),
156 .y = static_cast<NSInteger>(kernel_size[1] / 2),
157 .z = 0}];
158
160 int64_t oC = iC;
161 int64_t oH = pooling_output_shape(iH, kH, pH, sH, dH, ceil_mode);
162 int64_t oW = pooling_output_shape(iW, kW, pW, sW, dW, ceil_mode);
163
164 std::vector<int64_t> outputSize{oN, oC, oH, oW};
166 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
167 mt.texture()->allocateTemporaryTextureStorage(outputSize, commandBuffer);
168 MPSImage* Y = imageFromMetalTensor(mt);
169 [pool encodeToCommandBuffer:commandBuffer.buffer
170 sourceImage:X
171 destinationImage:Y];
172 auto output = MetalTensor::toTensor(std::move(mt), input.options());
173 return output;
174}
175
176API_AVAILABLE(ios(10.0), macos(10.13))
178 MPSImage* X = imageFromTensor(input);
179 MPSCNNPoolingAverage* pool = [[MPSCNNPoolingAverage alloc]
180 initWithDevice:[MPSCNNContext sharedInstance].device
181 kernelWidth:X.width
182 kernelHeight:X.height
183 strideInPixelsX:X.width
184 strideInPixelsY:X.height];
185 [pool setEdgeMode:MPSImageEdgeModeClamp];
186 [pool setOffset:{.x = static_cast<NSInteger>(X.width / 2),
187 .y = static_cast<NSInteger>(X.height / 2),
188 .z = 0}];
189 std::vector<int64_t> outputSize{
190 input.sizes()[0], input.sizes()[1], output_size[0], output_size[1]};
192 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
193 mt.texture()->allocateTemporaryTextureStorage(outputSize, commandBuffer);
194 MPSImage* Y = imageFromMetalTensor(mt);
195 [pool encodeToCommandBuffer:commandBuffer.buffer
196 sourceImage:X
197 destinationImage:Y];
198 auto output = MetalTensor::toTensor(std::move(mt), input.options());
199 return output;
200}
201
202API_AVAILABLE(ios(10.0), macos(10.13))
203Tensor neuronKernel(const Tensor& input, MPSCNNNeuron* neuron) {
204 MPSImage* X = imageFromTensor(input);
205 std::vector<int64_t> outputSize = input.sizes().vec();
206 std::vector<int64_t> textureSize = outputSize;
207 if (input.dim() == 2) {
208 textureSize = {outputSize[0], outputSize[1], 1, 1};
209 }
211 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
212 mt.texture()->allocateTemporaryTextureStorage(textureSize, commandBuffer);
213 MPSImage* Y = imageFromMetalTensor(mt);
214 [neuron encodeToCommandBuffer:commandBuffer.buffer
215 sourceImage:X
216 destinationImage:Y];
217 auto output = MetalTensor::toTensor(std::move(mt), input.options());
218 return output;
219}
220
221API_AVAILABLE(ios(10.0), macos(10.13))
222Tensor& neuronKernel_(Tensor& input, MPSCNNNeuron* neuron) {
223 MPSImage* X = imageFromTensor(input);
224 std::vector<int64_t> outputSize = input.sizes().vec();
225 std::vector<int64_t> textureSize = outputSize;
226 if (input.dim() == 2) {
227 textureSize = {outputSize[0], outputSize[1], 1, 1};
228 }
229 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
230 MPSImage* Y = [MPSImage temporaryImageFromSize:input.sizes().vec()
232 [neuron encodeToCommandBuffer:commandBuffer.buffer
233 sourceImage:X
234 destinationImage:Y];
235 MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
236 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
237 metalTensor.texture()->copyFromTexture(Y);
238 return input;
239}
240
241API_AVAILABLE(ios(10.0), macos(10.13))
242Tensor relu(const Tensor& input) {
243 return neuronKernel(input, [MPSCNNNeuronOp relu]);
244}
245
246API_AVAILABLE(ios(10.0), macos(10.13))
247Tensor& relu_(Tensor& input) {
248 return neuronKernel_(input, [MPSCNNNeuronOp relu]);
249}
250
251API_AVAILABLE(ios(10.0), macos(10.13))
252Tensor sigmoid(const Tensor& input) {
253 return neuronKernel(input, [MPSCNNNeuronOp sigmoid]);
254}
255
256API_AVAILABLE(ios(11.0), macos(10.13))
257Tensor& hardsigmoid_(Tensor& input) {
258 MPSImage* X = imageFromTensor(input);
259 std::vector<int64_t> outputSize = input.sizes().vec();
260 std::vector<int64_t> textureSize = outputSize;
261 if (input.dim() == 2) {
262 textureSize = {outputSize[0], outputSize[1], 1, 1};
263 }
265 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
266 mt.texture()->allocateTemporaryTextureStorage(textureSize, commandBuffer);
267 MPSImage* Y = imageFromMetalTensor(mt);
268 static dispatch_once_t onceToken;
269 static MPSCNNNeuronHardSigmoid* neuron = nil;
270 dispatch_once(&onceToken, ^{
271 neuron = [[MPSCNNNeuronHardSigmoid alloc]
272 initWithDevice:[MPSCNNContext sharedInstance].device
273 a:1.0/6.0
274 b:0.5];
275 });
276 [neuron encodeToCommandBuffer:commandBuffer.buffer
277 sourceImage:X
278 destinationImage:Y];
279 MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
280 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
281 metalTensor.texture()->copyFromTexture(Y);
282 return input;
283}
284
285API_AVAILABLE(ios(10.0), macos(10.13))
286Tensor tanh(const Tensor& input) {
287 return neuronKernel(input, [MPSCNNNeuronOp tanh]);
288}
289
290API_AVAILABLE(ios(10.0), macos(10.13))
291Tensor& hardtanh_(Tensor& input, Scalar min_val, Scalar max_val) {
292 MPSImage* X = imageFromTensor(input);
293 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
294 MPSImage* Y = [MPSImage temporaryImageFromSize:input.sizes().vec()
296 float min = min_val.toFloat();
297 float max = max_val.toFloat();
298 MPSCNNClampOp* clampOp = [MPSCNNClampOp newWithTextures:@[ X, Y ]
299 Args:@[ @(min), @(max) ]];
300 [clampOp encode:commandBuffer.buffer];
301 MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
302 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
303 metalTensor.texture()->copyFromTexture(Y);
304 return input;
305}
306
307Tensor& hardswish_(Tensor& input) {
308 MPSImage* X = imageFromTensor(input);
309 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
310 std::vector<int64_t> outputSize = input.sizes().vec();
311 std::vector<int64_t> textureSize = outputSize;
312 if (input.dim() == 2) {
313 textureSize = {outputSize[0], outputSize[1], 1, 1};
314 }
315 MPSImage* Y = [MPSImage temporaryImageFromSize:textureSize commandBuffer:commandBuffer];
316 id<MTLComputeCommandEncoder> encoder =
317 [commandBuffer.buffer computeCommandEncoder];
318 id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
320 X, @"hardswish", @"hardswish_nonarray")
321 Constants:@[
322 @(X.featureChannels),
323 @(X.height),
324 @(X.width)
325 ]];
326
327 [encoder setComputePipelineState:state];
328 [encoder setTexture:[X texture] atIndex:0];
329 [encoder setTexture:[Y texture] atIndex:1];
330
331 const auto& launchParams =
333 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
334 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
335 [encoder endEncoding];
336 [X markRead];
337 MetalTensorImpl* impl = (MetalTensorImpl*)input.unsafeGetTensorImpl();
338 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
339 metalTensor.texture()->copyFromTexture(Y);
341}
342
343/*
344 A fully connected layer takes an MPSImage object with dimensions source.width x
345 source.height x Ni, convolves it with
346 Weights[No][source.width][source.height][Ni],and produces a 1 x 1 x No output.
347
348 Thus, the following conditions must be true:
349 kernelWidth == source.width
350 kernelHeight == source.height
351 clipRect.size.width == 1
352 clipRect.size.height == 1
353
354 You can think of a fully connected layer as a matrix multiplication
355 where the image is flattened into a vector of length
356 source.width*source.height*Ni, and the weights are arranged in a matrix of
357 dimension No x (source.width*source.height*Ni) to produce an output vector of
358 length No
359
360 The value of the strideInPixelsX, strideInPixelsY, and groups properties must
361 be 1. The offset property is not applicable and it is ignored. Because the clip
362 rectangle is clamped to the destination image bounds, if the destination is 1 x
363 1, you do not need to set the clipRect property.
364 */
365API_AVAILABLE(ios(10.0), macos(10.13))
366Tensor addmm(const Tensor& bias, const Tensor& input, const Tensor& weight) {
367 MPSImage* X = imageFromTensor(input);
368 const int64_t N = X.numberOfImages;
369 const int64_t oC = weight.sizes()[0];
370 const int64_t kH = X.height;
371 const int64_t kW = X.width;
372 const int64_t iC = weight.sizes()[1] / kH / kW;
374 weight.data_ptr<float>(), {oC, iC, kH, kW});
375 MPSCNNConvolutionDescriptor* desc =
376 [MPSCNNConvolutionDescriptor cnnConvolutionDescriptorWithKernelWidth:kW
377 kernelHeight:kH
378 inputFeatureChannels:iC
379 outputFeatureChannels:oC
380 neuronFilter:nil];
381 desc.strideInPixelsX = 1;
382 desc.strideInPixelsY = 1;
385 Bias:bias.defined() ? bias.data_ptr<float>() : nil
386 Desc:desc];
387 MPSCNNFullyConnected* fc = nil;
388 if (@available(iOS 11.0, *)) {
389 fc = [[MPSCNNFullyConnected alloc]
390 initWithDevice:[MPSCNNContext sharedInstance].device
391 weights:ds];
392 } else {
393#if TARGET_OS_IPHONE
394 fc = [[MPSCNNFullyConnected alloc]
395 initWithDevice:[MPSCNNContext sharedInstance].device
396 convolutionDescriptor:desc
397 kernelWeights:(float*)packedWeights.data()
398 biasTerms:bias.defined() ? bias.data_ptr<float>() : nil
399 flags:MPSCNNConvolutionFlagsNone];
400#endif
401 }
402 [fc setClipRect:MTLRegionMake3D(0, 0, 0, 1, 1, N)];
403 [fc setOffset:{.x = static_cast<NSInteger>(X.width / 2),
404 .y = static_cast<NSInteger>(X.height / 2),
405 .z = 0}];
406 std::vector<int64_t> outputSize = {N, oC, 1, 1};
407 MetalTensor mt{{N, oC}};
408
409 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
410 mt.texture()->allocateTemporaryTextureStorage(outputSize, commandBuffer);
411 MPSImage* Y = imageFromMetalTensor(mt);
412 [fc encodeToCommandBuffer:commandBuffer.buffer
413 sourceImage:X
414 destinationImage:Y];
415 auto output = MetalTensor::toTensor(std::move(mt), input.options());
416 return output;
417}
418
419bool broadCastFirstInput(const Tensor& input1, const Tensor& input2) {
420 if (
421 (input2.sizes()[2] > 1 && input1.sizes()[2] == 1) ||
422 (input2.sizes()[3] > 1 && input1.sizes()[3] == 1)
423 ) {
424 return true;
425 }
426 return false;
427}
428
429API_AVAILABLE(ios(10.0), macos(10.13))
430Tensor binaryElementwiseShaderKernel(
431 const Tensor& input1,
432 const Tensor& input2,
433 NSString* arrayKernel,
434 NSString* nonarrayKernel) {
435 MPSImage* X1 = imageFromTensor(input1);
436 MPSImage* X2 = imageFromTensor(input2);
437 std::vector<int64_t> outputSize = input1.sizes().vec();
438 if (broadCastFirstInput(input1, input2)) {
439 outputSize = input2.sizes().vec();
440 }
442 MetalCommandBuffer* cb1 = commandBufferFromInputTensor(input1);
443 MetalCommandBuffer* cb2 = commandBufferFromInputTensor(input2);
444 TORCH_CHECK([cb1 isEqual:cb2], @"inputs have different command buffer");
445 mt.texture()->allocateTemporaryTextureStorage(outputSize, cb1);
446 MPSImage* Y = imageFromMetalTensor(mt);
447 id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
448 pipelineState:kernelFor(X1, arrayKernel, nonarrayKernel)];
449 id<MTLComputeCommandEncoder> encoder = [cb1.buffer computeCommandEncoder];
450 [encoder setComputePipelineState:state];
451 [encoder setTexture:[X1 texture] atIndex:0];
452 [encoder setTexture:[X2 texture] atIndex:1];
453 [encoder setTexture:[Y texture] atIndex:2];
454 const auto& launchParams = spatialPointwiseKernelLaunchParams(state, Y);
455 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
456 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
457 [encoder endEncoding];
458 [X1 markRead];
459 [X2 markRead];
460 auto output = MetalTensor::toTensor(std::move(mt), input1.options());
461 return output;
462}
463
464API_AVAILABLE(ios(10.0), macos(10.13))
465Tensor& binaryElementwiseShaderKernel_(
466 Tensor& input1,
467 const Tensor& input2,
468 NSString* arrayKernel,
469 NSString* nonarrayKernel) {
470 MPSImage* X1 = imageFromTensor(input1);
471 MPSImage* X2 = imageFromTensor(input2);
472 std::vector<int64_t> outputSize = input1.sizes().vec();
473 if (broadCastFirstInput(input1, input2)) {
474 outputSize = input2.sizes().vec();
475 }
476 MetalCommandBuffer* cb1 = commandBufferFromInputTensor(input1);
477 MetalCommandBuffer* cb2 = commandBufferFromInputTensor(input2);
478 TORCH_CHECK([cb1 isEqual:cb2], @"inputs have different command buffer");
479 MPSImage* Y = [MPSImage temporaryImageFromSize:outputSize commandBuffer:cb1];
480 id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
481 pipelineState:kernelFor(X1, arrayKernel, nonarrayKernel)];
482 id<MTLComputeCommandEncoder> encoder = [cb1.buffer computeCommandEncoder];
483 [encoder setComputePipelineState:state];
484 [encoder setTexture:[X1 texture] atIndex:0];
485 [encoder setTexture:[X2 texture] atIndex:1];
486 [encoder setTexture:[Y texture] atIndex:2];
487 const auto& launchParams = spatialPointwiseKernelLaunchParams(state, Y);
488 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
489 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
490 [encoder endEncoding];
491 [X1 markRead];
492 [X2 markRead];
493 MetalTensorImpl* impl = (MetalTensorImpl*)input1.unsafeGetTensorImpl();
494 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
495 metalTensor.texture()->copyFromTexture(Y);
496 return input1;
497}
498
499template <typename T>
500API_AVAILABLE(ios(11.3), macos(10.13))
501Tensor binaryElementwiseMPSCNNKernel(
502 const Tensor& input1,
503 const Tensor& input2) {
504 MPSImage* X1 = imageFromTensor(input1);
505 MPSImage* X2 = imageFromTensor(input2);
506 std::vector<int64_t> outputSize = input1.sizes().vec();
507 if (broadCastFirstInput(input1, input2)) {
508 outputSize = input2.sizes().vec();
509 }
511 MetalCommandBuffer* cb1 = commandBufferFromInputTensor(input1);
512 MetalCommandBuffer* cb2 = commandBufferFromInputTensor(input2);
513 TORCH_CHECK([cb1 isEqual:cb2], @"inputs have different command buffer");
514 mt.texture()->allocateTemporaryTextureStorage(outputSize, cb1);
515 MPSImage* Y = imageFromMetalTensor(mt);
516 T* kernel = [[T alloc]
517 initWithDevice:[MPSCNNContext sharedInstance].device];
518 kernel.primaryStrideInPixelsY = (NSUInteger)(input1.sizes()[2] == 1 ? 0 : 1);
519 kernel.primaryStrideInPixelsX = (NSUInteger)(input1.sizes()[3] == 1 ? 0 : 1);
520 kernel.secondaryStrideInPixelsY = (NSUInteger)(input2.sizes()[2] == 1 ? 0 : 1);
521 kernel.secondaryStrideInPixelsX = (NSUInteger)(input2.sizes()[3] == 1 ? 0 : 1);
522 [kernel encodeToCommandBuffer:cb1.buffer
523 primaryImage:X1
524 secondaryImage:X2
525 destinationImage:Y];
526 auto output = MetalTensor::toTensor(std::move(mt), input1.options());
527 return output;
528}
529
530template <typename T>
531API_AVAILABLE(ios(11.3), macos(10.13))
532Tensor& binaryElementwiseMPSCNNKernel_(
533 Tensor& input1,
534 const Tensor& input2) {
535 MPSImage* X1 = imageFromTensor(input1);
536 MPSImage* X2 = imageFromTensor(input2);
537 std::vector<int64_t> outputSize = input1.sizes().vec();
538 if (broadCastFirstInput(input1, input2)) {
539 outputSize = input2.sizes().vec();
540 }
542 MetalCommandBuffer* cb1 = commandBufferFromInputTensor(input1);
543 MetalCommandBuffer* cb2 = commandBufferFromInputTensor(input2);
544 TORCH_CHECK([cb1 isEqual:cb2], @"inputs have different command buffer");
545 mt.texture()->allocateTemporaryTextureStorage(outputSize, cb1);
546 MPSImage* Y = imageFromMetalTensor(mt);
547 T* kernel = [[T alloc]
548 initWithDevice:[MPSCNNContext sharedInstance].device];
549 [kernel encodeToCommandBuffer:cb1.buffer
550 primaryImage:X1
551 secondaryImage:X2
552 destinationImage:Y];
553 MetalTensorImpl* impl = (MetalTensorImpl*)input1.unsafeGetTensorImpl();
554 MetalTensor& metalTensor = impl->unsafe_opaque_handle();
555 metalTensor.texture()->copyFromTexture(Y);
556 return input1;
557}
558
559API_AVAILABLE(ios(10.0), macos(10.13))
560Tensor add(const Tensor& input1, const Tensor& input2) {
561 if (@available(iOS 11.3, *)) {
562 return binaryElementwiseMPSCNNKernel<MPSCNNAdd>(input1, input2);
563 }
564 return binaryElementwiseShaderKernel(
565 input1, input2, @"elementwise_add", @"elementwise_add_nonarray");
566}
567
568API_AVAILABLE(ios(10.0), macos(10.13))
569Tensor& add_(Tensor& input1, const Tensor& input2) {
570 if (@available(iOS 11.3, *)) {
571 return binaryElementwiseMPSCNNKernel_<MPSCNNAdd>(input1, input2);
572 }
573 return binaryElementwiseShaderKernel_(
574 input1, input2, @"elementwise_add", @"elementwise_add_nonarray");
575}
576
577API_AVAILABLE(ios(10.0), macos(10.13))
578Tensor sub(const Tensor& input1, const Tensor& input2) {
579 if (@available(iOS 11.3, *)) {
580 return binaryElementwiseMPSCNNKernel<MPSCNNSubtract>(input1, input2);
581 }
582 return binaryElementwiseShaderKernel(
583 input1, input2, @"elementwise_sub", @"elementwise_sub_nonarray");
584}
585
586API_AVAILABLE(ios(10.0), macos(10.13))
587Tensor mul(const Tensor& input1, const Tensor& input2) {
588 if (@available(iOS 11.3, *)) {
589 return binaryElementwiseMPSCNNKernel<MPSCNNMultiply>(input1, input2);
590 }
591 return binaryElementwiseShaderKernel(
592 input1, input2, @"elementwise_mul", @"elementwise_mul_nonarray");
593}
594
595API_AVAILABLE(ios(10.0), macos(10.13))
596Tensor t(const Tensor& input) {
597 auto strides = input.strides().vec();
598 auto sizes = input.sizes().vec();
599 MPSImage* X = imageFromTensor(input);
600 TORCH_CHECK(X.numberOfImages == 1);
601 TORCH_CHECK(X.featureChannels == 1);
602 MetalTensor mt({sizes[1], sizes[0]});
603 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
604 mt.texture()->allocateTemporaryTextureStorage(
605 {1, 1, sizes[1], sizes[0]}, commandBuffer);
606 MPSImage* Y = imageFromMetalTensor(mt);
607 MPSImageTranspose* transpose = [[MPSImageTranspose alloc]
608 initWithDevice:[MPSCNNContext sharedInstance].device];
609 [transpose encodeToCommandBuffer:commandBuffer.buffer
610 sourceImage:X
611 destinationImage:Y];
612 auto output = MetalTensor::toTensor(std::move(mt), input.options());
613 return output;
614}
615
616API_AVAILABLE(ios(10.0), macos(10.13))
617Tensor view(const Tensor& input, IntArrayRef size) {
618 auto inferred_size = at::infer_size(size, input.numel());
619 auto stride =
620 at::detail::computeStride(input.sizes(), input.strides(), inferred_size);
622 stride.has_value(),
623 "view size is "
624 "not compatible with input tensor's size and stride (at least one dimension"
625 " spans across two contiguous subspaces). Use .reshape(...) instead.");
626 auto stride_value = *stride;
627
628 MPSImage* X = imageFromTensor(input);
629 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
630 MetalTensor mt{inferred_size, stride_value};
631 mt.texture()->setCommandBuffer(commandBuffer);
632 mt.texture()->copyFromTexture(X);
633 auto output = MetalTensor::toTensor(std::move(mt), input.options());
634 return output;
635}
636
637Tensor reshape(const Tensor& input, IntArrayRef shape) {
638 return view(input, shape);
639}
640
641API_AVAILABLE(ios(10.0), macos(10.13))
642Tensor log_softmax_int(const Tensor& input) {
643 MPSImage* X = imageFromTensor(input);
644 TORCH_CHECK(X.height == 1 && X.width == 1);
645 std::vector<int64_t> outputSize = input.sizes().vec();
646 MPSCNNLogSoftMax* logSoftmax = [[MPSCNNLogSoftMax alloc]
647 initWithDevice:[MPSCNNContext sharedInstance].device];
648
650 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
651 mt.texture()->allocateTemporaryTextureStorage(
652 {outputSize[0], outputSize[1], 1, 1}, commandBuffer);
653 MPSImage* Y = imageFromMetalTensor(mt);
654 [logSoftmax encodeToCommandBuffer:commandBuffer.buffer
655 sourceImage:X
656 destinationImage:Y];
657 auto output = MetalTensor::toTensor(std::move(mt), input.options());
658 return output;
659}
660
661API_AVAILABLE(ios(10.0), macos(10.13))
663 const Tensor& input,
664 c10::optional<IntArrayRef> output_size,
665 c10::optional<ArrayRef<double>> scale_factors) {
666 auto osize =
667 upsample::compute_output_size(input.sizes(), output_size, scale_factors);
668 auto scale_h = upsample::get_scale_value(scale_factors, 0);
669 auto scale_w = upsample::get_scale_value(scale_factors, 1);
670 int64_t output_height = osize[0];
671 int64_t output_width = osize[1];
672 int64_t nbatch = input.size(0);
673 int64_t channels = input.size(1);
674 int64_t input_height = input.size(2);
675 int64_t input_width = input.size(3);
676 upsample_2d_shape_check(
677 input,
678 Tensor(),
679 nbatch,
680 channels,
681 input_height,
682 input_width,
683 output_height,
684 output_width);
685 std::vector<int64_t> outputSizes{
686 nbatch, channels, output_height, output_width};
687 MPSImage* X = imageFromTensor(input);
688 MetalTensor mt{outputSizes};
689 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
690 mt.texture()->allocateTemporaryTextureStorage(outputSizes, commandBuffer);
691 MPSImage* Y = imageFromMetalTensor(mt);
692 if (@available(iOS 11.0, *)) {
693 MPSCNNUpsamplingNearest* kernel = [[MPSCNNUpsamplingNearest alloc]
694 initWithDevice:[MPSCNNContext sharedInstance].device
695 integerScaleFactorX:(NSUInteger)scale_w.value()
696 integerScaleFactorY:(NSUInteger)scale_h.value()];
697 [kernel encodeToCommandBuffer:commandBuffer.buffer
698 sourceImage:X
699 destinationImage:Y];
700 } else {
701 NSUInteger sh = scale_h.value() * 10000;
702 NSUInteger sw = scale_w.value() * 10000;
703 id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
705 Y,
706 @"resize_nearest",
707 @"resize_nearest_nonarray")
708 Constants:@[
709 @(output_height),
710 @(output_width),
711 @(sh),
712 @(sw)
713 ]];
714 id<MTLComputeCommandEncoder> encoder =
715 [commandBuffer.buffer computeCommandEncoder];
716 [encoder setComputePipelineState:state];
717 [encoder setTexture:[X texture] atIndex:0];
718 [encoder setTexture:[Y texture] atIndex:1];
719 const auto& launchParams = spatialPointwiseKernelLaunchParams(state, Y);
720 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
721 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
722 [encoder endEncoding];
723 [X markRead];
724 [Y markRead];
725 }
726 auto output = MetalTensor::toTensor(std::move(mt), input.options());
728}
729
730Tensor flatten_using_ints(
731 const Tensor& input,
732 int64_t start_dim,
733 int64_t end_dim) {
734 start_dim = maybe_wrap_dim(start_dim, input.dim());
735 end_dim = maybe_wrap_dim(end_dim, input.dim());
737 start_dim <= end_dim,
738 "flatten() has invalid args: start_dim cannot come after end_dim");
739 std::vector<int64_t> shape;
740 if (input.dim() == 0) {
741 return input.reshape({1});
742 }
743 if (start_dim == end_dim) {
744 return input;
745 }
746 auto slice_numel =
747 prod_intlist(input.sizes().slice(start_dim, end_dim - start_dim + 1));
748 shape.reserve(input.dim() - end_dim + start_dim);
749 for (int64_t i = 0; i < start_dim; i++) {
750 shape.push_back(input.size(i));
751 }
752 shape.push_back(slice_numel);
753 for (int64_t i = end_dim + 1; i < input.dim(); i++) {
754 shape.push_back(input.size(i));
755 }
756 return input.reshape(shape);
757}
758
759Tensor copy_to_host(const Tensor& input) {
760 MPSImage* X = imageFromTensor(input);
761 MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input);
762 auto&& sizes = [X sizes];
764 mt.texture()->setCommandBuffer(commandBuffer);
765 mt.texture()->allocateTextureStorage(sizes);
766 MPSImage* Y = imageFromMetalTensor(mt);
767 id<MTLComputeCommandEncoder> encoder =
768 [commandBuffer.buffer computeCommandEncoder];
769 id<MTLComputePipelineState> state = [[MPSCNNContext sharedInstance]
771 X, @"copy", @"copy_nonarray")
772 Constants:@[
773 @(X.featureChannels),
774 @(X.height),
775 @(X.width)
776 ]];
777
778 [encoder setComputePipelineState:state];
779 [encoder setTexture:[X texture] atIndex:0];
780 [encoder setTexture:[Y texture] atIndex:1];
781
782 const auto& launchParams =
784 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
785 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
786 [encoder endEncoding];
787 [X markRead];
788 auto output = MetalTensor::toTensor(std::move(mt), input.options());
790}
791
792}
793}
794}
795}
#define TORCH_CHECK(cond,...)
Definition: Exception.h:361
id< MTLComputePipelineState > pipelineState
Definition: MPSCNNContext.h:13
BOOL available
Definition: MPSCNNContext.h:12
id< MTLComputePipelineState > specializedPipelineState
Definition: MPSCNNContext.h:16
instancetype sharedInstance
Definition: MPSCNNContext.h:11
id initWithWeights
Definition: MPSCNNConvOp.h:12
DimVector shape
c10::SmallVector< int64_t, 5 > sizes
c10::SmallVector< int64_t, 5 > strides
Args({2<< 5}) -> Args({2<< 8}) ->Args({2<< 12}) ->Args({2<< 14})
IntArrayRef sizes() const
Definition: TensorBody.h:227
void * data_ptr() const
Definition: TensorBody.h:389
Device device() const
Returns a Tensor's device.
bool defined() const
Definition: TensorBody.h:154
static at::Tensor toTensor(MetalTensor &&mt, const TensorOptions &options)
Definition: MetalTensor.mm:63
MPSImageWrapper * texture() const
Definition: MetalTensor.mm:120
id< MTLCommandBuffer > buffer
std::vector< int > weights
Definition: ir_nodes.cpp:1138
c10::optional< std::vector< int64_t > > computeStride(IntArrayRef oldshape, IntArrayRef oldstride, IntArrayRef newshape)
LaunchParams spatialPointwiseKernelLaunchParams(id< MTLComputePipelineState > pipeline, MPSImage *im)
Definition: MPSCNN.mm:38
IntArrayRef IntArrayRef IntArrayRef IntArrayRef bool ceil_mode
Definition: MPSCNNOps.mm:134
const Tensor const c10::optional< at::Tensor > & bias
Definition: MPSCNNOps.mm:54
Tensor & hardsigmoid_(Tensor &input)
Tensor sub(const Tensor &input1, const Tensor &input2)
Tensor & relu_(Tensor &input)
API_AVAILABLE(ios(10.0), macos(10.13)) LaunchParams spatialPointwiseKernelLaunchParams(id< MTLComputePipelineState > pipeline
Tensor reshape(const Tensor &input, IntArrayRef shape)
Tensor global_avg_pool2d(const Tensor &input, IntArrayRef output_size)
Tensor copy_to_host(const Tensor &input)
Tensor mul(const Tensor &input1, const Tensor &input2)
Tensor & add_(Tensor &input1, const Tensor &input2)
mt texture() -> allocateTemporaryTextureStorage(outputSize, commandBuffer)
Tensor conv2d(const Tensor &input, const Tensor &weight, const c10::optional< at::Tensor > &bias, const Conv2DParams &params, NeuronType t=NeuronType::None)
Tensor flatten_using_ints(const Tensor &input, int64_t start_dim, int64_t end_dim)
Tensor addmm(const Tensor &bias, const Tensor &input, const Tensor &weight)
NSString * arrayKernel
Definition: MPSCNN.h:23
Tensor upsample_nearest2d_vec(const Tensor &input, c10::optional< IntArrayRef > output_size, c10::optional< ArrayRef< double > > scale_factors)
Tensor add(const Tensor &input1, const Tensor &input2)
Tensor log_softmax_int(const Tensor &input)
Tensor max_pool2d(const Tensor &input, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode)
at::MetalTensorImpl< MetalTensor > MetalTensorImpl
Definition: MPSCNNOps.mm:24
Tensor relu(const Tensor &input)
at::native::metal::MetalTensor MetalTensor
Definition: MPSCNNOps.mm:23
MPSCNNPoolingMax * pool
Definition: MPSCNNOps.mm:148
Tensor view(const Tensor &input, IntArrayRef size)
MetalCommandBuffer * commandBuffer
Definition: MPSCNNOps.mm:74
NSString * kernelFor(MPSImage *X, NSString *arrayKernel, NSString *nonArrayKernel)
Definition: MPSCNN.mm:28
Tensor & hardtanh_(Tensor &input, Scalar min_val, Scalar max_val)
const Tensor & weight
Definition: MPSCNNOps.mm:53
Tensor sigmoid(const Tensor &input)
const Tensor const c10::optional< at::Tensor > const Conv2DParams NeuronType t
Definition: MPSCNNOps.mm:56
Tensor & hardswish_(Tensor &input)
const Tensor const c10::optional< at::Tensor > const Conv2DParams & params
Definition: MPSCNNOps.mm:55
IntArrayRef IntArrayRef IntArrayRef padding
Definition: MPSCNNOps.mm:132
IntArrayRef IntArrayRef IntArrayRef IntArrayRef dilation
Definition: MPSCNNOps.mm:133
IntArrayRef IntArrayRef stride
Definition: MPSCNNOps.mm:131
NeuronType neuronType(const Conv2dOpContext &context)
static std::vector< float > permuteWeights(const float *src, const std::vector< int64_t > &sizes)
TORCH_API c10::SmallVector< int64_t, 3 > compute_output_size(c10::IntArrayRef input_size, c10::optional< c10::IntArrayRef > output_size, c10::optional< c10::ArrayRef< double > > scale_factors)
Definition: UpSample.cpp:9
Tensor transpose(const Tensor &self, Dimname dim0, Dimname dim1)
int64_t size(const Tensor &self, int64_t dim)
Tensor max(const Tensor &self, const Tensor &other)
Definition: BinaryOps.cpp:914
Tensor min(const Tensor &self, const Tensor &other)
Definition: BinaryOps.cpp:953
Tensor tanh(const Tensor &self)
Definition: UnaryOps.cpp:467
Distributions kernel adapted from THRandom.cpp The kernels try to follow std::random distributions si...
std::vector< int64_t > infer_size(IntArrayRef a, IntArrayRef b)
Definition: ExpandUtils.cpp:6
static int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true)
Definition: WrapDimUtils.h:9
int64_t prod_intlist(const C &container)
Definition: Utils.h:99
constexpr remove_reference_t< T > && move(T &&t) noexcept
Definition: variant.h:418
This file contains functionality to take a C++ function and infer its c10::FunctionSchema.
Definition: alias_info.h:7
constexpr DeviceType kCPU
Definition: DeviceType.h:36
ArrayRef< int64_t > IntArrayRef
Definition: ArrayRef.h:273
given param return((1+momentum) *m_new - momentum *m, m_new, param) Output is(grad
float T
Definition: cc_bmm_bg_op.h:11
default where $N$ is batch $C$ is number of $H$ is spatial height
we first initialize the output tensor to all and then do accumulation Any further calls to the input
stride pad lengths and dilation $L_p$ pooling consists of taking the $L_p$ norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob for further processing Pooling layers reduce the spatial dimensionality of the input blob Each of the output blob s dimensions will reduce according kernel
Definition: lp_pool_op.cc:256
dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has where N is the batch C is the number of channels
Definition: pad_op.cc:440
required base learning rate default used only for inv policy type default sampling rate on iterations default True in alter policy int64_t
where N is the number of elements in the H and W are the height and width
Module caffe2.python.context.
Module caffe2.python.helpers.fc.
STL namespace.
at::ArrayRef< T > ArrayRef
Definition: ir.h:146
benchmark::State & state
const Tensor * tensor
DeviceType type() const noexcept
Returns the type of device this is.
Definition: Device.h:65