28 return tensor.texture()->image();
36 return imageFromMetalTensor(metalTensor);
46 TORCH_CHECK(cmdBuffer,
@"Command Buffer can't be nil!");
58 MPSImage*
X = imageFromTensor(
input);
76 MPSImage*
Y = imageFromMetalTensor(
mt);
84 MPSImage*
X = imageFromTensor(
input);
94 float*
w =
context.weight.data_ptr<
float>();
95 float*
b =
context.bias.has_value() ? ((*
context.bias).data_ptr<float>())
98 context.conv2dOp = (
void*)CFBridgingRetain(
op);
101 CFBridgingRelease(
res);
110 MPSImage*
Y1 = imageFromMetalTensor(
mt);
114 MPSImage* Y2 = [MPSImage temporaryImageFromSize:[
Y1 sizes]
116 float min =
context.output_min.value().toFloat();
117 float max =
context.output_max.value().toFloat();
147 MPSImage*
X = imageFromTensor(
input);
148 MPSCNNPoolingMax*
pool = [[MPSCNNPoolingMax alloc]
153 strideInPixelsY:
stride[1]];
154 [
pool setEdgeMode:MPSImageEdgeModeClamp];
168 MPSImage*
Y = imageFromMetalTensor(
mt);
178 MPSImage*
X = imageFromTensor(
input);
179 MPSCNNPoolingAverage*
pool = [[MPSCNNPoolingAverage alloc]
182 kernelHeight:
X.height
183 strideInPixelsX:
X.width
184 strideInPixelsY:
X.height];
185 [
pool setEdgeMode:MPSImageEdgeModeClamp];
186 [
pool setOffset:{.x =
static_cast<NSInteger
>(
X.width / 2),
187 .
y =
static_cast<NSInteger
>(
X.height / 2),
194 MPSImage*
Y = imageFromMetalTensor(
mt);
204 MPSImage*
X = imageFromTensor(
input);
206 std::vector<int64_t> textureSize =
outputSize;
207 if (
input.dim() == 2) {
213 MPSImage*
Y = imageFromMetalTensor(
mt);
222Tensor& neuronKernel_(Tensor&
input, MPSCNNNeuron* neuron) {
223 MPSImage*
X = imageFromTensor(
input);
225 std::vector<int64_t> textureSize =
outputSize;
226 if (
input.dim() == 2) {
230 MPSImage*
Y = [MPSImage temporaryImageFromSize:
input.sizes().vec()
237 metalTensor.
texture()->copyFromTexture(
Y);
258 MPSImage*
X = imageFromTensor(
input);
260 std::vector<int64_t> textureSize =
outputSize;
261 if (
input.dim() == 2) {
267 MPSImage*
Y = imageFromMetalTensor(
mt);
268 static dispatch_once_t onceToken;
269 static MPSCNNNeuronHardSigmoid* neuron = nil;
270 dispatch_once(&onceToken, ^{
271 neuron = [[MPSCNNNeuronHardSigmoid alloc]
281 metalTensor.
texture()->copyFromTexture(
Y);
292 MPSImage*
X = imageFromTensor(
input);
294 MPSImage*
Y = [MPSImage temporaryImageFromSize:
input.sizes().vec()
296 float min = min_val.toFloat();
297 float max = max_val.toFloat();
303 metalTensor.
texture()->copyFromTexture(
Y);
308 MPSImage*
X = imageFromTensor(
input);
311 std::vector<int64_t> textureSize =
outputSize;
312 if (
input.dim() == 2) {
316 id<MTLComputeCommandEncoder> encoder =
320 X,
@"hardswish",
@"hardswish_nonarray")
322 @(
X.featureChannels),
327 [encoder setComputePipelineState:
state];
328 [encoder setTexture:[
X texture] atIndex:0];
329 [encoder setTexture:[
Y texture] atIndex:1];
331 const auto& launchParams =
333 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
334 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
335 [encoder endEncoding];
339 metalTensor.
texture()->copyFromTexture(
Y);
367 MPSImage*
X = imageFromTensor(
input);
375 MPSCNNConvolutionDescriptor* desc =
376 [MPSCNNConvolutionDescriptor cnnConvolutionDescriptorWithKernelWidth:
kW
378 inputFeatureChannels:
iC
379 outputFeatureChannels:
oC
381 desc.strideInPixelsX = 1;
382 desc.strideInPixelsY = 1;
387 MPSCNNFullyConnected*
fc = nil;
389 fc = [[MPSCNNFullyConnected alloc]
394 fc = [[MPSCNNFullyConnected alloc]
396 convolutionDescriptor:desc
399 flags:MPSCNNConvolutionFlagsNone];
402 [
fc setClipRect:MTLRegionMake3D(0, 0, 0, 1, 1,
N)];
403 [
fc setOffset:{.x =
static_cast<NSInteger
>(
X.width / 2),
404 .
y =
static_cast<NSInteger
>(
X.height / 2),
411 MPSImage*
Y = imageFromMetalTensor(
mt);
419bool broadCastFirstInput(
const Tensor& input1,
const Tensor& input2) {
421 (input2.sizes()[2] > 1 && input1.sizes()[2] == 1) ||
422 (input2.sizes()[3] > 1 && input1.sizes()[3] == 1)
430Tensor binaryElementwiseShaderKernel(
431 const Tensor& input1,
432 const Tensor& input2,
434 NSString* nonarrayKernel) {
435 MPSImage*
X1 = imageFromTensor(input1);
436 MPSImage*
X2 = imageFromTensor(input2);
437 std::vector<int64_t>
outputSize = input1.sizes().vec();
438 if (broadCastFirstInput(input1, input2)) {
444 TORCH_CHECK([cb1 isEqual:cb2],
@"inputs have different command buffer");
446 MPSImage*
Y = imageFromMetalTensor(
mt);
449 id<MTLComputeCommandEncoder> encoder = [cb1.
buffer computeCommandEncoder];
450 [encoder setComputePipelineState:
state];
451 [encoder setTexture:[
X1 texture] atIndex:0];
452 [encoder setTexture:[
X2 texture] atIndex:1];
453 [encoder setTexture:[
Y texture] atIndex:2];
455 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
456 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
457 [encoder endEncoding];
465Tensor& binaryElementwiseShaderKernel_(
467 const Tensor& input2,
469 NSString* nonarrayKernel) {
470 MPSImage*
X1 = imageFromTensor(input1);
471 MPSImage*
X2 = imageFromTensor(input2);
472 std::vector<int64_t>
outputSize = input1.sizes().vec();
473 if (broadCastFirstInput(input1, input2)) {
478 TORCH_CHECK([cb1 isEqual:cb2],
@"inputs have different command buffer");
482 id<MTLComputeCommandEncoder> encoder = [cb1.
buffer computeCommandEncoder];
483 [encoder setComputePipelineState:
state];
484 [encoder setTexture:[
X1 texture] atIndex:0];
485 [encoder setTexture:[
X2 texture] atIndex:1];
486 [encoder setTexture:[
Y texture] atIndex:2];
488 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
489 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
490 [encoder endEncoding];
495 metalTensor.
texture()->copyFromTexture(
Y);
501Tensor binaryElementwiseMPSCNNKernel(
502 const Tensor& input1,
503 const Tensor& input2) {
504 MPSImage*
X1 = imageFromTensor(input1);
505 MPSImage*
X2 = imageFromTensor(input2);
506 std::vector<int64_t>
outputSize = input1.sizes().vec();
507 if (broadCastFirstInput(input1, input2)) {
513 TORCH_CHECK([cb1 isEqual:cb2],
@"inputs have different command buffer");
515 MPSImage*
Y = imageFromMetalTensor(
mt);
518 kernel.primaryStrideInPixelsY = (NSUInteger)(input1.sizes()[2] == 1 ? 0 : 1);
519 kernel.primaryStrideInPixelsX = (NSUInteger)(input1.sizes()[3] == 1 ? 0 : 1);
520 kernel.secondaryStrideInPixelsY = (NSUInteger)(input2.sizes()[2] == 1 ? 0 : 1);
521 kernel.secondaryStrideInPixelsX = (NSUInteger)(input2.sizes()[3] == 1 ? 0 : 1);
532Tensor& binaryElementwiseMPSCNNKernel_(
534 const Tensor& input2) {
535 MPSImage*
X1 = imageFromTensor(input1);
536 MPSImage*
X2 = imageFromTensor(input2);
537 std::vector<int64_t>
outputSize = input1.sizes().vec();
538 if (broadCastFirstInput(input1, input2)) {
544 TORCH_CHECK([cb1 isEqual:cb2],
@"inputs have different command buffer");
546 MPSImage*
Y = imageFromMetalTensor(
mt);
555 metalTensor.
texture()->copyFromTexture(
Y);
562 return binaryElementwiseMPSCNNKernel<MPSCNNAdd>(input1, input2);
564 return binaryElementwiseShaderKernel(
565 input1, input2,
@"elementwise_add",
@"elementwise_add_nonarray");
569Tensor&
add_(Tensor& input1,
const Tensor& input2) {
571 return binaryElementwiseMPSCNNKernel_<MPSCNNAdd>(input1, input2);
573 return binaryElementwiseShaderKernel_(
574 input1, input2,
@"elementwise_add",
@"elementwise_add_nonarray");
580 return binaryElementwiseMPSCNNKernel<MPSCNNSubtract>(input1, input2);
582 return binaryElementwiseShaderKernel(
583 input1, input2,
@"elementwise_sub",
@"elementwise_sub_nonarray");
589 return binaryElementwiseMPSCNNKernel<MPSCNNMultiply>(input1, input2);
591 return binaryElementwiseShaderKernel(
592 input1, input2,
@"elementwise_mul",
@"elementwise_mul_nonarray");
599 MPSImage*
X = imageFromTensor(
input);
604 mt.
texture()->allocateTemporaryTextureStorage(
606 MPSImage*
Y = imageFromMetalTensor(
mt);
607 MPSImageTranspose*
transpose = [[MPSImageTranspose alloc]
624 "not compatible with input tensor's size and stride (at least one dimension"
625 " spans across two contiguous subspaces). Use .reshape(...) instead.");
626 auto stride_value = *
stride;
628 MPSImage*
X = imageFromTensor(
input);
643 MPSImage*
X = imageFromTensor(
input);
646 MPSCNNLogSoftMax* logSoftmax = [[MPSCNNLogSoftMax alloc]
651 mt.
texture()->allocateTemporaryTextureStorage(
653 MPSImage*
Y = imageFromMetalTensor(
mt);
668 auto scale_h = upsample::get_scale_value(scale_factors, 0);
669 auto scale_w = upsample::get_scale_value(scale_factors, 1);
670 int64_t output_height = osize[0];
671 int64_t output_width = osize[1];
676 upsample_2d_shape_check(
685 std::vector<int64_t> outputSizes{
686 nbatch,
channels, output_height, output_width};
687 MPSImage*
X = imageFromTensor(
input);
691 MPSImage*
Y = imageFromMetalTensor(
mt);
693 MPSCNNUpsamplingNearest*
kernel = [[MPSCNNUpsamplingNearest alloc]
695 integerScaleFactorX:(NSUInteger)scale_w.value()
696 integerScaleFactorY:(NSUInteger)scale_h.value()];
701 NSUInteger sh = scale_h.value() * 10000;
702 NSUInteger sw = scale_w.value() * 10000;
707 @"resize_nearest_nonarray")
714 id<MTLComputeCommandEncoder> encoder =
716 [encoder setComputePipelineState:
state];
717 [encoder setTexture:[
X texture] atIndex:0];
718 [encoder setTexture:[
Y texture] atIndex:1];
720 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
721 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
722 [encoder endEncoding];
737 start_dim <= end_dim,
738 "flatten() has invalid args: start_dim cannot come after end_dim");
739 std::vector<int64_t>
shape;
740 if (
input.dim() == 0) {
741 return input.reshape({1});
743 if (start_dim == end_dim) {
748 shape.reserve(
input.dim() - end_dim + start_dim);
749 for (
int64_t i = 0;
i < start_dim;
i++) {
752 shape.push_back(slice_numel);
760 MPSImage*
X = imageFromTensor(
input);
766 MPSImage*
Y = imageFromMetalTensor(
mt);
767 id<MTLComputeCommandEncoder> encoder =
771 X,
@"copy",
@"copy_nonarray")
773 @(
X.featureChannels),
778 [encoder setComputePipelineState:
state];
779 [encoder setTexture:[
X texture] atIndex:0];
780 [encoder setTexture:[
Y texture] atIndex:1];
782 const auto& launchParams =
784 [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
785 threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
786 [encoder endEncoding];
#define TORCH_CHECK(cond,...)
id< MTLComputePipelineState > pipelineState
id< MTLComputePipelineState > specializedPipelineState
instancetype sharedInstance
c10::SmallVector< int64_t, 5 > sizes
c10::SmallVector< int64_t, 5 > strides
Args({2<< 5}) -> Args({2<< 8}) ->Args({2<< 12}) ->Args({2<< 14})
IntArrayRef sizes() const
Device device() const
Returns a Tensor's device.
std::vector< int > weights
c10::optional< std::vector< int64_t > > computeStride(IntArrayRef oldshape, IntArrayRef oldstride, IntArrayRef newshape)
TORCH_API c10::SmallVector< int64_t, 3 > compute_output_size(c10::IntArrayRef input_size, c10::optional< c10::IntArrayRef > output_size, c10::optional< c10::ArrayRef< double > > scale_factors)
Tensor transpose(const Tensor &self, Dimname dim0, Dimname dim1)
int64_t size(const Tensor &self, int64_t dim)
Tensor max(const Tensor &self, const Tensor &other)
Tensor min(const Tensor &self, const Tensor &other)
Tensor tanh(const Tensor &self)
Distributions kernel adapted from THRandom.cpp The kernels try to follow std::random distributions si...
std::vector< int64_t > infer_size(IntArrayRef a, IntArrayRef b)
static int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true)
int64_t prod_intlist(const C &container)
constexpr remove_reference_t< T > && move(T &&t) noexcept
static constexpr bool impl()
This file contains functionality to take a C++ function and infer its c10::FunctionSchema.
constexpr DeviceType kCPU
ArrayRef< int64_t > IntArrayRef
given param return((1+momentum) *m_new - momentum *m, m_new, param) Output is(grad
default where $N$ is batch $C$ is number of $H$ is spatial height
we first initialize the output tensor to all and then do accumulation Any further calls to the input
stride pad lengths and dilation $L_p$ pooling consists of taking the $L_p$ norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob for further processing Pooling layers reduce the spatial dimensionality of the input blob Each of the output blob s dimensions will reduce according kernel
dimensions depend on whether the NCHW or NHWC operators are being used For in the the input has where N is the batch C is the number of channels
required base learning rate default used only for inv policy type default sampling rate on iterations default True in alter policy int64_t
where N is the number of elements in the H and W are the height and width
Module caffe2.python.context.
Module caffe2.python.helpers.fc.
at::ArrayRef< T > ArrayRef
DeviceType type() const noexcept
Returns the type of device this is.