pytorch  1.8.2
About: PyTorch provides Tensor computation (like NumPy) with strong GPU acceleration and Deep Neural Networks (in Python) built on a tape-based autograd system. LTS (Long Term Support) release.
  Fossies Dox: pytorch-1.8.2.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

caffe2.perfkernels.hp_emblookup_codegen Namespace Reference

Functions

def unroll (uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
 
def generic (IndexType, InType, OutType, use_weights, isa, fused, use_offsets)
 

Variables

dictionary sizeof = {"float": 4, "at::Half": 2, "uint8_t": 1}
 
 parser = argparse.ArgumentParser()
 
 help
 
 action
 
 opts = parser.parse_args()
 
 filename = opts.filename
 
list options
 
list code = []
 
string prefix = "Fused8BitRowwise" if opts.fused else ""
 
string fn_base
 
string suffix = "__avx2_fma"
 
string fn = "static bool " + fn_base + suffix
 
list args = []
 
tuple offset = (8 // sizeof[InType]) if opts.fused else 0
 
string extra_space = "\n "
 
string ret_string = " return " + fn_base + suffix + "<" + is_weight_positional + ">("
 

Function Documentation

◆ generic()

def caffe2.perfkernels.hp_emblookup_codegen.generic (   IndexType,
  InType,
  OutType,
  use_weights,
  isa,
  fused,
  use_offsets 
)

◆ unroll()

def caffe2.perfkernels.hp_emblookup_codegen.unroll (   uf,
  IndexType,
  InType,
  OutType,
  use_weights,
  isa,
  fused,
  use_offsets 
)

Variable Documentation

◆ action

caffe2.perfkernels.hp_emblookup_codegen.action

Definition at line 390 of file hp_emblookup_codegen.py.

◆ args

list caffe2.perfkernels.hp_emblookup_codegen.args = []

Definition at line 445 of file hp_emblookup_codegen.py.

◆ code

◆ extra_space

string caffe2.perfkernels.hp_emblookup_codegen.extra_space = "\n "

Definition at line 497 of file hp_emblookup_codegen.py.

◆ filename

string caffe2.perfkernels.hp_emblookup_codegen.filename = opts.filename

Definition at line 394 of file hp_emblookup_codegen.py.

◆ fn

string caffe2.perfkernels.hp_emblookup_codegen.fn = "static bool " + fn_base + suffix

Definition at line 442 of file hp_emblookup_codegen.py.

Referenced by torch._utils._accumulate(), torch.onnx.symbolic_opset9._adaptive_pool(), torch.nn.cpp.ModuleWrapper._apply(), torch.nn.modules.module.Module._apply(), torch.utils.hooks.BackwardHook._apply_on_tensors(), torch.jit.supported_ops._list_supported_ops(), torch.autograd.function._nested_map(), torch.jit._trace._script_if_tracing(), torch.multiprocessing.spawn._wrap(), torch::jit::mobile::Function.append_operator(), torch::jit::Module.apply(), torch.nn.modules.module.Module.apply(), torch::autograd::CopySlices.apply(), torch::utils.apply_(), c10d.assertCPU(), c10d.assertDense(), c10d.assertLayoutMatch(), c10d.assertNonEmpty(), c10d.assertRootRank(), c10d.assertRootTensor(), c10d.assertSameDevice(), c10d.assertSingleElement(), c10d.assertSingleElementInput(), c10d.assertSingleElementOutput(), c10d.assertSizesMatch(), c10d.assertTypeAndSizesMatch(), c10d.assertTypeMatch(), torch.distributed.rpc.functions.async_execution(), torch::jit::ClassNamespaceValue.attr(), at::RecordFunction.before(), torch::nn::utils::rnn.bind(), torch._jit_internal.boolean_dispatch(), torch::jit::SimpleValue.call(), torch::autograd.call_function(), torch::autograd.call_post_hooks(), torch::autograd.call_pre_hooks(), at::native.check_1d(), at::native.check_t(), at.checkAllSame(), torch::jit::Module.clone_impl(), c10d::ProcessGroupNCCL.collective(), c10::TupleType.compare(), torch::autograd::Engine.compute_dependencies(), torch::distributed::autograd::DistEngine.computeDependencies(), torch::autograd::impl.create_cpp_hook(), torch::jit::CompilationUnit.create_function(), at::cuda::detail::_stubs.cuLaunchKernel(), at::cuda::detail::_stubs.cuLinkAddData(), at::cuda::detail::_stubs.cuModuleLoadDataEx(), torch.customClassSchemasForBCCheck(), torch.serialization.default_restore_location(), torch::jit::CompilationUnit.define(), torch::jit::CompilationUnit.define_hooks(), torch::jit.emitBuiltinCall(), torch::autograd::Engine.evaluate_function(), torch::jit.extractClosure(), c10.filter(), torch::jit::mobile::CompilationUnit.find_function(), torch::jit::Object.find_method(), torch::jit::mobile::Module.find_method(), c10.fmap(), c10d.fmap(), ONNX_NAMESPACE::OpSet_PyTorch_ver1.ForEachSchema(), torch::autograd.functionToPyObject(), microbenchmarks.gen_binary_nnc_fun(), microbenchmarks.gen_binary_torch_fun(), microbenchmarks.gen_custom_torch_fun(), torch::jit::CompilationUnit.get_functions(), torch::jit::mobile::Module.get_methods(), torch.autograd.gradcheck.get_numerical_jacobian(), torch::autograd::VariableHooks.grad_fn(), torch.autograd.gradcheck.gradcheck(), freeze.indent_msg(), torch::autograd::GraphTask.init_to_execute(), torch::jit.initJitScriptBindings(), torch::jit.initPythonIRBindings(), torch.autograd.profiler.inputSizes(), torch::jit.jit_log_prefix(), caffe2::detail::ScopeGuardImpl< FunctionType >.makeFailsafe(), torch::jit::List< T >.map(), torch::jit::Maybe< T >.map(), torch::jit::Tree.map(), torch::jit::Compound.map(), torch::utils.map2_(), torch::utils.map_(), at::cuda::detail::_stubs.nvrtcCreateProgram(), torch.autograd.function.once_differentiable(), torch.onnx.symbolic_opset9.overload_by_arg_count(), torch.onnx.symbolic_helper.parse_args(), c10d::ProcessGroupNCCL.pointToPoint(), torch::autograd.profiler::ProfilerThreadLocalState.popRange(), c10d::Reducer.prepare_for_backward(), torch::jit::PythonPrintImpl.printRHS(), torch::autograd.profiler::ProfilerThreadLocalState.pushRange(), torch::utils.recursive_apply(), torch::jit::CompilationUnit.register_function(), torch::jit::mobile::CompilationUnit.register_function(), at::Tensor.register_hook(), torch.registerCustomClassMethod(), Notifier< T >.registerDestructorCallback(), torch::autograd.registerFunctionHook(), torch::jit::tensorexpr::RegisterNNCExternalFunction.RegisterNNCExternalFunction(), Notifier< T >.registerNotificationCallback(), caffe2.RepeatedMaskWithFunctor(), caffe2::ThreadPool.run(), microbenchmarks.run_benchmarks(), caffe2.contrib.gloo.gloo_test.TestCase.run_test_distributed(), caffe2.contrib.gloo.gloo_test.TestCase.run_test_locally(), caffe2.python.data_parallel_model_test.DataParallelModelTest.run_test_locally(), torch::jit::InterpreterStateImpl.runBuiltinFunction(), torch::jit::InterpreterStateImpl.runGraphFunction(), torch::jit::InterpreterStateImpl.runImpl(), torch.autograd.profiler.saveExtraArgs(), torch::autograd::GraphTask.set_exception(), torch::autograd::GraphTask.set_exception_without_signal(), torch.utils.hooks.BackwardHook.setup_input_hook(), torch.utils.hooks.BackwardHook.setup_output_hook(), torch::jit::tracer.setWarn(), torch.multiprocessing.spawn.spawn(), torch::autograd.THPCppFunction_name(), torch::autograd.THPCppFunction_register_hook(), torch::autograd.THPCppFunction_register_hook_dict(), THPFunction_next_functions(), torch::autograd::Engine.thread_on_exception(), torch::autograd::python::PythonEngine.thread_on_exception(), torch::jit.toBackendSelectiveImpl(), torch::detail::TorchLibraryInit.TorchLibraryInit(), torch::jit.tryToInferType(), torch.onnx.symbolic_opset9.wrap_logical_op_with_cast_to(), and wrap_tuple_fn().

◆ fn_base

string caffe2.perfkernels.hp_emblookup_codegen.fn_base
Initial value:
1= "{}EmbeddingLookupIdx_{}_{}_{}".format(
2 prefix, IndexTypeName, InTypeName, OutTypeName
3 )
constexpr Symbol format(static_cast< unique_t >(_keys::aten_format))

Definition at line 434 of file hp_emblookup_codegen.py.

◆ help

caffe2.perfkernels.hp_emblookup_codegen.help

Definition at line 389 of file hp_emblookup_codegen.py.

◆ offset

tuple caffe2.perfkernels.hp_emblookup_codegen.offset = (8 // sizeof[InType]) if opts.fused else 0

Definition at line 466 of file hp_emblookup_codegen.py.

Referenced by at.__printFormat(), torch::optim::LBFGS._add_grad(), at::native._cat_out_cpu(), at::native._pack_padded_sequence_backward(), at::native::vulkan::detail::VBuffer.addBufferMemoryBarrier(), dnnlowp.adjust_hist_to_include_zero(), caffe2::dataset_ops::TreeIterator.advance(), torch::jit::MemoryPlanner.allocate(), torch::autograd::CopySlices.apply(), at.apply_op(), caffe2::ATenOp< Context >.assignListStartingAt(), torch::jit::ConstantTableValue.attr(), at::native.batch_norm_cpu_inference_channels_last(), C10_DEFINE_bool(), at::native.canUse32BitIndexMath(), check_Execution_setInputFromMemory(), check_Execution_setOutputFromMemory(), check_Memory_createFromFd(), check_Model_setOperandValueFromMemory(), caffe2::BoundShapeInferencer.CheckAndSetTensorBoundShape(), clEnqueueFillBuffer(), clEnqueueMapBuffer(), clEnqueueReadBuffer(), clEnqueueWriteBuffer(), caffe2.compare(), caffe2::SumReduceDimsOp< Context, FIRSTDIMS, NORMALIZE >.Compute(), compute_clamp_contiguous(), compute_lut_contiguous(), compute_q8add_contiguous(), at::TensorIteratorBase.compute_strides(), caffe2::LarsOp< T, Context >.ComputeLearningRate(), c10d.computeLengthsAndOffsets(), caffe2::internal.ComputeQuantizedFusedParamsAVX2(), torch::jit::tensorexpr::TensorExprKernel.computeValue(), caffe2::RecurrentNetworkOp< Context >.constructAliases(), caffe2::RecurrentNetworkGradientOp< Context >.constructRecurrentGradients(), caffe2.convertToVector(), caffe2::math.CopyMatrix< std::uint16_t, CPUContext >(), at::native::vulkan::detail::ComputeUnit.createComputePipeline(), at::native.diag_embed(), at::native.diagflat(), at::native.diagonal(), at::native.diagonal_backward(), at.diagonal_backward_batching_rule(), at.diagonal_batching_rule(), caffe2::BoxWithNMSLimitOp< Context >.DoRunWithType(), caffe2::WhereOp< Context >.DoRunWithType(), caffe2::ExpandGradientOp< InputTypes, Context >.DoRunWithType(), caffe2::MergeDenseFeatureTensorsOp< Context >.DoRunWithType(), caffe2::PackRNNSequenceOpBase< Context, Forward >.DoRunWithType(), caffe2::SparseToDenseMaskOp< Context >.DoRunWithType(), caffe2::SparseToDenseMaskGradientOp< Context >.DoRunWithType(), caffe2::StringJoinOp< Context >.DoRunWithType(), cl::CommandQueue.enqueueFillBuffer(), at::native.fill_diagonal_(), caffe2::SumReducerGradient< T, Context >.fillGrad(), caffe2::MeanReducerGradient< T, Context >.fillGrad(), caffe2::WeightedSumReducerGradient< T, Context >.fillGrad(), caffe2::WeightedSumReducerGradient< T, Context >.fillGradWithMainInput(), c10d::Reducer.finalize_bucket_dense(), fake_fp16.fma_fp16(), at.forward(), caffe2.fp32_to_bfp16_round(), fromBuffer(), torch::jit::AliasDb.functionalNonEscapingListUse(), torch::FunctionSignature.FunctionSignature(), at::native.get_tril_size(), caffe2::math.Im2ColNdNCHW(), at.infer_size(), caffe2::BoundShapeInferencer.InferCommonOp(), at.inferExpandGeometry(), caffe2::BoundShapeInferencer.InferFC(), caffe2::ConvPoolOpBase< Context >.InferOutputSize(), caffe2::ConvPoolOpBase< Context >.InferOutputSize64(), c10d::Reducer.initialize_bucket_views(), c10d::Reducer.initialize_buckets(), torch::jit.initPythonIRBindings(), torch::jit.inlineConsecutiveIfs(), torch::jit::Node.insertOutput(), torch::jit.insertProfileNodesForSpecializeAutogradZero(), torch::jit::ProfilingRecord.insertShapeProfile(), torch::jit::ProfilingRecord.instrumentBlock(), torch::jit.lambdaLiftReverse(), torch::jit::fuser.launchFusion(), torch::jit::Source.lineno_for_offset(), caffe2::Int8ConvDNNLowpPackedWeightBlobShapeFunctions.LoadInfoOfBlob(), caffe2::Int8FCDNNLowpPackedWeightBlobShapeFunctions.LoadInfoOfBlob(), caffe2.LoadInt8TensorInfoOfBlob(), torch::jit.lower_graph(), c10d::Reducer.mark_variable_ready_dense(), torch::jit::tensorexpr::CacheReplacer.mutate(), caffe2::LogSumExpRangeReducerGradient< T, Context >.operator()(), caffe2::LogMeanExpRangeReducerGradient< T, Context >.operator()(), at::native::CompositeRandomAccessor< KeyAccessor, ValueAccessor, TupleInfo >.operator+(), at::native::ConstStridedRandomAccessor< T, index_t, PtrTraits >.operator+(), c10::impl::ListIterator< T, Iterator >.operator+(), at::native::CompositeRandomAccessor< KeyAccessor, ValueAccessor, TupleInfo >.operator+=(), at::native::ConstStridedRandomAccessor< T, index_t, PtrTraits >.operator+=(), c10::impl::ListIterator< T, Iterator >.operator+=(), at::native::CompositeRandomAccessor< KeyAccessor, ValueAccessor, TupleInfo >.operator-(), at::native::ConstStridedRandomAccessor< T, index_t, PtrTraits >.operator-(), c10::impl::ListIterator< T, Iterator >.operator-(), at::native::CompositeRandomAccessor< KeyAccessor, ValueAccessor, TupleInfo >.operator-=(), at::native::ConstStridedRandomAccessor< T, index_t, PtrTraits >.operator-=(), c10::impl::ListIterator< T, Iterator >.operator-=(), c10::impl::ListIterator< T, Iterator >.operator[](), c10d::Reducer.populate_bucket_views_out(), at::native.prelu_cpu_kernel_multi_weights(), torch::jit::PythonPrintImpl.printLoop(), caffe2::WeightedSumReducer< T, CPUContext >.process(), caffe2::QShapeInfo.QShapeInfo(), caffe2.QuantizeWeight(), caffe2::PatternNetTransform.ReplaceRule(), caffe2::BBoxTransformOp< T, Context >.RunOnDevice(), caffe2::L1DistanceGradientOp< T, Context >.RunOnDevice(), caffe2::CosineSimilarityOp< T, Context >.RunOnDevice(), caffe2::CosineSimilarityGradientOp< T, Context >.RunOnDevice(), caffe2::DotProductOp< T, Context >.RunOnDevice(), caffe2::DotProductGradientOp< T, Context >.RunOnDevice(), caffe2::WeightedSampleOp< T, Context >.RunOnDevice(), caffe2::TensorRTOp.RunOnDevice(), caffe2::SparseMatrixReshapeOp< Context >.RunOnDevice(), caffe2::LengthsRangeFillOp< Context >.RunOnDevice(), caffe2::PairWiseLossOp< T, Context >.RunOnDevice(), caffe2::PairWiseLossGradientOp< T, Context >.RunOnDevice(), caffe2::RowMulOp< T, Context >.RunOnDevice(), caffe2::ReduceTailSumOp< T, Context >.RunOnDevice(), caffe2::LengthsToRangesOp< Context >.RunOnDevice(), caffe2::LengthsToOffsetsOp< Context >.RunOnDevice(), caffe2::LRNGradientOp< T, Context >.RunOnDeviceWithOrderNCHW(), caffe2::DeformConvOp< T, Context >.RunOnDeviceWithOrderNCHW(), caffe2::DeformConvGradientOp< T, Context >.RunOnDeviceWithOrderNCHW(), caffe2::LRNGradientOp< T, Context >.RunOnDeviceWithOrderNHWC(), torch::jit::PythonPrintImpl.scanLongInlines(), caffe2::VideoIOContext.seek(), caffe2::VideoIOContext.seekFile(), caffe2::VideoIOContext.seekMemory(), setFromFile(), THCTensor_canUse32BitIndexMath(), at::native.tril_indices_cpu(), at::native.triu_indices_cpu(), torch::jit::MutationRemover.tryMakeUnaliasedIfOutputAndMutationAtomic(), torch::utils.unflatten_dense_tensors(), caffe2.uniformQuantize2b1b(), torch::jit::ConcreteSourceRangeUnpickler.unpickle(), at::native.view_tensor(), and caffe2.WriteToDB().

◆ options

list caffe2.perfkernels.hp_emblookup_codegen.options
Initial value:
1= [
2 ["int32_t", "int", "float", "float", "float", "float"],
3 ["int64_t", "int64_t", "float", "float", "float", "float"],
4 ["int32_t", "int", "half", "at::Half", "float", "float"],
5 ["int64_t", "int64_t", "half", "at::Half", "float", "float"],
6 ["int32_t", "int", "uint8_t", "uint8_t", "float", "float"],
7 ["int64_t", "int64_t", "uint8_t", "uint8_t", "float", "float"],
8]

Definition at line 406 of file hp_emblookup_codegen.py.

Referenced by at::native._allocate_or_resize_output_with_indices(), at::native._allreduce_return_trivial(), at::native._linalg_cond_empty_matrix(), at::native._linalg_cond_exception_helper(), at::native._linalg_qr_helper_cpu(), at::native._lu_with_info_cpu(), torch::nn::ConvTransposeNdImpl< D, Derived >._output_padding(), at::native._reduction_with_indices_allocate_or_resize_output(), at::native._s_where(), at::native._shape_as_tensor(), at::native._standard_gamma_grad_cpu(), at::native._syevd_helper_cpu(), at::native._symeig_helper_cpu(), at::native::legacy::cpu._th_std(), at::native::legacy::cpu._th_var(), at::native._triangular_solve_helper_cpu(), torch::nn::functional.adaptive_avg_pool1d(), torch::nn::functional.adaptive_avg_pool2d(), torch::nn::functional.adaptive_avg_pool3d(), torch::nn::functional.adaptive_max_pool1d(), torch::nn::functional.adaptive_max_pool2d(), torch::nn::functional.adaptive_max_pool2d_with_indices(), torch::nn::functional.adaptive_max_pool3d(), torch::nn::functional.adaptive_max_pool3d_with_indices(), at::native::vulkan::aten.add(), torch::autograd::Node.add_input_metadata(), at::native.add_out_sparse_contiguous(), at::native::vulkan::aten.add_scalar(), at::native.addcdiv_out(), at::native.addcmul_out(), at::native::vulkan::aten.addmm(), c10d::ProcessGroupGloo.allgather(), c10d::ProcessGroupGloo.allgather_coalesced(), at::native.allocate_reduction_result(), torch::utils.apply_(), at::native.arange(), at::native.argmax_out(), at::native.argmin_out(), c10d.assertSameSizeAndType(), c10d.assertSameType(), c10d.assertTypeAndSizesMatch(), c10d.assertTypeMatch(), torch::nn::functional.avg_pool1d(), torch::nn::functional.avg_pool2d(), at::native::vulkan::aten.avg_pool2d(), torch::nn::functional.avg_pool3d(), at::native.bartlett_window(), at::native.blackman_window(), caffe2.BlobGetMutableTensor(), at::native.bucketize_cpu(), caffe2::OpenCLContext.BuildKernel(), at::native.cat_sparse(), c10::impl.check_tensor_options_and_extract_memory_format(), at::native::vulkan::aten.clamp(), clBuildProgram(), clCompileProgram(), clLinkProgram(), at::native.clone_sparse(), at::native.coalesce_sparse_cpu(), at::native.combinations(), torch::jit::fuser::cuda::FusionExecutor.compileFusion(), at::native.complex(), torch::distributed::rpc::RpcAgent.computeNewRpcRetryTime(), at::native.constant_pad_nd(), torch::jit::tensorexpr.constructTensors(), at::native.contiguous(), torch::detail::TensorDataContainer.convert_to_tensor(), caffe2.ConvertToRawDataset(), caffe2::ImageInputOp< Context >.CopyPrefetched(), at::cpp_custom_type_hack.create(), at::cuda::detail::_stubs.cuLinkAddData(), at::native.cummax(), at::native.cummin(), at::cuda::detail::_stubs.cuModuleLoadDataEx(), torch::jit::fuser::cuda::FusionExecutor.debugCompileFusionFromStr(), at::native.diag_embed(), torch::autograd.dispatch_arange(), torch::autograd.dispatch_full(), torch::autograd.dispatch_randint(), torch::autograd.dispatch_range(), torch::autograd.dispatch_to(), caffe2.empty(), at::native.empty(), at::native.empty_affine_quantized(), at::native.empty_like(), at::native.empty_per_channel_affine_quantized(), at::native.eye(), at::native.fake_quantize_per_channel_affine_cachemask(), at::native.fake_quantize_per_tensor_affine_cachemask(), at::native.fft_fftfreq(), at::native.fft_rfftfreq(), torch::nn::functional.fold(), torch.format_invalid_args(), torch::nn::ReflectionPadImpl< D, Derived >.forward(), torch::nn::ReplicationPadImpl< D, Derived >.forward(), torch::nn::ConstantPadImpl< D, Derived >.forward(), torch::nn::RNNCellImpl.forward(), torch::nn::GRUCellImpl.forward(), torch::nn::LSTMCellImpl.forward(), torch::nn::TransformerEncoderLayerImpl.forward(), torch::nn::TransformerImpl.forward(), torch::nn::functional.fractional_max_pool2d(), torch::nn::functional.fractional_max_pool2d_with_indices(), torch::nn::functional.fractional_max_pool3d(), torch::nn::functional.fractional_max_pool3d_with_indices(), at::native.frobenius_norm(), torch.from_blob(), at::native.from_file(), at::native.full(), at::native.full_like(), c10d::ProcessGroupGloo.gather(), torch::nn::RNNCellImpl.get_nonlinearity_str(), torch::PythonArgParser.get_signatures(), caffe2.GetAsyncTaskGraph(), c10.getDefaultTensorOptions(), caffe2.GetSizedTensorWithOptions(), torch::nn::functional.group_norm(), at::native.hamming_window(), at::native.hann_window(), at::native.index_select_sparse(), c10d::Reducer.initialize_buckets(), at::native::DropoutDescriptor.initialize_rng(), at::native.int_repr_quantized_cpu(), at::native.istft(), at::native.kaiser_window(), torch::nn::functional.layer_norm(), torch::utils.legacy_tensor_ctor(), torch::utils.legacy_tensor_new(), caffe2::db::LevelDB.LevelDB(), at::native.linalg_norm(), at::native.linspace(), torch::nn::functional.local_response_norm(), torch::nn::functional.log_softmax(), at::native.logspace(), torch::nn::functional.lp_pool1d(), torch::nn::functional.lp_pool2d(), torch::data.make_data_loader(), at::native.make_per_channel_quantized_tensor_cpu(), at::native.make_per_tensor_quantized_tensor_cpu(), torch::utils.map2_(), torch::utils.map_(), at::native.masked_select_out_impl_cpu(), torch::nn::functional.max_pool1d(), torch::nn::functional.max_pool1d_with_indices(), at::native::vulkan::aten.max_pool2d(), torch::nn::functional.max_pool2d(), torch::nn::functional.max_pool2d_with_indices(), torch::nn::functional.max_pool3d(), torch::nn::functional.max_pool3d_with_indices(), torch::nn::functional.max_unpool1d(), torch::nn::functional.max_unpool2d(), torch::nn::functional.max_unpool3d(), torch::utils.maybe_initialize_cuda(), at::native::vulkan::aten.mean(), c10::TensorOptions.merge_in(), at::native::vulkan::aten.mm(), at::native::vulkan::aten.mul_scalar(), torch::nn::functional.multi_head_attention_forward(), at::native.mvlgamma(), at::native.mvlgamma_(), torch::autograd::generated::details.mvlgamma_backward(), at::native.nanquantile(), at::native.nanquantile_out(), at::native.new_empty(), at::native.new_empty_strided(), at::native.new_full(), at.new_qtensor(), at::native::vulkan::aten.new_with_vtensor_vulkan(), at::native.new_zeros(), at.new_zeros_batching_rule(), c10d::DistributedC10d.newProcessGroupHelper(), at::native.normal(), torch::jit::fuser::cuda::executor_utils.nvrtcCompile(), at::native.one_hot(), at::native.ones(), at::native.ones_like(), c10::RegisterOperators.op(), at::DeprecatedTypeProperties.operator TensorOptions(), c10.operator<<(), at::DeprecatedTypeProperties.options(), torch::utils.options_to_string(), caffe2::OperatorBase.OutputTensor(), caffe2::OperatorBase.OutputTensorCopyFrom(), caffe2::ConvPoolDNNLowPOpBase< T, FP32_OP >.OutputTensorCPU_(), caffe2::DNNLowPOp< T, FP32_OP >.OutputTensorCPU_(), torch::nn::functional.pad(), at::native.pin_memory(), torch::nn::functional.pixel_shuffle(), torch::nn::functional.pixel_unshuffle(), at::native.polar(), torch::nn::BatchNormImplBase< D, Derived >.pretty_print(), torch::nn::InstanceNormImpl< D, Derived >.pretty_print(), torch::nn::ReplicationPadImpl< D, Derived >.pretty_print(), torch::nn::ConstantPadImpl< D, Derived >.pretty_print(), torch::nn::MaxPoolImpl< D, Derived >.pretty_print(), torch::nn::AdaptiveMaxPoolImpl< D, output_size_t, Derived >.pretty_print(), torch::nn::AdaptiveAvgPoolImpl< D, output_size_t, Derived >.pretty_print(), torch::nn::MaxUnpoolImpl< D, Derived >.pretty_print(), torch::nn::LPPoolImpl< D, Derived >.pretty_print(), torch::PythonArgParser.print_error(), c10d::ProcessGroupGloo.ProcessGroupGloo(), torch::jit.push_one(), at::native.quantile(), at::native.quantile_out(), at::native.quantized_clone(), at::native.rand(), at::native.rand_like(), at::native.randint(), at::native.randint_like(), at::native.randn(), at::native.randn_like(), at::native.randperm(), at::native.range(), torch::jit::Unpickler.readInstruction(), c10d::Reducer.Reducer(), c10::RegisterOperators.RegisterOperators(), caffe2.ReinitializeAndCopyFrom(), caffe2.ReinitializeTensor(), at::native.repeat(), torch::nn::TransformerImpl.reset(), torch::nn::TransformerEncoderImpl.reset(), torch::nn::TransformerEncoderLayerImpl.reset(), torch::nn::TransformerEncoderImpl.reset_parameters(), at::native.reshape_out(), at::SparseTensorImpl.resize_and_clear_(), torch::jit.resizeConstantScalarOrTensorToShape(), torch::fft.rfftfreq(), caffe2::db::RocksDB.RocksDB(), caffe2::AsyncTask.Run(), torch::jit::fuser::cuda::FusionExecutorCache.runFusionWithInputs(), at::native.same_stride_to(), at::native.scalar_tensor(), at::indexing.scalarToTensor(), at::indexing::impl.scalarToTensorNonNativeDeviceType(), c10d::ProcessGroupGloo.scatter(), at::native.searchsorted_cpu(), at::native.select_sparse(), torch::distributed::autograd::DistAutogradContainer.sendReleaseContextRpc(), at::indexing.set_item(), at::TensorIteratorBase.set_output(), at::TensorIterator.set_output(), at::impl::MetaBase.set_output(), at::native::vulkan::aten.slice(), at::native.slow_conv_dilated2d_backward_cpu(), at::native.slow_conv_dilated2d_cpu(), at::native.slow_conv_dilated3d_backward_cpu(), at::native.slow_conv_dilated3d_cpu(), torch::nn::functional.softmax(), torch::nn::functional.softmin(), at::native.sparse_coo_tensor(), torch::utils.sparse_coo_tensor_ctor(), at::native.sparse_to_dense(), torch::autograd::generated::details.split_backward(), torch::autograd::generated::details.split_with_sizes_backward(), torch::jit::mobile::SGD.step(), torch::optim::LBFGS.step(), torch::optim::Adagrad.step(), torch::optim::Adam.step(), torch::optim::AdamW.step(), torch::optim::RMSprop.step(), torch::optim::SGD.step(), c10d::Reducer.sync_bucket_indices(), at::native.tensor_backend(), at::detail.tensor_backend(), at::native.tensor_complex_backend(), at::detail.tensor_complex_backend(), at::native.tensor_complex_cpu(), at::detail.tensor_complex_cpu(), at::native.tensor_cpu(), at::detail.tensor_cpu(), torch::autograd.THPVariable_arange(), torch::autograd.THPVariable_full(), torch::autograd.THPVariable_pynew(), torch::autograd.THPVariable_randint(), torch::autograd.THPVariable_range(), torch::autograd.THPVariable_type(), torch::nn::functional.threshold(), at::native.to(), torch::nn::utils::rnn::PackedSequence.to(), at.to_dtype_layout_batching_rule(), at::native.to_impl(), c10.toString(), at::native::metal::MetalTensor.toTensor(), at::native::vulkan::aten.transpose(), torch::autograd::generated::details.unbind_backward(), torch::nn::functional.unfold(), at::native.unsqueeze_sparse(), torch::autograd.valueToTensor(), at::native::vulkan::ops.verify(), c10d::Reducer.verify_replica0_across_processes(), at::native::vulkan::aten.view(), caffe2.XBlobGetMutableTensor(), caffe2::Operator< Context >.XOutput(), caffe2::OperatorBase.XOutputTensor(), at::native.zeros(), and at::native.zeros_like().

◆ opts

caffe2.perfkernels.hp_emblookup_codegen.opts = parser.parse_args()

Definition at line 392 of file hp_emblookup_codegen.py.

◆ parser

caffe2.perfkernels.hp_emblookup_codegen.parser = argparse.ArgumentParser()

Definition at line 388 of file hp_emblookup_codegen.py.

◆ prefix

◆ ret_string

string caffe2.perfkernels.hp_emblookup_codegen.ret_string = " return " + fn_base + suffix + "<" + is_weight_positional + ">("

Definition at line 498 of file hp_emblookup_codegen.py.

◆ sizeof

dictionary caffe2.perfkernels.hp_emblookup_codegen.sizeof = {"float": 4, "at::Half": 2, "uint8_t": 1}

Definition at line 7 of file hp_emblookup_codegen.py.

◆ suffix