"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/amd/common/amd_kernel_code_t.h" (16 Sep 2020, 25650 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "amd_kernel_code_t.h" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright 2015,2016 Advanced Micro Devices, Inc.
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * on the rights to use, copy, modify, merge, publish, distribute, sub
    8  * license, and/or sell copies of the Software, and to permit persons to whom
    9  * the Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
   18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
   19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
   20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
   21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
   22  *
   23  */
   24 
   25 #ifndef AMDKERNELCODET_H
   26 #define AMDKERNELCODET_H
   27 
   28 //---------------------------------------------------------------------------//
   29 // AMD Kernel Code, and its dependencies                                     //
   30 //---------------------------------------------------------------------------//
   31 
   32 // Sets val bits for specified mask in specified dst packed instance.
   33 #define AMD_HSA_BITS_SET(dst, mask, val)                                       \
   34   dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
   35   dst |= (((val) << mask ## _SHIFT) & mask)
   36 
   37 // Gets bits for specified mask from specified src packed instance.
   38 #define AMD_HSA_BITS_GET(src, mask)                                            \
   39   ((src & mask) >> mask ## _SHIFT)
   40 
   41 /* Every amd_*_code_t has the following properties, which are composed of
   42  * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
   43  * bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
   44  * (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
   45  *
   46  * (Note that bit fields cannot be used as their layout is
   47  * implementation defined in the C standard and so cannot be used to
   48  * specify an ABI)
   49  */
   50 enum amd_code_property_mask_t {
   51 
   52   /* Enable the setup of the SGPR user data registers
   53    * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
   54    * for initial register state.
   55    *
   56    * The total number of SGPRuser data registers requested must not
   57    * exceed 16. Any requests beyond 16 will be ignored.
   58    *
   59    * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
   60    * SGPR user data registers enabled up to 16).
   61    */
   62 
   63   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
   64   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
   65   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
   66 
   67   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
   68   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
   69   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
   70 
   71   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
   72   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
   73   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
   74 
   75   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
   76   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
   77   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
   78 
   79   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
   80   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
   81   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
   82 
   83   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
   84   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
   85   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
   86 
   87   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
   88   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
   89   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
   90 
   91   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
   92   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
   93   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
   94 
   95   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
   96   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
   97   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
   98 
   99   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
  100   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
  101   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
  102 
  103   AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
  104   AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
  105   AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
  106 
  107   /* Control wave ID base counter for GDS ordered-append. Used to set
  108    * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
  109    * ORDERED_APPEND_MODE also needs to be settable)
  110    */
  111   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
  112   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
  113   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
  114 
  115   /* The interleave (swizzle) element size in bytes required by the
  116    * code for private memory. This must be 2, 4, 8 or 16. This value
  117    * is provided to the finalizer when it is invoked and is recorded
  118    * here. The hardware will interleave the memory requests of each
  119    * lane of a wavefront by this element size to ensure each
  120    * work-item gets a distinct memory memory location. Therefore, the
  121    * finalizer ensures that all load and store operations done to
  122    * private memory do not exceed this size. For example, if the
  123    * element size is 4 (32-bits or dword) and a 64-bit value must be
  124    * loaded, the finalizer will generate two 32-bit loads. This
  125    * ensures that the interleaving will get the work-item
  126    * specific dword for both halves of the 64-bit value. If it just
  127    * did a 64-bit load then it would get one dword which belonged to
  128    * its own work-item, but the second dword would belong to the
  129    * adjacent lane work-item since the interleaving is in dwords.
  130    *
  131    * The value used must match the value that the runtime configures
  132    * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
  133    * is generally DWORD.
  134    *
  135    * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
  136    */
  137   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
  138   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
  139   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
  140 
  141   /* Are global memory addresses 64 bits. Must match
  142    * amd_kernel_code_t.hsail_machine_model ==
  143    * HSA_MACHINE_LARGE. Must also match
  144    * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
  145    * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
  146    */
  147   AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
  148   AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
  149   AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
  150 
  151   /* Indicate if the generated ISA is using a dynamically sized call
  152    * stack. This can happen if calls are implemented using a call
  153    * stack and recursion, alloca or calls to indirect functions are
  154    * present. In these cases the Finalizer cannot compute the total
  155    * private segment size at compile time. In this case the
  156    * workitem_private_segment_byte_size only specifies the statically
  157    * know private segment size, and additional space must be added
  158    * for the call stack.
  159    */
  160   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
  161   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
  162   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
  163 
  164   /* Indicate if code generated has support for debugging. */
  165   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
  166   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
  167   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
  168 
  169   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
  170   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
  171   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
  172 
  173   AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
  174   AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
  175   AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
  176 };
  177 
  178 /* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
  179  * Code Object to set up the hardware to execute the kernel dispatch.
  180  *
  181  * Initial Kernel Register State.
  182  *
  183  * Initial kernel register state will be set up by CP/SPI prior to the start
  184  * of execution of every wavefront. This is limited by the constraints of the
  185  * current hardware.
  186  *
  187  * The order of the SGPR registers is defined, but the Finalizer can specify
  188  * which ones are actually setup in the amd_kernel_code_t object using the
  189  * enable_sgpr_* bit fields. The register numbers used for enabled registers
  190  * are dense starting at SGPR0: the first enabled register is SGPR0, the next
  191  * enabled register is SGPR1 etc.; disabled registers do not have an SGPR
  192  * number.
  193  *
  194  * The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
  195  * apply to all waves of the grid. It is possible to specify more than 16 User
  196  * SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
  197  * are actually initialized. These are then immediately followed by the System
  198  * SGPRs that are set up by ADC/SPI and can have different values for each wave
  199  * of the grid dispatch.
  200  *
  201  * SGPR register initial state is defined as follows:
  202  *
  203  * Private Segment Buffer (enable_sgpr_private_segment_buffer):
  204  *   Number of User SGPR registers: 4. V# that can be used, together with
  205  *   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
  206  *   segments using a segment address. It must be set as follows:
  207  *     - Base address: of the scratch memory area used by the dispatch. It
  208  *       does not include the scratch wave offset. It will be the per process
  209  *       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
  210  *       example there may be a per pipe offset, or per AQL Queue offset).
  211  *     - Stride + data_format: Element Size * Index Stride (???)
  212  *     - Cache swizzle: ???
  213  *     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
  214  *       scratch)
  215  *     - Num records: Flat Scratch Work Item Size / Element Size (???)
  216  *     - Dst_sel_*: ???
  217  *     - Num_format: ???
  218  *     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
  219  *       agree with amd_kernel_code_t.privateElementSize)
  220  *     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
  221  *       be number of wavefront lanes for scratch, must agree with
  222  *       amd_kernel_code_t.wavefrontSize)
  223  *     - Add tid enable: 1
  224  *     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
  225  *     - Hash_enable: ???
  226  *     - Heap: ???
  227  *     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
  228  *     - Type: 0 (a buffer) (???)
  229  *
  230  * Dispatch Ptr (enable_sgpr_dispatch_ptr):
  231  *   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
  232  *   for kernel actually executing.
  233  *
  234  * Queue Ptr (enable_sgpr_queue_ptr):
  235  *   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
  236  *   AQL queue on which the dispatch packet was queued.
  237  *
  238  * Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
  239  *   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
  240  *   is directly copied from the kernargPtr in the dispatch packet. Having CP
  241  *   load it once avoids loading it at the beginning of every wavefront.
  242  *
  243  * Dispatch Id (enable_sgpr_dispatch_id):
  244  *   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
  245  *   packet being executed.
  246  *
  247  * Flat Scratch Init (enable_sgpr_flat_scratch_init):
  248  *   Number of User SGPR registers: 2. This is 2 SGPRs.
  249  *
  250  *   For CI/VI:
  251  *     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
  252  *     to base of memory for scratch for this dispatch. This is the same offset
  253  *     used in computing the Scratch Segment Buffer base address. The value of
  254  *     Scratch Wave Offset must be added by the kernel code and moved to
  255  *     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
  256  *
  257  *     The second SGPR is 32 bit byte size of a single work-item's scratch
  258  *     memory usage. This is directly loaded from the dispatch packet Private
  259  *     Segment Byte Size and rounded up to a multiple of DWORD.
  260  *
  261  *     \todo [Does CP need to round this to >4 byte alignment?]
  262  *
  263  *     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
  264  *     flat memory instructions. Having CP load it once avoids loading it at
  265  *     the beginning of every wavefront.
  266  *
  267  * Private Segment Size (enable_sgpr_private_segment_size):
  268  *   Number of User SGPR registers: 1. The 32 bit byte size of a single
  269  *   work-item's scratch memory allocation. This is the value from the dispatch
  270  *   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
  271  *
  272  *   \todo [Does CP need to round this to >4 byte alignment?]
  273  *
  274  *   Having CP load it once avoids loading it at the beginning of every
  275  *   wavefront.
  276  *
  277  *   \todo [This will not be used for CI/VI since it is the same value as
  278  *   the second SGPR of Flat Scratch Init.
  279  *
  280  * Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
  281  *   Number of User SGPR registers: 1. 32 bit count of the number of
  282  *   work-groups in the X dimension for the grid being executed. Computed from
  283  *   the fields in the HsaDispatchPacket as
  284  *   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
  285  *
  286  * Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
  287  *   Number of User SGPR registers: 1. 32 bit count of the number of
  288  *   work-groups in the Y dimension for the grid being executed. Computed from
  289  *   the fields in the HsaDispatchPacket as
  290  *   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
  291  *
  292  *   Only initialized if <16 previous SGPRs initialized.
  293  *
  294  * Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
  295  *   Number of User SGPR registers: 1. 32 bit count of the number of
  296  *   work-groups in the Z dimension for the grid being executed. Computed
  297  *   from the fields in the HsaDispatchPacket as
  298  *   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
  299  *
  300  *   Only initialized if <16 previous SGPRs initialized.
  301  *
  302  * Work-Group Id X (enable_sgpr_workgroup_id_x):
  303  *   Number of System SGPR registers: 1. 32 bit work group id in X dimension
  304  *   of grid for wavefront. Always present.
  305  *
  306  * Work-Group Id Y (enable_sgpr_workgroup_id_y):
  307  *   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
  308  *   of grid for wavefront.
  309  *
  310  * Work-Group Id Z (enable_sgpr_workgroup_id_z):
  311  *   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
  312  *   of grid for wavefront. If present then Work-group Id Y will also be
  313  *   present
  314  *
  315  * Work-Group Info (enable_sgpr_workgroup_info):
  316  *   Number of System SGPR registers: 1. {first_wave, 14'b0000,
  317  *   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
  318  *
  319  * Private Segment Wave Byte Offset
  320  * (enable_sgpr_private_segment_wave_byte_offset):
  321  *   Number of System SGPR registers: 1. 32 bit byte offset from base of
  322  *   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
  323  *   segment address when using Scratch Segment Buffer. It must be added to
  324  *   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
  325  *
  326  *
  327  * The order of the VGPR registers is defined, but the Finalizer can specify
  328  * which ones are actually setup in the amd_kernel_code_t object using the
  329  * enableVgpr*  bit fields. The register numbers used for enabled registers
  330  * are dense starting at VGPR0: the first enabled register is VGPR0, the next
  331  * enabled register is VGPR1 etc.; disabled registers do not have an VGPR
  332  * number.
  333  *
  334  * VGPR register initial state is defined as follows:
  335  *
  336  * Work-Item Id X (always initialized):
  337  *   Number of registers: 1. 32 bit work item id in X dimension of work-group
  338  *   for wavefront lane.
  339  *
  340  * Work-Item Id X (enable_vgpr_workitem_id > 0):
  341  *   Number of registers: 1. 32 bit work item id in Y dimension of work-group
  342  *   for wavefront lane.
  343  *
  344  * Work-Item Id X (enable_vgpr_workitem_id > 0):
  345  *   Number of registers: 1. 32 bit work item id in Z dimension of work-group
  346  *   for wavefront lane.
  347  *
  348  *
  349  * The setting of registers is being done by existing GPU hardware as follows:
  350  *   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
  351  *      registers.
  352  *   2) Work-group Id registers X, Y, Z are set by SPI which supports any
  353  *      combination including none.
  354  *   3) Scratch Wave Offset is also set by SPI which is why its value cannot
  355  *      be added into the value Flat Scratch Offset which would avoid the
  356  *      Finalizer generated prolog having to do the add.
  357  *   4) The VGPRs are set by SPI which only supports specifying either (X),
  358  *      (X, Y) or (X, Y, Z).
  359  *
  360  * Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
  361  * they can be moved as a 64 bit value to the hardware required SGPRn-3 and
  362  * SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
  363  *
  364  * The global segment can be accessed either using flat operations or buffer
  365  * operations. If buffer operations are used then the Global Buffer used to
  366  * access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
  367  * segment address is not passed into the kernel code by CP since its base
  368  * address is always 0. Instead the Finalizer generates prolog code to
  369  * initialize 4 SGPRs with a V# that has the following properties, and then
  370  * uses that in the buffer instructions:
  371  *   - base address of 0
  372  *   - no swizzle
  373  *   - ATC=1
  374  *   - MTYPE set to support memory coherence specified in
  375  *     amd_kernel_code_t.globalMemoryCoherence
  376  *
  377  * When the Global Buffer is used to access the Kernarg segment, must add the
  378  * dispatch packet kernArgPtr to a kernarg segment address before using this V#.
  379  * Alternatively scalar loads can be used if the kernarg offset is uniform, as
  380  * the kernarg segment is constant for the duration of the kernel execution.
  381  */
  382 
  383 typedef struct amd_kernel_code_s {
  384   uint32_t amd_kernel_code_version_major;
  385   uint32_t amd_kernel_code_version_minor;
  386   uint16_t amd_machine_kind;
  387   uint16_t amd_machine_version_major;
  388   uint16_t amd_machine_version_minor;
  389   uint16_t amd_machine_version_stepping;
  390 
  391   /* Byte offset (possibly negative) from start of amd_kernel_code_t
  392    * object to kernel's entry point instruction. The actual code for
  393    * the kernel is required to be 256 byte aligned to match hardware
  394    * requirements (SQ cache line is 16). The code must be position
  395    * independent code (PIC) for AMD devices to give runtime the
  396    * option of copying code to discrete GPU memory or APU L2
  397    * cache. The Finalizer should endeavour to allocate all kernel
  398    * machine code in contiguous memory pages so that a device
  399    * pre-fetcher will tend to only pre-fetch Kernel Code objects,
  400    * improving cache performance.
  401    */
  402   int64_t kernel_code_entry_byte_offset;
  403 
  404   /* Range of bytes to consider prefetching expressed as an offset
  405    * and size. The offset is from the start (possibly negative) of
  406    * amd_kernel_code_t object. Set both to 0 if no prefetch
  407    * information is available.
  408    */
  409   int64_t kernel_code_prefetch_byte_offset;
  410   uint64_t kernel_code_prefetch_byte_size;
  411 
  412   /* Number of bytes of scratch backing memory required for full
  413    * occupancy of target chip. This takes into account the number of
  414    * bytes of scratch per work-item, the wavefront size, the maximum
  415    * number of wavefronts per CU, and the number of CUs. This is an
  416    * upper limit on scratch. If the grid being dispatched is small it
  417    * may only need less than this. If the kernel uses no scratch, or
  418    * the Finalizer has not computed this value, it must be 0.
  419    */
  420   uint64_t max_scratch_backing_memory_byte_size;
  421 
  422   /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
  423    * COMPUTE_PGM_RSRC2 registers.
  424    */
  425   uint64_t compute_pgm_resource_registers;
  426 
  427   /* Code properties. See amd_code_property_mask_t for a full list of
  428    * properties.
  429    */
  430   uint32_t code_properties;
  431 
  432   /* The amount of memory required for the combined private, spill
  433    * and arg segments for a work-item in bytes. If
  434    * is_dynamic_callstack is 1 then additional space must be added to
  435    * this value for the call stack.
  436    */
  437   uint32_t workitem_private_segment_byte_size;
  438 
  439   /* The amount of group segment memory required by a work-group in
  440    * bytes. This does not include any dynamically allocated group
  441    * segment memory that may be added when the kernel is
  442    * dispatched.
  443    */
  444   uint32_t workgroup_group_segment_byte_size;
  445 
  446   /* Number of byte of GDS required by kernel dispatch. Must be 0 if
  447    * not using GDS.
  448    */
  449   uint32_t gds_segment_byte_size;
  450 
  451   /* The size in bytes of the kernarg segment that holds the values
  452    * of the arguments to the kernel. This could be used by CP to
  453    * prefetch the kernarg segment pointed to by the dispatch packet.
  454    */
  455   uint64_t kernarg_segment_byte_size;
  456 
  457   /* Number of fbarrier's used in the kernel and all functions it
  458    * calls. If the implementation uses group memory to allocate the
  459    * fbarriers then that amount must already be included in the
  460    * workgroup_group_segment_byte_size total.
  461    */
  462   uint32_t workgroup_fbarrier_count;
  463 
  464   /* Number of scalar registers used by a wavefront. This includes
  465    * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
  466    * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
  467    * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
  468    */
  469   uint16_t wavefront_sgpr_count;
  470 
  471   /* Number of vector registers used by each work-item. Used to set
  472    * COMPUTE_PGM_RSRC1.VGPRS.
  473    */
  474   uint16_t workitem_vgpr_count;
  475 
  476   /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
  477    * first fixed VGPR number reserved.
  478    */
  479   uint16_t reserved_vgpr_first;
  480 
  481   /* The number of consecutive VGPRs reserved by the client. If
  482    * is_debug_supported then this count includes VGPRs reserved
  483    * for debugger use.
  484    */
  485   uint16_t reserved_vgpr_count;
  486 
  487   /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
  488    * first fixed SGPR number reserved.
  489    */
  490   uint16_t reserved_sgpr_first;
  491 
  492   /* The number of consecutive SGPRs reserved by the client. If
  493    * is_debug_supported then this count includes SGPRs reserved
  494    * for debugger use.
  495    */
  496   uint16_t reserved_sgpr_count;
  497 
  498   /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
  499    * fixed SGPR number used to hold the wave scratch offset for the
  500    * entire kernel execution, or uint16_t(-1) if the register is not
  501    * used or not known.
  502    */
  503   uint16_t debug_wavefront_private_segment_offset_sgpr;
  504 
  505   /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
  506    * fixed SGPR number of the first of 4 SGPRs used to hold the
  507    * scratch V# used for the entire kernel execution, or uint16_t(-1)
  508    * if the registers are not used or not known.
  509    */
  510   uint16_t debug_private_segment_buffer_sgpr;
  511 
  512   /* The maximum byte alignment of variables used by the kernel in
  513    * the specified memory segment. Expressed as a power of two. Must
  514    * be at least HSA_POWERTWO_16.
  515    */
  516   uint8_t kernarg_segment_alignment;
  517   uint8_t group_segment_alignment;
  518   uint8_t private_segment_alignment;
  519 
  520   /* Wavefront size expressed as a power of two. Must be a power of 2
  521    * in range 1..64 inclusive. Used to support runtime query that
  522    * obtains wavefront size, which may be used by application to
  523    * allocated dynamic group memory and set the dispatch work-group
  524    * size.
  525    */
  526   uint8_t wavefront_size;
  527 
  528   int32_t call_convention;
  529   uint8_t reserved3[12];
  530   uint64_t runtime_loader_kernel_symbol;
  531   uint64_t control_directives[16];
  532 } amd_kernel_code_t;
  533 
  534 #endif // AMDKERNELCODET_H