"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/swr/rasterizer/core/backend_impl.h" (16 Sep 2020, 58719 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "backend_impl.h" see the Fossies "Dox" file reference documentation.

    1 /****************************************************************************
    2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21  * IN THE SOFTWARE.
   22  *
   23  * @file backend.h
   24  *
   25  * @brief Backend handles rasterization, pixel shading and output merger
   26  *        operations.
   27  *
   28  ******************************************************************************/
   29 #pragma once
   30 
   31 #include "tilemgr.h"
   32 #include "state.h"
   33 #include "context.h"
   34 
   35 
   36 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
   37 void InitBackendSampleFuncTable(
   38     PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
   39 
   40 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
   41                                           SWR_PS_CONTEXT&          psContext);
   42 
   43 
   44 enum SWR_BACKEND_FUNCS
   45 {
   46     SWR_BACKEND_SINGLE_SAMPLE,
   47     SWR_BACKEND_MSAA_PIXEL_RATE,
   48     SWR_BACKEND_MSAA_SAMPLE_RATE,
   49     SWR_BACKEND_FUNCS_MAX,
   50 };
   51 
   52 #if KNOB_SIMD_WIDTH == 8
   53 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
   54 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
   55 static const __m256 vULOffsetsX     = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
   56 static const __m256 vULOffsetsY     = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
   57 #define MASK 0xff
   58 #endif
   59 
   60 static INLINE simdmask ComputeUserClipMask(uint8_t           clipMask,
   61                                            float*            pUserClipBuffer,
   62                                            simdscalar const& vI,
   63                                            simdscalar const& vJ)
   64 {
   65     simdscalar vClipMask       = _simd_setzero_ps();
   66     uint32_t   numClipDistance = _mm_popcnt_u32(clipMask);
   67 
   68     for (uint32_t i = 0; i < numClipDistance; ++i)
   69     {
   70         // pull triangle clip distance values from clip buffer
   71         simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
   72         simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
   73         simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
   74 
   75         // interpolate
   76         simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
   77 
   78         // clip if interpolated clip distance is < 0 || NAN
   79         simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
   80 
   81         vClipMask = _simd_or_ps(vClipMask, vCull);
   82     }
   83 
   84     return _simd_movemask_ps(vClipMask);
   85 }
   86 
   87 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
   88 {
   89     static const uint32_t RasterTileColorOffsets[16]{
   90         0,
   91         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
   92         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
   93         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
   94         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
   95         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
   96         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
   97         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
   98         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
   99         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
  100         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
  101             10,
  102         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
  103             11,
  104         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
  105             12,
  106         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
  107             13,
  108         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
  109             14,
  110         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
  111             15,
  112     };
  113     assert(sampleNum < 16);
  114     return RasterTileColorOffsets[sampleNum];
  115 }
  116 
  117 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
  118 {
  119     static const uint32_t RasterTileDepthOffsets[16]{
  120         0,
  121         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
  122         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
  123         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
  124         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
  125         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
  126         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
  127         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
  128         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
  129         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
  130         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
  131             10,
  132         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
  133             11,
  134         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
  135             12,
  136         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
  137             13,
  138         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
  139             14,
  140         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
  141             15,
  142     };
  143     assert(sampleNum < 16);
  144     return RasterTileDepthOffsets[sampleNum];
  145 }
  146 
  147 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
  148 {
  149     static const uint32_t RasterTileStencilOffsets[16]{
  150         0,
  151         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
  152         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  153             2,
  154         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  155             3,
  156         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  157             4,
  158         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  159             5,
  160         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  161             6,
  162         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  163             7,
  164         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  165             8,
  166         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  167             9,
  168         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  169             10,
  170         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  171             11,
  172         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  173             12,
  174         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  175             13,
  176         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  177             14,
  178         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
  179             15,
  180     };
  181     assert(sampleNum < 16);
  182     return RasterTileStencilOffsets[sampleNum];
  183 }
  184 
  185 template <typename T, uint32_t InputCoverage>
  186 struct generateInputCoverage
  187 {
  188     INLINE generateInputCoverage(const uint64_t* const coverageMask,
  189                                  uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
  190                                  const uint32_t sampleMask)
  191     {
  192         // will need to update for avx512
  193         assert(KNOB_SIMD_WIDTH == 8);
  194 
  195         simdscalari mask[2];
  196         simdscalari sampleCoverage[2];
  197 
  198         if (T::bIsCenterPattern)
  199         {
  200             // center coverage is the same for all samples; just broadcast to the sample slots
  201             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
  202             if (T::MultisampleT::numSamples == 1)
  203             {
  204                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
  205             }
  206             else if (T::MultisampleT::numSamples == 2)
  207             {
  208                 sampleCoverage[0] =
  209                     _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
  210             }
  211             else if (T::MultisampleT::numSamples == 4)
  212             {
  213                 sampleCoverage[0] = _simd_set_epi32(
  214                     0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
  215             }
  216             else if (T::MultisampleT::numSamples == 8)
  217             {
  218                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
  219             }
  220             else if (T::MultisampleT::numSamples == 16)
  221             {
  222                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
  223                 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
  224             }
  225         }
  226         else
  227         {
  228             simdscalari src    = _simd_set1_epi32(0);
  229             simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
  230 
  231             if (T::MultisampleT::numSamples == 1)
  232             {
  233                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
  234             }
  235             else if (T::MultisampleT::numSamples == 2)
  236             {
  237                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
  238             }
  239             else if (T::MultisampleT::numSamples == 4)
  240             {
  241                 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
  242             }
  243             else if (T::MultisampleT::numSamples == 8)
  244             {
  245                 mask[0] = _simd_set1_epi32(-1);
  246             }
  247             else if (T::MultisampleT::numSamples == 16)
  248             {
  249                 mask[0] = _simd_set1_epi32(-1);
  250                 mask[1] = _simd_set1_epi32(-1);
  251                 index1  = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
  252             }
  253 
  254             // gather coverage for samples 0-7
  255             sampleCoverage[0] =
  256                 _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
  257                                                             (const float*)coverageMask,
  258                                                             index0,
  259                                                             _mm256_castsi256_ps(mask[0]),
  260                                                             8));
  261             if (T::MultisampleT::numSamples > 8)
  262             {
  263                 // gather coverage for samples 8-15
  264                 sampleCoverage[1] =
  265                     _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
  266                                                                 (const float*)coverageMask,
  267                                                                 index1,
  268                                                                 _mm256_castsi256_ps(mask[1]),
  269                                                                 8));
  270             }
  271         }
  272 
  273         mask[0] = _mm256_set_epi8(-1,
  274                                   -1,
  275                                   -1,
  276                                   -1,
  277                                   -1,
  278                                   -1,
  279                                   -1,
  280                                   -1,
  281                                   -1,
  282                                   -1,
  283                                   -1,
  284                                   -1,
  285                                   0xC,
  286                                   0x8,
  287                                   0x4,
  288                                   0x0,
  289                                   -1,
  290                                   -1,
  291                                   -1,
  292                                   -1,
  293                                   -1,
  294                                   -1,
  295                                   -1,
  296                                   -1,
  297                                   -1,
  298                                   -1,
  299                                   -1,
  300                                   -1,
  301                                   0xC,
  302                                   0x8,
  303                                   0x4,
  304                                   0x0);
  305         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
  306         simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
  307 
  308         simdscalari packedCoverage1;
  309         if (T::MultisampleT::numSamples > 8)
  310         {
  311             // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit
  312             // lane
  313             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
  314         }
  315 
  316 #if (KNOB_ARCH == KNOB_ARCH_AVX)
  317         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
  318         simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
  319         simdscalar  shufRes = _mm256_shuffle_ps(
  320             _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
  321         packedCoverage0 = _mm256_castps_si256(
  322             _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
  323 
  324         simdscalari packedSampleCoverage;
  325         if (T::MultisampleT::numSamples > 8)
  326         {
  327             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
  328             hiToLow         = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
  329             shufRes         = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow),
  330                                         _mm256_castsi256_ps(hiToLow),
  331                                         _MM_SHUFFLE(1, 1, 0, 1));
  332             shufRes         = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
  333             packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(
  334                 _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
  335             packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(
  336                 _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
  337         }
  338         else
  339         {
  340             packedSampleCoverage = packedCoverage0;
  341         }
  342 #else
  343         simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
  344         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
  345         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
  346 
  347         simdscalari packedSampleCoverage;
  348         if (T::MultisampleT::numSamples > 8)
  349         {
  350             permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
  351             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
  352             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
  353 
  354             // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
  355             packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
  356         }
  357         else
  358         {
  359             packedSampleCoverage = packedCoverage0;
  360         }
  361 #endif
  362 
  363         for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
  364         {
  365             // convert packed sample coverage masks into single coverage masks for all samples for
  366             // each pixel in the 4x2
  367             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
  368 
  369             if (!T::bForcedSampleCount)
  370             {
  371                 // input coverage has to be anded with sample mask if MSAA isn't forced on
  372                 inputMask[i] &= sampleMask;
  373             }
  374 
  375             // shift to the next pixel in the 4x2
  376             packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
  377         }
  378     }
  379 
  380     INLINE generateInputCoverage(const uint64_t* const coverageMask,
  381                                  simdscalar&           inputCoverage,
  382                                  const uint32_t        sampleMask)
  383     {
  384         uint32_t inputMask[KNOB_SIMD_WIDTH];
  385         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
  386         inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7],
  387                                                         inputMask[6],
  388                                                         inputMask[5],
  389                                                         inputMask[4],
  390                                                         inputMask[3],
  391                                                         inputMask[2],
  392                                                         inputMask[1],
  393                                                         inputMask[0]));
  394     }
  395 };
  396 
  397 template <typename T>
  398 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
  399 {
  400     INLINE generateInputCoverage(const uint64_t* const coverageMask,
  401                                  simdscalar&           inputCoverage,
  402                                  const uint32_t        sampleMask)
  403     {
  404         // will need to update for avx512
  405         assert(KNOB_SIMD_WIDTH == 8);
  406         simdscalari       vec = _simd_set1_epi32(coverageMask[0]);
  407         const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
  408         vec                   = _simd_and_si(vec, bit);
  409         vec                   = _simd_cmplt_epi32(_simd_setzero_si(), vec);
  410         vec                   = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
  411         inputCoverage         = _simd_castsi_ps(vec);
  412     }
  413 
  414     INLINE generateInputCoverage(const uint64_t* const coverageMask,
  415                                  uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
  416                                  const uint32_t sampleMask)
  417     {
  418         uint32_t              simdCoverage     = (coverageMask[0] & MASK);
  419         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
  420         for (int i = 0; i < KNOB_SIMD_WIDTH; i++)
  421         {
  422             // set all samples to covered if conservative coverage mask is set for that pixel
  423             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
  424         }
  425     }
  426 };
  427 
  428 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  429 // Centroid behaves exactly as follows :
  430 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center
  431 // (even if the sample pattern does not happen to
  432 //     have a sample location there).
  433 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample
  434 // index, where sample coverage is after ANDing the
  435 //     coverage with the SampleMask Rasterizer State.
  436 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to
  437 // fill out 2x2 pixel stamps, the attribute is
  438 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the
  439 //     pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation
  440 //     point.Otherwise (full SampleMask), the pixel center is the evaluation point.
  441 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  442 template <typename T>
  443 INLINE void CalcCentroidPos(SWR_PS_CONTEXT&            psContext,
  444                             const SWR_MULTISAMPLE_POS& samplePos,
  445                             const uint64_t* const      coverageMask,
  446                             const uint32_t             sampleMask,
  447                             simdscalar const&          vXSamplePosUL,
  448                             simdscalar const&          vYSamplePosUL)
  449 {
  450     uint32_t inputMask[KNOB_SIMD_WIDTH];
  451     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
  452 
  453     // Case (2) - partially covered pixel
  454 
  455     // scan for first covered sample per pixel in the 4x2 span
  456     unsigned long sampleNum[KNOB_SIMD_WIDTH];
  457     (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
  458     (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
  459     (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
  460     (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
  461     (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
  462     (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
  463     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
  464     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
  465 
  466     // look up and set the sample offsets from UL pixel corner for first covered sample
  467     simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
  468                                        samplePos.X(sampleNum[6]),
  469                                        samplePos.X(sampleNum[5]),
  470                                        samplePos.X(sampleNum[4]),
  471                                        samplePos.X(sampleNum[3]),
  472                                        samplePos.X(sampleNum[2]),
  473                                        samplePos.X(sampleNum[1]),
  474                                        samplePos.X(sampleNum[0]));
  475 
  476     simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
  477                                        samplePos.Y(sampleNum[6]),
  478                                        samplePos.Y(sampleNum[5]),
  479                                        samplePos.Y(sampleNum[4]),
  480                                        samplePos.Y(sampleNum[3]),
  481                                        samplePos.Y(sampleNum[2]),
  482                                        samplePos.Y(sampleNum[1]),
  483                                        samplePos.Y(sampleNum[0]));
  484     // add sample offset to UL pixel corner
  485     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
  486     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
  487 
  488     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
  489     static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
  490     simdscalari              vInputCoveragei   = _simd_set_epi32(inputMask[7],
  491                                                   inputMask[6],
  492                                                   inputMask[5],
  493                                                   inputMask[4],
  494                                                   inputMask[3],
  495                                                   inputMask[2],
  496                                                   inputMask[1],
  497                                                   inputMask[0]);
  498     simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
  499 
  500     static const simdscalari vZero = _simd_setzero_si();
  501     const simdscalari vSampleMask  = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
  502     simdscalari       vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
  503     simdscalari       vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
  504     simdscalari       vCase3b           = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
  505 
  506     simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
  507 
  508     // set the centroid position based on results from above
  509     psContext.vX.centroid =
  510         _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
  511     psContext.vY.centroid =
  512         _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
  513 
  514     // Case (3a) No samples covered and partial sample mask
  515     simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
  516     // sample mask should never be all 0's for this case, but handle it anyways
  517     unsigned long firstCoveredSampleMaskSample = 0;
  518     (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask))
  519                      : (firstCoveredSampleMaskSample = 0);
  520 
  521     simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
  522 
  523     vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
  524     vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
  525 
  526     // blend in case 3a pixel locations
  527     psContext.vX.centroid =
  528         _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
  529     psContext.vY.centroid =
  530         _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
  531 }
  532 
  533 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs,
  534                                      SWR_PS_CONTEXT&          psContext,
  535                                      const simdscalar&        vXSamplePosUL,
  536                                      const simdscalar&        vYSamplePosUL)
  537 {
  538     // evaluate I,J
  539     psContext.vI.centroid =
  540         vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
  541     psContext.vJ.centroid =
  542         vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
  543     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
  544     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
  545 
  546     // interpolate 1/w
  547     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW,
  548                                             coeffs.vBOneOverW,
  549                                             coeffs.vCOneOverW,
  550                                             psContext.vI.centroid,
  551                                             psContext.vJ.centroid);
  552 }
  553 
  554 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz)
  555 {
  556     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
  557     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
  558 
  559     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
  560 }
  561 
  562 template <typename T>
  563 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
  564 {
  565     // RT has to be single sample if we're in forcedMSAA mode
  566     if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
  567     {
  568         return 1;
  569     }
  570     // unless we're forced to single sample, in which case we run the OM at the sample count of the
  571     // RT
  572     else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
  573     {
  574         return GetNumSamples(blendSampleCount);
  575     }
  576     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
  577     else
  578     {
  579         return T::MultisampleT::numSamples;
  580     }
  581 }
  582 
  583 inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work)
  584 {
  585     // broadcast scalars
  586 
  587     coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
  588     coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
  589     coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
  590 
  591     coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
  592     coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
  593     coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
  594 
  595     coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
  596     coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
  597     coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
  598 
  599     coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
  600 
  601     coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
  602     coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
  603     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
  604 }
  605 
  606 inline void SetupRenderBuffers(uint8_t*             pColorBuffer[SWR_NUM_RENDERTARGETS],
  607                                uint8_t**            pDepthBuffer,
  608                                uint8_t**            pStencilBuffer,
  609                                uint32_t             colorHotTileMask,
  610                                RenderOutputBuffers& renderBuffers)
  611 {
  612     DWORD index;
  613     while (_BitScanForward(&index, colorHotTileMask))
  614     {
  615         assert(index < SWR_NUM_RENDERTARGETS);
  616         colorHotTileMask &= ~(1 << index);
  617         pColorBuffer[index] = renderBuffers.pColor[index];
  618     }
  619 
  620     if (pDepthBuffer)
  621     {
  622         *pDepthBuffer = renderBuffers.pDepth;
  623     }
  624 
  625     if (pStencilBuffer)
  626     {
  627         *pStencilBuffer = renderBuffers.pStencil;
  628         ;
  629     }
  630 }
  631 
  632 INLINE void SetRenderHotTilesDirty(DRAW_CONTEXT* pDC, RenderOutputBuffers& renderBuffers)
  633 {
  634     const API_STATE& state = GetApiState(pDC);
  635 
  636     unsigned long rtSlot                 = 0;
  637     uint32_t      colorHottileEnableMask = state.colorHottileEnable;
  638     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
  639     {
  640         colorHottileEnableMask &= ~(1 << rtSlot);
  641         renderBuffers.pColorHotTile[rtSlot]->state = HOTTILE_DIRTY;
  642     }
  643 }
  644 
  645 template <typename T>
  646 void SetupPixelShaderContext(SWR_PS_CONTEXT*            psContext,
  647                              const SWR_MULTISAMPLE_POS& samplePos,
  648                              SWR_TRIANGLE_DESC&         work)
  649 {
  650     psContext->pAttribs               = work.pAttribs;
  651     psContext->pPerspAttribs          = work.pPerspAttribs;
  652     psContext->frontFace              = work.triFlags.frontFacing;
  653     psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
  654     psContext->viewportIndex          = work.triFlags.viewportIndex;
  655 
  656     // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull
  657     // attribs
  658     psContext->I = work.I;
  659     psContext->J = work.J;
  660 
  661     psContext->recipDet = work.recipDet;
  662     psContext->pRecipW  = work.pRecipW;
  663     psContext->pSamplePosX =
  664         samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
  665     psContext->pSamplePosY =
  666         samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
  667     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
  668     psContext->sampleIndex           = 0;
  669 }
  670 
  671 template <typename T, bool IsSingleSample>
  672 void CalcCentroid(SWR_PS_CONTEXT*            psContext,
  673                   const SWR_MULTISAMPLE_POS& samplePos,
  674                   const BarycentricCoeffs&   coeffs,
  675                   const uint64_t* const      coverageMask,
  676                   uint32_t                   sampleMask)
  677 {
  678     if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid
  679                         // positions are still different
  680     {
  681         // for 1x case, centroid is pixel center
  682         psContext->vX.centroid        = psContext->vX.center;
  683         psContext->vY.centroid        = psContext->vY.center;
  684         psContext->vI.centroid        = psContext->vI.center;
  685         psContext->vJ.centroid        = psContext->vJ.center;
  686         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
  687     }
  688     else
  689     {
  690         if (T::bCentroidPos)
  691         {
  692             ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
  693             if (T::bIsCenterPattern)
  694             {
  695                 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
  696                 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
  697             }
  698             else
  699             {
  700                 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate
  701                 // coverage 2X'..
  702                 CalcCentroidPos<T>(*psContext,
  703                                    samplePos,
  704                                    coverageMask,
  705                                    sampleMask,
  706                                    psContext->vX.UL,
  707                                    psContext->vY.UL);
  708             }
  709 
  710             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
  711         }
  712         else
  713         {
  714             psContext->vX.centroid = psContext->vX.sample;
  715             psContext->vY.centroid = psContext->vY.sample;
  716         }
  717     }
  718 }
  719 
  720 template <typename T>
  721 struct PixelRateZTestLoop
  722 {
  723     PixelRateZTestLoop(DRAW_CONTEXT*            DC,
  724                        uint32_t                 _workerId,
  725                        const SWR_TRIANGLE_DESC& Work,
  726                        const BarycentricCoeffs& Coeffs,
  727                        const API_STATE&         apiState,
  728                        uint8_t*&                depthBuffer,
  729                        uint8_t*&                stencilBuffer,
  730                        const uint8_t            ClipDistanceMask) :
  731         pDC(DC),
  732         workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
  733         samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask),
  734         pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
  735 
  736     INLINE
  737     uint32_t operator()(simdscalar&        activeLanes,
  738                         SWR_PS_CONTEXT&    psContext,
  739                         const CORE_BUCKETS BEDepthBucket,
  740                         uint32_t           currentSimdIn8x8 = 0)
  741     {
  742 
  743         uint32_t   statCount            = 0;
  744         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
  745         for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
  746         {
  747             const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample];
  748             vCoverageMask[sample] =
  749                 _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
  750 
  751             if (!_simd_movemask_ps(vCoverageMask[sample]))
  752             {
  753                 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =
  754                     _simd_setzero_ps();
  755                 continue;
  756             }
  757 
  758             // offset depth/stencil buffers current sample
  759             uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
  760             uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
  761 
  762             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
  763             {
  764                 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
  765                               "Unsupported depth hot tile format");
  766 
  767                 const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
  768 
  769                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
  770                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
  771 
  772                 vCoverageMask[sample] =
  773                     _simd_and_ps(vCoverageMask[sample],
  774                                  _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
  775             }
  776 
  777             RDTSC_BEGIN(psContext.pBucketManager, BEBarycentric, pDC->drawId);
  778 
  779             // calculate per sample positions
  780             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
  781             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
  782 
  783             // calc I & J per sample
  784             CalcSampleBarycentrics(coeffs, psContext);
  785 
  786             if (psState.writesODepth)
  787             {
  788                 {
  789                     // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
  790                     vZ[sample] = psContext.vZ;
  791                 }
  792             }
  793             else
  794             {
  795                 vZ[sample] = vplaneps(
  796                     coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
  797                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
  798             }
  799 
  800             RDTSC_END(psContext.pBucketManager, BEBarycentric, 0);
  801 
  802             ///@todo: perspective correct vs non-perspective correct clipping?
  803             // if clip distances are enabled, we need to interpolate for each sample
  804             if (clipDistanceMask)
  805             {
  806                 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask,
  807                                                        work.pUserClipBuffer,
  808                                                        psContext.vI.sample,
  809                                                        psContext.vJ.sample);
  810 
  811                 vCoverageMask[sample] =
  812                     _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
  813             }
  814 
  815             // ZTest for this sample
  816             ///@todo Need to uncomment out this bucket.
  817             // RDTSC_BEGIN(psContext.pBucketManager, BEDepthBucket, pDC->drawId);
  818             depthPassMask[sample]   = vCoverageMask[sample];
  819             stencilPassMask[sample] = vCoverageMask[sample];
  820             depthPassMask[sample]   = DepthStencilTest(&state,
  821                                                      work.triFlags.frontFacing,
  822                                                      work.triFlags.viewportIndex,
  823                                                      vZ[sample],
  824                                                      pDepthSample,
  825                                                      vCoverageMask[sample],
  826                                                      pStencilSample,
  827                                                      &stencilPassMask[sample]);
  828             // RDTSC_END(psContext.pBucketManager, BEDepthBucket, 0);
  829 
  830             // early-exit if no pixels passed depth or earlyZ is forced on
  831             if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
  832             {
  833                 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
  834                                   &state.depthStencilState,
  835                                   work.triFlags.frontFacing,
  836                                   vZ[sample],
  837                                   pDepthSample,
  838                                   depthPassMask[sample],
  839                                   vCoverageMask[sample],
  840                                   pStencilSample,
  841                                   stencilPassMask[sample]);
  842 
  843                 if (!_simd_movemask_ps(depthPassMask[sample]))
  844                 {
  845                     continue;
  846                 }
  847             }
  848             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
  849             uint32_t statMask    = _simd_movemask_ps(depthPassMask[sample]);
  850             statCount += _mm_popcnt_u32(statMask);
  851         }
  852 
  853         activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
  854         // return number of samples that passed depth and coverage
  855         return statCount;
  856     }
  857 
  858     // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
  859     simdscalar vZ[T::MultisampleT::numCoverageSamples];
  860     simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
  861     simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
  862     simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
  863 
  864 private:
  865     // functor inputs
  866     DRAW_CONTEXT* pDC;
  867     uint32_t      workerId;
  868 
  869     const SWR_TRIANGLE_DESC&   work;
  870     const BarycentricCoeffs&   coeffs;
  871     const API_STATE&           state;
  872     const SWR_PS_STATE&        psState;
  873     const SWR_MULTISAMPLE_POS& samplePos;
  874     const uint8_t              clipDistanceMask;
  875     uint8_t*&                  pDepthBuffer;
  876     uint8_t*&                  pStencilBuffer;
  877 };
  878 
  879 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext)
  880 {
  881     // evaluate I,J
  882     psContext.vI.center =
  883         vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
  884     psContext.vJ.center =
  885         vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
  886     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
  887     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
  888 
  889     // interpolate 1/w
  890     psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW,
  891                                           coeffs.vBOneOverW,
  892                                           coeffs.vCOneOverW,
  893                                           psContext.vI.center,
  894                                           psContext.vJ.center);
  895 }
  896 
  897 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
  898                                           SWR_PS_CONTEXT&          psContext)
  899 {
  900     // evaluate I,J
  901     psContext.vI.sample =
  902         vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
  903     psContext.vJ.sample =
  904         vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
  905     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
  906     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
  907 
  908     // interpolate 1/w
  909     psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW,
  910                                           coeffs.vBOneOverW,
  911                                           coeffs.vCOneOverW,
  912                                           psContext.vI.sample,
  913                                           psContext.vJ.sample);
  914 }
  915 
  916 // Merge Output to 8x2 SIMD16 Tile Format
  917 INLINE void OutputMerger8x2(DRAW_CONTEXT*   pDC,
  918                             SWR_PS_CONTEXT& psContext,
  919                             uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
  920                             uint32_t               sample,
  921                             const SWR_BLEND_STATE* pBlendState,
  922                             const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
  923                             simdscalar&       coverageMask,
  924                             simdscalar const& depthPassMask,
  925                             uint32_t          renderTargetMask,
  926                             bool              useAlternateOffset,
  927                             uint32_t          workerId)
  928 {
  929     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
  930     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
  931 
  932     if (useAlternateOffset)
  933     {
  934         rasterTileColorOffset += sizeof(simdscalar);
  935     }
  936 
  937     simdvector blendSrc;
  938     simdvector blendOut;
  939 
  940     DWORD rt;
  941     while (_BitScanForward(&rt, renderTargetMask))
  942     {
  943         renderTargetMask &= ~(1 << rt);
  944 
  945         const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
  946 
  947         simdscalar* pColorSample;
  948         bool        hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed ||
  949                              !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
  950         if (hotTileEnable)
  951         {
  952             pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset);
  953             blendSrc[0]  = pColorSample[0];
  954             blendSrc[1]  = pColorSample[2];
  955             blendSrc[2]  = pColorSample[4];
  956             blendSrc[3]  = pColorSample[6];
  957         }
  958         else
  959         {
  960             pColorSample = nullptr;
  961         }
  962 
  963         SWR_BLEND_CONTEXT blendContext = {0};
  964         {
  965             // pfnBlendFunc may not update all channels.  Initialize with PS output.
  966             /// TODO: move this into the blend JIT.
  967             blendOut = psContext.shaded[rt];
  968 
  969             blendContext.pBlendState = pBlendState;
  970             blendContext.src         = &psContext.shaded[rt];
  971             blendContext.src1        = &psContext.shaded[1];
  972             blendContext.src0alpha   = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
  973             blendContext.sampleNum   = sample;
  974             blendContext.pDst        = &blendSrc;
  975             blendContext.result      = &blendOut;
  976             blendContext.oMask       = &psContext.oMask;
  977             blendContext.pMask       = reinterpret_cast<simdscalari*>(&coverageMask);
  978 
  979             // Blend outputs and update coverage mask for alpha test
  980             if (pfnBlendFunc[rt] != nullptr)
  981             {
  982                 pfnBlendFunc[rt](&blendContext);
  983             }
  984         }
  985 
  986         // Track alpha events
  987         AR_EVENT(
  988             AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
  989 
  990         // final write mask
  991         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
  992 
  993         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
  994         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
  995                       "Unsupported hot tile format");
  996 
  997         // store with color mask
  998         if (!pRTBlend->writeDisableRed)
  999         {
 1000             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x);
 1001         }
 1002         if (!pRTBlend->writeDisableGreen)
 1003         {
 1004             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y);
 1005         }
 1006         if (!pRTBlend->writeDisableBlue)
 1007         {
 1008             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z);
 1009         }
 1010         if (!pRTBlend->writeDisableAlpha)
 1011         {
 1012             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w);
 1013         }
 1014     }
 1015 }
 1016 
 1017 template <typename T>
 1018 void BackendPixelRate(DRAW_CONTEXT*        pDC,
 1019                       uint32_t             workerId,
 1020                       uint32_t             x,
 1021                       uint32_t             y,
 1022                       SWR_TRIANGLE_DESC&   work,
 1023                       RenderOutputBuffers& renderBuffers)
 1024 {
 1025     ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the
 1026     /// backend
 1027 
 1028 
 1029     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelRateBackend, pDC->drawId);
 1030     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
 1031 
 1032     const API_STATE& state = GetApiState(pDC);
 1033 
 1034     BarycentricCoeffs coeffs;
 1035     SetupBarycentricCoeffs(&coeffs, work);
 1036 
 1037     SWR_CONTEXT* pContext    = pDC->pContext;
 1038     void*        pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 1039 
 1040     SWR_PS_CONTEXT             psContext;
 1041     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
 1042     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 1043 
 1044     uint8_t *pDepthBuffer, *pStencilBuffer;
 1045     SetupRenderBuffers(psContext.pColorBuffer,
 1046                        &pDepthBuffer,
 1047                        &pStencilBuffer,
 1048                        state.colorHottileEnable,
 1049                        renderBuffers);
 1050 
 1051     bool isTileDirty = false;
 1052 
 1053     RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
 1054 
 1055     PixelRateZTestLoop<T> PixelRateZTest(pDC,
 1056                                          workerId,
 1057                                          work,
 1058                                          coeffs,
 1059                                          state,
 1060                                          pDepthBuffer,
 1061                                          pStencilBuffer,
 1062                                          state.backendState.clipDistanceMask);
 1063 
 1064     psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 1065     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 1066 
 1067     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 1068 
 1069     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
 1070     {
 1071         psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
 1072         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
 1073 
 1074         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
 1075 
 1076         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
 1077         {
 1078             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
 1079 
 1080 
 1081             simdscalar activeLanes;
 1082             if (!(work.anyCoveredSamples & MASK))
 1083             {
 1084                 goto Endtile;
 1085             };
 1086             activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
 1087 
 1088             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
 1089             {
 1090                 const uint64_t* pCoverageMask =
 1091                     (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
 1092                         ? &work.innerCoverageMask
 1093                         : &work.coverageMask[0];
 1094 
 1095                 generateInputCoverage<T, T::InputCoverage>(
 1096                     pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
 1097             }
 1098 
 1099             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
 1100 
 1101             CalcPixelBarycentrics(coeffs, psContext);
 1102 
 1103             CalcCentroid<T, false>(
 1104                 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
 1105 
 1106             RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
 1107 
 1108             if (T::bForcedSampleCount)
 1109             {
 1110                 // candidate pixels (that passed coverage) will cause shader invocation if any bits
 1111                 // in the samplemask are set
 1112                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(
 1113                     _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
 1114                 activeLanes                  = _simd_and_ps(activeLanes, vSampleMask);
 1115             }
 1116 
 1117             // Early-Z?
 1118             if (T::bCanEarlyZ && !T::bForcedSampleCount)
 1119             {
 1120                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
 1121                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
 1122                 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
 1123             }
 1124 
 1125             // if we have no covered samples that passed depth at this point, go to next tile
 1126             if (!_simd_movemask_ps(activeLanes))
 1127             {
 1128                 goto Endtile;
 1129             };
 1130 
 1131             if (state.psState.usesSourceDepth)
 1132             {
 1133                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
 1134                 // interpolate and quantize z
 1135                 psContext.vZ = vplaneps(
 1136                     coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
 1137                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 1138                 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
 1139             }
 1140 
 1141             // pixels that are currently active
 1142             psContext.activeMask = _simd_castps_si(activeLanes);
 1143             psContext.oMask      = T::MultisampleT::FullSampleMask();
 1144 
 1145             // execute pixel shader
 1146             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
 1147             state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
 1148             RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
 1149 
 1150             // update stats
 1151             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
 1152             AR_EVENT(PSStats((HANDLE)&psContext.stats));
 1153 
 1154             // update active lanes to remove any discarded or oMask'd pixels
 1155             activeLanes = _simd_castsi_ps(_simd_and_si(
 1156                 psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
 1157             if (!_simd_movemask_ps(activeLanes))
 1158             {
 1159                 goto Endtile;
 1160             };
 1161 
 1162             isTileDirty = true;
 1163 
 1164             // late-Z
 1165             if (!T::bCanEarlyZ && !T::bForcedSampleCount)
 1166             {
 1167                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
 1168                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
 1169                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
 1170             }
 1171 
 1172             // if we have no covered samples that passed depth at this point, skip OM and go to next
 1173             // tile
 1174             if (!_simd_movemask_ps(activeLanes))
 1175             {
 1176                 goto Endtile;
 1177             };
 1178 
 1179             // output merger
 1180             // loop over all samples, broadcasting the results of the PS to all passing pixels
 1181             for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount);
 1182                  sample++)
 1183             {
 1184                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
 1185                 // center pattern does a single coverage/depth/stencil test, standard pattern tests
 1186                 // all samples
 1187                 uint32_t   coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
 1188                 simdscalar coverageMask, depthMask;
 1189                 if (T::bForcedSampleCount)
 1190                 {
 1191                     coverageMask = depthMask = activeLanes;
 1192                 }
 1193                 else
 1194                 {
 1195                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
 1196                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
 1197                     if (!_simd_movemask_ps(depthMask))
 1198                     {
 1199                         // stencil should already have been written in early/lateZ tests
 1200                         RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
 1201                         continue;
 1202                     }
 1203                 }
 1204 
 1205                 // broadcast the results of the PS to all passing pixels
 1206 
 1207                 OutputMerger8x2(pDC,
 1208                                 psContext,
 1209                                 psContext.pColorBuffer,
 1210                                 sample,
 1211                                 &state.blendState,
 1212                                 state.pfnBlendFunc,
 1213                                 coverageMask,
 1214                                 depthMask,
 1215                                 state.psState.renderTargetMask,
 1216                                 useAlternateOffset,
 1217                                 workerId);
 1218 
 1219 
 1220                 if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
 1221                 {
 1222                     uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
 1223                     uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
 1224 
 1225                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
 1226                                       &state.depthStencilState,
 1227                                       work.triFlags.frontFacing,
 1228                                       PixelRateZTest.vZ[coverageSampleNum],
 1229                                       pDepthSample,
 1230                                       depthMask,
 1231                                       coverageMask,
 1232                                       pStencilSample,
 1233                                       PixelRateZTest.stencilPassMask[coverageSampleNum]);
 1234                 }
 1235                 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
 1236             }
 1237         Endtile:
 1238             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
 1239 
 1240             for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
 1241             {
 1242                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
 1243             }
 1244 
 1245             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
 1246             {
 1247                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
 1248             }
 1249             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
 1250 
 1251             if (useAlternateOffset)
 1252             {
 1253                 DWORD    rt;
 1254                 uint32_t rtMask = state.colorHottileEnable;
 1255                 while (_BitScanForward(&rt, rtMask))
 1256                 {
 1257                     rtMask &= ~(1 << rt);
 1258                     psContext.pColorBuffer[rt] +=
 1259                         (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
 1260                 }
 1261             }
 1262 
 1263             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
 1264             pStencilBuffer +=
 1265                 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 1266 
 1267             RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
 1268 
 1269             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
 1270             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
 1271         }
 1272 
 1273         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
 1274         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
 1275     }
 1276 
 1277     if (isTileDirty)
 1278     {
 1279         SetRenderHotTilesDirty(pDC, renderBuffers);
 1280     }
 1281 
 1282     RDTSC_END(pDC->pContext->pBucketMgr, BEPixelRateBackend, 0);
 1283 }
 1284 
 1285 template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X,
 1286           uint32_t isCenter     = 0,
 1287           uint32_t coverage     = 0,
 1288           uint32_t centroid     = 0,
 1289           uint32_t forced       = 0,
 1290           uint32_t canEarlyZ    = 0
 1291           >
 1292 struct SwrBackendTraits
 1293 {
 1294     static const bool     bIsCenterPattern   = (isCenter == 1);
 1295     static const uint32_t InputCoverage      = coverage;
 1296     static const bool     bCentroidPos       = (centroid == 1);
 1297     static const bool     bForcedSampleCount = (forced == 1);
 1298     static const bool     bCanEarlyZ         = (canEarlyZ == 1);
 1299     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
 1300 };