"Fossies" - the Fresh Open Source Software Archive

Member "mesa-20.1.8/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl" (16 Sep 2020, 12053 Bytes) of package /linux/misc/mesa-20.1.8.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "simdlib_256_avx2.inl" see the Fossies "Dox" file reference documentation.

    1 /****************************************************************************
    2  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
    3  *
    4  * Permission is hereby granted, free of charge, to any person obtaining a
    5  * copy of this software and associated documentation files (the "Software"),
    6  * to deal in the Software without restriction, including without limitation
    7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    8  * and/or sell copies of the Software, and to permit persons to whom the
    9  * Software is furnished to do so, subject to the following conditions:
   10  *
   11  * The above copyright notice and this permission notice (including the next
   12  * paragraph) shall be included in all copies or substantial portions of the
   13  * Software.
   14  *
   15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
   18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
   21  * IN THE SOFTWARE.
   22  ****************************************************************************/
   23 #if !defined(__SIMD_LIB_AVX2_HPP__)
   24 #error Do not include this file directly, use "simdlib.hpp" instead.
   25 #endif
   26 
   27 //============================================================================
   28 // SIMD256 AVX (2) implementation
   29 //
   30 // Since this implementation inherits from the AVX (1) implementation,
   31 // the only operations below ones that replace AVX (1) operations.
   32 // Mostly these are integer operations that are no longer emulated with SSE
   33 //============================================================================
   34 
   35 #define SIMD_IWRAPPER_1(op) \
   36     static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
   37 
   38 #define SIMD_IWRAPPER_1L(op)                                \
   39     static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
   40     {                                                       \
   41         return _mm256_##op(_mm256_castsi256_si128(a));      \
   42     }
   43 
   44 #define SIMD_IWRAPPER_1I(op)                                \
   45     template <int ImmT>                                     \
   46     static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
   47     {                                                       \
   48         return _mm256_##op(a, ImmT);                        \
   49     }
   50 
   51 #define SIMD_IWRAPPER_1I_(op, intrin)                       \
   52     template <int ImmT>                                     \
   53     static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
   54     {                                                       \
   55         return _mm256_##intrin(a, ImmT);                    \
   56     }
   57 
   58 #define SIMD_IWRAPPER_2_(op, intrin)                                          \
   59     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
   60     {                                                                         \
   61         return _mm256_##intrin(a, b);                                         \
   62     }
   63 
   64 #define SIMD_IWRAPPER_2(op)                                                   \
   65     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
   66     {                                                                         \
   67         return _mm256_##op(a, b);                                             \
   68     }
   69 
   70 #define SIMD_IWRAPPER_2I(op)                                                  \
   71     template <int ImmT>                                                       \
   72     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
   73     {                                                                         \
   74         return _mm256_##op(a, b, ImmT);                                       \
   75     }
   76 
   77 #define SIMD_IWRAPPER_2I(op)                                                  \
   78     template <int ImmT>                                                       \
   79     static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
   80     {                                                                         \
   81         return _mm256_##op(a, b, ImmT);                                       \
   82     }
   83 
   84 
   85 //-----------------------------------------------------------------------
   86 // Floating point arithmetic operations
   87 //-----------------------------------------------------------------------
   88 static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
   89                                           Float const& b,
   90                                           Float const& c) // return (a * b) + c
   91 {
   92     return _mm256_fmadd_ps(a, b, c);
   93 }
   94 
   95 //-----------------------------------------------------------------------
   96 // Integer (various width) arithmetic operations
   97 //-----------------------------------------------------------------------
   98 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
   99 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
  100 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
  101 SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
  102 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
  103 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
  104 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
  105 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
  106 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
  107 
  108 // return (a * b) & 0xFFFFFFFF
  109 //
  110 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
  111 // and store the low 32 bits of the intermediate integers in dst.
  112 SIMD_IWRAPPER_2(mullo_epi32);
  113 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
  114 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
  115 SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
  116 
  117 //-----------------------------------------------------------------------
  118 // Logical operations
  119 //-----------------------------------------------------------------------
  120 #if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
  121 // Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
  122 // Using and_ps instead inhibits the compiler's constant folding and actually issues
  123 // the and intrinsic even though both inputs are constant values.
  124 #else
  125 // Use native integer and intrinsic
  126 SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b       (int)
  127 #endif
  128 SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b    (int)
  129 SIMD_IWRAPPER_2_(or_si, or_si256);         // return a | b       (int)
  130 SIMD_IWRAPPER_2_(xor_si, xor_si256);       // return a ^ b       (int)
  131 
  132 //-----------------------------------------------------------------------
  133 // Shift operations
  134 //-----------------------------------------------------------------------
  135 SIMD_IWRAPPER_1I(slli_epi32);           // return a << ImmT
  136 SIMD_IWRAPPER_2(sllv_epi32);            // return a << b      (uint32)
  137 SIMD_IWRAPPER_1I(srai_epi32);           // return a >> ImmT   (int32)
  138 SIMD_IWRAPPER_1I(srli_epi32);           // return a >> ImmT   (uint32)
  139 SIMD_IWRAPPER_2(srlv_epi32);            // return a >> b      (uint32)
  140 SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
  141 
  142 template <int ImmT> // same as srli_si, but with Float cast to int
  143 static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
  144 {
  145     return castsi_ps(srli_si<ImmT>(castps_si(a)));
  146 }
  147 
  148 //-----------------------------------------------------------------------
  149 // Conversion operations
  150 //-----------------------------------------------------------------------
  151 SIMD_IWRAPPER_1L(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
  152 SIMD_IWRAPPER_1L(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
  153 SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
  154 SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
  155 SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
  156 
  157 //-----------------------------------------------------------------------
  158 // Comparison operations
  159 //-----------------------------------------------------------------------
  160 SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
  161 SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
  162 SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
  163 SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
  164 SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
  165 SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
  166 SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
  167 SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
  168 
  169 static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a,
  170                                                Integer const& b) // return a < b (int32)
  171 {
  172     return cmpgt_epi32(b, a);
  173 }
  174 
  175 //-----------------------------------------------------------------------
  176 // Blend / shuffle / permute operations
  177 //-----------------------------------------------------------------------
  178 SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
  179 SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
  180 SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
  181 SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
  182 SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
  183 
  184 template <int ImmT>
  185 static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
  186 {
  187     return _mm256_permute_ps(a, ImmT);
  188 }
  189 
  190 SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
  191 
  192 static SIMDINLINE Float SIMDCALL
  193                         permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
  194 {
  195     return _mm256_permutevar8x32_ps(a, swiz);
  196 }
  197 
  198 SIMD_IWRAPPER_1I(shuffle_epi32);
  199 template <int ImmT>
  200 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
  201 {
  202     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
  203 }
  204 SIMD_IWRAPPER_2(shuffle_epi8);
  205 SIMD_IWRAPPER_2(unpackhi_epi16);
  206 SIMD_IWRAPPER_2(unpackhi_epi32);
  207 SIMD_IWRAPPER_2(unpackhi_epi64);
  208 SIMD_IWRAPPER_2(unpackhi_epi8);
  209 SIMD_IWRAPPER_2(unpacklo_epi16);
  210 SIMD_IWRAPPER_2(unpacklo_epi32);
  211 SIMD_IWRAPPER_2(unpacklo_epi64);
  212 SIMD_IWRAPPER_2(unpacklo_epi8);
  213 
  214 //-----------------------------------------------------------------------
  215 // Load / store operations
  216 //-----------------------------------------------------------------------
  217 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  218 static SIMDINLINE Float SIMDCALL
  219                         i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  220 {
  221     return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
  222 }
  223 
  224 #if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
  225 // Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
  226 // correctly in early versions of MSVC 2019
  227 #else
  228 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
  229 template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  230 static SIMDINLINE Float SIMDCALL
  231                         mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  232 {
  233     // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
  234     // Only for this intrinsic - not sure why. :(
  235     return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
  236 }
  237 #endif
  238 
  239 static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
  240 {
  241     return static_cast<uint32_t>(_mm256_movemask_epi8(a));
  242 }
  243 
  244 //=======================================================================
  245 // Legacy interface (available only in SIMD256 width)
  246 //=======================================================================
  247 
  248 #undef SIMD_IWRAPPER_1
  249 #undef SIMD_IWRAPPER_1L
  250 #undef SIMD_IWRAPPER_1I
  251 #undef SIMD_IWRAPPER_1I_
  252 #undef SIMD_IWRAPPER_2_
  253 #undef SIMD_IWRAPPER_2
  254 #undef SIMD_IWRAPPER_2I
  255 #undef SIMD_IWRAPPER_2I