"Fossies" - the Fresh Open Source Software Archive

Member "pytorch-1.8.2/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.c" (23 Jul 2021, 18076 Bytes) of package /linux/misc/pytorch-1.8.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "8x4c1x4-dq-packedA-sse2.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright (c) Facebook, Inc. and its affiliates.
    3  * All rights reserved.
    4  *
    5  * This source code is licensed under the BSD-style license found in the
    6  * LICENSE file in the root directory of this source tree.
    7  */
    8 
    9 #include <immintrin.h>
   10 
   11 #include <qnnpack/q8gemm_sparse.h>
   12 #include <requantization/runtime-sse2.h>
   13 
   14 #include "8x4c1x4-packed-sse2.h"
   15 
   16 #define CONVERT_TO_FP_AND_TRANSPOSE(a, b, c, d, t_a, t_b, t_c, t_d)  \
   17   a_ps = _mm_cvtepi32_ps(a);                                         \
   18   b_ps = _mm_cvtepi32_ps(b);                                         \
   19   c_ps = _mm_cvtepi32_ps(c);                                         \
   20   d_ps = _mm_cvtepi32_ps(d);                                         \
   21   tmp0 = _mm_shuffle_ps(a_ps, b_ps, _MM_SHUFFLE(1, 0, 1, 0));        \
   22   tmp1 = _mm_shuffle_ps(a_ps, b_ps, _MM_SHUFFLE(3, 2, 3, 2));        \
   23   tmp2 = _mm_shuffle_ps(c_ps, d_ps, _MM_SHUFFLE(1, 0, 1, 0));        \
   24   tmp3 = _mm_shuffle_ps(c_ps, d_ps, _MM_SHUFFLE(3, 2, 3, 2));        \
   25   t_a = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2, 0, 2, 0));         \
   26   t_b = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3, 1, 3, 1));         \
   27   t_c = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2, 0, 2, 0));         \
   28   t_d = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3, 1, 3, 1));
   29 
   30 void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x4_packedA__sse2(
   31     size_t mr,
   32     size_t nr,
   33     const uint8_t* a_packed,
   34     const uint8_t* packed_w,
   35     const uint32_t* w_row_ptr,
   36     const uint32_t* w_block_ids_ptr,
   37     const float* b,
   38     float* c,
   39     size_t c_stride,
   40     size_t output_channel_index,
   41     const struct pytorch_qnnp_conv_dynamic_quantization_params
   42         quantization_params[RESTRICT_STATIC 1]) {
   43 
   44   const __m128i va_zero_point = _mm_set1_epi16(quantization_params->input_zero_point);
   45   const __m128 vbias = _mm_load_ps(b);
   46   const __m128i vzero = _mm_setzero_si128();
   47 
   48   // Packed A format.
   49   // 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
   50   // Original A
   51   // --------- K -----------          -- (K + 4 - 1) / 4 --
   52   // |                     |          |                   |
   53   // |                     |        (M + 8 - 1)/8         |
   54   // |                     | Packed   |                   |
   55   // M                     |  =>      |-------------------|
   56   // |                     |        Thus Packed A has (K + 4 - 1)/4 * (M + 8 -1)/8 blocks
   57   // |                     |
   58   // |---------------------|
   59   //
   60   // Each 8 x 4 blocks is transposed and stored.
   61   // Each of the (K + 4 - 1)/4 blocks for a given group of 8 m blocks
   62   // are stored adjacent in memory
   63   // Thus, each block:
   64   // |----8m-----|----8m-----|
   65   // 4k          |           | .....
   66   // |-----------|-----------|
   67   // This locality helps in loading 8kx8m blocks of activations
   68   // Note when M is not multiple of 8, the rest can contain arbitrary
   69   // data in packed A as we will not be writing those out.
   70   // This wil be taken care by just copying the appropriate valid data
   71 
   72   __m128i vacc_low[4];
   73   __m128i vacc_high[4];
   74   const __m128 vmultiplier =
   75       _mm_loadu_ps(&quantization_params->multipliers[output_channel_index]);
   76   for (int32_t n = 0; n < nr; n++) {
   77     vacc_low[n] = _mm_setzero_si128();
   78     vacc_high[n] = _mm_setzero_si128();
   79     const int16_t b_zero_point =
   80       (int16_t)(uint16_t)quantization_params->kernel_zero_points[
   81       output_channel_index + n];
   82 
   83     int32_t num_blocks = w_row_ptr[n+1] - w_row_ptr[n];
   84     // Offset into compressed values.
   85     // w_row_ptr[0] is the block offset in the compressed values.
   86     // Where the corresponding row of the weight matrix starts.
   87     const uint8_t* temp_packed_w = packed_w + w_row_ptr[n] * COL_BLOCK_SIZE;
   88     // Similarly w_row_ptr[0] is also the block offset where
   89     // corresponding row's block column ids start.
   90     // Per row # of block column ids = # of block values
   91     const uint32_t* temp_w_block_ids_ptr = w_block_ids_ptr + w_row_ptr[n];
   92     while (num_blocks > 1) {
   93       // Load two 1x4 uint8 blocks 2 ints
   94       const uint8_t* b_ptr = temp_packed_w;
   95       // This is not perf optimal since this will result in
   96       // register spills. We probably should work with output block
   97       // of 1x4 instead of 1x8
   98       // But doing is this way because mostly this how we will
   99       // do it for ARM and this reference code helps establish
  100       // the baseline for functional correctness.
  101       const int16_t b_0 = (int16_t)((uint16_t)(b_ptr[0]));
  102       const int16_t b_1 = (int16_t)((uint16_t)(b_ptr[1]));
  103       const int16_t b_2 = (int16_t)((uint16_t)(b_ptr[2]));
  104       const int16_t b_3 = (int16_t)((uint16_t)(b_ptr[3]));
  105       const int16_t b_4 = (int16_t)((uint16_t)(b_ptr[4]));
  106       const int16_t b_5 = (int16_t)((uint16_t)(b_ptr[5]));
  107       const int16_t b_6 = (int16_t)((uint16_t)(b_ptr[6]));
  108       const int16_t b_7 = (int16_t)((uint16_t)(b_ptr[7]));
  109       // Now we will load 8kx1(broadcast 8) weight values
  110       const __m128i vxb0 = _mm_set1_epi16((b_0 - b_zero_point));
  111       const __m128i vxb1 = _mm_set1_epi16((b_1 - b_zero_point));
  112       const __m128i vxb2 = _mm_set1_epi16((b_2 - b_zero_point));
  113       const __m128i vxb3 = _mm_set1_epi16((b_3 - b_zero_point));
  114       const __m128i vxb4 = _mm_set1_epi16((b_4 - b_zero_point));
  115       const __m128i vxb5 = _mm_set1_epi16((b_5 - b_zero_point));
  116       const __m128i vxb6 = _mm_set1_epi16((b_6 - b_zero_point));
  117       const __m128i vxb7 = _mm_set1_epi16((b_7 - b_zero_point));
  118 
  119       // Load activation blocks. In this kernel we assume
  120       // a mat is already transposed. K x M
  121       // 1. Load 8 1x8 registers = 8k x 8m
  122 
  123       // Load column id of the first 1x4 block
  124       int32_t col_block_id_0 = temp_w_block_ids_ptr[0];
  125       // Load column id of the second 1x4 block
  126       int32_t col_block_id_1 = temp_w_block_ids_ptr[1];
  127       const __m128i va0 =
  128         _mm_loadl_epi64((const __m128i*) (a_packed +
  129             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 0));
  130       const __m128i va1 =
  131         _mm_loadl_epi64((const __m128i*) (a_packed +
  132             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 1));
  133       const __m128i va2 =
  134         _mm_loadl_epi64((const __m128i*) (a_packed +
  135             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 2));
  136       const __m128i va3 =
  137         _mm_loadl_epi64((const __m128i*) (a_packed +
  138             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 3));
  139       const __m128i va4 =
  140         _mm_loadl_epi64((const __m128i*) (a_packed +
  141             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 0));
  142       const __m128i va5 =
  143         _mm_loadl_epi64((const __m128i*) (a_packed +
  144             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 1));
  145       const __m128i va6 =
  146         _mm_loadl_epi64((const __m128i*) (a_packed +
  147             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 2));
  148       const __m128i va7 =
  149         _mm_loadl_epi64((const __m128i*) (a_packed +
  150             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 3));
  151 
  152       const __m128i vxa0 =
  153           sub_zero_point(_mm_unpacklo_epi8(va0, vzero), va_zero_point);
  154       const __m128i vxa1 =
  155           sub_zero_point(_mm_unpacklo_epi8(va1, vzero), va_zero_point);
  156       const __m128i vxa2 =
  157           sub_zero_point(_mm_unpacklo_epi8(va2, vzero), va_zero_point);
  158       const __m128i vxa3 =
  159           sub_zero_point(_mm_unpacklo_epi8(va3, vzero), va_zero_point);
  160       const __m128i vxa4 =
  161           sub_zero_point(_mm_unpacklo_epi8(va4, vzero), va_zero_point);
  162       const __m128i vxa5 =
  163           sub_zero_point(_mm_unpacklo_epi8(va5, vzero), va_zero_point);
  164       const __m128i vxa6 =
  165           sub_zero_point(_mm_unpacklo_epi8(va6, vzero), va_zero_point);
  166       const __m128i vxa7 =
  167           sub_zero_point(_mm_unpacklo_epi8(va7, vzero), va_zero_point);
  168 
  169       // acc += a0 * b0;
  170       __m128i vacc_low_16bits = _mm_mullo_epi16(vxa0, vxb0);
  171       __m128i vacc_high_16bits = _mm_mulhi_epi16(vxa0, vxb0);
  172       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  173         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  174       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  175         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  176       // acc += a1 * b1;
  177       vacc_low_16bits = _mm_mullo_epi16(vxa1, vxb1);
  178       vacc_high_16bits = _mm_mulhi_epi16(vxa1, vxb1);
  179       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  180         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  181       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  182         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  183       // acc += a2 * b2;
  184       vacc_low_16bits = _mm_mullo_epi16(vxa2, vxb2);
  185       vacc_high_16bits = _mm_mulhi_epi16(vxa2, vxb2);
  186       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  187         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  188       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  189         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  190       // acc += a3 * b3;
  191       vacc_low_16bits = _mm_mullo_epi16(vxa3, vxb3);
  192       vacc_high_16bits = _mm_mulhi_epi16(vxa3, vxb3);
  193       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  194         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  195       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  196         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  197       // acc += a4 * b4;
  198       vacc_low_16bits = _mm_mullo_epi16(vxa4, vxb4);
  199       vacc_high_16bits = _mm_mulhi_epi16(vxa4, vxb4);
  200       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  201         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  202       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  203         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  204       // acc += a5 * b5;
  205       vacc_low_16bits = _mm_mullo_epi16(vxa5, vxb5);
  206       vacc_high_16bits = _mm_mulhi_epi16(vxa5, vxb5);
  207       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  208         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  209       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  210         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  211       // acc += a6 * b6;
  212       vacc_low_16bits = _mm_mullo_epi16(vxa6, vxb6);
  213       vacc_high_16bits = _mm_mulhi_epi16(vxa6, vxb6);
  214       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  215         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  216       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  217         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  218       // acc += a7 * b7;
  219       vacc_low_16bits = _mm_mullo_epi16(vxa7, vxb7);
  220       vacc_high_16bits = _mm_mulhi_epi16(vxa7, vxb7);
  221       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  222         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  223       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  224         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  225 
  226       // Now we have 1x8 m acculated 32 bit values in vacc_low[n](4) and vacc_high[n](4)
  227 
  228       temp_packed_w = temp_packed_w + COL_BLOCK_SIZE * 2;
  229       temp_w_block_ids_ptr += 2;
  230       num_blocks -= 2;
  231     }
  232     if (num_blocks > 0) {
  233       // Load two 1x4 uint8 blocks 2 ints
  234       const uint8_t* b_ptr = temp_packed_w;
  235       const int16_t b_0 = (int16_t)((uint16_t)(b_ptr[0]));
  236       const int16_t b_1 = (int16_t)((uint16_t)(b_ptr[1]));
  237       const int16_t b_2 = (int16_t)((uint16_t)(b_ptr[2]));
  238       const int16_t b_3 = (int16_t)((uint16_t)(b_ptr[3]));
  239       // Now we will load 8kx1(broadcast 8) weight values
  240       const __m128i vxb0 = _mm_set1_epi16((b_0 - b_zero_point));
  241       const __m128i vxb1 = _mm_set1_epi16((b_1 - b_zero_point));
  242       const __m128i vxb2 = _mm_set1_epi16((b_2 - b_zero_point));
  243       const __m128i vxb3 = _mm_set1_epi16((b_3 - b_zero_point));
  244 
  245       // Then load transformed weight blocks
  246       // 1. Load 4 1x8 registers = 4k x 8m
  247       // Thus have 4x8 (4k x 8m) activations a0, a1, a2, a3
  248       // Each a containing 8 m values.
  249 
  250       // Load column id of the first 1x4 block
  251       int32_t col_block_id_0 = temp_w_block_ids_ptr[0];
  252       const __m128i va0 =
  253         _mm_loadl_epi64((const __m128i*) (a_packed +
  254             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 0));
  255       const __m128i va1 =
  256         _mm_loadl_epi64((const __m128i*) (a_packed +
  257             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 1));
  258       const __m128i va2 =
  259         _mm_loadl_epi64((const __m128i*) (a_packed +
  260             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 2));
  261       const __m128i va3 =
  262         _mm_loadl_epi64((const __m128i*) (a_packed +
  263             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 3));
  264       const __m128i vxa0 =
  265           sub_zero_point(_mm_unpacklo_epi8(va0, vzero), va_zero_point);
  266       const __m128i vxa1 =
  267           sub_zero_point(_mm_unpacklo_epi8(va1, vzero), va_zero_point);
  268       const __m128i vxa2 =
  269           sub_zero_point(_mm_unpacklo_epi8(va2, vzero), va_zero_point);
  270       const __m128i vxa3 =
  271           sub_zero_point(_mm_unpacklo_epi8(va3, vzero), va_zero_point);
  272 
  273       // acc += a0 * b0;
  274       __m128i vacc_low_16bits = _mm_mullo_epi16(vxa0, vxb0);
  275       __m128i vacc_high_16bits = _mm_mulhi_epi16(vxa0, vxb0);
  276       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  277         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  278       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  279         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  280       // acc += a1 * b1;
  281       vacc_low_16bits = _mm_mullo_epi16(vxa1, vxb1);
  282       vacc_high_16bits = _mm_mulhi_epi16(vxa1, vxb1);
  283       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  284         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  285       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  286         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  287       // acc += a2 * b2;
  288       vacc_low_16bits = _mm_mullo_epi16(vxa2, vxb2);
  289       vacc_high_16bits = _mm_mulhi_epi16(vxa2, vxb2);
  290       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  291         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  292       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  293         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  294       // acc += a3 * b3;
  295       vacc_low_16bits = _mm_mullo_epi16(vxa3, vxb3);
  296       vacc_high_16bits = _mm_mulhi_epi16(vxa3, vxb3);
  297       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  298         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  299       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  300         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  301 
  302       // Now we have 1x8 m acculated 32 bit values in vacc_low[n](4) and vacc_high[n](4)
  303     }
  304   }
  305 
  306   __m128 vout[8];
  307   __m128 a_ps, b_ps, c_ps, d_ps, tmp0, tmp1, tmp2, tmp3;
  308 
  309   // Transform low half of 4x8 result
  310   // That is 4x4 block (4n x 4m)
  311   // Convert to FP and transpose: 4m x 4n
  312   CONVERT_TO_FP_AND_TRANSPOSE(vacc_low[0],
  313                               vacc_low[1],
  314                               vacc_low[2],
  315                               vacc_low[3],
  316                               vout[0],
  317                               vout[1],
  318                               vout[2],
  319                               vout[3])
  320   CONVERT_TO_FP_AND_TRANSPOSE(vacc_high[0],
  321                               vacc_high[1],
  322                               vacc_high[2],
  323                               vacc_high[3],
  324                               vout[4],
  325                               vout[5],
  326                               vout[6],
  327                               vout[7])
  328 
  329   vout[0] = _mm_mul_ps(vmultiplier, vout[0]);
  330   vout[1] = _mm_mul_ps(vmultiplier, vout[1]);
  331   vout[2] = _mm_mul_ps(vmultiplier, vout[2]);
  332   vout[3] = _mm_mul_ps(vmultiplier, vout[3]);
  333   vout[4] = _mm_mul_ps(vmultiplier, vout[4]);
  334   vout[5] = _mm_mul_ps(vmultiplier, vout[5]);
  335   vout[6] = _mm_mul_ps(vmultiplier, vout[6]);
  336   vout[7] = _mm_mul_ps(vmultiplier, vout[7]);
  337 
  338   vout[0] = _mm_add_ps(vout[0], vbias);
  339   vout[1] = _mm_add_ps(vout[1], vbias);
  340   vout[2] = _mm_add_ps(vout[2], vbias);
  341   vout[3] = _mm_add_ps(vout[3], vbias);
  342   vout[4] = _mm_add_ps(vout[4], vbias);
  343   vout[5] = _mm_add_ps(vout[5], vbias);
  344   vout[6] = _mm_add_ps(vout[6], vbias);
  345   vout[7] = _mm_add_ps(vout[7], vbias);
  346 
  347   float* c0 = c;
  348   float* c1 = c0 + c_stride;
  349   if (mr < 2) {
  350     c1 = c0;
  351     vout[1] = vout[0];
  352   }
  353   float* c2 = c1 + c_stride;
  354   if (mr < 3) {
  355     c2 = c0;
  356     vout[2] = vout[0];
  357   }
  358   float* c3 = c2 + c_stride;
  359   if (mr < 4) {
  360     c3 = c0;
  361     vout[3] = vout[0];
  362   }
  363   float* c4 = c3 + c_stride;
  364   if (mr < 5) {
  365     c4 = c0;
  366     vout[4] = vout[0];
  367   }
  368   float* c5 = c4 + c_stride;
  369   if (mr < 6) {
  370     c5 = c0;
  371     vout[5] = vout[0];
  372   }
  373   float* c6 = c5 + c_stride;
  374   if (mr < 7) {
  375     c6 = c0;
  376     vout[6] = vout[0];
  377   }
  378   float* c7 = c6 + c_stride;
  379   if (mr < 8) {
  380     c7 = c0;
  381     vout[7] = vout[0];
  382   }
  383 
  384   if (nr == 4) {
  385     _mm_storeu_ps(c0, vout[0]);
  386     _mm_storeu_ps(c1, vout[1]);
  387     _mm_storeu_ps(c2, vout[2]);
  388     _mm_storeu_ps(c3, vout[3]);
  389     _mm_storeu_ps(c4, vout[4]);
  390     _mm_storeu_ps(c5, vout[5]);
  391     _mm_storeu_ps(c6, vout[6]);
  392     _mm_storeu_ps(c7, vout[7]);
  393   } else {
  394     if (nr >= 2) {
  395       _mm_storel_pi((__m64*)c0, vout[0]);
  396       _mm_storel_pi((__m64*)c1, vout[1]);
  397       _mm_storel_pi((__m64*)c2, vout[2]);
  398       _mm_storel_pi((__m64*)c3, vout[3]);
  399       _mm_storel_pi((__m64*)c4, vout[4]);
  400       _mm_storel_pi((__m64*)c5, vout[5]);
  401       _mm_storel_pi((__m64*)c6, vout[6]);
  402       _mm_storel_pi((__m64*)c7, vout[7]);
  403 
  404       nr -= 2;
  405 
  406       c0 += 2;
  407       c1 += 2;
  408       c2 += 2;
  409       c3 += 2;
  410       c4 += 2;
  411       c5 += 2;
  412       c6 += 2;
  413       c7 += 2;
  414       vout[0] = _mm_shuffle_ps(vout[0], vout[0], _MM_SHUFFLE(2, 2, 2, 2));
  415       vout[1] = _mm_shuffle_ps(vout[1], vout[1], _MM_SHUFFLE(2, 2, 2, 2));
  416       vout[2] = _mm_shuffle_ps(vout[2], vout[2], _MM_SHUFFLE(2, 2, 2, 2));
  417       vout[3] = _mm_shuffle_ps(vout[3], vout[3], _MM_SHUFFLE(2, 2, 2, 2));
  418       vout[4] = _mm_shuffle_ps(vout[4], vout[4], _MM_SHUFFLE(2, 2, 2, 2));
  419       vout[5] = _mm_shuffle_ps(vout[5], vout[5], _MM_SHUFFLE(2, 2, 2, 2));
  420       vout[6] = _mm_shuffle_ps(vout[6], vout[6], _MM_SHUFFLE(2, 2, 2, 2));
  421       vout[7] = _mm_shuffle_ps(vout[7], vout[7], _MM_SHUFFLE(2, 2, 2, 2));
  422     }
  423     if (nr != 0) {
  424       *c0 = _mm_cvtss_f32(vout[0]);
  425       *c1 = _mm_cvtss_f32(vout[1]);
  426       *c2 = _mm_cvtss_f32(vout[2]);
  427       *c3 = _mm_cvtss_f32(vout[3]);
  428       *c4 = _mm_cvtss_f32(vout[4]);
  429       *c5 = _mm_cvtss_f32(vout[5]);
  430       *c6 = _mm_cvtss_f32(vout[6]);
  431       *c7 = _mm_cvtss_f32(vout[7]);
  432     }
  433   }
  434 }