"Fossies" - the Fresh Open Source Software Archive

Member "pytorch-1.8.2/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-sse2.c" (23 Jul 2021, 18534 Bytes) of package /linux/misc/pytorch-1.8.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2  * Copyright (c) Facebook, Inc. and its affiliates.
    3  * All rights reserved.
    4  *
    5  * This source code is licensed under the BSD-style license found in the
    6  * LICENSE file in the root directory of this source tree.
    7  */
    8 
    9 #include <immintrin.h>
   10 
   11 #include <qnnpack/q8gemm_sparse.h>
   12 #include <requantization/runtime-sse2.h>
   13 
   14 #define MR 8
   15 #define COL_BLOCK_SIZE 4
   16 
   17 #define CONVERT_TO_FP_AND_TRANSPOSE(a, b, c, d, t_a, t_b, t_c, t_d)  \
   18   a_ps = _mm_cvtepi32_ps(a);                                         \
   19   b_ps = _mm_cvtepi32_ps(b);                                         \
   20   c_ps = _mm_cvtepi32_ps(c);                                         \
   21   d_ps = _mm_cvtepi32_ps(d);                                         \
   22   tmp0 = _mm_shuffle_ps(a_ps, b_ps, _MM_SHUFFLE(1, 0, 1, 0));        \
   23   tmp1 = _mm_shuffle_ps(a_ps, b_ps, _MM_SHUFFLE(3, 2, 3, 2));        \
   24   tmp2 = _mm_shuffle_ps(c_ps, d_ps, _MM_SHUFFLE(1, 0, 1, 0));        \
   25   tmp3 = _mm_shuffle_ps(c_ps, d_ps, _MM_SHUFFLE(3, 2, 3, 2));        \
   26   t_a = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2, 0, 2, 0));         \
   27   t_b = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3, 1, 3, 1));         \
   28   t_c = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2, 0, 2, 0));         \
   29   t_d = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3, 1, 3, 1));
   30 
   31 PYTORCH_QNNP_INLINE __m128i load_transposed_a_block(
   32     const uint8_t* a0,
   33     const uint8_t* a1,
   34     const uint8_t* a2,
   35     const uint8_t* a3,
   36     const uint8_t* a4,
   37     const uint8_t* a5,
   38     const uint8_t* a6,
   39     const uint8_t* a7,
   40     const uint32_t index,
   41     const uint32_t col_block_id) {
   42   return _mm_set_epi8(
   43                    0,
   44                    0,
   45                    0,
   46                    0,
   47                    0,
   48                    0,
   49                    0,
   50                    0,
   51                    *(a7 + col_block_id * COL_BLOCK_SIZE + index),
   52                    *(a6 + col_block_id * COL_BLOCK_SIZE + index),
   53                    *(a5 + col_block_id * COL_BLOCK_SIZE + index),
   54                    *(a4 + col_block_id * COL_BLOCK_SIZE + index),
   55                    *(a3 + col_block_id * COL_BLOCK_SIZE + index),
   56                    *(a2 + col_block_id * COL_BLOCK_SIZE + index),
   57                    *(a1 + col_block_id * COL_BLOCK_SIZE + index),
   58                    *(a0 + col_block_id * COL_BLOCK_SIZE + index)
   59                    );
   60 }
   61 
   62 void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x4__sse2(
   63     size_t mr,
   64     size_t nr,
   65     const uint8_t* a,
   66     size_t a_stride,
   67     const uint8_t* packed_w,
   68     const uint32_t* w_row_ptr,
   69     const uint32_t* w_block_ids_ptr,
   70     const float* b,
   71     float* c,
   72     size_t c_stride,
   73     size_t output_channel_index,
   74     const struct pytorch_qnnp_conv_dynamic_quantization_params
   75         quantization_params[RESTRICT_STATIC 1]) {
   76 
   77   const __m128i va_zero_point = _mm_set1_epi16(quantization_params->input_zero_point);
   78   const __m128 vbias = _mm_load_ps(b);
   79   const __m128i vzero = _mm_setzero_si128();
   80 
   81   const uint8_t* a0 = a;
   82   const uint8_t* a1 = a + a_stride;
   83   if (mr < 2) {
   84     a1 = a0;
   85   }
   86   const uint8_t* a2 = a1 + a_stride;
   87   if (mr < 3) {
   88     a2 = a0;
   89   }
   90   const uint8_t* a3 = a2 + a_stride;
   91   if (mr < 4) {
   92     a3 = a0;
   93   }
   94   const uint8_t* a4 = a3 + a_stride;
   95   if (mr < 5) {
   96     a4 = a0;
   97   }
   98   const uint8_t* a5 = a4 + a_stride;
   99   if (mr < 6) {
  100     a5 = a0;
  101   }
  102   const uint8_t* a6 = a5 + a_stride;
  103   if (mr < 7) {
  104     a6 = a0;
  105   }
  106   const uint8_t* a7 = a6 + a_stride;
  107   if (mr < 8) {
  108     a7 = a0;
  109   }
  110 
  111   __m128i vacc_low[4];
  112   __m128i vacc_high[4];
  113   const __m128 vmultiplier =
  114       _mm_loadu_ps(&quantization_params->multipliers[output_channel_index]);
  115   for (int32_t n = 0; n < nr; n++) {
  116     vacc_low[n] = _mm_setzero_si128();
  117     vacc_high[n] = _mm_setzero_si128();
  118     const int16_t b_zero_point =
  119       (int16_t)(uint16_t)quantization_params->kernel_zero_points[
  120       output_channel_index + n];
  121 
  122     int32_t num_blocks = w_row_ptr[n+1] - w_row_ptr[n];
  123     // Offset into compressed values.
  124     // w_row_ptr[0] is the block offset in the compressed values.
  125     // Where the corresponding row of the weight matrix starts.
  126     const uint8_t* temp_packed_w = packed_w + w_row_ptr[n] * COL_BLOCK_SIZE;
  127     // Similarly w_row_ptr[0] is also the block offset where
  128     // corresponding row's block column ids start.
  129     // Per row # of block column ids = # of block values
  130     const uint32_t* temp_w_block_ids_ptr = w_block_ids_ptr + w_row_ptr[n];
  131     while (num_blocks > 1) {
  132       // Load two 1x4 uint8 blocks 2 ints
  133       const uint8_t* b_ptr = temp_packed_w;
  134       // This is not perf optimal since this will result in
  135       // register spills. We probably should work with output block
  136       // of 1x4 instead of 1x8
  137       // But doing is this way because mostly this how we will
  138       // do it for ARM and this reference code helps establish
  139       // the baseline for functional correctness.
  140       const int16_t b_0 = (int16_t)((uint16_t)(b_ptr[0]));
  141       const int16_t b_1 = (int16_t)((uint16_t)(b_ptr[1]));
  142       const int16_t b_2 = (int16_t)((uint16_t)(b_ptr[2]));
  143       const int16_t b_3 = (int16_t)((uint16_t)(b_ptr[3]));
  144       const int16_t b_4 = (int16_t)((uint16_t)(b_ptr[4]));
  145       const int16_t b_5 = (int16_t)((uint16_t)(b_ptr[5]));
  146       const int16_t b_6 = (int16_t)((uint16_t)(b_ptr[6]));
  147       const int16_t b_7 = (int16_t)((uint16_t)(b_ptr[7]));
  148       // Now we will load 8kx1(broadcast 8) weight values
  149       const __m128i vxb0 = _mm_set1_epi16((b_0 - b_zero_point));
  150       const __m128i vxb1 = _mm_set1_epi16((b_1 - b_zero_point));
  151       const __m128i vxb2 = _mm_set1_epi16((b_2 - b_zero_point));
  152       const __m128i vxb3 = _mm_set1_epi16((b_3 - b_zero_point));
  153       const __m128i vxb4 = _mm_set1_epi16((b_4 - b_zero_point));
  154       const __m128i vxb5 = _mm_set1_epi16((b_5 - b_zero_point));
  155       const __m128i vxb6 = _mm_set1_epi16((b_6 - b_zero_point));
  156       const __m128i vxb7 = _mm_set1_epi16((b_7 - b_zero_point));
  157 
  158       // Load transformed activation blocks
  159       // 1. Load 4 1x8 registers = 4k x 8m
  160       // 2. Load 4 1x8 registers.
  161       // Thus have 8x8 (8k x 8m) activations a0, a1, a2, a3, a4, a5, a6, a7
  162       // Each containing 8 m values.
  163       // Macro load_transposed_a_block loads 1x8 block of transposed
  164       // activations by loading 1 column of values in 1x8 register
  165 
  166       // Load column id of the first 1x4 block
  167       int32_t col_block_id_0 = temp_w_block_ids_ptr[0];
  168       // Load column id of the second 1x4 block
  169       int32_t col_block_id_1 = temp_w_block_ids_ptr[1];
  170       // NOTE: On the last 1x4 block.
  171       // Value of k might be such that in the last 1x4 block only
  172       // one valid value exist.
  173       // In that case for the corresponding block from a we may not have
  174       // out of bound access. We may access max of 3 bytes out of bound
  175       const __m128i va0 = load_transposed_a_block(
  176           a0, a1, a2, a3, a4, a5, a6, a7, 0, col_block_id_0);
  177       const __m128i va1 = load_transposed_a_block(
  178           a0, a1, a2, a3, a4, a5, a6, a7, 1, col_block_id_0);
  179       const __m128i va2 = load_transposed_a_block(
  180           a0, a1, a2, a3, a4, a5, a6, a7, 2, col_block_id_0);
  181       const __m128i va3 = load_transposed_a_block(
  182           a0, a1, a2, a3, a4, a5, a6, a7, 3, col_block_id_0);
  183       const __m128i va4 = load_transposed_a_block(
  184           a0, a1, a2, a3, a4, a5, a6, a7, 0, col_block_id_1);
  185       const __m128i va5 = load_transposed_a_block(
  186           a0, a1, a2, a3, a4, a5, a6, a7, 1, col_block_id_1);
  187       const __m128i va6 = load_transposed_a_block(
  188           a0, a1, a2, a3, a4, a5, a6, a7, 2, col_block_id_1);
  189       const __m128i va7 = load_transposed_a_block(
  190           a0, a1, a2, a3, a4, a5, a6, a7, 3, col_block_id_1);
  191       // 1.
  192       const __m128i vxa0 =
  193           sub_zero_point(_mm_unpacklo_epi8(va0, vzero), va_zero_point);
  194       const __m128i vxa1 =
  195           sub_zero_point(_mm_unpacklo_epi8(va1, vzero), va_zero_point);
  196       const __m128i vxa2 =
  197           sub_zero_point(_mm_unpacklo_epi8(va2, vzero), va_zero_point);
  198       const __m128i vxa3 =
  199           sub_zero_point(_mm_unpacklo_epi8(va3, vzero), va_zero_point);
  200       // 2.
  201       const __m128i vxa4 =
  202           sub_zero_point(_mm_unpacklo_epi8(va4, vzero), va_zero_point);
  203       const __m128i vxa5 =
  204           sub_zero_point(_mm_unpacklo_epi8(va5, vzero), va_zero_point);
  205       const __m128i vxa6 =
  206           sub_zero_point(_mm_unpacklo_epi8(va6, vzero), va_zero_point);
  207       const __m128i vxa7 =
  208           sub_zero_point(_mm_unpacklo_epi8(va7, vzero), va_zero_point);
  209 
  210       // acc += a0 * b0;
  211       __m128i vacc_low_16bits = _mm_mullo_epi16(vxa0, vxb0);
  212       __m128i vacc_high_16bits = _mm_mulhi_epi16(vxa0, vxb0);
  213       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  214         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  215       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  216         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  217       // acc += a1 * b1;
  218       vacc_low_16bits = _mm_mullo_epi16(vxa1, vxb1);
  219       vacc_high_16bits = _mm_mulhi_epi16(vxa1, vxb1);
  220       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  221         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  222       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  223         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  224       // acc += a2 * b2;
  225       vacc_low_16bits = _mm_mullo_epi16(vxa2, vxb2);
  226       vacc_high_16bits = _mm_mulhi_epi16(vxa2, vxb2);
  227       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  228         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  229       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  230         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  231       // acc += a3 * b3;
  232       vacc_low_16bits = _mm_mullo_epi16(vxa3, vxb3);
  233       vacc_high_16bits = _mm_mulhi_epi16(vxa3, vxb3);
  234       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  235         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  236       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  237         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  238       // acc += a4 * b4;
  239       vacc_low_16bits = _mm_mullo_epi16(vxa4, vxb4);
  240       vacc_high_16bits = _mm_mulhi_epi16(vxa4, vxb4);
  241       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  242         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  243       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  244         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  245       // acc += a5 * b5;
  246       vacc_low_16bits = _mm_mullo_epi16(vxa5, vxb5);
  247       vacc_high_16bits = _mm_mulhi_epi16(vxa5, vxb5);
  248       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  249         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  250       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  251         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  252       // acc += a6 * b6;
  253       vacc_low_16bits = _mm_mullo_epi16(vxa6, vxb6);
  254       vacc_high_16bits = _mm_mulhi_epi16(vxa6, vxb6);
  255       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  256         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  257       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  258         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  259       // acc += a7 * b7;
  260       vacc_low_16bits = _mm_mullo_epi16(vxa7, vxb7);
  261       vacc_high_16bits = _mm_mulhi_epi16(vxa7, vxb7);
  262       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  263         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  264       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  265         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  266 
  267       // Now we have 1x8 m acculated 32 bit values in vacc_low[n](4) and vacc_high[n](4)
  268 
  269       temp_packed_w = temp_packed_w + COL_BLOCK_SIZE * 2;
  270       temp_w_block_ids_ptr += 2;
  271       num_blocks -= 2;
  272     }
  273     if (num_blocks > 0) {
  274       // Load two 1x4 uint8 blocks 2 ints
  275       const uint8_t* b_ptr = temp_packed_w;
  276       const int16_t b_0 = (int16_t)((uint16_t)(b_ptr[0]));
  277       const int16_t b_1 = (int16_t)((uint16_t)(b_ptr[1]));
  278       const int16_t b_2 = (int16_t)((uint16_t)(b_ptr[2]));
  279       const int16_t b_3 = (int16_t)((uint16_t)(b_ptr[3]));
  280       // Now we will load 8kx1(broadcast 8) weight values
  281       const __m128i vxb0 = _mm_set1_epi16((b_0 - b_zero_point));
  282       const __m128i vxb1 = _mm_set1_epi16((b_1 - b_zero_point));
  283       const __m128i vxb2 = _mm_set1_epi16((b_2 - b_zero_point));
  284       const __m128i vxb3 = _mm_set1_epi16((b_3 - b_zero_point));
  285 
  286       // Then load transformed weight blocks
  287       // 1. Load 4 1x8 registers = 4k x 8m
  288       // Thus have 4x8 (4k x 8m) activations a0, a1, a2, a3
  289       // Each a containing 8 m values.
  290 
  291       // Load column id of the first 1x4 block
  292       int32_t col_block_id_0 = temp_w_block_ids_ptr[0];
  293       const __m128i va0 = load_transposed_a_block(a0, a1, a2, a3, a4, a5, a6, a7, 0, col_block_id_0);
  294       const __m128i va1 = load_transposed_a_block(a0, a1, a2, a3, a4, a5, a6, a7, 1, col_block_id_0);
  295       const __m128i va2 = load_transposed_a_block(a0, a1, a2, a3, a4, a5, a6, a7, 2, col_block_id_0);
  296       const __m128i va3 = load_transposed_a_block(a0, a1, a2, a3, a4, a5, a6, a7, 3, col_block_id_0);
  297       const __m128i vxa0 =
  298           sub_zero_point(_mm_unpacklo_epi8(va0, vzero), va_zero_point);
  299       const __m128i vxa1 =
  300           sub_zero_point(_mm_unpacklo_epi8(va1, vzero), va_zero_point);
  301       const __m128i vxa2 =
  302           sub_zero_point(_mm_unpacklo_epi8(va2, vzero), va_zero_point);
  303       const __m128i vxa3 =
  304           sub_zero_point(_mm_unpacklo_epi8(va3, vzero), va_zero_point);
  305 
  306       // acc += a0 * b0;
  307       __m128i vacc_low_16bits = _mm_mullo_epi16(vxa0, vxb0);
  308       __m128i vacc_high_16bits = _mm_mulhi_epi16(vxa0, vxb0);
  309       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  310         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  311       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  312         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  313       // acc += a1 * b1;
  314       vacc_low_16bits = _mm_mullo_epi16(vxa1, vxb1);
  315       vacc_high_16bits = _mm_mulhi_epi16(vxa1, vxb1);
  316       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  317         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  318       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  319         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  320       // acc += a2 * b2;
  321       vacc_low_16bits = _mm_mullo_epi16(vxa2, vxb2);
  322       vacc_high_16bits = _mm_mulhi_epi16(vxa2, vxb2);
  323       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  324         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  325       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  326         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  327       // acc += a3 * b3;
  328       vacc_low_16bits = _mm_mullo_epi16(vxa3, vxb3);
  329       vacc_high_16bits = _mm_mulhi_epi16(vxa3, vxb3);
  330       vacc_low[n] = _mm_add_epi32(vacc_low[n],
  331         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
  332       vacc_high[n] = _mm_add_epi32(vacc_high[n],
  333         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
  334 
  335       // Now we have 1x8 m acculated 32 bit values in vacc_low[n](4) and vacc_high[n](4)
  336     }
  337   }
  338 
  339   __m128 vout[8];
  340   __m128 a_ps, b_ps, c_ps, d_ps, tmp0, tmp1, tmp2, tmp3;
  341 
  342   // Transform low half of 4x8 result
  343   // That is 4x4 block (4n x 4m)
  344   // Convert to FP and transpose: 4m x 4n
  345   CONVERT_TO_FP_AND_TRANSPOSE(vacc_low[0],
  346                               vacc_low[1],
  347                               vacc_low[2],
  348                               vacc_low[3],
  349                               vout[0],
  350                               vout[1],
  351                               vout[2],
  352                               vout[3])
  353   CONVERT_TO_FP_AND_TRANSPOSE(vacc_high[0],
  354                               vacc_high[1],
  355                               vacc_high[2],
  356                               vacc_high[3],
  357                               vout[4],
  358                               vout[5],
  359                               vout[6],
  360                               vout[7])
  361 
  362   vout[0] = _mm_mul_ps(vmultiplier, vout[0]);
  363   vout[1] = _mm_mul_ps(vmultiplier, vout[1]);
  364   vout[2] = _mm_mul_ps(vmultiplier, vout[2]);
  365   vout[3] = _mm_mul_ps(vmultiplier, vout[3]);
  366   vout[4] = _mm_mul_ps(vmultiplier, vout[4]);
  367   vout[5] = _mm_mul_ps(vmultiplier, vout[5]);
  368   vout[6] = _mm_mul_ps(vmultiplier, vout[6]);
  369   vout[7] = _mm_mul_ps(vmultiplier, vout[7]);
  370 
  371   vout[0] = _mm_add_ps(vout[0], vbias);
  372   vout[1] = _mm_add_ps(vout[1], vbias);
  373   vout[2] = _mm_add_ps(vout[2], vbias);
  374   vout[3] = _mm_add_ps(vout[3], vbias);
  375   vout[4] = _mm_add_ps(vout[4], vbias);
  376   vout[5] = _mm_add_ps(vout[5], vbias);
  377   vout[6] = _mm_add_ps(vout[6], vbias);
  378   vout[7] = _mm_add_ps(vout[7], vbias);
  379 
  380   float* c0 = c;
  381   float* c1 = c0 + c_stride;
  382   if (mr < 2) {
  383     c1 = c0;
  384   }
  385   float* c2 = c1 + c_stride;
  386   if (mr < 3) {
  387     c2 = c0;
  388   }
  389   float* c3 = c2 + c_stride;
  390   if (mr < 4) {
  391     c3 = c0;
  392   }
  393   float* c4 = c3 + c_stride;
  394   if (mr < 5) {
  395     c4 = c0;
  396   }
  397   float* c5 = c4 + c_stride;
  398   if (mr < 6) {
  399     c5 = c0;
  400   }
  401   float* c6 = c5 + c_stride;
  402   if (mr < 7) {
  403     c6 = c0;
  404   }
  405   float* c7 = c6 + c_stride;
  406   if (mr < 8) {
  407     c7 = c0;
  408   }
  409 
  410   if (nr == 4) {
  411     _mm_storeu_ps(c0, vout[0]);
  412     _mm_storeu_ps(c1, vout[1]);
  413     _mm_storeu_ps(c2, vout[2]);
  414     _mm_storeu_ps(c3, vout[3]);
  415     _mm_storeu_ps(c4, vout[4]);
  416     _mm_storeu_ps(c5, vout[5]);
  417     _mm_storeu_ps(c6, vout[6]);
  418     _mm_storeu_ps(c7, vout[7]);
  419   } else {
  420     if (nr >= 2) {
  421       _mm_storel_pi((__m64*)c0, vout[0]);
  422       _mm_storel_pi((__m64*)c1, vout[1]);
  423       _mm_storel_pi((__m64*)c2, vout[2]);
  424       _mm_storel_pi((__m64*)c3, vout[3]);
  425       _mm_storel_pi((__m64*)c4, vout[4]);
  426       _mm_storel_pi((__m64*)c5, vout[5]);
  427       _mm_storel_pi((__m64*)c6, vout[6]);
  428       _mm_storel_pi((__m64*)c7, vout[7]);
  429 
  430       nr -= 2;
  431 
  432       c0 += 2;
  433       c1 += 2;
  434       c2 += 2;
  435       c3 += 2;
  436       c4 += 2;
  437       c5 += 2;
  438       c6 += 2;
  439       c7 += 2;
  440       vout[0] = _mm_shuffle_ps(vout[0], vout[0], _MM_SHUFFLE(2, 2, 2, 2));
  441       vout[1] = _mm_shuffle_ps(vout[1], vout[1], _MM_SHUFFLE(2, 2, 2, 2));
  442       vout[2] = _mm_shuffle_ps(vout[2], vout[2], _MM_SHUFFLE(2, 2, 2, 2));
  443       vout[3] = _mm_shuffle_ps(vout[3], vout[3], _MM_SHUFFLE(2, 2, 2, 2));
  444       vout[4] = _mm_shuffle_ps(vout[4], vout[4], _MM_SHUFFLE(2, 2, 2, 2));
  445       vout[5] = _mm_shuffle_ps(vout[5], vout[5], _MM_SHUFFLE(2, 2, 2, 2));
  446       vout[6] = _mm_shuffle_ps(vout[6], vout[6], _MM_SHUFFLE(2, 2, 2, 2));
  447       vout[7] = _mm_shuffle_ps(vout[7], vout[7], _MM_SHUFFLE(2, 2, 2, 2));
  448     }
  449     if (nr != 0) {
  450       *c0 = _mm_cvtss_f32(vout[0]);
  451       *c1 = _mm_cvtss_f32(vout[1]);
  452       *c2 = _mm_cvtss_f32(vout[2]);
  453       *c3 = _mm_cvtss_f32(vout[3]);
  454       *c4 = _mm_cvtss_f32(vout[4]);
  455       *c5 = _mm_cvtss_f32(vout[5]);
  456       *c6 = _mm_cvtss_f32(vout[6]);
  457       *c7 = _mm_cvtss_f32(vout[7]);
  458     }
  459   }
  460 }