"Fossies" - the Fresh Open Source Software Archive

Member "pytorch-1.8.2/aten/src/ATen/native/quantized/cpu/qnnpack/src/sconv/6x8-psimd.c" (23 Jul 2021, 6122 Bytes) of package /linux/misc/pytorch-1.8.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "6x8-psimd.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright (c) Facebook, Inc. and its affiliates.
    3  * All rights reserved.
    4  *
    5  * This source code is licensed under the BSD-style license found in the
    6  * LICENSE file in the root directory of this source tree.
    7  */
    8 
    9 #include <psimd.h>
   10 
   11 #include <qnnpack/sconv.h>
   12 
   13 void pytorch_sconv_ukernel_6x8__psimd(
   14     size_t mr,
   15     size_t nr,
   16     size_t kc,
   17     size_t ks,
   18     const float** restrict a,
   19     const float* restrict w,
   20     float* restrict c,
   21     size_t c_stride,
   22     const struct pytorch_qnnp_fp32_clamping_params
   23         clamping_params[restrict static 1]) {
   24   psimd_f32 vacc0x0123 = psimd_load_f32(w);
   25   w += 4;
   26   psimd_f32 vacc0x4567 = psimd_load_f32(w);
   27   w += 4;
   28   psimd_f32 vacc1x0123 = vacc0x0123;
   29   psimd_f32 vacc1x4567 = vacc0x4567;
   30   psimd_f32 vacc2x0123 = vacc0x0123;
   31   psimd_f32 vacc2x4567 = vacc0x4567;
   32   psimd_f32 vacc3x0123 = vacc0x0123;
   33   psimd_f32 vacc3x4567 = vacc0x4567;
   34   psimd_f32 vacc4x0123 = vacc0x0123;
   35   psimd_f32 vacc4x4567 = vacc0x4567;
   36   psimd_f32 vacc5x0123 = vacc0x0123;
   37   psimd_f32 vacc5x4567 = vacc0x4567;
   38 
   39   do {
   40     const float* restrict a0 = *a++;
   41     const float* restrict a1 = *a++;
   42     const float* restrict a2 = *a++;
   43     const float* restrict a3 = *a++;
   44     const float* restrict a4 = *a++;
   45     const float* restrict a5 = *a++;
   46 
   47     size_t k = kc;
   48     do {
   49       const psimd_f32 va0 = psimd_splat_f32(*a0);
   50       a0 += 1;
   51       const psimd_f32 va1 = psimd_splat_f32(*a1);
   52       a1 += 1;
   53       const psimd_f32 va2 = psimd_splat_f32(*a2);
   54       a2 += 1;
   55       const psimd_f32 va3 = psimd_splat_f32(*a3);
   56       a3 += 1;
   57       const psimd_f32 va4 = psimd_splat_f32(*a4);
   58       a4 += 1;
   59       const psimd_f32 va5 = psimd_splat_f32(*a5);
   60       a5 += 1;
   61 
   62       const psimd_f32 vb0123 = psimd_load_f32(w);
   63       w += 4;
   64       const psimd_f32 vb4567 = psimd_load_f32(w);
   65       w += 4;
   66 
   67       vacc0x0123 += vb0123 * va0;
   68       vacc0x4567 += vb4567 * va0;
   69       vacc1x0123 += vb0123 * va1;
   70       vacc1x4567 += vb4567 * va1;
   71       vacc2x0123 += vb0123 * va2;
   72       vacc2x4567 += vb4567 * va2;
   73       vacc3x0123 += vb0123 * va3;
   74       vacc3x4567 += vb4567 * va3;
   75       vacc4x0123 += vb0123 * va4;
   76       vacc4x4567 += vb4567 * va4;
   77       vacc5x0123 += vb0123 * va5;
   78       vacc5x4567 += vb4567 * va5;
   79     } while (--k != 0);
   80   } while (--ks != 0);
   81 
   82   const psimd_f32 vmax = psimd_splat_f32(clamping_params->max);
   83   vacc0x0123 = psimd_min_f32(vacc0x0123, vmax);
   84   vacc0x4567 = psimd_min_f32(vacc0x4567, vmax);
   85   vacc1x0123 = psimd_min_f32(vacc1x0123, vmax);
   86   vacc1x4567 = psimd_min_f32(vacc1x4567, vmax);
   87   vacc2x0123 = psimd_min_f32(vacc2x0123, vmax);
   88   vacc2x4567 = psimd_min_f32(vacc2x4567, vmax);
   89   vacc3x0123 = psimd_min_f32(vacc3x0123, vmax);
   90   vacc3x4567 = psimd_min_f32(vacc3x4567, vmax);
   91   vacc4x0123 = psimd_min_f32(vacc4x0123, vmax);
   92   vacc4x4567 = psimd_min_f32(vacc4x4567, vmax);
   93   vacc5x0123 = psimd_min_f32(vacc5x0123, vmax);
   94   vacc5x4567 = psimd_min_f32(vacc5x4567, vmax);
   95 
   96   const psimd_f32 vmin = psimd_splat_f32(clamping_params->min);
   97   vacc0x0123 = psimd_max_f32(vacc0x0123, vmin);
   98   vacc0x4567 = psimd_max_f32(vacc0x4567, vmin);
   99   vacc1x0123 = psimd_max_f32(vacc1x0123, vmin);
  100   vacc1x4567 = psimd_max_f32(vacc1x4567, vmin);
  101   vacc2x0123 = psimd_max_f32(vacc2x0123, vmin);
  102   vacc2x4567 = psimd_max_f32(vacc2x4567, vmin);
  103   vacc3x0123 = psimd_max_f32(vacc3x0123, vmin);
  104   vacc3x4567 = psimd_max_f32(vacc3x4567, vmin);
  105   vacc4x0123 = psimd_max_f32(vacc4x0123, vmin);
  106   vacc4x4567 = psimd_max_f32(vacc4x4567, vmin);
  107   vacc5x0123 = psimd_max_f32(vacc5x0123, vmin);
  108   vacc5x4567 = psimd_max_f32(vacc5x4567, vmin);
  109 
  110   float* c0 = c;
  111   float* c1 = (float*)((uintptr_t)c0 + c_stride);
  112   if (mr < 2) {
  113     c1 = c0;
  114   }
  115   float* c2 = (float*)((uintptr_t)c1 + c_stride);
  116   if (mr <= 2) {
  117     c2 = c1;
  118   }
  119   float* c3 = (float*)((uintptr_t)c2 + c_stride);
  120   if (mr < 4) {
  121     c3 = c2;
  122   }
  123   float* c4 = (float*)((uintptr_t)c3 + c_stride);
  124   if (mr <= 4) {
  125     c4 = c3;
  126   }
  127   float* c5 = (float*)((uintptr_t)c4 + c_stride);
  128   if (mr != 6) {
  129     c5 = c4;
  130   }
  131   if (nr == 8) {
  132     psimd_store_f32(c0, vacc0x0123);
  133     c0 += 4;
  134     psimd_store_f32(c1, vacc1x0123);
  135     c1 += 4;
  136     psimd_store_f32(c2, vacc2x0123);
  137     c2 += 4;
  138     psimd_store_f32(c3, vacc3x0123);
  139     c3 += 4;
  140     psimd_store_f32(c4, vacc4x0123);
  141     c4 += 4;
  142     psimd_store_f32(c5, vacc5x0123);
  143     c5 += 4;
  144 
  145     psimd_store_f32(c0, vacc0x4567);
  146     psimd_store_f32(c1, vacc1x4567);
  147     psimd_store_f32(c2, vacc2x4567);
  148     psimd_store_f32(c3, vacc3x4567);
  149     psimd_store_f32(c4, vacc4x4567);
  150     psimd_store_f32(c5, vacc5x4567);
  151   } else {
  152     if (nr >= 4) {
  153       psimd_store_f32(c0, vacc0x0123);
  154       c0 += 4;
  155       psimd_store_f32(c1, vacc1x0123);
  156       c1 += 4;
  157       psimd_store_f32(c2, vacc2x0123);
  158       c2 += 4;
  159       psimd_store_f32(c3, vacc3x0123);
  160       c3 += 4;
  161       psimd_store_f32(c4, vacc4x0123);
  162       c4 += 4;
  163       psimd_store_f32(c5, vacc5x0123);
  164       c5 += 4;
  165       vacc0x0123 = vacc0x4567;
  166       vacc1x0123 = vacc1x4567;
  167       vacc2x0123 = vacc2x4567;
  168       vacc3x0123 = vacc3x4567;
  169       vacc4x0123 = vacc4x4567;
  170       vacc5x0123 = vacc5x4567;
  171       nr -= 4;
  172     }
  173     if (nr >= 2) {
  174       psimd_store2_f32(c0, vacc0x0123);
  175       c0 += 2;
  176       psimd_store2_f32(c1, vacc1x0123);
  177       c1 += 2;
  178       psimd_store2_f32(c2, vacc2x0123);
  179       c2 += 2;
  180       psimd_store2_f32(c3, vacc3x0123);
  181       c3 += 2;
  182       psimd_store2_f32(c4, vacc4x0123);
  183       c4 += 2;
  184       psimd_store2_f32(c5, vacc5x0123);
  185       c5 += 2;
  186       vacc0x0123 = psimd_concat_hi_f32(vacc0x0123, vacc0x0123);
  187       vacc1x0123 = psimd_concat_hi_f32(vacc1x0123, vacc1x0123);
  188       vacc2x0123 = psimd_concat_hi_f32(vacc2x0123, vacc2x0123);
  189       vacc3x0123 = psimd_concat_hi_f32(vacc3x0123, vacc3x0123);
  190       vacc4x0123 = psimd_concat_hi_f32(vacc4x0123, vacc4x0123);
  191       vacc5x0123 = psimd_concat_hi_f32(vacc5x0123, vacc5x0123);
  192       nr -= 2;
  193     }
  194     if (nr != 0) {
  195       psimd_store1_f32(c0, vacc0x0123);
  196       psimd_store1_f32(c1, vacc1x0123);
  197       psimd_store1_f32(c2, vacc2x0123);
  198       psimd_store1_f32(c3, vacc3x0123);
  199       psimd_store1_f32(c4, vacc4x0123);
  200       psimd_store1_f32(c5, vacc5x0123);
  201     }
  202   }
  203 }