"Fossies" - the Fresh Open Source Software Archive

Member "pytorch-1.8.2/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/4x4-packA-aarch32-neon.S" (23 Jul 2021, 5931 Bytes) of package /linux/misc/pytorch-1.8.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PowerPC Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2  * Copyright (c) Facebook, Inc. and its affiliates.
    3  * All rights reserved.
    4  *
    5  * This source code is licensed under the BSD-style license found in the
    6  * LICENSE file in the root directory of this source tree.
    7  */
    8 
    9 #include <qnnpack/assembly.h>
   10 #include <requantization/runtime-assembly.h>
   11 
   12 # r0 mr
   13 # r1 k
   14 # r2 a
   15 # r3 a_stride
   16 
   17 .syntax unified
   18 
   19 #  Args passed via stack.
   20 #  TOS
   21 #  |----------------|
   22 #  |packed_a        | 0
   23 #  |----------------|
   24 #  
   25 
   26 #  After loading w pointer in ip reg.
   27 #  And after pushing r4-r9 and d8-d15 on stack
   28 #  |----------------|
   29 #  |r4 - r11        | 0 
   30 #  |packed_a        | 32
   31 #  |----------------|
   32 #  
   33 
   34 # Packed A format.
   35 # 4kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
   36 # Original A
   37 # --------- K -----------          -- (K + 4 - 1) / 4 --
   38 # |                     |          |                   |
   39 # |                     |        (M + 4 - 1)/4         |
   40 # |                     | Packed   |                   |
   41 # M                     |  =>      |-------------------|
   42 # |                     |        Thus Packed A has (K + 4 - 1)/4 * (M + 4 -1)/4 blocks
   43 # |                     |
   44 # |---------------------|
   45 # 
   46 # Each 4 x 4 blocks is transposed and stored.
   47 # Each of the (K + 4 - 1)/4 blocks for a given group of 4 m blocks
   48 # are stored adjacent in memory
   49 # Thus, each block:
   50 # |----4m-----|----4m-----|
   51 # 4k          |           | ..... (K + 4 - 1)/4 blocks
   52 # |-----------|-----------|
   53 # This locality helps in loading 8kx4m blocks of activations
   54 # Note when M is not multiple of 4, the rest can contain arbitrary
   55 # data in packed A as we will not be writing those out.
   56 # This wil be taken care by just copying the appropriate valid data
   57 
   58 # void pytorch_q8gemm_sparse_packA_ukernel_4x4__aarch32_neon(
   59 #     size_t mr,
   60 #     size_t K,
   61 #     const uint8_t* a,
   62 #     size_t a_stride,
   63 #     uint8_t* packed_a,
   64 BEGIN_FUNCTION pytorch_q8gemm_sparse_packA_ukernel_4x4__aarch32_neon
   65     .arm
   66 #ifndef __APPLE__
   67     .arch armv7-a
   68     .fpu neon
   69 #endif
   70 
   71     PUSH {r4, r5, r6, r7, r8, r9, r10, r11}
   72 
   73     # r4 = a0 = a pointer
   74     MOV r4, r2
   75     # r2 = packed_a pointer
   76     LDR r2, [sp, 32]
   77 
   78     CMP r0, 2
   79     # r5 = a1
   80     ADD r5, r4, r3
   81     MOVLO r5, r4
   82 
   83     # r6 = a2
   84     ADD r6, r5, r3
   85     MOVLS r6, r5
   86 
   87     CMP r0, 4
   88     # r7 = a3
   89     ADD r7, r6, r3
   90     MOVNE r7, r6
   91 
   92     # num_k_blocks = (k + (4 - 1)) / 4
   93     ADD r1, r1, 3
   94     LSR r1, r1, 2
   95 
   96     SUBS r1, r1, 2
   97     BLO 1f
   98 
   99     .p2align 5
  100 k_loop:
  101     VLD1.8 {d0}, [r4]!
  102     VLD1.8 {d1}, [r5]!
  103     VLD1.8 {d2}, [r6]!
  104     VLD1.8 {d3}, [r7]!
  105 
  106     #  Now we have 4x8 block of values that we will tranpose
  107     #  A matrix
  108     #  --------------------------------
  109     #  |                              |
  110     #  |a0-----a3 a4-----a7....|
  111     #  |b0 B00 b3 b4 B01 b7....|
  112     #  |c0     c3 c4     c7....|
  113     #  |d0-----d3 d4-----d7....|
  114     #  |                              |
  115     #  |                              |
  116     #  -------------------------------
  117     #  {va01, va23} = B00 + B01 = 2 uint8x16_t
  118     #  Sequence:
  119     #  VTRN.8 d0, d1 // low(va01), high(va01)
  120     #  VTRN.8 d2, d3 // low(va23), high(va23)
  121     #  VTRN.16 q0, q1 // va01, va23
  122     #  Now we have
  123     #  d0 = d4, c4, b4, a4 : d0, c0, b0, a0
  124     #  d1 = d5, c5, b5, a5 : d1, c1, b1, a1
  125     #  d2 = d6, c6, b6, a6 : d2, c2, b2, a2
  126     #  d3 = d7, c7, b7, a7 : d3, c3, b3, a3
  127     #  Thus 2 4x4 blocks are transposed.
  128     #  Now we have all 2 B00, B01 transposed.
  129 
  130     VTRN.8 d0, d1
  131     VTRN.8 d2, d3
  132     VTRN.16 q0, q1
  133 
  134     # Now VTRN.32 d0, d1
  135     # Now VTRN.32 d2, d3
  136     # Thus we have
  137     #  d0 = d1, c1, b1, a1 : d0, c0, b0, a0
  138     #  d1 = d5, c5, b5, a5 : d4, c4, b4, a4
  139     #  d2 = d3, c3, b3, a3 : d2, c2, b2, a2
  140     #  d3 = d7, c7, b7, a7 : d6, c6, b6, a6
  141     #  Then we can do
  142     # VSWP d1, d2
  143     #  d0 = d1, c1, b1, a1 : d0, c0, b0, a0
  144     #  d1 = d3, c3, b3, a3 : d2, c2, b2, a2
  145     #  d2 = d5, c5, b5, a5 : d4, c4, b4, a4
  146     #  d3 = d7, c7, b7, a7 : d6, c6, b6, a6
  147     # Now we can store q0 contiguously followed
  148     VTRN.32 d0, d1
  149     VTRN.32 d2, d3
  150     VSWP d1, d2
  151 
  152     # Now store the tranposed values
  153     # d0, d1, d2, d3
  154     VST1.8 {q0}, [r2]!
  155     VST1.8 {q1}, [r2]!
  156 
  157     SUBS r1, r1, 2
  158 
  159     BHS k_loop
  160 1:
  161     CMP r1, -2
  162     BEQ 2f
  163 
  164     VLD1.32 {d0[]}, [r4]
  165     VLD1.32 {d1[]}, [r5]
  166     VLD1.32 {d2[]}, [r6]
  167     VLD1.32 {d3[]}, [r7]
  168 
  169     #  Now we have 4x8 block of values that we will tranpose
  170     #  _d{0-3} are arm neon vector registers
  171     #  va0 = _d0 = a0 a1 a2 a3
  172     #  va1 = _d1 = b0 b1 b2 b3
  173     #  va2 = _d2 = c0 c1 c2 c3
  174     #  va3 = _d3 = d0 d1 d2 d3
  175     #  A matrix
  176     #  ----------------------------
  177     #  |                          |
  178     #  |                 a0-----a3|
  179     #  |                 b0 B00 b3|
  180     #  |   last block    c0     c3|
  181     #  |                 d0-----d3|
  182     #  |                          |
  183     #  |                          |
  184     #  ---------------------------
  185     #  Sequence:
  186     #  VTRN.8 d0, d1 // va0, va1
  187     #  VTRN.8 d2, d3 // va2, va3
  188     #  Now we have
  189     #  d0 = b2, a2, b0, a0
  190     #  d1 = b3, a3, b1, a1
  191     #  d2 = d2, c2, d0, c0
  192     #  d3 = d3, c3, d1, c1
  193     #  Sequence:
  194     #  VTRN.16 d0, d2
  195     #  VTRN.16 d1, d3
  196     #  Now we have
  197     #  d0 = d0, c0, b0, a0
  198     #  d1 = d1, c1, b1, a1
  199     #  d2 = d2, c2, b2, a2
  200     #  d3 = d3, c3, b3, a3
  201 
  202     VTRN.8 d0, d1
  203     VTRN.8 d2, d3
  204     VTRN.16 d0, d2
  205     VTRN.16 d1, d3
  206 
  207     # Since upper half of d0 just contains duplicate values
  208     # We dont want to store those
  209     # So let's combine upper half of d0 to the lower part of d0
  210     # And lower half of d1 to upper half of d0
  211     # Same for d2, d3
  212     VEXT.8 d0, d0, d1, #4
  213     VEXT.8 d1, d2, d3, #4
  214 
  215     # Now store the tranposed values
  216     # d0, d1, d2, d3
  217     VST1.8 {q0}, [r2]
  218     .p2align 4
  219 2:
  220     POP {r4, r5, r6, r7, r8, r9, r10, r11}
  221     BX lr
  222 
  223 END_FUNCTION pytorch_q8gemm_sparse_packA_ukernel_4x4__aarch32_neon
  224 
  225 #ifdef __ELF__
  226 .section ".note.GNU-stack","",%progbits
  227 #endif