"Fossies" - the Fresh Open Source Software Archive

Member "pytorch-1.8.2/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-aarch32-neon.S" (23 Jul 2021, 16672 Bytes) of package /linux/misc/pytorch-1.8.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PowerPC Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2  * Copyright (c) Facebook, Inc. and its affiliates.
    3  * All rights reserved.
    4  *
    5  * This source code is licensed under the BSD-style license found in the
    6  * LICENSE file in the root directory of this source tree.
    7  */
    8 
    9 #include <qnnpack/assembly.h>
   10 #include <requantization/runtime-assembly.h>
   11 
   12 # r0 mr
   13 # r1 nr
   14 # r2 a
   15 # r3 a_stride
   16 
   17 # d14 a_zero_point
   18 # d15 b_zero_point
   19 
   20 ## Stack
   21 # 4     a_stride
   22 # 4     packed_w
   23 # 4     w_row_ptr 
   24 # 4     w_block_ids_ptr
   25 # 4     b
   26 # 4     c
   27 # 4     c_stride
   28 # 4     output channel index
   29 # 4     quantization_params
   30 # --
   31 # 36 bytes of parameters on stack
   32 # 16    r4-r7
   33 # 64    d8-d15
   34 
   35 .syntax unified
   36 
   37 #  Args passed via stack.
   38 #  TOS
   39 #  |----------------|
   40 #  |packed_w        | 0
   41 #  |w_row_ptr       | 4
   42 #  |w_block_ids_ptr | 8
   43 #  |b               | 12
   44 #  |c               | 16
   45 #  |c_stride        | 20
   46 #  |out ch indx     | 24
   47 #  |params          | 28
   48 #  |----------------|
   49 #  
   50 
   51 #  After loading w pointer in ip reg.
   52 #  And after pushing r4-r9 and d8-d15 on stack
   53 #  |----------------|
   54 #  |d8 - d15        | 0
   55 #  |r4 - r11,lr     | 64
   56 #  |packed_w        | 100
   57 #  |w_row_ptr       | 104
   58 #  |w_block_ids_ptr | 108
   59 #  |b               | 112
   60 #  |c               | 116
   61 #  |c_stride        | 120
   62 #  |out ch indx     | 124
   63 #  |params          | 128
   64 #  |----------------|
   65 #  
   66 
   67 # void pytorch_q8gemm_dq_sparse_1x4_ukernel_8x4__aarch32_neon(
   68 #     size_t mr,
   69 #     size_t nr,
   70 #     const uint8_t* a,
   71 #     size_t a_stride,
   72 #     const uint8_t* packed_w,
   73 #     const uint32_t* w_row_ptr,
   74 #     const uint32_t* w_block_ids_ptr,
   75 #     const float* b,
   76 #     uint8_t* restrict c,
   77 #     size_t c_stride,
   78 #     size_t output_channel_index,
   79 #     const union pytorch_qnnp_conv_dynamic_quantization_params quantization_params[restrict static 1])
   80 BEGIN_FUNCTION pytorch_q8gemm_dq_sparse_1x4_ukernel_8x4__aarch32_neon
   81     .arm
   82 #ifndef __APPLE__
   83     .arch armv7-a
   84     .fpu neon
   85 #endif
   86 
   87     PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr}
   88     VPUSH {d8-d15}
   89 
   90     # Store mr and nr on stack to later load from since
   91     # we will not need it once all 8 a pointers are
   92     # initialized.
   93     # Furthermore, we need all the register we can get.
   94     STR r0, [sp, #-4]
   95     STR r1, [sp, #-8]
   96     # Load output channel index
   97     LDR r5, [sp, 124]
   98     # Load quantization params
   99     # - r7 = quantization_params
  100     LDR r7, [sp, 128]
  101     # Load input_zero_point
  102     VLD1.8 {d16[]}, [r7]
  103     ADD r7, r7, 4
  104     # Load pointer to per channel zero points array
  105     LDR r4, [r7]
  106     # Add output_channel_index to the b_zero_point pointer
  107     ADD r4, r4, r5
  108     # Store pointer to zero point on stack
  109     STR r4, [sp, #-12]
  110 
  111     # Copy mr to r9
  112     MOV r9, r0
  113     # Stride to r10
  114     MOV r10, r3
  115     # r0 = a0 = a pointer
  116     MOV r0, r2
  117     # Use r8 to store nr
  118     # and r1 to store a1
  119     MOV r8, r1
  120 
  121     CMP r9, 2
  122     # r1 = a1
  123     ADD r1, r0, r10
  124     MOVLO r1, r0
  125 
  126     # r2 = a2
  127     ADD r2, r1, r10
  128     MOVLS r2, r1
  129 
  130     CMP r9, 4
  131     # r3 = a3
  132     ADD r3, r2, r10
  133     MOVLO r3, r2
  134 
  135     # r4 = a4
  136     ADD r4, r3, r10
  137     MOVLS r4, r3
  138 
  139     CMP r9, 6
  140     # r5 = a5
  141     ADD r5, r4, r10
  142     MOVLO r5, r4
  143 
  144     # r6 = a6
  145     ADD r6, r5, r10
  146     MOVLS r6, r5
  147 
  148     CMP r9, 8
  149     # r7 = a7
  150     ADD r7, r6, r10
  151     MOVNE r7, r6
  152 
  153     # We enter the loop if r8 is atleast 1.
  154     # r8 = r8 - 1 will happen in the epilogue
  155     # of the loop
  156     CMP r8, 1
  157     BLO 7f
  158 
  159     .p2align 5
  160 0:
  161     # Load pointer to weight zero point
  162     LDR r9, [sp, #-12]
  163     # Load w_row_ptr + n
  164     LDR r11, [sp, 104]
  165     VEOR q10, q10, q10
  166     VLD1.8 {d17[]}, [r9]!
  167     # Store updated pointer to weight zero point on stack
  168     STR r9, [sp, #-12]
  169     # ip = w_row_ptr[n], lr = w_row_ptr[n+1]
  170     # r11 = r11 + 4 to point to next n
  171     LDR ip, [r11], #4
  172     LDR lr, [r11]
  173     VEOR q11, q11, q11
  174     STR r11, [sp, 104]
  175     # r9 = packed_w
  176     LDR r9, [sp, 100]
  177     # r10 = blocks_id_ptr
  178     LDR r10, [sp, 108]
  179     # r9 = temp_packed_w = packed_w + w_row_ptr[n] * 4
  180     # This points to the first block of nonzero value
  181     # for the nth row.
  182     ADD r9, r9, ip, LSL #2
  183     # r10 = temp_w_block_ids_ptr = w_block_ids_ptr + w_row_ptr[n]
  184     # LSL2 because each element is 4 bytes
  185     # This points to the block id of the first block
  186     # It should contain r9 - r8 number of block ids
  187     ADD r10, r10, ip, LSL #2
  188     # r11 = num_blocks that needs to be processed
  189     SUB r11, lr, ip
  190     SUBS r11, r11, 2
  191     BLO 1f
  192 
  193 k_loop:
  194     # Load 2 non zero blocks of weights. Each block = 1x4.
  195     VLD1.8 {d19}, [r9]!
  196     # q9 = vxb
  197     VSUBL.U8 q9, d19, d17
  198 
  199     #ip = block_id_ptr[0]
  200     #lr = block_id_ptr[1]
  201     LDR ip, [r10], #4
  202     LDR lr, [r10], #4
  203 
  204     # Add offset to r0
  205     # Shift by 2 because each block id corresponds to 4 uint8_t elements
  206     ADD r0, r0, ip, LSL #2
  207     ADD r1, r1, ip, LSL #2
  208     ADD r2, r2, ip, LSL #2
  209     ADD r3, r3, ip, LSL #2
  210     ADD r4, r4, ip, LSL #2
  211     ADD r5, r5, ip, LSL #2
  212     ADD r6, r6, ip, LSL #2
  213     ADD r7, r7, ip, LSL #2
  214 
  215     VLD1.32 {d0[]}, [r0]
  216     VLD1.32 {d2[]}, [r1]
  217     VLD1.32 {d4[]}, [r2]
  218     VLD1.32 {d6[]}, [r3]
  219     VLD1.32 {d8[]}, [r4]
  220     VLD1.32 {d10[]}, [r5]
  221     VLD1.32 {d12[]}, [r6]
  222     VLD1.32 {d14[]}, [r7]
  223 
  224     SUB ip, lr, ip
  225     ADD r0, r0, ip, LSL #2
  226     ADD r1, r1, ip, LSL #2
  227     ADD r2, r2, ip, LSL #2
  228     ADD r3, r3, ip, LSL #2
  229     ADD r4, r4, ip, LSL #2
  230     ADD r5, r5, ip, LSL #2
  231     ADD r6, r6, ip, LSL #2
  232     ADD r7, r7, ip, LSL #2
  233 
  234     VLD1.32 {d1[]}, [r0]
  235     VLD1.32 {d3[]}, [r1]
  236     VLD1.32 {d5[]}, [r2]
  237     VLD1.32 {d7[]}, [r3]
  238     VLD1.32 {d9[]}, [r4]
  239     VLD1.32 {d11[]}, [r5]
  240     VLD1.32 {d13[]}, [r6]
  241     VLD1.32 {d15[]}, [r7]
  242 
  243     SUB r0, r0, lr, LSL #2
  244     SUB r1, r1, lr, LSL #2
  245     SUB r2, r2, lr, LSL #2
  246     SUB r3, r3, lr, LSL #2
  247     SUB r4, r4, lr, LSL #2
  248     SUB r5, r5, lr, LSL #2
  249     SUB r6, r6, lr, LSL #2
  250     SUB r7, r7, lr, LSL #2
  251 
  252     VEXT.8 d0, d0, d1, #4
  253     VEXT.8 d1, d2, d3, #4
  254     VEXT.8 d2, d4, d5, #4
  255     VTRN.8 d0, d1
  256     VEXT.8 d3, d6, d7, #4
  257     VEXT.8 d4, d8, d9, #4
  258     VTRN.8 d2, d3
  259     VEXT.8 d5, d10, d11, #4
  260     VTRN.16 q0, q1
  261     VTRN.8 d4, d5
  262     VEXT.8 d6, d12, d13, #4
  263     VEXT.8 d7, d14, d15, #4
  264 
  265     VTRN.8 d6, d7
  266     VTRN.16 q2, q3
  267 
  268     VTRN.32 q0, q2
  269     VTRN.32 q1, q3
  270     # Sub offset from r0
  271     # This we have to do because we dont have a vector load
  272     # instruction with pre-indexed register offset
  273 
  274     # Add offset to r1
  275     # Sub offset from r1
  276     # This we have to do because we dont have a vector load
  277     # instruction with pre-indexed register offset
  278 
  279     # Add offset to r2
  280     # Sub offset from r2
  281     # This we have to do because we dont have a vector load
  282     # instruction with pre-indexed register offset
  283 
  284     # Add offset to r3
  285     # Sub offset from r3
  286     # This we have to do because we dont have a vector load
  287     # instruction with pre-indexed register offset
  288 
  289     # Add offset to r4
  290     # Sub offset from r4
  291     # This we have to do because we dont have a vector load
  292     # instruction with pre-indexed register offset
  293 
  294     # Add offset to r5
  295     # Sub offset from r5
  296     # This we have to do because we dont have a vector load
  297     # instruction with pre-indexed register offset
  298 
  299     # Add offset to r6
  300     # Sub offset from r6
  301     # This we have to do because we dont have a vector load
  302     # instruction with pre-indexed register offset
  303 
  304     # Add offset to r7
  305     # Sub offset from r7
  306     # This we have to do because we dont have a vector load
  307     # instruction with pre-indexed register offset
  308 
  309     #
  310     # d0, d1 = q0 = va0, va1
  311     # d2, d3 = q1 = va2, va3
  312     # d4, d5 = q2 = va4, va5
  313     # d6, d7 = q3 = va6, va7
  314     #Now transpose
  315 
  316 
  317     VSUBL.U8 q4, d0, d16  // vxa0_t
  318     VSUBL.U8 q5, d1, d16  // vxa1_t
  319     VSUBL.U8 q6, d2, d16  // vxa2_t
  320     VSUBL.U8 q7, d3, d16  // vxa3_t
  321     VSUBL.U8 q12, d4, d16  // vxa4_t
  322     VSUBL.U8 q13, d5, d16  // vxa5_t
  323     VSUBL.U8 q14, d6, d16  // vxa6_t
  324     VSUBL.U8 q15, d7, d16  // vxa7_t
  325     # This setup without the VMOVs is a perfect
  326     # setup for double buffering + tranpose
  327     # Tranposed result is stored in q0, q1, q2, q3.
  328     # vxa* ares in q4-q7, q12-q13.
  329     # Now q0-q3 are free to store next iterations of
  330     # activations + tranpose
  331     # We will do this as a later optimization.
  332 
  333     VMOV q0, q9
  334 
  335     VMLAL.S16 q10, d8, d0[0]
  336     VMLAL.S16 q11, d9, d0[0]
  337     VMLAL.S16 q10, d10, d0[1]
  338     VMLAL.S16 q11, d11, d0[1]
  339     VMLAL.S16 q10, d12, d0[2]
  340     VMLAL.S16 q11, d13, d0[2]
  341     VMLAL.S16 q10, d14, d0[3]
  342     VMLAL.S16 q11, d15, d0[3]
  343     VMLAL.S16 q10, d24, d1[0]
  344     VMLAL.S16 q11, d25, d1[0]
  345     VMLAL.S16 q10, d26, d1[1]
  346     VMLAL.S16 q11, d27, d1[1]
  347     VMLAL.S16 q10, d28, d1[2]
  348     VMLAL.S16 q11, d29, d1[2]
  349     VMLAL.S16 q10, d30, d1[3]
  350     VMLAL.S16 q11, d31, d1[3]
  351 
  352     SUBS r11, r11, 2
  353 
  354     BHS k_loop
  355 1:
  356     CMP r11, -2
  357     BEQ 2f
  358 
  359     # Load last nonzero block
  360     # For this we will load 4 8 bit values as one 32 bit value
  361     VLD1.32 {d19[]}, [r9]!
  362     # q9 = vxb
  363     VSUBL.U8 q9, d19, d17
  364 
  365     #ip = block_id_ptr[0]
  366     LDR ip, [r10]
  367 
  368     # Add offset to r0, r4
  369     # Shift by 2 because each block id corresponds to 4 uint8_t elements
  370     ADD r0, r0, ip, LSL #2
  371     ADD r4, r4, ip, LSL #2
  372     ADD r1, r1, ip, LSL #2
  373     ADD r5, r5, ip, LSL #2
  374     ADD r2, r2, ip, LSL #2
  375     ADD r6, r6, ip, LSL #2
  376     ADD r3, r3, ip, LSL #2
  377     ADD r7, r7, ip, LSL #2
  378 
  379     VLD1.32 {d0[]}, [r0]
  380     VLD1.32 {d1[]}, [r4]
  381     VLD1.32 {d2[]}, [r1]
  382     VLD1.32 {d3[]}, [r5]
  383     VLD1.32 {d4[]}, [r2]
  384     VLD1.32 {d5[]}, [r6]
  385     VLD1.32 {d6[]}, [r3]
  386     VLD1.32 {d7[]}, [r7]
  387 
  388     SUB r0, r0, ip, LSL #2
  389     SUB r1, r1, ip, LSL #2
  390     SUB r5, r5, ip, LSL #2
  391     SUB r4, r4, ip, LSL #2
  392     SUB r2, r2, ip, LSL #2
  393     SUB r6, r6, ip, LSL #2
  394     SUB r3, r3, ip, LSL #2
  395     SUB r7, r7, ip, LSL #2
  396 
  397     VEXT.8 d0, d0, d1, #4
  398     # Add offset to r1, r5
  399     # d1 = va1, va5
  400     VEXT.8 d1, d2, d3, #4
  401     # Add offset to r2, r6
  402     # d2 = va2, va6
  403     VEXT.8 d2, d4, d5, #4
  404     # Add offset to r3, r7
  405     # d3 = va3, va7
  406     VEXT.8 d3, d6, d7, #4
  407 
  408     #
  409     # d0 = va0, va4
  410     # d1 = va1, va5
  411     # d2 = va2, va6
  412     # d3 = va3, va7
  413     #Now transpose
  414     VTRN.8 d0, d1
  415     VTRN.8 d2, d3
  416     VTRN.16 d0, d2
  417     VTRN.16 d1, d3
  418 
  419 
  420     VSUBL.U8 q4, d0, d16  // vxa04_t
  421     VSUBL.U8 q5, d1, d16  // vxa15_t
  422     VSUBL.U8 q6, d2, d16  // vxa26_t
  423     VSUBL.U8 q7, d3, d16  // vxa37_t
  424 
  425     VMOV q0, q9
  426 
  427     VMLAL.S16 q10, d8, d0[0]
  428     VMLAL.S16 q11, d9, d0[0]
  429     VMLAL.S16 q10, d10, d0[1]
  430     VMLAL.S16 q11, d11, d0[1]
  431     VMLAL.S16 q10, d12, d0[2]
  432     VMLAL.S16 q11, d13, d0[2]
  433     VMLAL.S16 q10, d14, d0[3]
  434     VMLAL.S16 q11, d15, d0[3]
  435 
  436     .p2align 4
  437 2:
  438     # Store result on stack
  439 
  440     # -12 because TOS - 4, TOS - 8, and TOS - 12, store mr, nr and pointer to weight zp
  441     # + 128 bytes of buffer when nr = 1
  442     # This is needed because after processing all nrs we will
  443     # load 128 bytes from stack. This is for q10, q11 for max nr of 4
  444     # Thus we will load accumulators back in q0, q1, q2, q3, q4, q5, q6, q7
  445     # When nr < 4, extra q values will be fetched from stack which may overlap
  446     # with other parts of stack storing local variables. To avoid that we just
  447     # create a buffer of 128 bytes inbetween to make sure pointer increment
  448     # never produces address that is beyond the stack frame of this function.
  449     SUB r9, sp, 140
  450     # Each iteration produce 8 values each of 4 bytes
  451     # Thus 8 x 4 = 32 bytes 2^5
  452     # In this implementation, first value will be stored at
  453     # 1st value: sp - 12 - r8 * 32
  454     # 2nd value: sp - 12 - (r8 - 1) * 32
  455     # and so on.
  456     SUB r9, r9, r8, LSL #5
  457     VST1.32 {q10}, [r9]!
  458     VST1.32 {q11}, [r9]
  459 
  460     # Check if nr >=1
  461     SUBS r8, r8, 1
  462     BHI 0b
  463 3:
  464     # First load all the accumulators from stack
  465     # Load nr
  466     LDR r8, [sp, #-8]
  467     SUB r9, sp, 140
  468     SUB r9, r9, r8, LSL #5
  469     # Now load q8-q15
  470     # This is 4x8 block (nrxmr)
  471     # We will transpose this to 8x4 (mrxnr)
  472     # q8, q12  : x00, x10, x20, x30; x40, x50, x60, x70
  473     # q9, q13  : x01, x11, x21, x31; x41, x51, x61, x71
  474     # q10, q14 : x02, x12, x22, x32; x42, x52, x62, x72
  475     # q11, q15 : x03, x13, x23, x33; x43, x53, x63, x73
  476     VLD1.32 {q8}, [r9]!
  477     VLD1.32 {q12}, [r9]!
  478     VLD1.32 {q9}, [r9]!
  479     VLD1.32 {q13}, [r9]!
  480     VLD1.32 {q10}, [r9]!
  481     VLD1.32 {q14}, [r9]!
  482     VLD1.32 {q11}, [r9]!
  483     VLD1.32 {q15}, [r9]
  484 
  485     ## Now transpose q8-11
  486     # VTRN.32 q8, q9
  487     # VTRN.32 q10, q11
  488     # q8 : X00, x01, x20, x21
  489     # q9 : X10, x11, x30, x31
  490     # q10: X02, x03, x22, x23
  491     # q11: X12, x13, x32, x33
  492     # VSWP d16, d17
  493     # q8 : x20, x21, x00, x01
  494     # VEXT.32 q6, q8, q10, 2
  495     # q6 : x00, x01, x02, x03
  496     # VEXT.32 q10, q10, q8, 2
  497     # q10: x22, x23, x20, x21
  498     # VSWP d20, d21
  499     # VMOV q8, q6
  500     # q8 : X00, x01, x02, x03
  501     # q10: x20, x21, x22, x23
  502     # VSWP d18, d19
  503     # q9 : x30, x31, x10, x11
  504     # VEXT.32 q6, q9, q11, 2
  505     # q6 : x10, x11, x12, x13
  506     # VEXT.32 q11, q11, q9, 2
  507     # q11: x32, x33, x30, x31
  508     # VSWP d22, d23
  509     # VMOV q9, q6
  510     # q9 : x10, x11, x12, x13
  511     # q11: x30, x31, x32, x33
  512     # Thus we have
  513     # q8 : X00, x01, x02, x03
  514     # q9 : X10, x11, x12, x13
  515     # q10: X20, x21, x22, x23
  516     # q11: X30, x31, x32, x33
  517     # Now we can do the same for q4-q7
  518     # q12: X40, X41, X42, X43
  519     # q13: X50, X51, X52, X53
  520     # q14: X60, X61, X62, X63
  521     # q15: X70, X71, X72, X73
  522     # NEED TO VALIDATE THIS
  523     VTRN.32 q8, q9
  524     VTRN.32 q10, q11
  525     VSWP d16, d17
  526     VEXT.32 q6, q8, q10, 2
  527     VEXT.32 q10, q10, q8, 2
  528     VSWP d20, d21
  529     VMOV q8, q6
  530     VSWP d18, d19
  531     VEXT.32 q6, q9, q11, 2
  532     VEXT.32 q11, q11, q9, 2
  533     VSWP d22, d23
  534     VMOV q9, q6
  535 
  536     VTRN.32 q12, q13
  537     VTRN.32 q14, q15
  538     VSWP d24, d25
  539     VEXT.32 q6, q12, q14, 2
  540     VEXT.32 q14, q14, q12, 2
  541     VSWP d28, d29
  542     VMOV q12, q6
  543     VSWP d26, d27
  544     VEXT.32 q6, q13, q15, 2
  545     VEXT.32 q15, q15, q13, 2
  546     VSWP d30, d31
  547     VMOV q13, q6
  548 
  549     # Load output channel index
  550     LDR r5, [sp, 124]
  551     # Load quantization params
  552     # - r7 = quantization_params
  553     LDR r7, [sp, 128]
  554     ADD r7, r7, 8
  555     # Load pointer to per channel requant scale
  556     LDR r7, [r7]
  557     # Now r7 has the base_addr + offset for multipliers
  558     ADD r7, r7, r5, LSL #2
  559 
  560     LDR r6, [sp, 112]
  561     # Load q6: vmultiplier_c0123
  562     VLD1.32 {d12, d13}, [r7]!
  563     VCVT.F32.S32 q8, q8
  564     VCVT.F32.S32 q9, q9
  565     VCVT.F32.S32 q10, q10
  566     VLD1.32 {q0}, [r6]
  567 
  568     VCVT.F32.S32 q11, q11
  569     VCVT.F32.S32 q12, q12
  570     VCVT.F32.S32 q13, q13
  571     VCVT.F32.S32 q14, q14
  572     VCVT.F32.S32 q15, q15
  573 
  574     VMUL.F32 q8, q8, q6
  575     VMUL.F32 q9, q9, q6
  576     VMUL.F32 q10, q10, q6
  577     VMUL.F32 q11, q11, q6
  578     VMUL.F32 q12, q12, q6
  579     VMUL.F32 q13, q13, q6
  580     VMUL.F32 q14, q14, q6
  581     VMUL.F32 q15, q15, q6
  582 
  583     VADD.F32 q8, q8, q0
  584     VADD.F32 q9, q9, q0
  585     VADD.F32 q10, q10, q0
  586     VADD.F32 q11, q11, q0
  587     VADD.F32 q12, q12, q0
  588     VADD.F32 q13, q13, q0
  589     VADD.F32 q14, q14, q0
  590     VADD.F32 q15, q15, q0
  591 
  592     # Load c, c_stride:
  593     # - r0 = c
  594     # - r10 = c_stride
  595     LDR r0, [sp, 116]
  596     LDR r10, [sp, 120]
  597     LSL r10, r10, 2
  598 
  599     # load mr from stack
  600     LDR r9, [sp, #-4]
  601     # load nr from stack
  602     LDR r8, [sp, #-8]
  603     # r0 = c0 = c pointer
  604 
  605     CMP r9, 2
  606     # r1 = c1
  607     ADD r1, r0, r10
  608     MOVLO r1, r0
  609 
  610     # r2 = c2
  611     ADD r2, r1, r10
  612     MOVLS r2, r1
  613 
  614     CMP r9, 4
  615     # r3 = c3
  616     ADD r3, r2, r10
  617     MOVLO r3, r2
  618 
  619     # r4 = c4
  620     ADD r4, r3, r10
  621     MOVLS r4, r3
  622 
  623     CMP r9, 6
  624     # r5 = c5
  625     ADD r5, r4, r10
  626     MOVLO r5, r4
  627 
  628     # r6 = c6
  629     ADD r6, r5, r10
  630     MOVLS r6, r5
  631 
  632     CMP r9, 8
  633     # r7 = c7
  634     ADD r7, r6, r10
  635     MOVNE r7, r6
  636 
  637     CMP r8, 4
  638     BNE 4f
  639 
  640     VST1.32 {q8}, [r0]
  641     VST1.32 {q9}, [r1]
  642     VST1.32 {q10}, [r2]
  643     VST1.32 {q11}, [r3]
  644     VST1.32 {q12}, [r4]
  645     VST1.32 {q13}, [r5]
  646     VST1.32 {q14}, [r6]
  647     VST1.32 {q15}, [r7]
  648 
  649     VPOP {d8-d15}
  650     POP {r4, r5, r6, r7, r8, r9, r10, r11, lr}
  651     BX lr
  652 
  653     .p2align 3
  654 4:
  655     CMP r8, 2
  656     BLO 5f
  657 
  658     VST1.32 {d16}, [r0]!
  659     VST1.32 {d18}, [r1]!
  660     VST1.32 {d20}, [r2]!
  661     VST1.32 {d22}, [r3]!
  662     VST1.32 {d24}, [r4]!
  663     VST1.32 {d26}, [r5]!
  664     VST1.32 {d28}, [r6]!
  665     VST1.32 {d30}, [r7]!
  666 
  667     SUB r8, 2
  668 
  669     VMOV.32 d16, d17
  670     VMOV.32 d18, d19
  671     VMOV.32 d20, d21
  672     VMOV.32 d22, d23
  673     VMOV.32 d24, d25
  674     VMOV.32 d26, d27
  675     VMOV.32 d28, d29
  676     VMOV.32 d30, d31
  677 
  678 5:
  679     CMP r8, 0
  680     BEQ 7f
  681 
  682     VST1.32 {d16[0]}, [r0]
  683     VST1.32 {d18[0]}, [r1]
  684     VST1.32 {d20[0]}, [r2]
  685     VST1.32 {d22[0]}, [r3]
  686     VST1.32 {d24[0]}, [r4]
  687     VST1.32 {d26[0]}, [r5]
  688     VST1.32 {d28[0]}, [r6]
  689     VST1.32 {d30[0]}, [r7]
  690 
  691 7:
  692     VPOP {d8-d15}
  693     POP {r4, r5, r6, r7, r8, r9, r10, r11, lr}
  694     BX lr
  695 
  696 END_FUNCTION pytorch_q8gemm_dq_sparse_1x4_ukernel_8x4__aarch32_neon
  697 
  698 #ifdef __ELF__
  699 .section ".note.GNU-stack","",%progbits
  700 #endif