"Fossies" - the Fresh Open Source Software Archive

Member "gmp-6.2.1/mpn/sparc32/v9/addmul_1.asm" (14 Nov 2020, 7411 Bytes) of package /linux/misc/gmp-6.2.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
    2 dnl  the result to a second limb vector.
    3 
    4 dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
    5 
    6 dnl  This file is part of the GNU MP Library.
    7 dnl
    8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    9 dnl  it under the terms of either:
   10 dnl
   11 dnl    * the GNU Lesser General Public License as published by the Free
   12 dnl      Software Foundation; either version 3 of the License, or (at your
   13 dnl      option) any later version.
   14 dnl
   15 dnl  or
   16 dnl
   17 dnl    * the GNU General Public License as published by the Free Software
   18 dnl      Foundation; either version 2 of the License, or (at your option) any
   19 dnl      later version.
   20 dnl
   21 dnl  or both in parallel, as here.
   22 dnl
   23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
   24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   26 dnl  for more details.
   27 dnl
   28 dnl  You should have received copies of the GNU General Public License and the
   29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
   30 dnl  see https://www.gnu.org/licenses/.
   31 
   32 include(`../config.m4')
   33 
   34 C Algorithm: We use two floating-point multiplies per limb product, with the
   35 C invariant v operand split into two 16-bit pieces, and the u operand split
   36 C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
   37 C the integer unit.
   38 
   39 C          cycles/limb
   40 C UltraSPARC 1&2:     6.5
   41 C UltraSPARC 3:       ?
   42 
   43 C Possible optimizations:
   44 C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
   45 C      memory bandwidth limited, this could save 1.5 cycles/limb.
   46 C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
   47 C      it is very straightforward to unroll, using an exit branch midways.
   48 C      Unrolling would allow deeper scheduling which could improve speed for L2
   49 C      cache case.
   50 C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
   51 C      aren't sufficiently apart-scheduled with just two temp areas.
   52 C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
   53 C      could save many operations.
   54 
   55 C INPUT PARAMETERS
   56 C rp    i0
   57 C up    i1
   58 C n i2
   59 C v i3
   60 
   61 define(`FSIZE',224)
   62 
   63 ASM_START()
   64 PROLOGUE(mpn_addmul_1)
   65     add %sp, -FSIZE, %sp
   66     sethi   %hi(0xffff), %g1
   67     srl %o3, 16, %g2
   68     or  %g1, %lo(0xffff), %g1
   69     and %o3, %g1, %g1
   70     stx %g1, [%sp+104]
   71     stx %g2, [%sp+112]
   72     ldd [%sp+104], %f6
   73     ldd [%sp+112], %f8
   74     fxtod   %f6, %f6
   75     fxtod   %f8, %f8
   76     ld  [%sp+104], %f10     C zero f10
   77 
   78     mov 0, %g3          C cy = 0
   79 
   80 define(`fanop', `fitod %f18, %f0')  C  A quasi nop running in the FA pipe
   81 
   82     add %sp, 160, %o5       C point in scratch area
   83     and %o5, -32, %o5       C align at 0 (mod 32) in scratch area
   84 
   85     subcc   %o2, 1, %o2
   86     ld  [%o1], %f11     C read up[i]
   87     add %o1, 4, %o1     C up++
   88     bne,pt  %icc, .L_two_or_more
   89     fxtod   %f10, %f2
   90 
   91     fmuld   %f2, %f8, %f16
   92     fmuld   %f2, %f6, %f4
   93     fdtox   %f16, %f14
   94     fdtox   %f4, %f12
   95     std %f14, [%o5+16]
   96     std %f12, [%o5+24]
   97     ldx [%o5+16], %g2       C p16
   98     ldx [%o5+24], %g1       C p0
   99     lduw    [%o0], %g5      C read rp[i]
  100     b   .L1
  101     add %o0, -16, %o0
  102 
  103     .align  16
  104 .L_two_or_more:
  105     subcc   %o2, 1, %o2
  106     ld  [%o1], %f11     C read up[i]
  107     fmuld   %f2, %f8, %f16
  108     fmuld   %f2, %f6, %f4
  109     add %o1, 4, %o1     C up++
  110     bne,pt  %icc, .L_three_or_more
  111     fxtod   %f10, %f2
  112 
  113     fdtox   %f16, %f14
  114     fdtox   %f4, %f12
  115     std %f14, [%o5+16]
  116     fmuld   %f2, %f8, %f16
  117     std %f12, [%o5+24]
  118     fmuld   %f2, %f6, %f4
  119     fdtox   %f16, %f14
  120     fdtox   %f4, %f12
  121     std %f14, [%o5+0]
  122     std %f12, [%o5+8]
  123     lduw    [%o0], %g5      C read rp[i]
  124     ldx [%o5+16], %g2       C p16
  125     ldx [%o5+24], %g1       C p0
  126     b   .L2
  127     add %o0, -12, %o0
  128 
  129     .align  16
  130 .L_three_or_more:
  131     subcc   %o2, 1, %o2
  132     ld  [%o1], %f11     C read up[i]
  133     fdtox   %f16, %f14
  134     fdtox   %f4, %f12
  135     std %f14, [%o5+16]
  136     fmuld   %f2, %f8, %f16
  137     std %f12, [%o5+24]
  138     fmuld   %f2, %f6, %f4
  139     add %o1, 4, %o1     C up++
  140     bne,pt  %icc, .L_four_or_more
  141     fxtod   %f10, %f2
  142 
  143     fdtox   %f16, %f14
  144     fdtox   %f4, %f12
  145     std %f14, [%o5+0]
  146     fmuld   %f2, %f8, %f16
  147     std %f12, [%o5+8]
  148     fmuld   %f2, %f6, %f4
  149     fdtox   %f16, %f14
  150     ldx [%o5+16], %g2       C p16
  151     fdtox   %f4, %f12
  152     ldx [%o5+24], %g1       C p0
  153     std %f14, [%o5+16]
  154     std %f12, [%o5+24]
  155     lduw    [%o0], %g5      C read rp[i]
  156     b   .L3
  157     add %o0, -8, %o0
  158 
  159     .align  16
  160 .L_four_or_more:
  161     subcc   %o2, 1, %o2
  162     ld  [%o1], %f11     C read up[i]
  163     fdtox   %f16, %f14
  164     fdtox   %f4, %f12
  165     std %f14, [%o5+0]
  166     fmuld   %f2, %f8, %f16
  167     std %f12, [%o5+8]
  168     fmuld   %f2, %f6, %f4
  169     add %o1, 4, %o1     C up++
  170     bne,pt  %icc, .L_five_or_more
  171     fxtod   %f10, %f2
  172 
  173     fdtox   %f16, %f14
  174     ldx [%o5+16], %g2       C p16
  175     fdtox   %f4, %f12
  176     ldx [%o5+24], %g1       C p0
  177     std %f14, [%o5+16]
  178     fmuld   %f2, %f8, %f16
  179     std %f12, [%o5+24]
  180     fmuld   %f2, %f6, %f4
  181     add %o1, 4, %o1     C up++
  182     lduw    [%o0], %g5      C read rp[i]
  183     b   .L4
  184     add %o0, -4, %o0
  185 
  186     .align  16
  187 .L_five_or_more:
  188     subcc   %o2, 1, %o2
  189     ld  [%o1], %f11     C read up[i]
  190     fdtox   %f16, %f14
  191     ldx [%o5+16], %g2       C p16
  192     fdtox   %f4, %f12
  193     ldx [%o5+24], %g1       C p0
  194     std %f14, [%o5+16]
  195     fmuld   %f2, %f8, %f16
  196     std %f12, [%o5+24]
  197     fmuld   %f2, %f6, %f4
  198     add %o1, 4, %o1     C up++
  199     lduw    [%o0], %g5      C read rp[i]
  200     bne,pt  %icc, .Loop
  201     fxtod   %f10, %f2
  202     b,a .L5
  203 
  204 C BEGIN MAIN LOOP
  205     .align 16
  206 C -- 0
  207 .Loop:  nop
  208     subcc   %o2, 1, %o2
  209     ld  [%o1], %f11     C read up[i]
  210     fdtox   %f16, %f14
  211 C -- 1
  212     sllx    %g2, 16, %g4        C (p16 << 16)
  213     add %o0, 4, %o0     C rp++
  214     ldx [%o5+0], %g2        C p16
  215     fdtox   %f4, %f12
  216 C -- 2
  217     nop
  218     add %g1, %g4, %g4       C p = p0 + (p16 << 16)
  219     ldx [%o5+8], %g1        C p0
  220     fanop
  221 C -- 3
  222     nop
  223     add %g3, %g4, %g4       C p += cy
  224     std %f14, [%o5+0]
  225     fmuld   %f2, %f8, %f16
  226 C -- 4
  227     nop
  228     add %g5, %g4, %g4       C p += rp[i]
  229     std %f12, [%o5+8]
  230     fmuld   %f2, %f6, %f4
  231 C -- 5
  232     xor %o5, 16, %o5        C alternate scratch variables
  233     add %o1, 4, %o1     C up++
  234     stw %g4, [%o0-4]
  235     fanop
  236 C -- 6
  237     srlx    %g4, 32, %g3        C new cy
  238     lduw    [%o0], %g5      C read rp[i]
  239     bne,pt  %icc, .Loop
  240     fxtod   %f10, %f2
  241 C END MAIN LOOP
  242 
  243 .L5:    fdtox   %f16, %f14
  244     sllx    %g2, 16, %g4        C (p16 << 16)
  245     ldx [%o5+0], %g2        C p16
  246     fdtox   %f4, %f12
  247     add %g1, %g4, %g4       C p = p0 + (p16 << 16)
  248     ldx [%o5+8], %g1        C p0
  249     add %g4, %g3, %g4       C p += cy
  250     std %f14, [%o5+0]
  251     fmuld   %f2, %f8, %f16
  252     add %g5, %g4, %g4       C p += rp[i]
  253     std %f12, [%o5+8]
  254     fmuld   %f2, %f6, %f4
  255     xor %o5, 16, %o5
  256     stw %g4, [%o0+0]
  257     srlx    %g4, 32, %g3        C new cy
  258     lduw    [%o0+4], %g5        C read rp[i]
  259 
  260 .L4:    fdtox   %f16, %f14
  261     sllx    %g2, 16, %g4        C (p16 << 16)
  262     ldx [%o5+0], %g2        C p16
  263     fdtox   %f4, %f12
  264     add %g1, %g4, %g4       C p = p0 + (p16 << 16)
  265     ldx [%o5+8], %g1        C p0
  266     add %g3, %g4, %g4       C p += cy
  267     std %f14, [%o5+0]
  268     add %g5, %g4, %g4       C p += rp[i]
  269     std %f12, [%o5+8]
  270     xor %o5, 16, %o5
  271     stw %g4, [%o0+4]
  272     srlx    %g4, 32, %g3        C new cy
  273     lduw    [%o0+8], %g5        C read rp[i]
  274 
  275 .L3:    sllx    %g2, 16, %g4        C (p16 << 16)
  276     ldx [%o5+0], %g2        C p16
  277     add %g1, %g4, %g4       C p = p0 + (p16 << 16)
  278     ldx [%o5+8], %g1        C p0
  279     add %g3, %g4, %g4       C p += cy
  280     add %g5, %g4, %g4       C p += rp[i]
  281     xor %o5, 16, %o5
  282     stw %g4, [%o0+8]
  283     srlx    %g4, 32, %g3        C new cy
  284     lduw    [%o0+12], %g5       C read rp[i]
  285 
  286 .L2:    sllx    %g2, 16, %g4        C (p16 << 16)
  287     ldx [%o5+0], %g2        C p16
  288     add %g1, %g4, %g4       C p = p0 + (p16 << 16)
  289     ldx [%o5+8], %g1        C p0
  290     add %g3, %g4, %g4       C p += cy
  291     add %g5, %g4, %g4       C p += rp[i]
  292     stw %g4, [%o0+12]
  293     srlx    %g4, 32, %g3        C new cy
  294     lduw    [%o0+16], %g5       C read rp[i]
  295 
  296 .L1:    sllx    %g2, 16, %g4        C (p16 << 16)
  297     add %g1, %g4, %g4       C p = p0 + (p16 << 16)
  298     add %g3, %g4, %g4       C p += cy
  299     add %g5, %g4, %g4       C p += rp[i]
  300     stw %g4, [%o0+16]
  301     srlx    %g4, 32, %g3        C new cy
  302 
  303     mov %g3, %o0
  304     retl
  305     sub %sp, -FSIZE, %sp
  306 EPILOGUE(mpn_addmul_1)