"Fossies" - the Fresh Open Source Software Archive

Member "gmp-6.2.1/mpn/pa64/addmul_1.asm" (14 Nov 2020, 18604 Bytes) of package /linux/misc/gmp-6.2.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
    2 dnl  add the result to a second limb vector.
    3 
    4 dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
    5 
    6 dnl  This file is part of the GNU MP Library.
    7 dnl
    8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    9 dnl  it under the terms of either:
   10 dnl
   11 dnl    * the GNU Lesser General Public License as published by the Free
   12 dnl      Software Foundation; either version 3 of the License, or (at your
   13 dnl      option) any later version.
   14 dnl
   15 dnl  or
   16 dnl
   17 dnl    * the GNU General Public License as published by the Free Software
   18 dnl      Foundation; either version 2 of the License, or (at your option) any
   19 dnl      later version.
   20 dnl
   21 dnl  or both in parallel, as here.
   22 dnl
   23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
   24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   26 dnl  for more details.
   27 dnl
   28 dnl  You should have received copies of the GNU General Public License and the
   29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
   30 dnl  see https://www.gnu.org/licenses/.
   31 
   32 include(`../config.m4')
   33 
   34 C           cycles/limb
   35 C 8000,8200:        7
   36 C 8500,8600,8700:   6.375
   37 
   38 C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
   39 C  could be saved there per call.
   40 
   41 C  DESCRIPTION:
   42 C  The main loop "BIG" is 4-way unrolled, mainly to allow
   43 C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
   44 C  registers to the IU registers, have demanded a deep software pipeline, and
   45 C  a lot of stack slots for partial products in flight.
   46 C
   47 C  CODE STRUCTURE:
   48 C  save-some-registers
   49 C  do 0, 1, 2, or 3 limbs
   50 C  if done, restore-some-regs and return
   51 C  save-many-regs
   52 C  do 4, 8, ... limb
   53 C  restore-all-regs
   54 
   55 C  STACK LAYOUT:
   56 C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
   57 C  slots marked FREE, as well as some slots in the caller's "frame marker".
   58 C
   59 C -00 <- r30
   60 C -08  FREE
   61 C -10  tmp
   62 C -18  tmp
   63 C -20  tmp
   64 C -28  tmp
   65 C -30  tmp
   66 C -38  tmp
   67 C -40  tmp
   68 C -48  tmp
   69 C -50  tmp
   70 C -58  tmp
   71 C -60  tmp
   72 C -68  tmp
   73 C -70  tmp
   74 C -78  tmp
   75 C -80  tmp
   76 C -88  tmp
   77 C -90  FREE
   78 C -98  FREE
   79 C -a0  FREE
   80 C -a8  FREE
   81 C -b0  r13
   82 C -b8  r12
   83 C -c0  r11
   84 C -c8  r10
   85 C -d0  r8
   86 C -d8  r8
   87 C -e0  r7
   88 C -e8  r6
   89 C -f0  r5
   90 C -f8  r4
   91 C -100 r3
   92 C  Previous frame:
   93 C  [unused area]
   94 C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
   95 
   96 
   97 include(`../config.m4')
   98 
   99 C INPUT PARAMETERS:
  100 define(`rp',`%r26') C
  101 define(`up',`%r25') C
  102 define(`n',`%r24')  C
  103 define(`vlimb',`%r23')  C
  104 
  105 define(`climb',`%r23')  C
  106 
  107 ifdef(`HAVE_ABI_2_0w',
  108 `   .level  2.0w
  109 ',` .level  2.0
  110 ')
  111 PROLOGUE(mpn_addmul_1)
  112 
  113 ifdef(`HAVE_ABI_2_0w',
  114 `   std     vlimb, -0x38(%r30)  C store vlimb into "home" slot
  115 ')
  116     std,ma      %r3, 0x100(%r30)
  117     std     %r4, -0xf8(%r30)
  118     std     %r5, -0xf0(%r30)
  119     ldo     0(%r0), climb       C clear climb
  120     fldd        -0x138(%r30), %fr8  C put vlimb in fp register
  121 
  122 define(`p032a1',`%r1')  C
  123 define(`p032a2',`%r19') C
  124 
  125 define(`m032',`%r20')   C
  126 define(`m096',`%r21')   C
  127 
  128 define(`p000a',`%r22')  C
  129 define(`p064a',`%r29')  C
  130 
  131 define(`s000',`%r31')   C
  132 
  133 define(`ma000',`%r4')   C
  134 define(`ma064',`%r20')  C
  135 
  136 define(`r000',`%r3')    C
  137 
  138     extrd,u     n, 63, 2, %r5
  139     cmpb,=      %r5, %r0, L(BIG)
  140     nop
  141 
  142     fldd        0(up), %fr4
  143     ldo     8(up), up
  144     xmpyu       %fr8R, %fr4L, %fr22
  145     xmpyu       %fr8L, %fr4R, %fr23
  146     fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  147     xmpyu       %fr8R, %fr4R, %fr24
  148     xmpyu       %fr8L, %fr4L, %fr25
  149     fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  150     fstd        %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
  151     addib,<>    -1, %r5, L(two_or_more)
  152     fstd        %fr25, -0x68(%r30)  C high product to -0x68..-0x61
  153 LDEF(one)
  154     ldd     -0x78(%r30), p032a1
  155     ldd     -0x70(%r30), p032a2
  156     ldd     -0x80(%r30), p000a
  157     b       L(0_one_out)
  158     ldd     -0x68(%r30), p064a
  159 
  160 LDEF(two_or_more)
  161     fldd        0(up), %fr4
  162     ldo     8(up), up
  163     xmpyu       %fr8R, %fr4L, %fr22
  164     xmpyu       %fr8L, %fr4R, %fr23
  165     ldd     -0x78(%r30), p032a1
  166     fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  167     xmpyu       %fr8R, %fr4R, %fr24
  168     xmpyu       %fr8L, %fr4L, %fr25
  169     ldd     -0x70(%r30), p032a2
  170     fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  171     ldd     -0x80(%r30), p000a
  172     fstd        %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
  173     ldd     -0x68(%r30), p064a
  174     addib,<>    -1, %r5, L(three_or_more)
  175     fstd        %fr25, -0x68(%r30)  C high product to -0x68..-0x61
  176 LDEF(two)
  177     add     p032a1, p032a2, m032
  178     add,dc      %r0, %r0, m096
  179     depd,z      m032, 31, 32, ma000
  180     extrd,u     m032, 31, 32, ma064
  181     ldd     0(rp), r000
  182     b       L(0_two_out)
  183     depd        m096, 31, 32, ma064
  184 
  185 LDEF(three_or_more)
  186     fldd        0(up), %fr4
  187     add     p032a1, p032a2, m032
  188     add,dc      %r0, %r0, m096
  189     depd,z      m032, 31, 32, ma000
  190     extrd,u     m032, 31, 32, ma064
  191     ldd     0(rp), r000
  192 C   addib,=     -1, %r5, L(0_out)
  193     depd        m096, 31, 32, ma064
  194 LDEF(loop0)
  195 C   xmpyu       %fr8R, %fr4L, %fr22
  196 C   xmpyu       %fr8L, %fr4R, %fr23
  197 C   ldd     -0x78(%r30), p032a1
  198 C   fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  199 C
  200 C   xmpyu       %fr8R, %fr4R, %fr24
  201 C   xmpyu       %fr8L, %fr4L, %fr25
  202 C   ldd     -0x70(%r30), p032a2
  203 C   fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  204 C
  205 C   ldo     8(rp), rp
  206 C   add     climb, p000a, s000
  207 C   ldd     -0x80(%r30), p000a
  208 C   fstd        %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
  209 C
  210 C   add,dc      p064a, %r0, climb
  211 C   ldo     8(up), up
  212 C   ldd     -0x68(%r30), p064a
  213 C   fstd        %fr25, -0x68(%r30)  C high product to -0x68..-0x61
  214 C
  215 C   add     ma000, s000, s000
  216 C   add,dc      ma064, climb, climb
  217 C   fldd        0(up), %fr4
  218 C
  219 C   add     r000, s000, s000
  220 C   add,dc      %r0, climb, climb
  221 C   std     s000, -8(rp)
  222 C
  223 C   add     p032a1, p032a2, m032
  224 C   add,dc      %r0, %r0, m096
  225 C
  226 C   depd,z      m032, 31, 32, ma000
  227 C   extrd,u     m032, 31, 32, ma064
  228 C   ldd     0(rp), r000
  229 C   addib,<>    -1, %r5, L(loop0)
  230 C   depd        m096, 31, 32, ma064
  231 LDEF(0_out)
  232     ldo     8(up), up
  233     xmpyu       %fr8R, %fr4L, %fr22
  234     xmpyu       %fr8L, %fr4R, %fr23
  235     ldd     -0x78(%r30), p032a1
  236     fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  237     xmpyu       %fr8R, %fr4R, %fr24
  238     xmpyu       %fr8L, %fr4L, %fr25
  239     ldd     -0x70(%r30), p032a2
  240     fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  241     ldo     8(rp), rp
  242     add     climb, p000a, s000
  243     ldd     -0x80(%r30), p000a
  244     fstd        %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
  245     add,dc      p064a, %r0, climb
  246     ldd     -0x68(%r30), p064a
  247     fstd        %fr25, -0x68(%r30)  C high product to -0x68..-0x61
  248     add     ma000, s000, s000
  249     add,dc      ma064, climb, climb
  250     add     r000, s000, s000
  251     add,dc      %r0, climb, climb
  252     std     s000, -8(rp)
  253     add     p032a1, p032a2, m032
  254     add,dc      %r0, %r0, m096
  255     depd,z      m032, 31, 32, ma000
  256     extrd,u     m032, 31, 32, ma064
  257     ldd     0(rp), r000
  258     depd        m096, 31, 32, ma064
  259 LDEF(0_two_out)
  260     ldd     -0x78(%r30), p032a1
  261     ldd     -0x70(%r30), p032a2
  262     ldo     8(rp), rp
  263     add     climb, p000a, s000
  264     ldd     -0x80(%r30), p000a
  265     add,dc      p064a, %r0, climb
  266     ldd     -0x68(%r30), p064a
  267     add     ma000, s000, s000
  268     add,dc      ma064, climb, climb
  269     add     r000, s000, s000
  270     add,dc      %r0, climb, climb
  271     std     s000, -8(rp)
  272 LDEF(0_one_out)
  273     add     p032a1, p032a2, m032
  274     add,dc      %r0, %r0, m096
  275     depd,z      m032, 31, 32, ma000
  276     extrd,u     m032, 31, 32, ma064
  277     ldd     0(rp), r000
  278     depd        m096, 31, 32, ma064
  279 
  280     add     climb, p000a, s000
  281     add,dc      p064a, %r0, climb
  282     add     ma000, s000, s000
  283     add,dc      ma064, climb, climb
  284     add     r000, s000, s000
  285     add,dc      %r0, climb, climb
  286     std     s000, 0(rp)
  287 
  288     cmpib,>=    4, n, L(done)
  289     ldo     8(rp), rp
  290 
  291 C 4-way unrolled code.
  292 
  293 LDEF(BIG)
  294 
  295 define(`p032a1',`%r1')  C
  296 define(`p032a2',`%r19') C
  297 define(`p096b1',`%r20') C
  298 define(`p096b2',`%r21') C
  299 define(`p160c1',`%r22') C
  300 define(`p160c2',`%r29') C
  301 define(`p224d1',`%r31') C
  302 define(`p224d2',`%r3')  C
  303             C
  304 define(`m032',`%r4')    C
  305 define(`m096',`%r5')    C
  306 define(`m160',`%r6')    C
  307 define(`m224',`%r7')    C
  308 define(`m288',`%r8')    C
  309             C
  310 define(`p000a',`%r1')   C
  311 define(`p064a',`%r19')  C
  312 define(`p064b',`%r20')  C
  313 define(`p128b',`%r21')  C
  314 define(`p128c',`%r22')  C
  315 define(`p192c',`%r29')  C
  316 define(`p192d',`%r31')  C
  317 define(`p256d',`%r3')   C
  318             C
  319 define(`s000',`%r10')   C
  320 define(`s064',`%r11')   C
  321 define(`s128',`%r12')   C
  322 define(`s192',`%r13')   C
  323             C
  324 define(`ma000',`%r9')   C
  325 define(`ma064',`%r4')   C
  326 define(`ma128',`%r5')   C
  327 define(`ma192',`%r6')   C
  328 define(`ma256',`%r7')   C
  329             C
  330 define(`r000',`%r1')    C
  331 define(`r064',`%r19')   C
  332 define(`r128',`%r20')   C
  333 define(`r192',`%r21')   C
  334 
  335     std     %r6, -0xe8(%r30)
  336     std     %r7, -0xe0(%r30)
  337     std     %r8, -0xd8(%r30)
  338     std     %r9, -0xd0(%r30)
  339     std     %r10, -0xc8(%r30)
  340     std     %r11, -0xc0(%r30)
  341     std     %r12, -0xb8(%r30)
  342     std     %r13, -0xb0(%r30)
  343 
  344 ifdef(`HAVE_ABI_2_0w',
  345 `   extrd,u     n, 61, 62, n        C right shift 2
  346 ',` extrd,u     n, 61, 30, n        C right shift 2, zero extend
  347 ')
  348 
  349 LDEF(4_or_more)
  350     fldd        0(up), %fr4
  351     fldd        8(up), %fr5
  352     fldd        16(up), %fr6
  353     fldd        24(up), %fr7
  354     xmpyu       %fr8R, %fr4L, %fr22
  355     xmpyu       %fr8L, %fr4R, %fr23
  356     xmpyu       %fr8R, %fr5L, %fr24
  357     xmpyu       %fr8L, %fr5R, %fr25
  358     xmpyu       %fr8R, %fr6L, %fr26
  359     xmpyu       %fr8L, %fr6R, %fr27
  360     fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  361     xmpyu       %fr8R, %fr7L, %fr28
  362     xmpyu       %fr8L, %fr7R, %fr29
  363     fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  364     xmpyu       %fr8R, %fr4R, %fr30
  365     xmpyu       %fr8L, %fr4L, %fr31
  366     fstd        %fr24, -0x38(%r30)  C mid product to  -0x38..-0x31
  367     xmpyu       %fr8R, %fr5R, %fr22
  368     xmpyu       %fr8L, %fr5L, %fr23
  369     fstd        %fr25, -0x30(%r30)  C mid product to  -0x30..-0x29
  370     xmpyu       %fr8R, %fr6R, %fr24
  371     xmpyu       %fr8L, %fr6L, %fr25
  372     fstd        %fr26, -0x58(%r30)  C mid product to  -0x58..-0x51
  373     xmpyu       %fr8R, %fr7R, %fr26
  374     fstd        %fr27, -0x50(%r30)  C mid product to  -0x50..-0x49
  375     addib,<>    -1, n, L(8_or_more)
  376     xmpyu       %fr8L, %fr7L, %fr27
  377     fstd        %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
  378     fstd        %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
  379     fstd        %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
  380     fstd        %fr31, -0x68(%r30)  C high product to -0x68..-0x61
  381     fstd        %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
  382     fstd        %fr23, -0x28(%r30)  C high product to -0x28..-0x21
  383     fstd        %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
  384     fstd        %fr25, -0x48(%r30)  C high product to -0x48..-0x41
  385     fstd        %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
  386     fstd        %fr27, -0x88(%r30)  C high product to -0x88..-0x81
  387     ldd     -0x78(%r30), p032a1
  388     ldd     -0x70(%r30), p032a2
  389     ldd     -0x38(%r30), p096b1
  390     ldd     -0x30(%r30), p096b2
  391     ldd     -0x58(%r30), p160c1
  392     ldd     -0x50(%r30), p160c2
  393     ldd     -0x18(%r30), p224d1
  394     ldd     -0x10(%r30), p224d2
  395     b       L(end1)
  396     nop
  397 
  398 LDEF(8_or_more)
  399     fstd        %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
  400     fstd        %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
  401     ldo     32(up), up
  402     fstd        %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
  403     fstd        %fr31, -0x68(%r30)  C high product to -0x68..-0x61
  404     fstd        %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
  405     fstd        %fr23, -0x28(%r30)  C high product to -0x28..-0x21
  406     fstd        %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
  407     fstd        %fr25, -0x48(%r30)  C high product to -0x48..-0x41
  408     fstd        %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
  409     fstd        %fr27, -0x88(%r30)  C high product to -0x88..-0x81
  410     fldd        0(up), %fr4
  411     fldd        8(up), %fr5
  412     fldd        16(up), %fr6
  413     fldd        24(up), %fr7
  414     xmpyu       %fr8R, %fr4L, %fr22
  415     ldd     -0x78(%r30), p032a1
  416     xmpyu       %fr8L, %fr4R, %fr23
  417     xmpyu       %fr8R, %fr5L, %fr24
  418     ldd     -0x70(%r30), p032a2
  419     xmpyu       %fr8L, %fr5R, %fr25
  420     xmpyu       %fr8R, %fr6L, %fr26
  421     ldd     -0x38(%r30), p096b1
  422     xmpyu       %fr8L, %fr6R, %fr27
  423     fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  424     xmpyu       %fr8R, %fr7L, %fr28
  425     ldd     -0x30(%r30), p096b2
  426     xmpyu       %fr8L, %fr7R, %fr29
  427     fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  428     xmpyu       %fr8R, %fr4R, %fr30
  429     ldd     -0x58(%r30), p160c1
  430     xmpyu       %fr8L, %fr4L, %fr31
  431     fstd        %fr24, -0x38(%r30)  C mid product to  -0x38..-0x31
  432     xmpyu       %fr8R, %fr5R, %fr22
  433     ldd     -0x50(%r30), p160c2
  434     xmpyu       %fr8L, %fr5L, %fr23
  435     fstd        %fr25, -0x30(%r30)  C mid product to  -0x30..-0x29
  436     xmpyu       %fr8R, %fr6R, %fr24
  437     ldd     -0x18(%r30), p224d1
  438     xmpyu       %fr8L, %fr6L, %fr25
  439     fstd        %fr26, -0x58(%r30)  C mid product to  -0x58..-0x51
  440     xmpyu       %fr8R, %fr7R, %fr26
  441     ldd     -0x10(%r30), p224d2
  442     fstd        %fr27, -0x50(%r30)  C mid product to  -0x50..-0x49
  443     addib,=     -1, n, L(end2)
  444     xmpyu       %fr8L, %fr7L, %fr27
  445 LDEF(loop)
  446     add     p032a1, p032a2, m032
  447     ldd     -0x80(%r30), p000a
  448     add,dc      p096b1, p096b2, m096
  449     fstd        %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
  450 
  451     add,dc      p160c1, p160c2, m160
  452     ldd     -0x68(%r30), p064a
  453     add,dc      p224d1, p224d2, m224
  454     fstd        %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
  455 
  456     add,dc      %r0, %r0, m288
  457     ldd     -0x40(%r30), p064b
  458     ldo     32(up), up
  459     fstd        %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
  460 
  461     depd,z      m032, 31, 32, ma000
  462     ldd     -0x28(%r30), p128b
  463     extrd,u     m032, 31, 32, ma064
  464     fstd        %fr31, -0x68(%r30)  C high product to -0x68..-0x61
  465 
  466     depd        m096, 31, 32, ma064
  467     ldd     -0x60(%r30), p128c
  468     extrd,u     m096, 31, 32, ma128
  469     fstd        %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
  470 
  471     depd        m160, 31, 32, ma128
  472     ldd     -0x48(%r30), p192c
  473     extrd,u     m160, 31, 32, ma192
  474     fstd        %fr23, -0x28(%r30)  C high product to -0x28..-0x21
  475 
  476     depd        m224, 31, 32, ma192
  477     ldd     -0x20(%r30), p192d
  478     extrd,u     m224, 31, 32, ma256
  479     fstd        %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
  480 
  481     depd        m288, 31, 32, ma256
  482     ldd     -0x88(%r30), p256d
  483     add     climb, p000a, s000
  484     fstd        %fr25, -0x48(%r30)  C high product to -0x48..-0x41
  485 
  486     add,dc      p064a, p064b, s064
  487     ldd     0(rp), r000
  488     add,dc      p128b, p128c, s128
  489     fstd        %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
  490 
  491     add,dc      p192c, p192d, s192
  492     ldd     8(rp), r064
  493     add,dc      p256d, %r0, climb
  494     fstd        %fr27, -0x88(%r30)  C high product to -0x88..-0x81
  495 
  496     ldd     16(rp), r128
  497     add     ma000, s000, s000   C accum mid 0
  498     ldd     24(rp), r192
  499     add,dc      ma064, s064, s064   C accum mid 1
  500 
  501     add,dc      ma128, s128, s128   C accum mid 2
  502     fldd        0(up), %fr4
  503     add,dc      ma192, s192, s192   C accum mid 3
  504     fldd        8(up), %fr5
  505 
  506     add,dc      ma256, climb, climb
  507     fldd        16(up), %fr6
  508     add     r000, s000, s000    C accum rlimb 0
  509     fldd        24(up), %fr7
  510 
  511     add,dc      r064, s064, s064    C accum rlimb 1
  512     add,dc      r128, s128, s128    C accum rlimb 2
  513     std     s000, 0(rp)
  514 
  515     add,dc      r192, s192, s192    C accum rlimb 3
  516     add,dc      %r0, climb, climb
  517     std     s064, 8(rp)
  518 
  519     xmpyu       %fr8R, %fr4L, %fr22
  520     ldd     -0x78(%r30), p032a1
  521     xmpyu       %fr8L, %fr4R, %fr23
  522     std     s128, 16(rp)
  523 
  524     xmpyu       %fr8R, %fr5L, %fr24
  525     ldd     -0x70(%r30), p032a2
  526     xmpyu       %fr8L, %fr5R, %fr25
  527     std     s192, 24(rp)
  528 
  529     xmpyu       %fr8R, %fr6L, %fr26
  530     ldd     -0x38(%r30), p096b1
  531     xmpyu       %fr8L, %fr6R, %fr27
  532     fstd        %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
  533 
  534     xmpyu       %fr8R, %fr7L, %fr28
  535     ldd     -0x30(%r30), p096b2
  536     xmpyu       %fr8L, %fr7R, %fr29
  537     fstd        %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
  538 
  539     xmpyu       %fr8R, %fr4R, %fr30
  540     ldd     -0x58(%r30), p160c1
  541     xmpyu       %fr8L, %fr4L, %fr31
  542     fstd        %fr24, -0x38(%r30)  C mid product to  -0x38..-0x31
  543 
  544     xmpyu       %fr8R, %fr5R, %fr22
  545     ldd     -0x50(%r30), p160c2
  546     xmpyu       %fr8L, %fr5L, %fr23
  547     fstd        %fr25, -0x30(%r30)  C mid product to  -0x30..-0x29
  548 
  549     xmpyu       %fr8R, %fr6R, %fr24
  550     ldd     -0x18(%r30), p224d1
  551     xmpyu       %fr8L, %fr6L, %fr25
  552     fstd        %fr26, -0x58(%r30)  C mid product to  -0x58..-0x51
  553 
  554     xmpyu       %fr8R, %fr7R, %fr26
  555     ldd     -0x10(%r30), p224d2
  556     fstd        %fr27, -0x50(%r30)  C mid product to  -0x50..-0x49
  557     xmpyu       %fr8L, %fr7L, %fr27
  558 
  559     addib,<>    -1, n, L(loop)
  560     ldo     32(rp), rp
  561 
  562 LDEF(end2)
  563     add     p032a1, p032a2, m032
  564     ldd     -0x80(%r30), p000a
  565     add,dc      p096b1, p096b2, m096
  566     fstd        %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
  567     add,dc      p160c1, p160c2, m160
  568     ldd     -0x68(%r30), p064a
  569     add,dc      p224d1, p224d2, m224
  570     fstd        %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
  571     add,dc      %r0, %r0, m288
  572     ldd     -0x40(%r30), p064b
  573     fstd        %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
  574     depd,z      m032, 31, 32, ma000
  575     ldd     -0x28(%r30), p128b
  576     extrd,u     m032, 31, 32, ma064
  577     fstd        %fr31, -0x68(%r30)  C high product to -0x68..-0x61
  578     depd        m096, 31, 32, ma064
  579     ldd     -0x60(%r30), p128c
  580     extrd,u     m096, 31, 32, ma128
  581     fstd        %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
  582     depd        m160, 31, 32, ma128
  583     ldd     -0x48(%r30), p192c
  584     extrd,u     m160, 31, 32, ma192
  585     fstd        %fr23, -0x28(%r30)  C high product to -0x28..-0x21
  586     depd        m224, 31, 32, ma192
  587     ldd     -0x20(%r30), p192d
  588     extrd,u     m224, 31, 32, ma256
  589     fstd        %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
  590     depd        m288, 31, 32, ma256
  591     ldd     -0x88(%r30), p256d
  592     add     climb, p000a, s000
  593     fstd        %fr25, -0x48(%r30)  C high product to -0x48..-0x41
  594     add,dc      p064a, p064b, s064
  595     ldd     0(rp), r000
  596     add,dc      p128b, p128c, s128
  597     fstd        %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
  598     add,dc      p192c, p192d, s192
  599     ldd     8(rp), r064
  600     add,dc      p256d, %r0, climb
  601     fstd        %fr27, -0x88(%r30)  C high product to -0x88..-0x81
  602     ldd     16(rp), r128
  603     add     ma000, s000, s000   C accum mid 0
  604     ldd     24(rp), r192
  605     add,dc      ma064, s064, s064   C accum mid 1
  606     add,dc      ma128, s128, s128   C accum mid 2
  607     add,dc      ma192, s192, s192   C accum mid 3
  608     add,dc      ma256, climb, climb
  609     add     r000, s000, s000    C accum rlimb 0
  610     add,dc      r064, s064, s064    C accum rlimb 1
  611     add,dc      r128, s128, s128    C accum rlimb 2
  612     std     s000, 0(rp)
  613     add,dc      r192, s192, s192    C accum rlimb 3
  614     add,dc      %r0, climb, climb
  615     std     s064, 8(rp)
  616     ldd     -0x78(%r30), p032a1
  617     std     s128, 16(rp)
  618     ldd     -0x70(%r30), p032a2
  619     std     s192, 24(rp)
  620     ldd     -0x38(%r30), p096b1
  621     ldd     -0x30(%r30), p096b2
  622     ldd     -0x58(%r30), p160c1
  623     ldd     -0x50(%r30), p160c2
  624     ldd     -0x18(%r30), p224d1
  625     ldd     -0x10(%r30), p224d2
  626     ldo     32(rp), rp
  627 
  628 LDEF(end1)
  629     add     p032a1, p032a2, m032
  630     ldd     -0x80(%r30), p000a
  631     add,dc      p096b1, p096b2, m096
  632     add,dc      p160c1, p160c2, m160
  633     ldd     -0x68(%r30), p064a
  634     add,dc      p224d1, p224d2, m224
  635     add,dc      %r0, %r0, m288
  636     ldd     -0x40(%r30), p064b
  637     depd,z      m032, 31, 32, ma000
  638     ldd     -0x28(%r30), p128b
  639     extrd,u     m032, 31, 32, ma064
  640     depd        m096, 31, 32, ma064
  641     ldd     -0x60(%r30), p128c
  642     extrd,u     m096, 31, 32, ma128
  643     depd        m160, 31, 32, ma128
  644     ldd     -0x48(%r30), p192c
  645     extrd,u     m160, 31, 32, ma192
  646     depd        m224, 31, 32, ma192
  647     ldd     -0x20(%r30), p192d
  648     extrd,u     m224, 31, 32, ma256
  649     depd        m288, 31, 32, ma256
  650     ldd     -0x88(%r30), p256d
  651     add     climb, p000a, s000
  652     add,dc      p064a, p064b, s064
  653     ldd     0(rp), r000
  654     add,dc      p128b, p128c, s128
  655     add,dc      p192c, p192d, s192
  656     ldd     8(rp), r064
  657     add,dc      p256d, %r0, climb
  658     ldd     16(rp), r128
  659     add     ma000, s000, s000   C accum mid 0
  660     ldd     24(rp), r192
  661     add,dc      ma064, s064, s064   C accum mid 1
  662     add,dc      ma128, s128, s128   C accum mid 2
  663     add,dc      ma192, s192, s192   C accum mid 3
  664     add,dc      ma256, climb, climb
  665     add     r000, s000, s000    C accum rlimb 0
  666     add,dc      r064, s064, s064    C accum rlimb 1
  667     add,dc      r128, s128, s128    C accum rlimb 2
  668     std     s000, 0(rp)
  669     add,dc      r192, s192, s192    C accum rlimb 3
  670     add,dc      %r0, climb, climb
  671     std     s064, 8(rp)
  672     std     s128, 16(rp)
  673     std     s192, 24(rp)
  674 
  675     ldd     -0xb0(%r30), %r13
  676     ldd     -0xb8(%r30), %r12
  677     ldd     -0xc0(%r30), %r11
  678     ldd     -0xc8(%r30), %r10
  679     ldd     -0xd0(%r30), %r9
  680     ldd     -0xd8(%r30), %r8
  681     ldd     -0xe0(%r30), %r7
  682     ldd     -0xe8(%r30), %r6
  683 LDEF(done)
  684 ifdef(`HAVE_ABI_2_0w',
  685 `   copy        climb, %r28
  686 ',` extrd,u     climb, 63, 32, %r29
  687     extrd,u     climb, 31, 32, %r28
  688 ')
  689     ldd     -0xf0(%r30), %r5
  690     ldd     -0xf8(%r30), %r4
  691     bve     (%r2)
  692     ldd,mb      -0x100(%r30), %r3
  693 EPILOGUE(mpn_addmul_1)