"Fossies" - the Fresh Open Source Software Archive

Member "gmp-6.2.1/mpn/x86_64/coreibwl/addmul_1.asm" (14 Nov 2020, 4588 Bytes) of package /linux/misc/gmp-6.2.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "addmul_1.asm": 6.1.2_vs_6.2.0.

    1 dnl  AMD64 mpn_addmul_1 optimised for Intel Broadwell.
    2 
    3 dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
    4 
    5 dnl  This file is part of the GNU MP Library.
    6 dnl
    7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    8 dnl  it under the terms of either:
    9 dnl
   10 dnl    * the GNU Lesser General Public License as published by the Free
   11 dnl      Software Foundation; either version 3 of the License, or (at your
   12 dnl      option) any later version.
   13 dnl
   14 dnl  or
   15 dnl
   16 dnl    * the GNU General Public License as published by the Free Software
   17 dnl      Foundation; either version 2 of the License, or (at your option) any
   18 dnl      later version.
   19 dnl
   20 dnl  or both in parallel, as here.
   21 dnl
   22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
   23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   25 dnl  for more details.
   26 dnl
   27 dnl  You should have received copies of the GNU General Public License and the
   28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
   29 dnl  see https://www.gnu.org/licenses/.
   30 
   31 include(`../config.m4')
   32 
   33 C        cycles/limb
   34 C AMD K8,K9 n/a
   35 C AMD K10   n/a
   36 C AMD bd1   n/a
   37 C AMD bd2   n/a
   38 C AMD bd3   n/a
   39 C AMD bd4    ?
   40 C AMD zen    ?
   41 C AMD bt1   n/a
   42 C AMD bt2   n/a
   43 C Intel P4  n/a
   44 C Intel PNR n/a
   45 C Intel NHM n/a
   46 C Intel SBR n/a
   47 C Intel IBR n/a
   48 C Intel HWL n/a
   49 C Intel BWL  1.67    1.74
   50 C Intel SKL  1.63    1.71
   51 C Intel atom    n/a
   52 C Intel SLM n/a
   53 C VIA nano  n/a
   54 
   55 C The loop of this code is the result of running a code generation and
   56 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
   57 
   58 C TODO
   59 C  * Put an initial mulx before switching, targeting some free registers.
   60 C  * Tune feed-in code.
   61 C  * Trim nop execution after L(f2).
   62 C  * For DOS64, fix nop execution.
   63 
   64 define(`rp',      `%rdi')   C rcx
   65 define(`up',      `%rsi')   C rdx
   66 define(`n_param', `%rdx')   C r8
   67 define(`v0_param',`%rcx')   C r9
   68 
   69 define(`n',       `%rcx')
   70 
   71 ABI_SUPPORT(DOS64)
   72 ABI_SUPPORT(STD64)
   73 
   74 dnl IFDOS(` define(`up', ``%rsi'')  ') dnl
   75 dnl IFDOS(` define(`rp', ``%rcx'')  ') dnl
   76 dnl IFDOS(` define(`vl', ``%r9'')   ') dnl
   77 dnl IFDOS(` define(`r9', ``rdi'')   ') dnl
   78 dnl IFDOS(` define(`n',  ``%r8'')   ') dnl
   79 dnl IFDOS(` define(`r8', ``r11'')   ') dnl
   80 
   81 ASM_START()
   82     TEXT
   83     ALIGN(32)
   84 PROLOGUE(mpn_addmul_1)
   85     FUNC_ENTRY(4)
   86 
   87     mov v0_param, %r10
   88     mov n_param, n
   89     mov R32(n_param), R32(%r8)
   90     shr $3, n
   91     and $7, R32(%r8)        C clear OF, CF as side-effect
   92     mov %r10, %rdx
   93     lea L(tab)(%rip), %r10
   94 ifdef(`PIC',
   95 `   movslq  (%r10,%r8,4), %r8
   96     lea (%r8, %r10), %r10
   97     jmp *%r10
   98 ',`
   99     jmp *(%r10,%r8,8)
  100 ')
  101     JUMPTABSECT
  102     ALIGN(8)
  103 L(tab): JMPENT( L(f0), L(tab))
  104     JMPENT( L(f1), L(tab))
  105     JMPENT( L(f2), L(tab))
  106     JMPENT( L(f3), L(tab))
  107     JMPENT( L(f4), L(tab))
  108     JMPENT( L(f5), L(tab))
  109     JMPENT( L(f6), L(tab))
  110     JMPENT( L(f7), L(tab))
  111     TEXT
  112 
  113 L(f0):  mulx(   (up), %r10, %r8)
  114     lea -8(up), up
  115     lea -8(rp), rp
  116     lea -1(n), n
  117     jmp L(b0)
  118 
  119 L(f3):  mulx(   (up), %r9, %rax)
  120     lea 16(up), up
  121     lea -48(rp), rp
  122     jmp L(b3)
  123 
  124 L(f4):  mulx(   (up), %r10, %r8)
  125     lea 24(up), up
  126     lea -40(rp), rp
  127     jmp L(b4)
  128 
  129 L(f5):  mulx(   (up), %r9, %rax)
  130     lea 32(up), up
  131     lea -32(rp), rp
  132     jmp L(b5)
  133 
  134 L(f6):  mulx(   (up), %r10, %r8)
  135     lea 40(up), up
  136     lea -24(rp), rp
  137     jmp L(b6)
  138 
  139 L(f1):  mulx(   (up), %r9, %rax)
  140     jrcxz   L(1)
  141     jmp L(b1)
  142 L(1):   add (rp), %r9
  143     mov %r9, (rp)
  144     adc %rcx, %rax      C relies on rcx = 0
  145     FUNC_EXIT()
  146     ret
  147 
  148 L(end): adox(   (rp), %r9)
  149     mov %r9, (rp)
  150     adox(   %rcx, %rax)     C relies on rcx = 0
  151     adc %rcx, %rax      C relies on rcx = 0
  152     FUNC_EXIT()
  153     ret
  154 
  155 ifdef(`PIC',
  156 `   nop;nop;nop;nop',
  157 `   nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
  158 
  159 L(f2):  mulx(   (up), %r10, %r8)
  160     lea 8(up), up
  161     lea 8(rp), rp
  162     mulx(   (up), %r9, %rax)
  163 
  164     ALIGN(32)
  165 L(top): adox(   -8,(rp), %r10)
  166     adcx(   %r8, %r9)
  167     mov %r10, -8(rp)
  168     jrcxz   L(end)
  169 L(b1):  mulx(   8,(up), %r10, %r8)
  170     adox(   (rp), %r9)
  171     lea -1(n), n
  172     mov %r9, (rp)
  173     adcx(   %rax, %r10)
  174 L(b0):  mulx(   16,(up), %r9, %rax)
  175     adcx(   %r8, %r9)
  176     adox(   8,(rp), %r10)
  177     mov %r10, 8(rp)
  178 L(b7):  mulx(   24,(up), %r10, %r8)
  179     lea 64(up), up
  180     adcx(   %rax, %r10)
  181     adox(   16,(rp), %r9)
  182     mov %r9, 16(rp)
  183 L(b6):  mulx(   -32,(up), %r9, %rax)
  184     adox(   24,(rp), %r10)
  185     adcx(   %r8, %r9)
  186     mov %r10, 24(rp)
  187 L(b5):  mulx(   -24,(up), %r10, %r8)
  188     adcx(   %rax, %r10)
  189     adox(   32,(rp), %r9)
  190     mov %r9, 32(rp)
  191 L(b4):  mulx(   -16,(up), %r9, %rax)
  192     adox(   40,(rp), %r10)
  193     adcx(   %r8, %r9)
  194     mov %r10, 40(rp)
  195 L(b3):  adox(   48,(rp), %r9)
  196     mulx(   -8,(up), %r10, %r8)
  197     mov %r9, 48(rp)
  198     lea 64(rp), rp
  199     adcx(   %rax, %r10)
  200     mulx(   (up), %r9, %rax)
  201     jmp L(top)
  202 
  203 L(f7):  mulx(   (up), %r9, %rax)
  204     lea -16(up), up
  205     lea -16(rp), rp
  206     jmp L(b7)
  207 EPILOGUE()
  208 ASM_END()