"Fossies" - the Fresh Open Source Software Archive

Member "gmp-6.2.1/mpn/x86_64/mulx/adx/addmul_1.asm" (14 Nov 2020, 3519 Bytes) of package /linux/misc/gmp-6.2.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "addmul_1.asm": 6.1.2_vs_6.2.0.

    1 dnl  AMD64 mpn_addmul_1 for CPUs with mulx and adx.
    2 
    3 dnl  Contributed to the GNU project by Torbjörn Granlund.
    4 
    5 dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
    6 
    7 dnl  This file is part of the GNU MP Library.
    8 dnl
    9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   10 dnl  it under the terms of either:
   11 dnl
   12 dnl    * the GNU Lesser General Public License as published by the Free
   13 dnl      Software Foundation; either version 3 of the License, or (at your
   14 dnl      option) any later version.
   15 dnl
   16 dnl  or
   17 dnl
   18 dnl    * the GNU General Public License as published by the Free Software
   19 dnl      Foundation; either version 2 of the License, or (at your option) any
   20 dnl      later version.
   21 dnl
   22 dnl  or both in parallel, as here.
   23 dnl
   24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
   25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   27 dnl  for more details.
   28 dnl
   29 dnl  You should have received copies of the GNU General Public License and the
   30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
   31 dnl  see https://www.gnu.org/licenses/.
   32 
   33 include(`../config.m4')
   34 
   35 C        cycles/limb
   36 C AMD K8,K9  -
   37 C AMD K10    -
   38 C AMD bd1    -
   39 C AMD bd2    -
   40 C AMD bd3    -
   41 C AMD bd4    -
   42 C AMD zen    ?
   43 C AMD bt1    -
   44 C AMD bt2    -
   45 C Intel P4   -
   46 C Intel PNR  -
   47 C Intel NHM  -
   48 C Intel SBR  -
   49 C Intel IBR  -
   50 C Intel HWL  -
   51 C Intel BWL  ?
   52 C Intel SKL  ?
   53 C Intel atom     -
   54 C Intel SLM  -
   55 C VIA nano   -
   56 
   57 define(`rp',      `%rdi')   dnl rcx
   58 define(`up',      `%rsi')   dnl rdx
   59 define(`n_param', `%rdx')   dnl r8
   60 define(`v0_param',`%rcx')   dnl r9
   61 
   62 define(`n',       `%rcx')   dnl
   63 define(`v0',      `%rdx')   dnl
   64 
   65 C Testing mechanism for running this on older AMD64 processors
   66 ifelse(FAKE_MULXADX,1,`
   67   include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4')
   68 ',`
   69   define(`adox',    ``adox' $1, $2')
   70   define(`adcx',    ``adcx' $1, $2')
   71   define(`mulx',    ``mulx' $1, $2, $3')
   72 ')
   73 
   74 ASM_START()
   75     TEXT
   76     ALIGN(16)
   77 PROLOGUE(mpn_addmul_1)
   78     mov (up), %r8
   79 
   80     push    %rbx
   81     push    %r12
   82     push    %r13
   83 
   84     lea (up,n_param,8), up
   85     lea -16(rp,n_param,8), rp
   86     mov R32(n_param), R32(%rax)
   87     xchg    v0_param, v0        C FIXME: is this insn fast?
   88 
   89     neg n
   90 
   91     and $3, R8(%rax)
   92     jz  L(b0)
   93     cmp $2, R8(%rax)
   94     jl  L(b1)
   95     jz  L(b2)
   96 
   97 L(b3):  mulx(   (up,n,8), %r11, %r10)
   98     mulx(   8(up,n,8), %r13, %r12)
   99     mulx(   16(up,n,8), %rbx, %rax)
  100     dec n
  101     jmp L(lo3)
  102 
  103 L(b0):  mulx(   (up,n,8), %r9, %r8)
  104     mulx(   8(up,n,8), %r11, %r10)
  105     mulx(   16(up,n,8), %r13, %r12)
  106     jmp L(lo0)
  107 
  108 L(b2):  mulx(   (up,n,8), %r13, %r12)
  109     mulx(   8(up,n,8), %rbx, %rax)
  110     lea 2(n), n
  111     jrcxz   L(wd2)
  112 L(gt2): mulx(   (up,n,8), %r9, %r8)
  113     jmp L(lo2)
  114 
  115 L(b1):  and R8(%rax), R8(%rax)
  116     mulx(   (up,n,8), %rbx, %rax)
  117     lea 1(n), n
  118     jrcxz   L(wd1)
  119     mulx(   (up,n,8), %r9, %r8)
  120     mulx(   8(up,n,8), %r11, %r10)
  121     jmp L(lo1)
  122 
  123 L(end): adcx(   %r10, %r13)
  124     mov %r11, -8(rp)
  125 L(wd2): adox(   (rp), %r13)
  126     adcx(   %r12, %rbx)
  127     mov %r13, (rp)
  128 L(wd1): adox(   8(rp), %rbx)
  129     adcx(   %rcx, %rax)
  130     adox(   %rcx, %rax)
  131     mov %rbx, 8(rp)
  132     pop %r13
  133     pop %r12
  134     pop %rbx
  135     ret
  136 
  137 L(top): jrcxz   L(end)
  138     mulx(   (up,n,8), %r9, %r8)
  139     adcx(   %r10, %r13)
  140     mov %r11, -8(rp,n,8)
  141 L(lo2): adox(   (rp,n,8), %r13)
  142     mulx(   8(up,n,8), %r11, %r10)
  143     adcx(   %r12, %rbx)
  144     mov %r13, (rp,n,8)
  145 L(lo1): adox(   8(rp,n,8), %rbx)
  146     mulx(   16(up,n,8), %r13, %r12)
  147     adcx(   %rax, %r9)
  148     mov %rbx, 8(rp,n,8)
  149 L(lo0): adox(   16(rp,n,8), %r9)
  150     mulx(   24(up,n,8), %rbx, %rax)
  151     adcx(   %r8, %r11)
  152     mov %r9, 16(rp,n,8)
  153 L(lo3): adox(   24(rp,n,8), %r11)
  154     lea 4(n), n
  155     jmp L(top)
  156 EPILOGUE()
  157 ASM_END()