"Fossies" - the Fresh Open Source Software Archive

Member "gmp-6.2.1/mpn/x86/k7/addlsh1_n.asm" (14 Nov 2020, 4748 Bytes) of package /linux/misc/gmp-6.2.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "addlsh1_n.asm": 6.1.2_vs_6.2.0.

    1 dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
    2 
    3 dnl  Copyright 2011 Free Software Foundation, Inc.
    4 
    5 dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
    6 
    7 dnl  This file is part of the GNU MP Library.
    8 dnl
    9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   10 dnl  it under the terms of either:
   11 dnl
   12 dnl    * the GNU Lesser General Public License as published by the Free
   13 dnl      Software Foundation; either version 3 of the License, or (at your
   14 dnl      option) any later version.
   15 dnl
   16 dnl  or
   17 dnl
   18 dnl    * the GNU General Public License as published by the Free Software
   19 dnl      Foundation; either version 2 of the License, or (at your option) any
   20 dnl      later version.
   21 dnl
   22 dnl  or both in parallel, as here.
   23 dnl
   24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
   25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   27 dnl  for more details.
   28 dnl
   29 dnl  You should have received copies of the GNU General Public License and the
   30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
   31 dnl  see https://www.gnu.org/licenses/.
   32 
   33 include(`../config.m4')
   34 
   35 C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
   36 C The innerloop is 2*3-way unrolled, which is best we can do with the available
   37 C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
   38 C cannot feed carry between operations there.
   39 
   40 C               cycles/limb
   41 C P5
   42 C P6 model 0-8,10-12
   43 C P6 model 9  (Banias)
   44 C P6 model 13 (Dothan)       5.4    (worse than add_n + lshift)
   45 C P4 model 0  (Willamette)
   46 C P4 model 1  (?)
   47 C P4 model 2  (Northwood)
   48 C P4 model 3  (Prescott)
   49 C P4 model 4  (Nocona)
   50 C Intel Atom             6
   51 C AMD K6             ?
   52 C AMD K7             2.5
   53 C AMD K8
   54 
   55 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
   56 C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
   57 C that means we need an initial magic multiply.
   58 C
   59 C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
   60 C cannot do rsblsh1_n since we feed carry from the shift blocks to the
   61 C add/subtract blocks, which is right for addition but reversed for
   62 C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
   63 C without losing any time, since we're not issue limited but carry recurrency
   64 C latency.
   65 C
   66 C Breaking carry recurrency might be a good idea.  We would then need separate
   67 C registers for the shift carry and add/subtract carry, which in turn would
   68 C force us to 2*2-way unrolling.
   69 
   70 defframe(PARAM_SIZE,    16)
   71 defframe(PARAM_DBLD,    12)
   72 defframe(PARAM_SRC,  8)
   73 defframe(PARAM_DST,  4)
   74 
   75 dnl  re-use parameter space
   76 define(VAR_COUNT,`PARAM_DST')
   77 define(VAR_TMP,`PARAM_DBLD')
   78 
   79 ASM_START()
   80     TEXT
   81     ALIGN(8)
   82 PROLOGUE(mpn_addlsh1_n)
   83 deflit(`FRAME',0)
   84 
   85 define(`rp',  `%edi')
   86 define(`up',  `%esi')
   87 define(`vp',  `%ebp')
   88 
   89     mov $0x2aaaaaab, %eax
   90 
   91     push    %ebx            FRAME_pushl()
   92     mov PARAM_SIZE, %ebx    C size
   93 
   94     push    rp          FRAME_pushl()
   95     mov PARAM_DST, rp
   96 
   97     mul %ebx
   98 
   99     push    up          FRAME_pushl()
  100     mov PARAM_SRC, up
  101 
  102     not %edx            C count = -(size\8)-1
  103     mov %edx, VAR_COUNT
  104 
  105     push    vp          FRAME_pushl()
  106     mov PARAM_DBLD, vp
  107 
  108     lea 3(%edx,%edx,2), %ecx    C count*3+3 = -(size\6)*3
  109     xor %edx, %edx
  110     lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
  111     or  %ebx, %ebx
  112     jz  L(exact)
  113 
  114 L(oop):
  115 ifdef(`CPU_P6',`
  116     shr %edx ')         C restore 2nd saved carry bit
  117     mov (vp), %eax
  118     adc %eax, %eax
  119     rcr %edx            C restore 1st saved carry bit
  120     lea 4(vp), vp
  121     adc (up), %eax
  122     lea 4(up), up
  123     adc %edx, %edx      C save a carry bit in edx
  124 ifdef(`CPU_P6',`
  125     adc %edx, %edx ')       C save another carry bit in edx
  126     dec %ebx
  127     mov %eax, (rp)
  128     lea 4(rp), rp
  129     jnz L(oop)
  130     mov vp, VAR_TMP
  131 L(exact):
  132     incl    VAR_COUNT
  133     jz  L(end)
  134 
  135     ALIGN(16)
  136 L(top):
  137 ifdef(`CPU_P6',`
  138     shr %edx ')         C restore 2nd saved carry bit
  139     mov (vp), %eax
  140     adc %eax, %eax
  141     mov 4(vp), %ebx
  142     adc %ebx, %ebx
  143     mov 8(vp), %ecx
  144     adc %ecx, %ecx
  145 
  146     rcr %edx            C restore 1st saved carry bit
  147 
  148     adc (up), %eax
  149     mov %eax, (rp)
  150     adc 4(up), %ebx
  151     mov %ebx, 4(rp)
  152     adc 8(up), %ecx
  153     mov %ecx, 8(rp)
  154 
  155     mov 12(vp), %eax
  156     adc %eax, %eax
  157     mov 16(vp), %ebx
  158     adc %ebx, %ebx
  159     mov 20(vp), %ecx
  160     adc %ecx, %ecx
  161 
  162     lea 24(vp), vp
  163     adc %edx, %edx      C save a carry bit in edx
  164 
  165     adc 12(up), %eax
  166     mov %eax, 12(rp)
  167     adc 16(up), %ebx
  168     mov %ebx, 16(rp)
  169     adc 20(up), %ecx
  170 
  171     lea 24(up), up
  172 
  173 ifdef(`CPU_P6',`
  174     adc %edx, %edx ')       C save another carry bit in edx
  175     mov %ecx, 20(rp)
  176     incl    VAR_COUNT
  177     lea 24(rp), rp
  178     jne L(top)
  179 
  180 L(end):
  181     pop vp          FRAME_popl()
  182     pop up          FRAME_popl()
  183 
  184 ifdef(`CPU_P6',`
  185     xor %eax, %eax
  186     shr $1, %edx
  187     adc %edx, %eax
  188 ',`
  189     adc $0, %edx
  190     mov %edx, %eax
  191 ')
  192     pop rp          FRAME_popl()
  193     pop %ebx            FRAME_popl()
  194     ret
  195 EPILOGUE()
  196 ASM_END()