"Fossies" - the Fresh Open Source Software Archive

Member "gmp-6.2.1/mpn/x86/pentium4/sse2/addmul_1.asm" (14 Nov 2020, 4032 Bytes) of package /linux/misc/gmp-6.2.1.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
    2 
    3 dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
    4 
    5 dnl  This file is part of the GNU MP Library.
    6 dnl
    7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    8 dnl  it under the terms of either:
    9 dnl
   10 dnl    * the GNU Lesser General Public License as published by the Free
   11 dnl      Software Foundation; either version 3 of the License, or (at your
   12 dnl      option) any later version.
   13 dnl
   14 dnl  or
   15 dnl
   16 dnl    * the GNU General Public License as published by the Free Software
   17 dnl      Foundation; either version 2 of the License, or (at your option) any
   18 dnl      later version.
   19 dnl
   20 dnl  or both in parallel, as here.
   21 dnl
   22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
   23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   25 dnl  for more details.
   26 dnl
   27 dnl  You should have received copies of the GNU General Public License and the
   28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
   29 dnl  see https://www.gnu.org/licenses/.
   30 
   31 
   32 include(`../config.m4')
   33 
   34 C               cycles/limb
   35 C P6 model 0-8,10-12        -
   36 C P6 model 9   (Banias)     5.24
   37 C P6 model 13  (Dothan)     5.24
   38 C P4 model 0-1 (Willamette) 5
   39 C P4 model 2   (Northwood)  5
   40 C P4 model 3-4 (Prescott)   5
   41 
   42 C TODO:
   43 C  * Tweak eax/edx offsets in loop as to save some lea's
   44 C  * Perhaps software pipeline small-case code
   45 
   46 C INPUT PARAMETERS
   47 C rp        sp + 4
   48 C up        sp + 8
   49 C n     sp + 12
   50 C v0        sp + 16
   51 
   52     TEXT
   53     ALIGN(16)
   54 PROLOGUE(mpn_addmul_1)
   55     pxor    %mm6, %mm6
   56 L(ent): mov 4(%esp), %edx
   57     mov 8(%esp), %eax
   58     mov 12(%esp), %ecx
   59     movd    16(%esp), %mm7
   60     cmp $4, %ecx
   61     jnc L(big)
   62 
   63 L(lp0): movd    (%eax), %mm0
   64     lea 4(%eax), %eax
   65     movd    (%edx), %mm4
   66     lea 4(%edx), %edx
   67     pmuludq %mm7, %mm0
   68     paddq   %mm0, %mm4
   69     paddq   %mm4, %mm6
   70     movd    %mm6, -4(%edx)
   71     psrlq   $32, %mm6
   72     dec %ecx
   73     jnz L(lp0)
   74     movd    %mm6, %eax
   75     emms
   76     ret
   77 
   78 L(big): and $3, %ecx
   79     je  L(0)
   80     cmp $2, %ecx
   81     jc  L(1)
   82     je  L(2)
   83     jmp L(3)            C FIXME: one case should fall through
   84 
   85 L(0):   movd    (%eax), %mm3
   86     sub 12(%esp), %ecx      C loop count
   87     lea -16(%eax), %eax
   88     lea -12(%edx), %edx
   89     pmuludq %mm7, %mm3
   90     movd    20(%eax), %mm0
   91     movd    12(%edx), %mm5
   92     pmuludq %mm7, %mm0
   93     movd    24(%eax), %mm1
   94     paddq   %mm3, %mm5
   95     movd    16(%edx), %mm4
   96     jmp L(00)
   97 
   98 L(1):   movd    (%eax), %mm2
   99     sub 12(%esp), %ecx
  100     lea -12(%eax), %eax
  101     lea -8(%edx), %edx
  102     movd    8(%edx), %mm4
  103     pmuludq %mm7, %mm2
  104     movd    16(%eax), %mm3
  105     pmuludq %mm7, %mm3
  106     movd    20(%eax), %mm0
  107     paddq   %mm2, %mm4
  108     movd    12(%edx), %mm5
  109     jmp L(01)
  110 
  111 L(2):   movd    (%eax), %mm1
  112     sub 12(%esp), %ecx
  113     lea -8(%eax), %eax
  114     lea -4(%edx), %edx
  115     pmuludq %mm7, %mm1
  116     movd    12(%eax), %mm2
  117     movd    4(%edx), %mm5
  118     pmuludq %mm7, %mm2
  119     movd    16(%eax), %mm3
  120     paddq   %mm1, %mm5
  121     movd    8(%edx), %mm4
  122     jmp L(10)
  123 
  124 L(3):   movd    (%eax), %mm0
  125     sub 12(%esp), %ecx
  126     lea -4(%eax), %eax
  127     pmuludq %mm7, %mm0
  128     movd    8(%eax), %mm1
  129     movd    (%edx), %mm4
  130     pmuludq %mm7, %mm1
  131     movd    12(%eax), %mm2
  132     paddq   %mm0, %mm4
  133     movd    4(%edx), %mm5
  134 
  135     ALIGN(16)
  136 L(top): pmuludq %mm7, %mm2
  137     paddq   %mm4, %mm6
  138     movd    16(%eax), %mm3
  139     paddq   %mm1, %mm5
  140     movd    8(%edx), %mm4
  141     movd    %mm6, 0(%edx)
  142     psrlq   $32, %mm6
  143 L(10):  pmuludq %mm7, %mm3
  144     paddq   %mm5, %mm6
  145     movd    20(%eax), %mm0
  146     paddq   %mm2, %mm4
  147     movd    12(%edx), %mm5
  148     movd    %mm6, 4(%edx)
  149     psrlq   $32, %mm6
  150 L(01):  pmuludq %mm7, %mm0
  151     paddq   %mm4, %mm6
  152     movd    24(%eax), %mm1
  153     paddq   %mm3, %mm5
  154     movd    16(%edx), %mm4
  155     movd    %mm6, 8(%edx)
  156     psrlq   $32, %mm6
  157 L(00):  pmuludq %mm7, %mm1
  158     paddq   %mm5, %mm6
  159     movd    28(%eax), %mm2
  160     paddq   %mm0, %mm4
  161     movd    20(%edx), %mm5
  162     movd    %mm6, 12(%edx)
  163     psrlq   $32, %mm6
  164     lea 16(%eax), %eax
  165     lea 16(%edx), %edx
  166     add $4, %ecx
  167     jnz L(top)
  168 
  169 L(end): pmuludq %mm7, %mm2
  170     paddq   %mm4, %mm6
  171     paddq   %mm1, %mm5
  172     movd    8(%edx), %mm4
  173     movd    %mm6, 0(%edx)
  174     psrlq   $32, %mm6
  175     paddq   %mm5, %mm6
  176     paddq   %mm2, %mm4
  177     movd    %mm6, 4(%edx)
  178     psrlq   $32, %mm6
  179     paddq   %mm4, %mm6
  180     movd    %mm6, 8(%edx)
  181     psrlq   $32, %mm6
  182     movd    %mm6, %eax
  183     emms
  184     ret
  185 EPILOGUE()
  186 PROLOGUE(mpn_addmul_1c)
  187     movd    20(%esp), %mm6
  188     jmp L(ent)
  189 EPILOGUE()