source/libs/gmp/gmp-src/mpn/x86/k7/addlsh1_n.asm

   1 dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
   2
   3 dnl  Copyright 2011 Free Software Foundation, Inc.
   4
   5 dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
  36 C The innerloop is 2*3-way unrolled, which is best we can do with the available
  37 C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
  38 C cannot feed carry between operations there.
  39
  40 C                           cycles/limb
  41 C P5
  42 C P6 model 0-8,10-12
  43 C P6 model 9  (Banias)
  44 C P6 model 13 (Dothan)           5.4    (worse than add_n + lshift)
  45 C P4 model 0  (Willamette)
  46 C P4 model 1  (?)
  47 C P4 model 2  (Northwood)
  48 C P4 model 3  (Prescott)
  49 C P4 model 4  (Nocona)
  50 C Intel Atom                     6
  51 C AMD K6                         ?
  52 C AMD K7                         2.5
  53 C AMD K8
  54
  55 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
  56 C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
  57 C that means we need an initial magic multiply.
  58 C
  59 C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
  60 C cannot do rsblsh1_n since we feed carry from the shift blocks to the
  61 C add/subtract blocks, which is right for addition but reversed for
  62 C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
  63 C without losing any time, since we're not issue limited but carry recurrency
  64 C latency.
  65 C
  66 C Breaking carry recurrency might be a good idea.  We would then need separate
  67 C registers for the shift carry and add/subtract carry, which in turn would
  68 C force is to 2*2-way unrolling.
  69
  70 defframe(PARAM_SIZE,    16)
  71 defframe(PARAM_DBLD,    12)
  72 defframe(PARAM_SRC,      8)
  73 defframe(PARAM_DST,      4)
  74
  75 dnl  re-use parameter space
  76 define(VAR_COUNT,`PARAM_DST')
  77 define(VAR_TMP,`PARAM_DBLD')
  78
  79 ASM_START()
  80         TEXT
  81         ALIGN(8)
  82 PROLOGUE(mpn_addlsh1_n)
  83 deflit(`FRAME',0)
  84
  85 define(`rp',  `%edi')
  86 define(`up',  `%esi')
  87 define(`vp',  `%ebp')
  88
  89         mov     $0x2aaaaaab, %eax
  90
  91         push    %ebx                    FRAME_pushl()
  92         mov     PARAM_SIZE, %ebx        C size
  93
  94         push    rp                      FRAME_pushl()
  95         mov     PARAM_DST, rp
  96
  97         mul     %ebx
  98
  99         push    up                      FRAME_pushl()
 100         mov     PARAM_SRC, up
 101
 102         not     %edx                    C count = -(size\8)-1
 103         mov     %edx, VAR_COUNT
 104
 105         push    vp                      FRAME_pushl()
 106         mov     PARAM_DBLD, vp
 107
 108         lea     3(%edx,%edx,2), %ecx    C count*3+3 = -(size\6)*3
 109         xor     %edx, %edx
 110         lea     (%ebx,%ecx,2), %ebx     C size + (count*3+3)*2 = size % 6
 111         or      %ebx, %ebx
 112         jz      L(exact)
 113
 114 L(oop):
 115 ifdef(`CPU_P6',`
 116         shr     %edx ')                 C restore 2nd saved carry bit
 117         mov     (vp), %eax
 118         adc     %eax, %eax
 119         rcr     %edx                    C restore 1st saved carry bit
 120         lea     4(vp), vp
 121         adc     (up), %eax
 122         lea     4(up), up
 123         adc     %edx, %edx              C save a carry bit in edx
 124 ifdef(`CPU_P6',`
 125         adc     %edx, %edx ')           C save another carry bit in edx
 126         dec     %ebx
 127         mov     %eax, (rp)
 128         lea     4(rp), rp
 129         jnz     L(oop)
 130         mov     vp, VAR_TMP
 131 L(exact):
 132         incl    VAR_COUNT
 133         jz      L(end)
 134
 135         ALIGN(16)
 136 L(top):
 137 ifdef(`CPU_P6',`
 138         shr     %edx ')                 C restore 2nd saved carry bit
 139         mov     (vp), %eax
 140         adc     %eax, %eax
 141         mov     4(vp), %ebx
 142         adc     %ebx, %ebx
 143         mov     8(vp), %ecx
 144         adc     %ecx, %ecx
 145
 146         rcr     %edx                    C restore 1st saved carry bit
 147
 148         adc     (up), %eax
 149         mov     %eax, (rp)
 150         adc     4(up), %ebx
 151         mov     %ebx, 4(rp)
 152         adc     8(up), %ecx
 153         mov     %ecx, 8(rp)
 154
 155         mov     12(vp), %eax
 156         adc     %eax, %eax
 157         mov     16(vp), %ebx
 158         adc     %ebx, %ebx
 159         mov     20(vp), %ecx
 160         adc     %ecx, %ecx
 161
 162         lea     24(vp), vp
 163         adc     %edx, %edx              C save a carry bit in edx
 164
 165         adc     12(up), %eax
 166         mov     %eax, 12(rp)
 167         adc     16(up), %ebx
 168         mov     %ebx, 16(rp)
 169         adc     20(up), %ecx
 170
 171         lea     24(up), up
 172
 173 ifdef(`CPU_P6',`
 174         adc     %edx, %edx ')           C save another carry bit in edx
 175         mov     %ecx, 20(rp)
 176         incl    VAR_COUNT
 177         lea     24(rp), rp
 178         jne     L(top)
 179
 180 L(end):
 181         pop     vp                      FRAME_popl()
 182         pop     up                      FRAME_popl()
 183
 184 ifdef(`CPU_P6',`
 185         xor     %eax, %eax
 186         shr     $1, %edx
 187         adc     %edx, %eax
 188 ',`
 189         adc     $0, %edx
 190         mov     %edx, %eax
 191 ')
 192         pop     rp                      FRAME_popl()
 193         pop     %ebx                    FRAME_popl()
 194         ret
 195 EPILOGUE()
 196 ASM_END()