source/libs/gmp/gmp-src/mpn/x86_64/pentium4/aorslshC_n.asm

   1 dnl  AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
   2 dnl  C is 1, 2, 3.  Optimized for Pentium 4.
   3
   4 dnl  Contributed to the GNU project by Torbjorn Granlund.
   5
   6 dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
   7
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  11 dnl  it under the terms of either:
  12 dnl
  13 dnl    * the GNU Lesser General Public License as published by the Free
  14 dnl      Software Foundation; either version 3 of the License, or (at your
  15 dnl      option) any later version.
  16 dnl
  17 dnl  or
  18 dnl
  19 dnl    * the GNU General Public License as published by the Free Software
  20 dnl      Foundation; either version 2 of the License, or (at your option) any
  21 dnl      later version.
  22 dnl
  23 dnl  or both in parallel, as here.
  24 dnl
  25 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  26 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  28 dnl  for more details.
  29 dnl
  30 dnl  You should have received copies of the GNU General Public License and the
  31 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  32 dnl  see https://www.gnu.org/licenses/.
  33
  34 C            cycles/limb
  35 C AMD K8,K9      3.8
  36 C AMD K10        3.8
  37 C Intel P4       5.8
  38 C Intel core2    4.75
  39 C Intel corei    4.75
  40 C Intel atom     ?
  41 C VIA nano       4.75
  42
  43
  44 C INPUT PARAMETERS
  45 define(`rp',`%rdi')
  46 define(`up',`%rsi')
  47 define(`vp',`%rdx')
  48 define(`n', `%rcx')
  49
  50 define(M, eval(m4_lshift(1,LSH)))
  51
  52 ABI_SUPPORT(DOS64)
  53 ABI_SUPPORT(STD64)
  54
  55 ASM_START()
  56         TEXT
  57         ALIGN(16)
  58 PROLOGUE(func)
  59         FUNC_ENTRY(4)
  60         push    %rbx
  61         push    %r12
  62         push    %rbp
  63
  64         mov     (vp), %r9
  65         shl     $LSH, %r9
  66         mov     4(vp), R32(%rbp)
  67
  68         xor     R32(%rbx), R32(%rbx)
  69
  70         mov     R32(n), R32(%rax)
  71         and     $3, R32(%rax)
  72         jne     L(n00)          C n = 0, 4, 8, ...
  73
  74         mov     (up), %r8
  75         mov     8(up), %r10
  76         shr     $RSH, R32(%rbp)
  77         ADDSUB  %r9, %r8
  78         mov     8(vp), %r9
  79         lea     (%rbp,%r9,M), %r9
  80         setc    R8(%rax)
  81         mov     12(vp), R32(%rbp)
  82         lea     -16(rp), rp
  83         jmp     L(L00)
  84
  85 L(n00): cmp     $2, R32(%rax)
  86         jnc     L(n01)          C n = 1, 5, 9, ...
  87         mov     (up), %r11
  88         lea     -8(rp), rp
  89         shr     $RSH, R32(%rbp)
  90         ADDSUB  %r9, %r11
  91         setc    R8(%rbx)
  92         dec     n
  93         jz      L(1)            C jump for n = 1
  94         mov     8(up), %r8
  95         mov     8(vp), %r9
  96         lea     (%rbp,%r9,M), %r9
  97         mov     12(vp), R32(%rbp)
  98         lea     8(up), up
  99         lea     8(vp), vp
 100         jmp     L(L01)
 101
 102 L(n01): jne     L(n10)          C n = 2, 6, 10, ...
 103         mov     (up), %r12
 104         mov     8(up), %r11
 105         shr     $RSH, R32(%rbp)
 106         ADDSUB  %r9, %r12
 107         mov     8(vp), %r9
 108         lea     (%rbp,%r9,M), %r9
 109         setc    R8(%rax)
 110         mov     12(vp), R32(%rbp)
 111         lea     16(up), up
 112         lea     16(vp), vp
 113         jmp     L(L10)
 114
 115 L(n10): mov     (up), %r10
 116         mov     8(up), %r12
 117         shr     $RSH, R32(%rbp)
 118         ADDSUB  %r9, %r10
 119         mov     8(vp), %r9
 120         lea     (%rbp,%r9,M), %r9
 121         setc    R8(%rbx)
 122         mov     12(vp), R32(%rbp)
 123         lea     -24(rp), rp
 124         lea     -8(up), up
 125         lea     -8(vp), vp
 126         jmp     L(L11)
 127
 128 L(c0):  mov     $1, R8(%rbx)
 129         jmp     L(rc0)
 130 L(c1):  mov     $1, R8(%rax)
 131         jmp     L(rc1)
 132 L(c2):  mov     $1, R8(%rbx)
 133         jmp     L(rc2)
 134
 135         ALIGN(16)
 136 L(top): mov     (up), %r8       C not on critical path
 137         shr     $RSH, R32(%rbp)
 138         ADDSUB  %r9, %r11       C not on critical path
 139         mov     (vp), %r9
 140         lea     (%rbp,%r9,M), %r9
 141         setc    R8(%rbx)        C save carry out
 142         mov     4(vp), R32(%rbp)
 143         mov     %r12, (rp)
 144         ADDSUB  %rax, %r11      C apply previous carry out
 145         jc      L(c0)           C jump if ripple
 146 L(rc0):
 147 L(L01): mov     8(up), %r10
 148         shr     $RSH, R32(%rbp)
 149         ADDSUB  %r9, %r8
 150         mov     8(vp), %r9
 151         lea     (%rbp,%r9,M), %r9
 152         setc    R8(%rax)
 153         mov     12(vp), R32(%rbp)
 154         mov     %r11, 8(rp)
 155         ADDSUB  %rbx, %r8
 156         jc      L(c1)
 157 L(rc1):
 158 L(L00): mov     16(up), %r12
 159         shr     $RSH, R32(%rbp)
 160         ADDSUB  %r9, %r10
 161         mov     16(vp), %r9
 162         lea     (%rbp,%r9,M), %r9
 163         setc    R8(%rbx)
 164         mov     20(vp), R32(%rbp)
 165         mov     %r8, 16(rp)
 166         ADDSUB  %rax, %r10
 167         jc      L(c2)
 168 L(rc2):
 169 L(L11): mov     24(up), %r11
 170         shr     $RSH, R32(%rbp)
 171         ADDSUB  %r9, %r12
 172         mov     24(vp), %r9
 173         lea     (%rbp,%r9,M), %r9
 174         lea     32(up), up
 175         lea     32(vp), vp
 176         setc    R8(%rax)
 177         mov     -4(vp), R32(%rbp)
 178         mov     %r10, 24(rp)
 179         ADDSUB  %rbx, %r12
 180         jc      L(c3)
 181 L(rc3): lea     32(rp), rp
 182 L(L10): sub     $4, n
 183         ja      L(top)
 184
 185 L(end):
 186         shr     $RSH, R32(%rbp)
 187         ADDSUB  %r9, %r11
 188         setc    R8(%rbx)
 189         mov     %r12, (rp)
 190         ADDSUB  %rax, %r11
 191         jnc     L(1)
 192         mov     $1, R8(%rbx)
 193 L(1):   mov     %r11, 8(rp)
 194         lea     (%rbx,%rbp), R32(%rax)
 195         pop     %rbp
 196         pop     %r12
 197         pop     %rbx
 198         FUNC_EXIT()
 199         ret
 200 L(c3):  mov     $1, R8(%rax)
 201         jmp     L(rc3)
 202 EPILOGUE()
 203 ASM_END()