source/libs/gmp/gmp-src/mpn/x86_64/coreinhm/aorrlsh_n.asm

   1 dnl  AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
   2 dnl  AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
   3 dnl  Optimised for Nehalem.
   4
   5 dnl  Contributed to the GNU project by Torbjorn Granlund.
   6
   7 dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
   8
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  12 dnl  it under the terms of either:
  13 dnl
  14 dnl    * the GNU Lesser General Public License as published by the Free
  15 dnl      Software Foundation; either version 3 of the License, or (at your
  16 dnl      option) any later version.
  17 dnl
  18 dnl  or
  19 dnl
  20 dnl    * the GNU General Public License as published by the Free Software
  21 dnl      Foundation; either version 2 of the License, or (at your option) any
  22 dnl      later version.
  23 dnl
  24 dnl  or both in parallel, as here.
  25 dnl
  26 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  27 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 dnl  for more details.
  30 dnl
  31 dnl  You should have received copies of the GNU General Public License and the
  32 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 dnl  see https://www.gnu.org/licenses/.
  34
  35 include(`../config.m4')
  36
  37 C            cycles/limb
  38 C AMD K8,K9      ?
  39 C AMD K10        4.75
  40 C Intel P4       ?
  41 C Intel core2    2.8-3
  42 C Intel NHM      2.8
  43 C Intel SBR      3.55
  44 C Intel atom     ?
  45 C VIA nano       ?
  46
  47 C The inner-loop probably runs close to optimally on Nehalem (using 4-way
  48 C unrolling).  The rest of the code is quite crude, and could perhaps be made
  49 C both smaller and faster.
  50
  51 C INPUT PARAMETERS
  52 define(`rp',    `%rdi')
  53 define(`up',    `%rsi')
  54 define(`vp',    `%rdx')
  55 define(`n',     `%rcx')
  56 define(`cnt',   `%r8')
  57 define(`cy',    `%r9')                  C for _nc variant
  58
  59 ifdef(`OPERATION_addlsh_n', `
  60         define(ADDSUB,  add)
  61         define(ADCSBB,  adc)
  62         define(IFRSB,   )
  63         define(func_n,  mpn_addlsh_n)
  64         define(func_nc, mpn_addlsh_nc)')
  65 ifdef(`OPERATION_rsblsh_n', `
  66         define(ADDSUB,  sub)
  67         define(ADCSBB,  sbb)
  68         define(IFRSB,   `$1')
  69         define(func_n,  mpn_rsblsh_n)
  70         define(func_nc, mpn_rsblsh_nc)')
  71
  72 C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
  73 C refmpn_rsblsh_nc
  74 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
  75
  76 ABI_SUPPORT(DOS64)
  77 ABI_SUPPORT(STD64)
  78
  79 ASM_START()
  80         TEXT
  81         ALIGN(32)
  82 PROLOGUE(func_n)
  83         FUNC_ENTRY(4)
  84 IFDOS(` mov     56(%rsp), %r8d  ')      C cnt
  85         push    %rbx
  86         xor     R32(%rbx), R32(%rbx)    C clear CF save register
  87 L(ent): push    %rbp
  88         mov     R32(n), R32(%rbp)
  89         mov     n, %rax
  90
  91         mov     R32(cnt), R32(%rcx)
  92         neg     R32(%rcx)
  93
  94         lea     -8(up,%rax,8), up
  95         lea     -8(vp,%rax,8), vp
  96         lea     -40(rp,%rax,8), rp
  97         neg     %rax
  98
  99         and     $3, R32(%rbp)
 100         jz      L(b0)
 101         cmp     $2, R32(%rbp)
 102         jc      L(b1)
 103         jz      L(b2)
 104
 105 L(b3):  xor     R32(%r9), R32(%r9)
 106         mov     8(vp,%rax,8), %r10
 107         mov     16(vp,%rax,8), %r11
 108         shrd    %cl, %r10, %r9
 109         shrd    %cl, %r11, %r10
 110         add     R32(%rbx), R32(%rbx)
 111         ADCSBB  8(up,%rax,8), %r9
 112         mov     24(vp,%rax,8), %r8
 113         ADCSBB  16(up,%rax,8), %r10
 114         sbb     R32(%rbx), R32(%rbx)
 115         add     $3, %rax
 116         jmp     L(lo3)
 117
 118 L(b0):  mov     8(vp,%rax,8), %r9
 119         xor     R32(%r8), R32(%r8)
 120         shrd    %cl, %r9, %r8
 121         mov     16(vp,%rax,8), %r10
 122         mov     24(vp,%rax,8), %r11
 123         shrd    %cl, %r10, %r9
 124         shrd    %cl, %r11, %r10
 125         add     R32(%rbx), R32(%rbx)
 126         ADCSBB  8(up,%rax,8), %r8
 127         mov     %r8, 40(rp,%rax,8)      C offset 40
 128         ADCSBB  16(up,%rax,8), %r9
 129         mov     32(vp,%rax,8), %r8
 130         ADCSBB  24(up,%rax,8), %r10
 131         sbb     R32(%rbx), R32(%rbx)
 132         add     $4, %rax
 133         jmp     L(lo0)
 134
 135 L(b1):  mov     8(vp,%rax,8), %r8
 136         add     $1, %rax
 137         jz      L(1)
 138         mov     8(vp,%rax,8), %r9
 139         xor     R32(%rbp), R32(%rbp)
 140         jmp     L(lo1)
 141 L(1):   xor     R32(%r11), R32(%r11)
 142         jmp     L(wd1)
 143
 144 L(b2):  xor     %r10, %r10
 145         mov     8(vp,%rax,8), %r11
 146         shrd    %cl, %r11, %r10
 147         add     R32(%rbx), R32(%rbx)
 148         mov     16(vp,%rax,8), %r8
 149         ADCSBB  8(up,%rax,8), %r10
 150         sbb     R32(%rbx), R32(%rbx)
 151         add     $2, %rax
 152         jz      L(end)
 153
 154         ALIGN(16)
 155 L(top): mov     8(vp,%rax,8), %r9
 156         mov     %r11, %rbp
 157 L(lo2): mov     %r10, 24(rp,%rax,8)     C offset 24
 158 L(lo1): shrd    %cl, %r8, %rbp
 159         shrd    %cl, %r9, %r8
 160         mov     16(vp,%rax,8), %r10
 161         mov     24(vp,%rax,8), %r11
 162         shrd    %cl, %r10, %r9
 163         shrd    %cl, %r11, %r10
 164         add     R32(%rbx), R32(%rbx)
 165         ADCSBB  (up,%rax,8), %rbp
 166         ADCSBB  8(up,%rax,8), %r8
 167         mov     %r8, 40(rp,%rax,8)      C offset 40
 168         ADCSBB  16(up,%rax,8), %r9
 169         mov     32(vp,%rax,8), %r8
 170         ADCSBB  24(up,%rax,8), %r10
 171         sbb     R32(%rbx), R32(%rbx)
 172         add     $4, %rax
 173         mov     %rbp, (rp,%rax,8)       C offset 32
 174 L(lo0):
 175 L(lo3): mov     %r9, 16(rp,%rax,8)      C offset 48
 176         jnz     L(top)
 177
 178 L(end): mov     %r10, 24(rp,%rax,8)
 179 L(wd1): shrd    %cl, %r8, %r11
 180         add     R32(%rbx), R32(%rbx)
 181         ADCSBB  (up,%rax,8), %r11
 182         mov     %r11, 32(rp,%rax,8)     C offset 32
 183         adc     R32(%rax), R32(%rax)    C rax is zero after loop
 184         shr     R8(%rcx), %r8
 185         ADDSUB  %r8, %rax
 186 IFRSB(  neg     %rax)
 187         pop     %rbp
 188         pop     %rbx
 189         FUNC_EXIT()
 190         ret
 191 EPILOGUE()
 192 PROLOGUE(func_nc)
 193         FUNC_ENTRY(4)
 194 IFDOS(` mov     56(%rsp), %r8d  ')      C cnt
 195 IFDOS(` mov     64(%rsp), %r9   ')      C cy
 196         push    %rbx
 197         neg     cy
 198         sbb     R32(%rbx), R32(%rbx)    C initialise CF save register
 199         jmp     L(ent)
 200 EPILOGUE()