source/libs/gmp/gmp-src/mpn/x86_64/coreisbr/aorrlsh_n.asm

   1 dnl  AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
   2 dnl  AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
   3 dnl  Optimised for Sandy Bridge.
   4
   5 dnl  Contributed to the GNU project by Torbjorn Granlund.
   6
   7 dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
   8
   9 dnl  This file is part of the GNU MP Library.
  10 dnl
  11 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  12 dnl  it under the terms of either:
  13 dnl
  14 dnl    * the GNU Lesser General Public License as published by the Free
  15 dnl      Software Foundation; either version 3 of the License, or (at your
  16 dnl      option) any later version.
  17 dnl
  18 dnl  or
  19 dnl
  20 dnl    * the GNU General Public License as published by the Free Software
  21 dnl      Foundation; either version 2 of the License, or (at your option) any
  22 dnl      later version.
  23 dnl
  24 dnl  or both in parallel, as here.
  25 dnl
  26 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  27 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 dnl  for more details.
  30 dnl
  31 dnl  You should have received copies of the GNU General Public License and the
  32 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 dnl  see https://www.gnu.org/licenses/.
  34
  35 include(`../config.m4')
  36
  37 C            cycles/limb
  38 C AMD K8,K9      ?
  39 C AMD K10        5.25
  40 C Intel P4       ?
  41 C Intel core2    3.1
  42 C Intel NHM      3.95
  43 C Intel SBR      2.75
  44 C Intel atom     ?
  45 C VIA nano       ?
  46
  47 C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
  48 C unrolling).  The rest of the code is quite crude, and could perhaps be made
  49 C both smaller and faster.
  50
  51 C INPUT PARAMETERS
  52 define(`rp',    `%rdi')
  53 define(`up',    `%rsi')
  54 define(`vp',    `%rdx')
  55 define(`n',     `%rcx')
  56 define(`cnt',   `%r8')
  57 define(`cy',    `%r9')                  C for _nc variant
  58
  59 ifdef(`OPERATION_addlsh_n', `
  60         define(ADDSUB,  add)
  61         define(ADCSBB,  adc)
  62         define(IFRSB,   )
  63         define(func_n,  mpn_addlsh_n)
  64         define(func_nc, mpn_addlsh_nc)')
  65 ifdef(`OPERATION_rsblsh_n', `
  66         define(ADDSUB,  sub)
  67         define(ADCSBB,  sbb)
  68         define(IFRSB,   `$1')
  69         define(func_n,  mpn_rsblsh_n)
  70         define(func_nc, mpn_rsblsh_nc)')
  71
  72 ABI_SUPPORT(DOS64)
  73 ABI_SUPPORT(STD64)
  74
  75 C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
  76 C refmpn_rsblsh_nc
  77 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
  78
  79 ASM_START()
  80         TEXT
  81         ALIGN(32)
  82 PROLOGUE(func_n)
  83         FUNC_ENTRY(4)
  84 IFDOS(` mov     56(%rsp), %r8d  ')      C cnt
  85         push    %rbx
  86         xor     R32(%rbx), R32(%rbx)    C clear CF save register
  87 L(ent): push    %rbp
  88         mov     R32(n), R32(%rbp)
  89         mov     n, %rax
  90         mov     R32(cnt), R32(%rcx)
  91         neg     R32(%rcx)
  92         and     $3, R32(%rbp)
  93         jz      L(b0)
  94         lea     -32(vp,%rbp,8), vp
  95         lea     -32(up,%rbp,8), up
  96         lea     -32(rp,%rbp,8), rp
  97         cmp     $2, R32(%rbp)
  98         jc      L(b1)
  99         jz      L(b2)
 100
 101 L(b3):  xor     %r8, %r8
 102         mov     8(vp), %r9
 103         mov     16(vp), %r10
 104         shrd    R8(%rcx), %r9, %r8
 105         shrd    R8(%rcx), %r10, %r9
 106         mov     24(vp), %r11
 107         shrd    R8(%rcx), %r11, %r10
 108         sub     $3, %rax
 109         jz      L(3)
 110         add     R32(%rbx), R32(%rbx)
 111         lea     32(vp), vp
 112         ADCSBB  8(up), %r8
 113         ADCSBB  16(up), %r9
 114         ADCSBB  24(up), %r10
 115         lea     32(up), up
 116         jmp     L(lo3)
 117 L(3):   add     R32(%rbx), R32(%rbx)
 118         lea     32(vp), vp
 119         ADCSBB  8(up), %r8
 120         ADCSBB  16(up), %r9
 121         ADCSBB  24(up), %r10
 122         jmp     L(wd3)
 123
 124 L(b0):  mov     (vp), %r8
 125         mov     8(vp), %r9
 126         xor     R32(%rbp), R32(%rbp)
 127         jmp     L(lo0)
 128
 129 L(b1):  xor     %r10, %r10
 130         mov     24(vp), %r11
 131         shrd    R8(%rcx), %r11, %r10
 132         sub     $1, %rax
 133         jz      L(1)
 134         add     R32(%rbx), R32(%rbx)
 135         lea     32(vp), vp
 136         ADCSBB  24(up), %r10
 137         lea     32(up), up
 138         mov     (vp), %r8
 139         jmp     L(lo1)
 140 L(1):   add     R32(%rbx), R32(%rbx)
 141         ADCSBB  24(up), %r10
 142         jmp     L(wd1)
 143
 144 L(b2):  xor     %r9, %r9
 145         mov     16(vp), %r10
 146         shrd    R8(%rcx), %r10, %r9
 147         mov     24(vp), %r11
 148         shrd    R8(%rcx), %r11, %r10
 149         sub     $2, %rax
 150         jz      L(2)
 151         add     R32(%rbx), R32(%rbx)
 152         lea     32(vp), vp
 153         ADCSBB  16(up), %r9
 154         ADCSBB  24(up), %r10
 155         lea     32(up), up
 156         jmp     L(lo2)
 157 L(2):   add     R32(%rbx), R32(%rbx)
 158         ADCSBB  16(up), %r9
 159         ADCSBB  24(up), %r10
 160         jmp     L(wd2)
 161
 162         ALIGN(32)                       C 16-byte alignment is not enough!
 163 L(top): shrd    R8(%rcx), %r11, %r10
 164         add     R32(%rbx), R32(%rbx)
 165         lea     32(vp), vp
 166         ADCSBB  (up), %rbp
 167         ADCSBB  8(up), %r8
 168         ADCSBB  16(up), %r9
 169         ADCSBB  24(up), %r10
 170         mov     %rbp, (rp)
 171         lea     32(up), up
 172 L(lo3): mov     %r8, 8(rp)
 173 L(lo2): mov     %r9, 16(rp)
 174         mov     (vp), %r8
 175 L(lo1): mov     %r10, 24(rp)
 176         mov     8(vp), %r9
 177         mov     %r11, %rbp
 178         lea     32(rp), rp
 179         sbb     R32(%rbx), R32(%rbx)
 180 L(lo0): shrd    R8(%rcx), %r8, %rbp
 181         mov     16(vp), %r10
 182         shrd    R8(%rcx), %r9, %r8
 183         shrd    R8(%rcx), %r10, %r9
 184         mov     24(vp), %r11
 185         sub     $4, %rax
 186         jg      L(top)
 187
 188         shrd    R8(%rcx), %r11, %r10
 189         add     R32(%rbx), R32(%rbx)
 190         ADCSBB  (up), %rbp
 191         ADCSBB  8(up), %r8
 192         ADCSBB  16(up), %r9
 193         ADCSBB  24(up), %r10
 194         mov     %rbp, (rp)
 195 L(wd3): mov     %r8, 8(rp)
 196 L(wd2): mov     %r9, 16(rp)
 197 L(wd1): mov     %r10, 24(rp)
 198         adc     R32(%rax), R32(%rax)    C rax is zero after loop
 199         shr     R8(%rcx), %r11
 200         ADDSUB  %r11, %rax
 201 IFRSB(  neg     %rax)
 202         pop     %rbp
 203         pop     %rbx
 204         FUNC_EXIT()
 205         ret
 206 EPILOGUE()
 207 PROLOGUE(func_nc)
 208         FUNC_ENTRY(4)
 209 IFDOS(` mov     56(%rsp), %r8d  ')      C cnt
 210 IFDOS(` mov     64(%rsp), %r9   ')      C cy
 211         push    %rbx
 212         neg     cy
 213         sbb     R32(%rbx), R32(%rbx)    C initialise CF save register
 214         jmp     L(ent)
 215 EPILOGUE()