source/libs/gmp/gmp-src/mpn/x86_64/cnd_aors_n.asm

   1 dnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
   2
   3 dnl  Copyright 2011-2013 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb
  34 C AMD K8,K9      2
  35 C AMD K10        2
  36 C AMD bd1        2.32
  37 C AMD bobcat     3
  38 C Intel P4      13
  39 C Intel core2    2.9
  40 C Intel NHM      2.8
  41 C Intel SBR      2.4
  42 C Intel atom     5.33
  43 C VIA nano       3
  44
  45 C NOTES
  46 C  * It might seem natural to use the cmov insn here, but since this function
  47 C    is supposed to have the exact same execution pattern for cnd true and
  48 C    false, and since cmov's documentation is not clear about whether it
  49 C    actually reads both source operands and writes the register for a false
  50 C    condition, we cannot use it.
  51 C  * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
  52 C    to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
  53 C    ADCSBB-to-memory, again saving 1 insn/limb.
  54 C  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
  55 C    for any other processor.
  56
  57 C INPUT PARAMETERS
  58 define(`cnd',   `%rdi') dnl rcx
  59 define(`rp',    `%rsi') dnl rdx
  60 define(`up',    `%rdx') dnl r8
  61 define(`vp',    `%rcx') dnl r9
  62 define(`n',     `%r8')  dnl rsp+40
  63
  64 ifdef(`OPERATION_cnd_add_n', `
  65         define(ADDSUB,        add)
  66         define(ADCSBB,        adc)
  67         define(func,          mpn_cnd_add_n)')
  68 ifdef(`OPERATION_cnd_sub_n', `
  69         define(ADDSUB,        sub)
  70         define(ADCSBB,        sbb)
  71         define(func,          mpn_cnd_sub_n)')
  72
  73 MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
  74
  75 ABI_SUPPORT(DOS64)
  76 ABI_SUPPORT(STD64)
  77
  78 ASM_START()
  79         TEXT
  80         ALIGN(16)
  81 PROLOGUE(func)
  82         FUNC_ENTRY(4)
  83 IFDOS(` mov     56(%rsp), R32(%r8)')
  84         push    %rbx
  85         push    %rbp
  86         push    %r12
  87         push    %r13
  88         push    %r14
  89
  90         neg     cnd
  91         sbb     cnd, cnd                C make cnd mask
  92
  93         lea     (vp,n,8), vp
  94         lea     (up,n,8), up
  95         lea     (rp,n,8), rp
  96
  97         mov     R32(n), R32(%rax)
  98         neg     n
  99         and     $3, R32(%rax)
 100         jz      L(top)                  C carry-save reg rax = 0 in this arc
 101         cmp     $2, R32(%rax)
 102         jc      L(b1)
 103         jz      L(b2)
 104
 105 L(b3):  mov     (vp,n,8), %r12
 106         mov     8(vp,n,8), %r13
 107         mov     16(vp,n,8), %r14
 108         and     cnd, %r12
 109         mov     (up,n,8), %r10
 110         and     cnd, %r13
 111         mov     8(up,n,8), %rbx
 112         and     cnd, %r14
 113         mov     16(up,n,8), %rbp
 114         ADDSUB  %r12, %r10
 115         mov     %r10, (rp,n,8)
 116         ADCSBB  %r13, %rbx
 117         mov     %rbx, 8(rp,n,8)
 118         ADCSBB  %r14, %rbp
 119         mov     %rbp, 16(rp,n,8)
 120         sbb     R32(%rax), R32(%rax)    C save carry
 121         add     $3, n
 122         js      L(top)
 123         jmp     L(end)
 124
 125 L(b2):  mov     (vp,n,8), %r12
 126         mov     8(vp,n,8), %r13
 127         mov     (up,n,8), %r10
 128         and     cnd, %r12
 129         mov     8(up,n,8), %rbx
 130         and     cnd, %r13
 131         ADDSUB  %r12, %r10
 132         mov     %r10, (rp,n,8)
 133         ADCSBB  %r13, %rbx
 134         mov     %rbx, 8(rp,n,8)
 135         sbb     R32(%rax), R32(%rax)    C save carry
 136         add     $2, n
 137         js      L(top)
 138         jmp     L(end)
 139
 140 L(b1):  mov     (vp,n,8), %r12
 141         mov     (up,n,8), %r10
 142         and     cnd, %r12
 143         ADDSUB  %r12, %r10
 144         mov     %r10, (rp,n,8)
 145         sbb     R32(%rax), R32(%rax)    C save carry
 146         add     $1, n
 147         jns     L(end)
 148
 149         ALIGN(16)
 150 L(top): mov     (vp,n,8), %r12
 151         mov     8(vp,n,8), %r13
 152         mov     16(vp,n,8), %r14
 153         mov     24(vp,n,8), %r11
 154         and     cnd, %r12
 155         mov     (up,n,8), %r10
 156         and     cnd, %r13
 157         mov     8(up,n,8), %rbx
 158         and     cnd, %r14
 159         mov     16(up,n,8), %rbp
 160         and     cnd, %r11
 161         mov     24(up,n,8), %r9
 162         add     R32(%rax), R32(%rax)    C restore carry
 163         ADCSBB  %r12, %r10
 164         mov     %r10, (rp,n,8)
 165         ADCSBB  %r13, %rbx
 166         mov     %rbx, 8(rp,n,8)
 167         ADCSBB  %r14, %rbp
 168         mov     %rbp, 16(rp,n,8)
 169         ADCSBB  %r11, %r9
 170         mov     %r9, 24(rp,n,8)
 171         sbb     R32(%rax), R32(%rax)    C save carry
 172         add     $4, n
 173         js      L(top)
 174
 175 L(end): neg     R32(%rax)
 176         pop     %r14
 177         pop     %r13
 178         pop     %r12
 179         pop     %rbp
 180         pop     %rbx
 181         FUNC_EXIT()
 182         ret
 183 EPILOGUE()