source/libs/gmp/gmp-src/mpn/x86_64/coreisbr/aorsmul_1.asm

   1 dnl  X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb
  36 C AMD K8,K9
  37 C AMD K10
  38 C AMD bull
  39 C AMD pile
  40 C AMD steam
  41 C AMD bobcat
  42 C AMD jaguar
  43 C Intel P4
  44 C Intel core
  45 C Intel NHM
  46 C Intel SBR      3.24 (average, fluctuating in 3.20-3.57)
  47 C Intel IBR      3.04
  48 C Intel HWL
  49 C Intel BWL
  50 C Intel atom
  51 C VIA nano
  52
  53 C The loop of this code is the result of running a code generation and
  54 C optimization tool suite written by David Harvey and Torbjörn Granlund.
  55
  56 define(`rp',      `%rdi')   C rcx
  57 define(`up',      `%rsi')   C rdx
  58 define(`n_param', `%rdx')   C r8
  59 define(`v0',      `%rcx')   C r9
  60
  61 define(`n',       `%rbx')
  62
  63 define(`I',`$1')
  64
  65 ifdef(`OPERATION_addmul_1',`
  66       define(`ADDSUB',        `add')
  67       define(`func',  `mpn_addmul_1')
  68 ')
  69 ifdef(`OPERATION_submul_1',`
  70       define(`ADDSUB',        `sub')
  71       define(`func',  `mpn_submul_1')
  72 ')
  73
  74 ABI_SUPPORT(DOS64)
  75 ABI_SUPPORT(STD64)
  76
  77 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
  78
  79 IFDOS(` define(`up',     ``%rsi'')') dnl
  80 IFDOS(` define(`rp',     ``%rcx'')') dnl
  81 IFDOS(` define(`v0',     ``%r9'')') dnl
  82 IFDOS(` define(`r9',     ``rdi'')') dnl
  83 IFDOS(` define(`n_param',``%r8'')') dnl
  84
  85 ASM_START()
  86         TEXT
  87         ALIGN(32)
  88 PROLOGUE(func)
  89
  90 IFDOS(``push    %rsi            '')
  91 IFDOS(``push    %rdi            '')
  92 IFDOS(``mov     %rdx, %rsi      '')
  93
  94         mov     (up), %rax
  95         push    %rbx
  96         lea     (up,n_param,8), up
  97         lea     (rp,n_param,8), rp
  98
  99         test    $1, R8(n_param)
 100         jnz     L(b13)
 101
 102 L(b02): xor     R32(%r11), R32(%r11)
 103         test    $2, R8(n_param)
 104         jnz     L(b2)
 105
 106 L(b0):  mov     $1, R32(n)
 107         sub     n_param, n
 108         mul     v0
 109         mov     %rdx, %r9
 110         mov     -8(rp,n,8), %r8
 111         jmp     L(e0)
 112
 113         ALIGN(16)
 114 L(b2):  mov     $-1, n
 115         sub     n_param, n
 116         mul     v0
 117         mov     8(rp,n,8), %r8
 118         mov     %rdx, %r9
 119         jmp     L(e2)
 120
 121         ALIGN(16)
 122 L(b13): xor     R32(%r9), R32(%r9)
 123         test    $2, R8(n_param)
 124         jnz     L(b3)
 125
 126 L(b1):  mov     $2, R32(n)
 127         sub     n_param, n
 128         jns     L(1)
 129         mul     v0
 130         mov     -16(rp,n,8), %r10
 131         mov     %rdx, %r11
 132         jmp     L(e1)
 133
 134         ALIGN(16)
 135 L(b3):  xor     R32(n), R32(n)
 136         sub     n_param, n
 137         mul     v0
 138         mov     (rp,n,8), %r10
 139         jmp     L(e3)
 140
 141         ALIGN(32)
 142 L(top): mul     v0
 143         mov     -16(rp,n,8), %r10
 144         ADDSUB  %r11, %r8
 145         mov     %rdx, %r11
 146         adc     $0, %r9
 147         mov     %r8, -24(rp,n,8)
 148 L(e1):  ADDSUB  %rax, %r10
 149         mov     -8(up,n,8), %rax
 150         adc     $0, %r11
 151         mul     v0
 152         ADDSUB  %r9, %r10
 153         mov     %rdx, %r9
 154         mov     -8(rp,n,8), %r8
 155         adc     $0, %r11
 156         mov     %r10, -16(rp,n,8)
 157 L(e0):  ADDSUB  %rax, %r8
 158         adc     $0, %r9
 159         mov     (up,n,8), %rax
 160         mul     v0
 161         mov     (rp,n,8), %r10
 162         ADDSUB  %r11, %r8
 163         mov     %r8, -8(rp,n,8)
 164         adc     $0, %r9
 165 L(e3):  mov     %rdx, %r11
 166         ADDSUB  %rax, %r10
 167         mov     8(up,n,8), %rax
 168         adc     $0, %r11
 169         mul     v0
 170         mov     8(rp,n,8), %r8
 171         ADDSUB  %r9, %r10
 172         mov     %rdx, %r9
 173         mov     %r10, (rp,n,8)
 174         adc     $0, %r11
 175 L(e2):  ADDSUB  %rax, %r8
 176         adc     $0, %r9
 177         mov     16(up,n,8), %rax
 178         add     $4, n
 179         jnc     L(top)
 180
 181 L(end): mul     v0
 182         mov     I(-8(rp),-16(rp,n,8)), %r10
 183         ADDSUB  %r11, %r8
 184         mov     %rdx, %r11
 185         adc     $0, %r9
 186         mov     %r8, I(-16(rp),-24(rp,n,8))
 187         ADDSUB  %rax, %r10
 188         adc     $0, %r11
 189         ADDSUB  %r9, %r10
 190         adc     $0, %r11
 191         mov     %r10, I(-8(rp),-16(rp,n,8))
 192         mov     %r11, %rax
 193
 194         pop     %rbx
 195 IFDOS(``pop     %rdi            '')
 196 IFDOS(``pop     %rsi            '')
 197         ret
 198
 199         ALIGN(16)
 200 L(1):   mul     v0
 201         ADDSUB  %rax, -8(rp)
 202         mov     %rdx, %rax
 203         adc     $0, %rax
 204         pop     %rbx
 205 IFDOS(``pop     %rdi            '')
 206 IFDOS(``pop     %rsi            '')
 207         ret
 208 EPILOGUE()
 209 ASM_END()