source/libs/gmp/gmp-src/mpn/x86_64/coreisbr/addmul_2.asm

   1 dnl  AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb        best
  36 C AMD K8,K9
  37 C AMD K10
  38 C AMD bull
  39 C AMD pile
  40 C AMD bobcat
  41 C AMD jaguar
  42 C Intel P4
  43 C Intel core
  44 C Intel NHM
  45 C Intel SBR      2.93           this
  46 C Intel IBR      2.66           this
  47 C Intel HWL      2.5             2.15
  48 C Intel BWL
  49 C Intel atom
  50 C VIA nano
  51
  52 C This code is the result of running a code generation and optimisation tool
  53 C suite written by David Harvey and Torbjorn Granlund.
  54
  55 C When playing with pointers, set this to $2 to fall back to conservative
  56 C indexing in wind-down code.
  57 define(`I',`$1')
  58
  59
  60 define(`rp',      `%rdi')   C rcx
  61 define(`up',      `%rsi')   C rdx
  62 define(`n_param', `%rdx')   C r8
  63 define(`vp',      `%rcx')   C r9
  64
  65 define(`n',       `%rcx')
  66 define(`v0',      `%rbx')
  67 define(`v1',      `%rbp')
  68 define(`w0',      `%r8')
  69 define(`w1',      `%r9')
  70 define(`w2',      `%r10')
  71 define(`w3',      `%r11')
  72 define(`X0',      `%r12')
  73 define(`X1',      `%r13')
  74
  75 ABI_SUPPORT(DOS64)
  76 ABI_SUPPORT(STD64)
  77
  78 ASM_START()
  79         TEXT
  80         ALIGN(32)
  81 PROLOGUE(mpn_addmul_2)
  82         FUNC_ENTRY(4)
  83         push    %rbx
  84         push    %rbp
  85         push    %r12
  86         push    %r13
  87
  88         mov     (vp), v0
  89         mov     8(vp), v1
  90
  91         mov     (up), %rax
  92
  93         mov     n_param, n
  94         neg     n
  95
  96         lea     (up,n_param,8), up
  97         lea     8(rp,n_param,8), rp
  98         mul     v0
  99
 100         test    $1, R8(n)
 101         jnz     L(bx1)
 102
 103 L(bx0): mov     -8(rp,n,8), X0
 104         mov     %rdx, w1
 105         add     %rax, X0
 106         adc     $0, w1
 107         mov     (up,n,8), %rax
 108         xor     w0, w0
 109         xor     w3, w3
 110         test    $2, R8(n)
 111         jnz     L(b10)
 112
 113 L(b00): nop                             C this nop make loop go faster on SBR!
 114         mul     v1
 115         mov     (rp,n,8), X1
 116         jmp     L(lo0)
 117
 118 L(b10): lea     -2(n), n
 119         jmp     L(lo2)
 120
 121 L(bx1): mov     -8(rp,n,8), X1
 122         mov     %rdx, w3
 123         add     %rax, X1
 124         adc     $0, w3
 125         mov     (up,n,8), %rax
 126         xor     w1, w1
 127         xor     w2, w2
 128         test    $2, R8(n)
 129         jz      L(b11)
 130
 131 L(b01): mov     (rp,n,8), X0
 132         inc     n
 133         jmp     L(lo1)
 134
 135 L(b11): dec     n
 136         jmp     L(lo3)
 137
 138         ALIGN(32)
 139 L(top):
 140 L(lo1): mul     v1
 141         mov     %rdx, w0                C 1
 142         add     %rax, X0                C 0
 143         adc     $0, w0                  C 1
 144         add     w1, X1                  C 3
 145         adc     $0, w3                  C 0
 146         add     w2, X0                  C 0
 147         adc     $0, w0                  C 1
 148         mov     (up,n,8), %rax
 149         mul     v0
 150         add     %rax, X0                C 0
 151         mov     %rdx, w1                C 1
 152         adc     $0, w1                  C 1
 153         mov     (up,n,8), %rax
 154         mul     v1
 155         mov     X1, -16(rp,n,8)         C 3
 156         mov     (rp,n,8), X1            C 1
 157         add     w3, X0                  C 0
 158         adc     $0, w1                  C 1
 159 L(lo0): mov     %rdx, w2                C 2
 160         mov     X0, -8(rp,n,8)          C 0
 161         add     %rax, X1                C 1
 162         adc     $0, w2                  C 2
 163         mov     8(up,n,8), %rax
 164         add     w0, X1                  C 1
 165         adc     $0, w2                  C 2
 166         mul     v0
 167         add     %rax, X1                C 1
 168         mov     %rdx, w3                C 2
 169         adc     $0, w3                  C 2
 170         mov     8(up,n,8), %rax
 171 L(lo3): mul     v1
 172         add     w1, X1                  C 1
 173         mov     8(rp,n,8), X0           C 2
 174         adc     $0, w3                  C 2
 175         mov     %rdx, w0                C 3
 176         add     %rax, X0                C 2
 177         adc     $0, w0                  C 3
 178         mov     16(up,n,8), %rax
 179         mul     v0
 180         add     w2, X0                  C 2
 181         mov     X1, (rp,n,8)            C 1
 182         mov     %rdx, w1                C 3
 183         adc     $0, w0                  C 3
 184         add     %rax, X0                C 2
 185         adc     $0, w1                  C 3
 186         mov     16(up,n,8), %rax
 187         add     w3, X0                  C 2
 188         adc     $0, w1                  C 3
 189 L(lo2): mul     v1
 190         mov     16(rp,n,8), X1          C 3
 191         add     %rax, X1                C 3
 192         mov     %rdx, w2                C 4
 193         adc     $0, w2                  C 4
 194         mov     24(up,n,8), %rax
 195         mov     X0, 8(rp,n,8)           C 2
 196         mul     v0
 197         add     w0, X1                  C 3
 198         mov     %rdx, w3                C 4
 199         adc     $0, w2                  C 4
 200         add     %rax, X1                C 3
 201         mov     24(up,n,8), %rax
 202         mov     24(rp,n,8), X0          C 0     useless but harmless final read
 203         adc     $0, w3                  C 4
 204         add     $4, n
 205         jnc     L(top)
 206
 207 L(end): mul     v1
 208         add     w1, X1
 209         adc     $0, w3
 210         add     w2, %rax
 211         adc     $0, %rdx
 212         mov     X1, I(-16(rp),-16(rp,n,8))
 213         add     w3, %rax
 214         adc     $0, %rdx
 215         mov     %rax, I(-8(rp),-8(rp,n,8))
 216         mov     %rdx, %rax
 217
 218         pop     %r13
 219         pop     %r12
 220         pop     %rbp
 221         pop     %rbx
 222         FUNC_EXIT()
 223         ret
 224 EPILOGUE()