source/libs/gmp/gmp-6.1.0/mpn/x86_64/coreibwl/mul_basecase.asm

   1 dnl  AMD64 mpn_mul_basecase optimised for Intel Broadwell.
   2
   3 dnl  Copyright 2015 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C cycles/limb   mul_1           addmul_1
  34 C AMD K8,K9     n/a             n/a
  35 C AMD K10       n/a             n/a
  36 C AMD bull      n/a             n/a
  37 C AMD pile      n/a             n/a
  38 C AMD steam     n/a             n/a
  39 C AMD excavator  ?               ?
  40 C AMD bobcat    n/a             n/a
  41 C AMD jaguar    n/a             n/a
  42 C Intel P4      n/a             n/a
  43 C Intel core2   n/a             n/a
  44 C Intel NHM     n/a             n/a
  45 C Intel SBR     n/a             n/a
  46 C Intel IBR     n/a             n/a
  47 C Intel HWL      1.68           n/a
  48 C Intel BWL      1.69         1.8-1.9
  49 C Intel atom    n/a             n/a
  50 C Intel SLM     n/a             n/a
  51 C VIA nano      n/a             n/a
  52
  53 C The inner loops of this code are the result of running a code generation and
  54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
  55
  56 C TODO
  57 C  * Do overlapped software pipelining.
  58 C  * When changing this, make sure the code which falls into the inner loops
  59 C    does not execute too many no-ops (for both PIC and non-PIC).
  60
  61 define(`rp',      `%rdi')
  62 define(`up',      `%rsi')
  63 define(`un_param',`%rdx')
  64 define(`vp_param',`%rcx')
  65 define(`vn',      `%r8')
  66
  67 define(`n',       `%rcx')
  68 define(`n_save',  `%rbp')
  69 define(`vp',      `%r14')
  70 define(`unneg',   `%rbx')
  71 define(`v0',      `%rdx')
  72 define(`jaddr',   `%rax')
  73
  74 define(`w0',    `%r12')
  75 define(`w1',    `%r9')
  76 define(`w2',    `%r10')
  77 define(`w3',    `%r11')
  78
  79 ABI_SUPPORT(DOS64)
  80 ABI_SUPPORT(STD64)
  81
  82 ASM_START()
  83         TEXT
  84         ALIGN(16)
  85 PROLOGUE(mpn_mul_basecase)
  86         FUNC_ENTRY(4)
  87 IFDOS(` mov     56(%rsp), %r8d  ')
  88
  89         cmp     $2, un_param
  90         ja      L(gen)
  91         mov     (vp_param), %rdx
  92         mulx(   (up), %rax, %r9)        C 0 1
  93         je      L(s2x)
  94
  95 L(s11): mov     %rax, (rp)
  96         mov     %r9, 8(rp)
  97         ret
  98
  99 L(s2x): cmp     $2, vn
 100         mulx(   8,(up), %r8, %r10)      C 1 2
 101         je      L(s22)
 102
 103 L(s21): add     %r8, %r9
 104         adc     $0, %r10
 105         mov     %rax, (rp)
 106         mov     %r9, 8(rp)
 107         mov     %r10, 16(rp)
 108         ret
 109
 110 L(s22): add     %r8, %r9                C 1
 111         adc     $0, %r10                C 2
 112         mov     8(vp_param), %rdx
 113         mov     %rax, (rp)
 114         mulx(   (up), %r8, %r11)        C 1 2
 115         mulx(   8,(up), %rax, %rdx)     C 2 3
 116         add     %r11, %rax              C 2
 117         adc     $0, %rdx                C 3
 118         add     %r8, %r9                C 1
 119         adc     %rax, %r10              C 2
 120         adc     $0, %rdx                C 3
 121         mov     %r9, 8(rp)
 122         mov     %r10, 16(rp)
 123         mov     %rdx, 24(rp)
 124         ret
 125
 126         ALIGN(16)
 127 L(gen):
 128         push    %rbx
 129         push    %rbp
 130         push    %r12
 131         push    %r14
 132
 133         mov     vp_param, vp
 134         lea     1(un_param), unneg
 135         mov     un_param, n_save
 136         mov     R32(un_param), R32(%rax)
 137         and     $-8, unneg
 138         shr     $3, n_save              C loop count
 139         neg     unneg
 140         and     $7, R32(%rax)           C clear CF for adc as side-effect
 141                                         C note that rax lives very long
 142         mov     n_save, n
 143         mov     (vp), v0
 144         lea     8(vp), vp
 145
 146         lea     L(mtab)(%rip), %r10
 147 ifdef(`PIC',
 148 `       movslq  (%r10,%rax,4), %r11
 149         lea     (%r11, %r10), %r10
 150         jmp     *%r10
 151 ',`
 152         jmp     *(%r10,%rax,8)
 153 ')
 154
 155 L(mf0): mulx(   (up), w2, w3)
 156         lea     56(up), up
 157         lea     -8(rp), rp
 158         jmp     L(mb0)
 159
 160 L(mf3): mulx(   (up), w0, w1)
 161         lea     16(up), up
 162         lea     16(rp), rp
 163         inc     n
 164         jmp     L(mb3)
 165
 166 L(mf4): mulx(   (up), w2, w3)
 167         lea     24(up), up
 168         lea     24(rp), rp
 169         inc     n
 170         jmp     L(mb4)
 171
 172 L(mf5): mulx(   (up), w0, w1)
 173         lea     32(up), up
 174         lea     32(rp), rp
 175         inc     n
 176         jmp     L(mb5)
 177
 178 L(mf6): mulx(   (up), w2, w3)
 179         lea     40(up), up
 180         lea     40(rp), rp
 181         inc     n
 182         jmp     L(mb6)
 183
 184 L(mf7): mulx(   (up), w0, w1)
 185         lea     48(up), up
 186         lea     48(rp), rp
 187         inc     n
 188         jmp     L(mb7)
 189
 190 L(mf1): mulx(   (up), w0, w1)
 191         jmp     L(mb1)
 192
 193 L(mf2): mulx(   (up), w2, w3)
 194         lea     8(up), up
 195         lea     8(rp), rp
 196         mulx(   (up), w0, w1)
 197
 198         ALIGN(16)
 199 L(m1top):
 200         mov     w2, -8(rp)
 201         adc     w3, w0
 202 L(mb1): mulx(   8,(up), w2, w3)
 203         adc     w1, w2
 204         lea     64(up), up
 205         mov     w0, (rp)
 206 L(mb0): mov     w2, 8(rp)
 207         mulx(   -48,(up), w0, w1)
 208         lea     64(rp), rp
 209         adc     w3, w0
 210 L(mb7): mulx(   -40,(up), w2, w3)
 211         mov     w0, -48(rp)
 212         adc     w1, w2
 213 L(mb6): mov     w2, -40(rp)
 214         mulx(   -32,(up), w0, w1)
 215         adc     w3, w0
 216 L(mb5): mulx(   -24,(up), w2, w3)
 217         mov     w0, -32(rp)
 218         adc     w1, w2
 219 L(mb4): mulx(   -16,(up), w0, w1)
 220         mov     w2, -24(rp)
 221         adc     w3, w0
 222 L(mb3): mulx(   -8,(up), w2, w3)
 223         adc     w1, w2
 224         mov     w0, -16(rp)
 225         dec     n
 226         mulx(   (up), w0, w1)
 227         jnz     L(m1top)
 228
 229 L(m1end):
 230         mov     w2, -8(rp)
 231         adc     w3, w0
 232         mov     w0, (rp)
 233         adc     %rcx, w1                C relies on rcx = 0
 234         mov     w1, 8(rp)
 235
 236         dec     vn
 237         jz      L(done)
 238
 239         lea     L(atab)(%rip), %r10
 240 ifdef(`PIC',
 241 `       movslq  (%r10,%rax,4), %rax
 242         lea     (%rax, %r10), jaddr
 243 ',`
 244         mov     (%r10,%rax,8), jaddr
 245 ')
 246
 247 L(outer):
 248         lea     (up,unneg,8), up
 249         mov     n_save, n
 250         mov     (vp), v0
 251         lea     8(vp), vp
 252         jmp     *jaddr
 253
 254 L(f0):  mulx(   8,(up), w2, w3)
 255         lea     8(rp,unneg,8), rp
 256         lea     -1(n), n
 257         jmp     L(b0)
 258
 259 L(f3):  mulx(   -16,(up), w0, w1)
 260         lea     -56(rp,unneg,8), rp
 261         jmp     L(b3)
 262
 263 L(f4):  mulx(   -24,(up), w2, w3)
 264         lea     -56(rp,unneg,8), rp
 265         jmp     L(b4)
 266
 267 L(f5):  mulx(   -32,(up), w0, w1)
 268         lea     -56(rp,unneg,8), rp
 269         jmp     L(b5)
 270
 271 L(f6):  mulx(   -40,(up), w2, w3)
 272         lea     -56(rp,unneg,8), rp
 273         jmp     L(b6)
 274
 275 L(f7):  mulx(   16,(up), w0, w1)
 276         lea     8(rp,unneg,8), rp
 277         jmp     L(b7)
 278
 279 L(f1):  mulx(   (up), w0, w1)
 280         lea     8(rp,unneg,8), rp
 281         jmp     L(b1)
 282
 283 L(am1end):
 284         adox(   (rp), w0)
 285         adox(   %rcx, w1)               C relies on rcx = 0
 286         mov     w0, (rp)
 287         adc     %rcx, w1                C relies on rcx = 0
 288         mov     w1, 8(rp)
 289
 290         dec     vn                      C clear CF and OF as side-effect
 291         jnz     L(outer)
 292 L(done):
 293         pop     %r14
 294         pop     %r12
 295         pop     %rbp
 296         pop     %rbx
 297         ret
 298
 299 L(f2):
 300         mulx(   -8,(up), w2, w3)
 301         lea     8(rp,unneg,8), rp
 302         mulx(   (up), w0, w1)
 303
 304         ALIGN(16)
 305 L(am1top):
 306         adox(   -8,(rp), w2)
 307         adcx(   w3, w0)
 308         mov     w2, -8(rp)
 309         jrcxz   L(am1end)
 310 L(b1):  mulx(   8,(up), w2, w3)
 311         adox(   (rp), w0)
 312         lea     -1(n), n
 313         mov     w0, (rp)
 314         adcx(   w1, w2)
 315 L(b0):  mulx(   16,(up), w0, w1)
 316         adcx(   w3, w0)
 317         adox(   8,(rp), w2)
 318         mov     w2, 8(rp)
 319 L(b7):  mulx(   24,(up), w2, w3)
 320         lea     64(up), up
 321         adcx(   w1, w2)
 322         adox(   16,(rp), w0)
 323         mov     w0, 16(rp)
 324 L(b6):  mulx(   -32,(up), w0, w1)
 325         adox(   24,(rp), w2)
 326         adcx(   w3, w0)
 327         mov     w2, 24(rp)
 328 L(b5):  mulx(   -24,(up), w2, w3)
 329         adcx(   w1, w2)
 330         adox(   32,(rp), w0)
 331         mov     w0, 32(rp)
 332 L(b4):  mulx(   -16,(up), w0, w1)
 333         adox(   40,(rp), w2)
 334         adcx(   w3, w0)
 335         mov     w2, 40(rp)
 336 L(b3):  adox(   48,(rp), w0)
 337         mulx(   -8,(up), w2, w3)
 338         mov     w0, 48(rp)
 339         lea     64(rp), rp
 340         adcx(   w1, w2)
 341         mulx(   (up), w0, w1)
 342         jmp     L(am1top)
 343
 344         JUMPTABSECT
 345         ALIGN(8)
 346 L(mtab):JMPENT( L(mf0), L(mtab))
 347         JMPENT( L(mf1), L(mtab))
 348         JMPENT( L(mf2), L(mtab))
 349         JMPENT( L(mf3), L(mtab))
 350         JMPENT( L(mf4), L(mtab))
 351         JMPENT( L(mf5), L(mtab))
 352         JMPENT( L(mf6), L(mtab))
 353         JMPENT( L(mf7), L(mtab))
 354 L(atab):JMPENT( L(f0), L(atab))
 355         JMPENT( L(f1), L(atab))
 356         JMPENT( L(f2), L(atab))
 357         JMPENT( L(f3), L(atab))
 358         JMPENT( L(f4), L(atab))
 359         JMPENT( L(f5), L(atab))
 360         JMPENT( L(f6), L(atab))
 361         JMPENT( L(f7), L(atab))
 362         TEXT
 363 EPILOGUE()