source/libs/gmp/gmp-src/mpn/x86_64/coreisbr/mul_basecase.asm

   1 dnl  AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C cycles/limb   mul_1           mul_2           mul_3           addmul_2
  36 C AMD K8,K9
  37 C AMD K10
  38 C AMD bull
  39 C AMD pile
  40 C AMD steam
  41 C AMD bobcat
  42 C AMD jaguar
  43 C Intel P4
  44 C Intel core
  45 C Intel NHM
  46 C Intel SBR      2.5             2.5             -               2.95
  47 C Intel IBR      2.4             2.3             -               2.68
  48 C Intel HWL      2.35            2.0             -               2.5
  49 C Intel BWL
  50 C Intel atom
  51 C VIA nano
  52
  53 C The inner loops of this code are the result of running a code generation and
  54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
  55
  56 C TODO
  57 C  * Fix the addmul_2 fluctuation affecting SBR.
  58 C  * Improve feed-in code, avoiding zeroing of many registers and dummy adds in
  59 C    the loops at the expense of code size.
  60 C  * Adjoin a mul_3, avoiding slow mul_1 for odd vn.
  61 C  * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight
  62 C    speedup.
  63 C  * Further micro-optimise.
  64
  65 C When playing with pointers, set this to $2 to fall back to conservative
  66 C indexing in wind-down code.
  67 define(`I',`$1')
  68
  69
  70 define(`rp',      `%rdi')
  71 define(`up',      `%rsi')
  72 define(`un_param',`%rdx')
  73 define(`vp',      `%rcx')
  74 define(`vn',      `%r8')
  75
  76 define(`un',      `%rbx')
  77
  78 define(`w0',    `%r10')
  79 define(`w1',    `%r11')
  80 define(`w2',    `%r12')
  81 define(`w3',    `%r13')
  82 define(`n',     `%rbp')
  83 define(`v0',    `%r9')
  84
  85 ABI_SUPPORT(DOS64)
  86 ABI_SUPPORT(STD64)
  87
  88 ASM_START()
  89         TEXT
  90         ALIGN(16)
  91 PROLOGUE(mpn_mul_basecase)
  92         FUNC_ENTRY(4)
  93 IFDOS(` mov     56(%rsp), %r8d  ')
  94         push    %rbx
  95         push    %rbp
  96         mov     un_param, un            C free up rdx
  97         neg     un
  98
  99         mov     (up), %rax              C shared for mul_1 and mul_2
 100         lea     (up,un_param,8), up     C point at operand end
 101         lea     (rp,un_param,8), rp     C point at rp[un-1]
 102
 103         mov     (vp), v0                C shared for mul_1 and mul_2
 104         mul     v0                      C shared for mul_1 and mul_2
 105
 106         test    $1, R8(vn)
 107         jz      L(do_mul_2)
 108
 109 L(do_mul_1):
 110         test    $1, R8(un)
 111         jnz     L(m1x1)
 112
 113 L(m1x0):mov     %rax, w0                C un = 2, 4, 6, 8, ...
 114         mov     %rdx, w1
 115         mov     8(up,un,8), %rax
 116         test    $2, R8(un)
 117         jnz     L(m110)
 118
 119 L(m100):lea     2(un), n                C un = 4, 8, 12, ...
 120         jmp     L(m1l0)
 121
 122 L(m110):lea     (un), n                 C un = 2, 6, 10, ...
 123         jmp     L(m1l2)
 124
 125 L(m1x1):mov     %rax, w1                C un = 1, 3, 5, 7, ...
 126         mov     %rdx, w0
 127         test    $2, R8(un)
 128         jz      L(m111)
 129
 130 L(m101):lea     3(un), n                C un = 1, 5, 9, ...
 131         test    n, n
 132         js      L(m1l1)
 133         mov     %rax, -8(rp)
 134         mov     %rdx, (rp)
 135         pop     %rbp
 136         pop     %rbx
 137         FUNC_EXIT()
 138         ret
 139
 140 L(m111):lea     1(un), n                C un = 3, 7, 11, ...
 141         mov     8(up,un,8), %rax
 142         jmp     L(m1l3)
 143
 144         ALIGN(16)               C FIXME
 145 L(m1tp):mov     %rdx, w0
 146         add     %rax, w1
 147 L(m1l1):mov     -16(up,n,8), %rax
 148         adc     $0, w0
 149         mul     v0
 150         add     %rax, w0
 151         mov     w1, -24(rp,n,8)
 152         mov     -8(up,n,8), %rax
 153         mov     %rdx, w1
 154         adc     $0, w1
 155 L(m1l0):mul     v0
 156         mov     w0, -16(rp,n,8)
 157         add     %rax, w1
 158         mov     %rdx, w0
 159         mov     (up,n,8), %rax
 160         adc     $0, w0
 161 L(m1l3):mul     v0
 162         mov     w1, -8(rp,n,8)
 163         mov     %rdx, w1
 164         add     %rax, w0
 165         mov     8(up,n,8), %rax
 166         adc     $0, w1
 167 L(m1l2):mul     v0
 168         mov     w0, (rp,n,8)
 169         add     $4, n
 170         jnc     L(m1tp)
 171
 172 L(m1ed):add     %rax, w1
 173         adc     $0, %rdx
 174         mov     w1, I(-8(rp),-24(rp,n,8))
 175         mov     %rdx, I((rp),-16(rp,n,8))
 176
 177         dec     R32(vn)
 178         jz      L(ret2)
 179
 180         lea     8(vp), vp
 181         lea     8(rp), rp
 182         push    %r12
 183         push    %r13
 184         push    %r14
 185         jmp     L(do_addmul)
 186
 187 L(do_mul_2):
 188 define(`v1',    `%r14')
 189         push    %r12
 190         push    %r13
 191         push    %r14
 192
 193         mov     8(vp), v1
 194
 195         test    $1, R8(un)
 196         jnz     L(m2b1)
 197
 198 L(m2b0):lea     (un), n
 199         xor     w0, w0
 200         mov     %rax, w2
 201         mov     %rdx, w1
 202         jmp     L(m2l0)
 203
 204 L(m2b1):lea     1(un), n
 205         xor     w1, w1
 206         xor     w2, w2
 207         mov     %rax, w0
 208         mov     %rdx, w3
 209         jmp     L(m2l1)
 210
 211         ALIGN(32)
 212 L(m2tp):mul     v0
 213         add     %rax, w0
 214         mov     %rdx, w3
 215         adc     $0, w3
 216 L(m2l1):mov     -8(up,n,8), %rax
 217         mul     v1
 218         add     w1, w0
 219         adc     $0, w3
 220         add     %rax, w2
 221         mov     w0, -8(rp,n,8)
 222         mov     %rdx, w0
 223         adc     $0, w0
 224         mov     (up,n,8), %rax
 225         mul     v0
 226         add     %rax, w2
 227         mov     %rdx, w1
 228         adc     $0, w1
 229         add     w3, w2
 230 L(m2l0):mov     (up,n,8), %rax
 231         adc     $0, w1
 232         mul     v1
 233         mov     w2, (rp,n,8)
 234         add     %rax, w0
 235         mov     %rdx, w2
 236         mov     8(up,n,8), %rax
 237         adc     $0, w2
 238         add     $2, n
 239         jnc     L(m2tp)
 240
 241 L(m2ed):mul     v0
 242         add     %rax, w0
 243         mov     %rdx, w3
 244         adc     $0, w3
 245         mov     I(-8(up),-8(up,n,8)), %rax
 246         mul     v1
 247         add     w1, w0
 248         adc     $0, w3
 249         add     %rax, w2
 250         mov     w0, I(-8(rp),-8(rp,n,8))
 251         adc     $0, %rdx
 252         add     w3, w2
 253         mov     w2, I((rp),(rp,n,8))
 254         adc     $0, %rdx
 255         mov     %rdx, I(8(rp),8(rp,n,8))
 256
 257         add     $-2, R32(vn)
 258         jz      L(ret5)
 259         lea     16(vp), vp
 260         lea     16(rp), rp
 261
 262
 263 L(do_addmul):
 264         push    %r15
 265         push    vn                      C save vn in new stack slot
 266 define(`vn',    `(%rsp)')
 267 define(`X0',    `%r14')
 268 define(`X1',    `%r15')
 269 define(`v1',    `%r8')
 270
 271 L(outer):
 272         mov     (vp), v0
 273         mov     8(vp), v1
 274         mov     (up,un,8), %rax
 275         mul     v0
 276         test    $1, R8(un)
 277         jnz     L(a1x1)
 278
 279 L(a1x0):mov     (rp,un,8), X0
 280         xor     w0, w0
 281         mov     %rdx, w1
 282         test    $2, R8(un)
 283         jnz     L(a110)
 284
 285 L(a100):lea     2(un), n                C un = 4, 8, 12, ...
 286         add     %rax, X0
 287         adc     $0, w1
 288         mov     (up,un,8), %rax
 289         mul     v1
 290         mov     8(rp,un,8), X1
 291         jmp     L(lo0)
 292
 293 L(a110):lea     (un), n                 C un = 2, 6, 10, ...
 294         xor     w3, w3
 295         jmp     L(lo2)
 296
 297 L(a1x1):mov     (rp,un,8), X1
 298         xor     w2, w2
 299         xor     w1, w1
 300         test    $2, R8(un)
 301         jz      L(a111)
 302
 303 L(a101):lea     3(un), n                C un = 1, 5, 9, ...
 304         mov     %rdx, w3
 305         add     %rax, X1
 306         mov     (up,un,8), %rax
 307         mov     8(rp,un,8), X0
 308         adc     $0, w3
 309         jmp     L(top)
 310
 311 L(a111):lea     1(un), n                C un = 3, 7, 11, ...
 312         jmp     L(lo3)
 313
 314         ALIGN(32)
 315 L(top): mul     v1
 316         mov     %rdx, w0
 317         add     %rax, X0
 318         adc     $0, w0
 319         add     w1, X1
 320         adc     $0, w3
 321         add     w2, X0
 322         adc     $0, w0
 323         mov     -16(up,n,8), %rax
 324         mul     v0
 325         add     %rax, X0
 326         mov     %rdx, w1
 327         adc     $0, w1
 328         mov     -16(up,n,8), %rax
 329         mul     v1
 330         mov     X1, -24(rp,n,8)
 331         mov     -8(rp,n,8), X1
 332         add     w3, X0
 333         adc     $0, w1
 334 L(lo0): mov     %rdx, w2
 335         mov     X0, -16(rp,n,8)
 336         add     %rax, X1
 337         adc     $0, w2
 338         mov     -8(up,n,8), %rax
 339         add     w0, X1
 340         adc     $0, w2
 341         mul     v0
 342 L(lo3): add     %rax, X1
 343         mov     %rdx, w3
 344         adc     $0, w3
 345         mov     -8(up,n,8), %rax
 346         mul     v1
 347         add     w1, X1
 348         mov     (rp,n,8), X0
 349         adc     $0, w3
 350         mov     %rdx, w0
 351         add     %rax, X0
 352         adc     $0, w0
 353         mov     (up,n,8), %rax
 354         mul     v0
 355         add     w2, X0
 356         mov     X1, -8(rp,n,8)
 357         mov     %rdx, w1
 358         adc     $0, w0
 359 L(lo2): add     %rax, X0
 360         adc     $0, w1
 361         mov     (up,n,8), %rax
 362         add     w3, X0
 363         adc     $0, w1
 364         mul     v1
 365         mov     8(rp,n,8), X1
 366         add     %rax, X1
 367         mov     %rdx, w2
 368         adc     $0, w2
 369         mov     8(up,n,8), %rax
 370         mov     X0, (rp,n,8)
 371         mul     v0
 372         add     w0, X1
 373         mov     %rdx, w3
 374         adc     $0, w2
 375         add     %rax, X1
 376         mov     8(up,n,8), %rax
 377         mov     16(rp,n,8), X0          C useless but harmless in final iter
 378         adc     $0, w3
 379         add     $4, n
 380         jnc     L(top)
 381
 382 L(end): mul     v1
 383         add     w1, X1
 384         adc     $0, w3
 385         add     w2, %rax
 386         adc     $0, %rdx
 387         mov     X1, I(-8(rp),-24(rp,n,8))
 388         add     w3, %rax
 389         adc     $0, %rdx
 390         mov     %rax, I((rp),-16(rp,n,8))
 391         mov     %rdx, I(8(rp),-8(rp,n,8))
 392
 393         addl    $-2, vn
 394         lea     16(vp), vp
 395         lea     16(rp), rp
 396         jnz     L(outer)
 397
 398         pop     %rax            C deallocate vn slot
 399         pop     %r15
 400 L(ret5):pop     %r14
 401         pop     %r13
 402         pop     %r12
 403 L(ret2):pop     %rbp
 404         pop     %rbx
 405         FUNC_EXIT()
 406         ret
 407 EPILOGUE()