source/libs/gmp/gmp-src/mpn/x86_64/coreihwl/redc_1.asm

   1 dnl  AMD64 mpn_redc_1 optimised for Intel Haswell.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb
  36 C AMD K8,K9     n/a
  37 C AMD K10       n/a
  38 C AMD bull      n/a
  39 C AMD pile      n/a
  40 C AMD steam      ?
  41 C AMD bobcat    n/a
  42 C AMD jaguar     ?
  43 C Intel P4      n/a
  44 C Intel core    n/a
  45 C Intel NHM     n/a
  46 C Intel SBR     n/a
  47 C Intel IBR     n/a
  48 C Intel HWL      2.32
  49 C Intel BWL      ?
  50 C Intel atom    n/a
  51 C VIA nano      n/a
  52
  53 C The inner loops of this code are the result of running a code generation and
  54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
  55
  56 C TODO
  57 C  * Micro-optimise.
  58 C  * Consider inlining mpn_add_n.  Tests indicate that this saves just 1-2
  59 C    cycles, though.
  60
  61 define(`rp',          `%rdi')   C rcx
  62 define(`up',          `%rsi')   C rdx
  63 define(`mp_param',    `%rdx')   C r8
  64 define(`n',           `%rcx')   C r9
  65 define(`u0inv_param', `%r8')    C stack
  66
  67 define(`i',           `%r14')
  68 define(`j',           `%r15')
  69 define(`mp',          `%rdi')
  70 define(`u0inv',       `(%rsp)')  C stack
  71
  72 ABI_SUPPORT(DOS64)    C FIXME: needs verification
  73 ABI_SUPPORT(STD64)
  74
  75 ASM_START()
  76         TEXT
  77         ALIGN(16)
  78 PROLOGUE(mpn_redc_1)
  79         FUNC_ENTRY(4)
  80 IFDOS(` mov     56(%rsp), %r8   ')
  81         push    %rbx
  82         push    %rbp
  83         push    %r12
  84         push    %r13
  85         push    %r14
  86         push    %r15
  87         push    rp
  88         mov     mp_param, mp            C note that rp and mp shares register
  89         mov     (up), %rdx
  90
  91         neg     n
  92         push    %r8                     C put u0inv on stack
  93         imul    u0inv_param, %rdx       C first iteration q0
  94         mov     n, j                    C outer loop induction var
  95
  96         test    $1, R8(n)
  97         jnz     L(bx1)
  98
  99 L(bx0): test    $2, R8(n)
 100         jz      L(o0b)
 101
 102         cmp     $-2, R32(n)
 103         jnz     L(o2)
 104
 105 C Special code for n = 2 since general code cannot handle it
 106         mov     8(%rsp), %rbx           C rp
 107         lea     16(%rsp), %rsp          C deallocate two slots
 108         mulx(   (mp), %r9, %r12)
 109         mulx(   8,(mp), %r11, %r10)
 110         add     %r12, %r11
 111         adc     $0, %r10
 112         add     (up), %r9               C = 0
 113         adc     8(up), %r11             C r11 = up[1]
 114         adc     $0, %r10                C -> up[0]
 115         mov     %r11, %rdx
 116         imul    u0inv_param, %rdx
 117         mulx(   (mp), %r13, %r12)
 118         mulx(   8,(mp), %r14, %r15)
 119         xor     R32(%rax), R32(%rax)
 120         add     %r12, %r14
 121         adc     $0, %r15
 122         add     %r11, %r13              C = 0
 123         adc     16(up), %r14            C rp[2]
 124         adc     $0, %r15                C -> up[1]
 125         add     %r14, %r10
 126         adc     24(up), %r15
 127         mov     %r10, (%rbx)
 128         mov     %r15, 8(%rbx)
 129         setc    R8(%rax)
 130         jmp     L(ret)
 131
 132 L(o2):  lea     2(n), i                 C inner loop induction var
 133         mulx(   (mp), %r9, %r8)
 134         mulx(   8,(mp), %r11, %r10)
 135         sar     $2, i
 136         add     %r8, %r11
 137         jmp     L(lo2)
 138
 139         ALIGN(16)
 140 L(tp2): adc     %rax, %r9
 141         lea     32(up), up
 142         adc     %r8, %r11
 143 L(lo2): mulx(   16,(mp), %r13, %r12)
 144         mov     (up), %r8
 145         mulx(   24,(mp), %rbx, %rax)
 146         lea     32(mp), mp
 147         adc     %r10, %r13
 148         adc     %r12, %rbx
 149         adc     $0, %rax
 150         mov     8(up), %r10
 151         mov     16(up), %r12
 152         add     %r9, %r8
 153         mov     24(up), %rbp
 154         mov     %r8, (up)
 155         adc     %r11, %r10
 156         mulx(   (mp), %r9, %r8)
 157         mov     %r10, 8(up)
 158         adc     %r13, %r12
 159         mov     %r12, 16(up)
 160         adc     %rbx, %rbp
 161         mulx(   8,(mp), %r11, %r10)
 162         mov     %rbp, 24(up)
 163         inc     i
 164         jnz     L(tp2)
 165
 166 L(ed2): mov     56(up,n,8), %rdx        C next iteration up[0]
 167         lea     16(mp,n,8), mp          C mp = (last starting mp)
 168         adc     %rax, %r9
 169         adc     %r8, %r11
 170         mov     32(up), %r8
 171         adc     $0, %r10
 172         imul    u0inv, %rdx             C next iteration q0
 173         mov     40(up), %rax
 174         add     %r9, %r8
 175         mov     %r8, 32(up)
 176         adc     %r11, %rax
 177         mov     %rax, 40(up)
 178         lea     56(up,n,8), up          C up = (last starting up) + 1
 179         adc     $0, %r10
 180         mov     %r10, -8(up)
 181         inc     j
 182         jnz     L(o2)
 183
 184         jmp     L(cj)
 185
 186
 187 L(bx1): test    $2, R8(n)
 188         jz      L(o3a)
 189
 190 L(o1a): cmp     $-1, R32(n)
 191         jnz     L(o1b)
 192
 193 C Special code for n = 1 since general code cannot handle it
 194         mov     8(%rsp), %rbx           C rp
 195         lea     16(%rsp), %rsp          C deallocate two slots
 196         mulx(   (mp), %r11, %r10)
 197         add     (up), %r11
 198         adc     8(up), %r10
 199         mov     %r10, (%rbx)
 200         mov     $0, R32(%rax)
 201         setc    R8(%rax)
 202         jmp     L(ret)
 203
 204 L(o1b): lea     24(mp), mp
 205 L(o1):  lea     1(n), i                 C inner loop induction var
 206         mulx(   -24,(mp), %r11, %r10)
 207         mulx(   -16,(mp), %r13, %r12)
 208         mulx(   -8,(mp), %rbx, %rax)
 209         sar     $2, i
 210         add     %r10, %r13
 211         adc     %r12, %rbx
 212         adc     $0, %rax
 213         mov     (up), %r10
 214         mov     8(up), %r12
 215         mov     16(up), %rbp
 216         add     %r11, %r10
 217         jmp     L(lo1)
 218
 219         ALIGN(16)
 220 L(tp1): adc     %rax, %r9
 221         lea     32(up), up
 222         adc     %r8, %r11
 223         mulx(   16,(mp), %r13, %r12)
 224         mov     -8(up), %r8
 225         mulx(   24,(mp), %rbx, %rax)
 226         lea     32(mp), mp
 227         adc     %r10, %r13
 228         adc     %r12, %rbx
 229         adc     $0, %rax
 230         mov     (up), %r10
 231         mov     8(up), %r12
 232         add     %r9, %r8
 233         mov     16(up), %rbp
 234         mov     %r8, -8(up)
 235         adc     %r11, %r10
 236 L(lo1): mulx(   (mp), %r9, %r8)
 237         mov     %r10, (up)
 238         adc     %r13, %r12
 239         mov     %r12, 8(up)
 240         adc     %rbx, %rbp
 241         mulx(   8,(mp), %r11, %r10)
 242         mov     %rbp, 16(up)
 243         inc     i
 244         jnz     L(tp1)
 245
 246 L(ed1): mov     48(up,n,8), %rdx        C next iteration up[0]
 247         lea     40(mp,n,8), mp          C mp = (last starting mp)
 248         adc     %rax, %r9
 249         adc     %r8, %r11
 250         mov     24(up), %r8
 251         adc     $0, %r10
 252         imul    u0inv, %rdx             C next iteration q0
 253         mov     32(up), %rax
 254         add     %r9, %r8
 255         mov     %r8, 24(up)
 256         adc     %r11, %rax
 257         mov     %rax, 32(up)
 258         lea     48(up,n,8), up          C up = (last starting up) + 1
 259         adc     $0, %r10
 260         mov     %r10, -8(up)
 261         inc     j
 262         jnz     L(o1)
 263
 264         jmp     L(cj)
 265
 266 L(o3a): cmp     $-3, R32(n)
 267         jnz     L(o3b)
 268
 269 C Special code for n = 3 since general code cannot handle it
 270 L(n3):  mulx(   (mp), %rbx, %rax)
 271         mulx(   8,(mp), %r9, %r14)
 272         add     (up), %rbx
 273         mulx(   16,(mp), %r11, %r10)
 274         adc     %rax, %r9               C W 1
 275         adc     %r14, %r11              C W 2
 276         mov     8(up), %r14
 277         mov     u0inv_param, %rdx
 278         adc     $0, %r10                C W 3
 279         mov     16(up), %rax
 280         add     %r9, %r14               C W 1
 281         mov     %r14, 8(up)
 282         mulx(   %r14, %rdx, %r13)       C next iteration q0
 283         adc     %r11, %rax              C W 2
 284         mov     %rax, 16(up)
 285         adc     $0, %r10                C W 3
 286         mov     %r10, (up)
 287         lea     8(up), up               C up = (last starting up) + 1
 288         inc     j
 289         jnz     L(n3)
 290
 291         jmp     L(cj)
 292
 293 L(o3b): lea     8(mp), mp
 294 L(o3):  lea     4(n), i                 C inner loop induction var
 295         mulx(   -8,(mp), %rbx, %rax)
 296         mulx(   (mp), %r9, %r8)
 297         mov     (up), %rbp
 298         mulx(   8,(mp), %r11, %r10)
 299         sar     $2, i
 300         add     %rbx, %rbp
 301         nop
 302         adc     %rax, %r9
 303         jmp     L(lo3)
 304
 305         ALIGN(16)
 306 L(tp3): adc     %rax, %r9
 307         lea     32(up), up
 308 L(lo3): adc     %r8, %r11
 309         mulx(   16,(mp), %r13, %r12)
 310         mov     8(up), %r8
 311         mulx(   24,(mp), %rbx, %rax)
 312         lea     32(mp), mp
 313         adc     %r10, %r13
 314         adc     %r12, %rbx
 315         adc     $0, %rax
 316         mov     16(up), %r10
 317         mov     24(up), %r12
 318         add     %r9, %r8
 319         mov     32(up), %rbp
 320         mov     %r8, 8(up)
 321         adc     %r11, %r10
 322         mulx(   (mp), %r9, %r8)
 323         mov     %r10, 16(up)
 324         adc     %r13, %r12
 325         mov     %r12, 24(up)
 326         adc     %rbx, %rbp
 327         mulx(   8,(mp), %r11, %r10)
 328         mov     %rbp, 32(up)
 329         inc     i
 330         jnz     L(tp3)
 331
 332 L(ed3): mov     64(up,n,8), %rdx        C next iteration up[0]
 333         lea     24(mp,n,8), mp          C mp = (last starting mp)
 334         adc     %rax, %r9
 335         adc     %r8, %r11
 336         mov     40(up), %r8
 337         adc     $0, %r10
 338         imul    u0inv, %rdx             C next iteration q0
 339         mov     48(up), %rax
 340         add     %r9, %r8
 341         mov     %r8, 40(up)
 342         adc     %r11, %rax
 343         mov     %rax, 48(up)
 344         lea     64(up,n,8), up          C up = (last starting up) + 1
 345         adc     $0, %r10
 346         mov     %r10, -8(up)
 347         inc     j
 348         jnz     L(o3)
 349
 350         jmp     L(cj)
 351
 352 L(o0b): lea     16(mp), mp
 353 L(o0):  mov     n, i                    C inner loop induction var
 354         mulx(   -16,(mp), %r13, %r12)
 355         mulx(   -8,(mp), %rbx, %rax)
 356         sar     $2, i
 357         add     %r12, %rbx
 358         adc     $0, %rax
 359         mov     (up), %r12
 360         mov     8(up), %rbp
 361         mulx(   (mp), %r9, %r8)
 362         add     %r13, %r12
 363         jmp     L(lo0)
 364
 365         ALIGN(16)
 366 L(tp0): adc     %rax, %r9
 367         lea     32(up), up
 368         adc     %r8, %r11
 369         mulx(   16,(mp), %r13, %r12)
 370         mov     -16(up), %r8
 371         mulx(   24,(mp), %rbx, %rax)
 372         lea     32(mp), mp
 373         adc     %r10, %r13
 374         adc     %r12, %rbx
 375         adc     $0, %rax
 376         mov     -8(up), %r10
 377         mov     (up), %r12
 378         add     %r9, %r8
 379         mov     8(up), %rbp
 380         mov     %r8, -16(up)
 381         adc     %r11, %r10
 382         mulx(   (mp), %r9, %r8)
 383         mov     %r10, -8(up)
 384         adc     %r13, %r12
 385         mov     %r12, (up)
 386 L(lo0): adc     %rbx, %rbp
 387         mulx(   8,(mp), %r11, %r10)
 388         mov     %rbp, 8(up)
 389         inc     i
 390         jnz     L(tp0)
 391
 392 L(ed0): mov     40(up,n,8), %rdx        C next iteration up[0]
 393         lea     32(mp,n,8), mp          C mp = (last starting mp)
 394         adc     %rax, %r9
 395         adc     %r8, %r11
 396         mov     16(up), %r8
 397         adc     $0, %r10
 398         imul    u0inv, %rdx             C next iteration q0
 399         mov     24(up), %rax
 400         add     %r9, %r8
 401         mov     %r8, 16(up)
 402         adc     %r11, %rax
 403         mov     %rax, 24(up)
 404         lea     40(up,n,8), up          C up = (last starting up) + 1
 405         adc     $0, %r10
 406         mov     %r10, -8(up)
 407         inc     j
 408         jnz     L(o0)
 409
 410 L(cj):
 411 IFSTD(` mov     8(%rsp), %rdi           C param 1: rp
 412         lea     16-8(%rsp), %rsp        C deallocate 2, add back for alignment
 413         lea     (up,n,8), %rdx          C param 3: up - n
 414         neg     R32(n)          ')      C param 4: n
 415
 416 IFDOS(` mov     up, %rdx                C param 2: up
 417         lea     (up,n,8), %r8           C param 3: up - n
 418         neg     R32(n)
 419         mov     n, %r9                  C param 4: n
 420         mov     8(%rsp), %rcx           C param 1: rp
 421         lea     16-32-8(%rsp), %rsp')   C deallocate 2, allocate shadow, align
 422
 423         ASSERT(nz, `test $15, %rsp')
 424         CALL(   mpn_add_n)
 425
 426 IFSTD(` lea     8(%rsp), %rsp   ')
 427 IFDOS(` lea     32+8(%rsp), %rsp')
 428
 429 L(ret): pop     %r15
 430         pop     %r14
 431         pop     %r13
 432         pop     %r12
 433         pop     %rbp
 434         pop     %rbx
 435         FUNC_EXIT()
 436         ret
 437 EPILOGUE()