source/libs/gmp/gmp-src/mpn/x86_64/coreisbr/sqr_basecase.asm

   1 dnl  AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C cycles/limb   mul_2           addmul_2        sqr_diag_addlsh1
  36 C AMD K8,K9      ?               ?                       ?
  37 C AMD K10        ?               ?                       ?
  38 C AMD bull       ?               ?                       ?
  39 C AMD pile       ?               ?                       ?
  40 C AMD steam      ?               ?                       ?
  41 C AMD bobcat     ?               ?                       ?
  42 C AMD jaguar     ?               ?                       ?
  43 C Intel P4       ?               ?                       ?
  44 C Intel core     ?               ?                       ?
  45 C Intel NHM      ?               ?                       ?
  46 C Intel SBR      2.57            2.93                    3.0
  47 C Intel IBR      2.35            2.66                    3.0
  48 C Intel HWL      2.02            2.5                     2.5
  49 C Intel BWL      ?               ?                       ?
  50 C Intel atom     ?               ?                       ?
  51 C VIA nano       ?               ?                       ?
  52
  53 C The inner loops of this code are the result of running a code generation and
  54 C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
  55 C that the sqr_diag_addlsh1 loop was manually written.
  56
  57 C TODO
  58 C  * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
  59 C  * Streamline pointer updates.
  60 C  * Perhaps suppress a few more xor insns in feed-in code.
  61 C  * Make sure we write no dead registers in feed-in code.
  62 C  * We might use 32-bit size ops, since n >= 2^32 is non-terminating.  Watch
  63 C    out for negative sizes being zero-extended, though.
  64 C  * The straight-line code for n <= 3 comes from the K8 code, and might be
  65 C    quite sub-optimal here.  Write specific code, and add code for n = 4.
  66 C  * The mul_2 loop has a 10 insn common sequence in the loop start and the
  67 C    wind-down code.  Try re-rolling it.
  68 C  * This file has been the subject to just basic micro-optimisation.
  69
  70 C When playing with pointers, set this to $2 to fall back to conservative
  71 C indexing in wind-down code.
  72 define(`I',`$1')
  73
  74 define(`rp',      `%rdi')
  75 define(`up',      `%rsi')
  76 define(`un_param',`%rdx')
  77
  78
  79 ABI_SUPPORT(DOS64)
  80 ABI_SUPPORT(STD64)
  81
  82 ASM_START()
  83         TEXT
  84         ALIGN(32)
  85 PROLOGUE(mpn_sqr_basecase)
  86         FUNC_ENTRY(3)
  87
  88         cmp     $2, un_param
  89         jae     L(gt1)
  90
  91         mov     (up), %rax
  92         mul     %rax
  93         mov     %rax, (rp)
  94         mov     %rdx, 8(rp)
  95         FUNC_EXIT()
  96         ret
  97
  98 L(gt1): jne     L(gt2)
  99
 100         mov     (up), %rax
 101         mov     %rax, %r8
 102         mul     %rax
 103         mov     8(up), %r11
 104         mov     %rax, (rp)
 105         mov     %r11, %rax
 106         mov     %rdx, %r9
 107         mul     %rax
 108         mov     %rax, %r10
 109         mov     %r11, %rax
 110         mov     %rdx, %r11
 111         mul     %r8
 112         xor     %r8, %r8
 113         add     %rax, %r9
 114         adc     %rdx, %r10
 115         adc     %r8, %r11
 116         add     %rax, %r9
 117         mov     %r9, 8(rp)
 118         adc     %rdx, %r10
 119         mov     %r10, 16(rp)
 120         adc     %r8, %r11
 121         mov     %r11, 24(rp)
 122         FUNC_EXIT()
 123         ret
 124
 125 L(gt2): cmp     $4, un_param
 126         jae     L(gt3)
 127 define(`v0', `%r8')
 128 define(`v1', `%r9')
 129 define(`w0', `%r10')
 130 define(`w2', `%r11')
 131
 132         mov     (up), %rax
 133         mov     %rax, %r10
 134         mul     %rax
 135         mov     8(up), %r11
 136         mov     %rax, (rp)
 137         mov     %r11, %rax
 138         mov     %rdx, 8(rp)
 139         mul     %rax
 140         mov     16(up), %rcx
 141         mov     %rax, 16(rp)
 142         mov     %rcx, %rax
 143         mov     %rdx, 24(rp)
 144         mul     %rax
 145         mov     %rax, 32(rp)
 146         mov     %rdx, 40(rp)
 147
 148         mov     %r11, %rax
 149         mul     %r10
 150         mov     %rax, %r8
 151         mov     %rcx, %rax
 152         mov     %rdx, %r9
 153         mul     %r10
 154         xor     %r10, %r10
 155         add     %rax, %r9
 156         mov     %r11, %rax
 157         mov     %r10, %r11
 158         adc     %rdx, %r10
 159
 160         mul     %rcx
 161         add     %rax, %r10
 162         adc     %r11, %rdx
 163         add     %r8, %r8
 164         adc     %r9, %r9
 165         adc     %r10, %r10
 166         adc     %rdx, %rdx
 167         adc     %r11, %r11
 168         add     %r8, 8(rp)
 169         adc     %r9, 16(rp)
 170         adc     %r10, 24(rp)
 171         adc     %rdx, 32(rp)
 172         adc     %r11, 40(rp)
 173         FUNC_EXIT()
 174         ret
 175
 176 L(gt3):
 177
 178 define(`v0', `%r8')
 179 define(`v1', `%r9')
 180 define(`w0', `%r10')
 181 define(`w1', `%r11')
 182 define(`w2', `%rbx')
 183 define(`w3', `%rbp')
 184 define(`un', `%r12')
 185 define(`n',  `%rcx')
 186
 187 define(`X0', `%r13')
 188 define(`X1', `%r14')
 189
 190 L(do_mul_2):
 191         mov     (up), v0
 192         push    %rbx
 193         lea     (rp,un_param,8), rp     C point rp at R[un]
 194         mov     8(up), %rax
 195         push    %rbp
 196         lea     (up,un_param,8), up     C point up right after U's end
 197         mov     %rax, v1
 198         push    %r12
 199         mov     $1, R32(un)             C free up rdx
 200         push    %r13
 201         sub     un_param, un
 202         push    %r14
 203         push    un
 204         mul     v0
 205         mov     %rax, (rp,un,8)
 206         mov     8(up,un,8), %rax
 207         test    $1, R8(un)
 208         jnz     L(m2b1)
 209
 210 L(m2b0):lea     2(un), n
 211         xor     R32(w1), R32(w1)        C FIXME
 212         xor     R32(w2), R32(w2)        C FIXME
 213         mov     %rdx, w0
 214         jmp     L(m2l0)
 215
 216 L(m2b1):lea     1(un), n
 217         xor     R32(w3), R32(w3)        C FIXME
 218         xor     R32(w0), R32(w0)        C FIXME
 219         mov     %rdx, w2
 220         jmp     L(m2l1)
 221
 222         ALIGN(32)
 223 L(m2tp):
 224 L(m2l0):mul     v0
 225         add     %rax, w0
 226         mov     %rdx, w3
 227         adc     $0, w3
 228         mov     -8(up,n,8), %rax
 229         mul     v1
 230         add     w1, w0
 231         adc     $0, w3
 232         add     %rax, w2
 233         mov     w0, -8(rp,n,8)
 234         mov     %rdx, w0
 235         adc     $0, w0
 236         mov     (up,n,8), %rax
 237 L(m2l1):mul     v0
 238         add     %rax, w2
 239         mov     %rdx, w1
 240         adc     $0, w1
 241         add     w3, w2
 242         mov     (up,n,8), %rax
 243         adc     $0, w1
 244         mul     v1
 245         mov     w2, (rp,n,8)
 246         add     %rax, w0
 247         mov     %rdx, w2
 248         mov     8(up,n,8), %rax
 249         adc     $0, w2
 250         add     $2, n
 251         jnc     L(m2tp)
 252
 253 L(m2ed):mul     v0
 254         add     %rax, w0
 255         mov     %rdx, w3
 256         adc     $0, w3
 257         mov     I(-8(up),-8(up,n,8)), %rax
 258         mul     v1
 259         add     w1, w0
 260         adc     $0, w3
 261         add     %rax, w2
 262         mov     w0, I(-8(rp),-8(rp,n,8))
 263         adc     $0, %rdx
 264         add     w3, w2
 265         mov     w2, I((rp),(rp,n,8))
 266         adc     $0, %rdx
 267         mov     %rdx, I(8(rp),8(rp,n,8))
 268
 269         add     $2, un                  C decrease |un|
 270
 271 L(do_addmul_2):
 272 L(outer):
 273         lea     16(rp), rp
 274         cmp     $-2, R32(un)            C jump if un C {-1,0}  FIXME jump if un C {-2,1}
 275         jge     L(corner)               C FIXME: move to before the lea above
 276
 277         mov     -8(up,un,8), v0
 278         mov     (up,un,8), %rax
 279         mov     %rax, v1
 280         mul     v0
 281         test    $1, R8(un)
 282         jnz     L(a1x1)
 283
 284 L(a1x0):mov     (rp,un,8), X0
 285         xor     w0, w0
 286         mov     8(rp,un,8), X1
 287         add     %rax, X0
 288         mov     %rdx, w1
 289         adc     $0, w1
 290         xor     w2, w2
 291         mov     X0, (rp,un,8)
 292         mov     8(up,un,8), %rax
 293         test    $2, R8(un)
 294         jnz     L(a110)
 295
 296 L(a100):lea     2(un), n                C un = 4, 8, 12, ...
 297         jmp     L(lo0)
 298
 299 L(a110):lea     (un), n                 C un = 2, 6, 10, ...
 300         jmp     L(lo2)
 301
 302 L(a1x1):mov     (rp,un,8), X1
 303         xor     w2, w2
 304         mov     8(rp,un,8), X0
 305         add     %rax, X1
 306         mov     %rdx, w3
 307         adc     $0, w3
 308         xor     w0, w0
 309         mov     8(up,un,8), %rax
 310         test    $2, R8(un)
 311         jz      L(a111)
 312
 313 L(a101):lea     3(un), n                C un = 1, 5, 9, ...
 314         jmp     L(lo1)
 315
 316 L(a111):lea     1(un), n                C un = 3, 7, 11, ...
 317         jmp     L(lo3)
 318
 319         ALIGN(32)
 320 L(top): mul     v1
 321         mov     %rdx, w0
 322         add     %rax, X0
 323         adc     $0, w0
 324         add     w1, X1
 325         adc     $0, w3
 326         add     w2, X0
 327         adc     $0, w0
 328         mov     -16(up,n,8), %rax
 329 L(lo1): mul     v0
 330         add     %rax, X0
 331         mov     %rdx, w1
 332         adc     $0, w1
 333         mov     -16(up,n,8), %rax
 334         mul     v1
 335         mov     X1, -24(rp,n,8)
 336         mov     -8(rp,n,8), X1
 337         add     w3, X0
 338         adc     $0, w1
 339         mov     %rdx, w2
 340         mov     X0, -16(rp,n,8)
 341         add     %rax, X1
 342         adc     $0, w2
 343         mov     -8(up,n,8), %rax
 344         add     w0, X1
 345         adc     $0, w2
 346 L(lo0): mul     v0
 347         add     %rax, X1
 348         mov     %rdx, w3
 349         adc     $0, w3
 350         mov     -8(up,n,8), %rax
 351         mul     v1
 352         add     w1, X1
 353         mov     (rp,n,8), X0
 354         adc     $0, w3
 355         mov     %rdx, w0
 356         add     %rax, X0
 357         adc     $0, w0
 358         mov     (up,n,8), %rax
 359 L(lo3): mul     v0
 360         add     w2, X0
 361         mov     X1, -8(rp,n,8)
 362         mov     %rdx, w1
 363         adc     $0, w0
 364         add     %rax, X0
 365         adc     $0, w1
 366         mov     (up,n,8), %rax
 367         add     w3, X0
 368         adc     $0, w1
 369         mul     v1
 370         mov     8(rp,n,8), X1
 371         add     %rax, X1
 372         mov     %rdx, w2
 373         adc     $0, w2
 374         mov     8(up,n,8), %rax
 375         mov     X0, (rp,n,8)
 376 L(lo2): mul     v0
 377         add     w0, X1
 378         mov     %rdx, w3
 379         adc     $0, w2
 380         add     %rax, X1
 381         mov     8(up,n,8), %rax
 382         mov     16(rp,n,8), X0
 383         adc     $0, w3
 384         add     $4, n
 385         jnc     L(top)
 386
 387 L(end): mul     v1
 388         add     w1, X1
 389         adc     $0, w3
 390         add     w2, %rax
 391         adc     $0, %rdx
 392         mov     X1, I(-8(rp),-24(rp,n,8))
 393         add     w3, %rax
 394         adc     $0, %rdx
 395         mov     %rax, I((rp),-16(rp,n,8))
 396         mov     %rdx, I(8(rp),-8(rp,n,8))
 397
 398         add     $2, un                  C decrease |un|
 399         jmp     L(outer)                C loop until a small corner remains
 400
 401 L(corner):
 402         pop     n
 403         jg      L(small_corner)
 404
 405         lea     8(rp), rp
 406         mov     -24(up), v0
 407         mov     -16(up), %rax
 408         mov     %rax, v1
 409         mul     v0
 410         mov     -24(rp), X0
 411         mov     -16(rp), X1
 412         add     %rax, X0
 413         mov     %rdx, w1
 414         adc     $0, w1
 415         xor     w2, w2
 416         mov     X0, -24(rp)
 417         mov     -8(up), %rax
 418         mul     v0
 419         add     $0, X1
 420         mov     %rdx, w3
 421         adc     $0, w2
 422         add     %rax, X1
 423         mov     -8(up), %rax
 424         adc     $0, w3
 425         mul     v1
 426         add     w1, X1
 427         adc     $0, w3
 428         add     w2, %rax
 429         adc     $0, %rdx
 430         mov     X1, -16(rp)
 431         jmp     L(com)
 432
 433 L(small_corner):
 434         mov     -8(rp), w3
 435         mov     -16(up), v0
 436         mov     -8(up), %rax
 437         mul     v0
 438 L(com): add     w3, %rax
 439         adc     $0, %rdx
 440         mov     %rax, -8(rp)
 441         mov     %rdx, (rp)
 442
 443 L(sqr_diag_addlsh1):
 444         mov     -8(up,n,8), %rax
 445         shl     n
 446         mul     %rax
 447         mov     %rax, (rp,n,8)
 448
 449         xor     R32(%rbx), R32(%rbx)
 450         mov     8(rp,n,8), %r8
 451         mov     16(rp,n,8), %r9
 452         jmp     L(dm)
 453
 454         ALIGN(32)
 455 L(dtop):add     %r8, %r10
 456         adc     %r9, %rax
 457         mov     8(rp,n,8), %r8
 458         mov     16(rp,n,8), %r9
 459         mov     %r10, -8(rp,n,8)
 460         mov     %rax, (rp,n,8)
 461 L(dm):  adc     %r8, %r8
 462         adc     %r9, %r9
 463         mov     (up,n,4), %rax
 464         lea     (%rdx,%rbx), %r10
 465         setc    R8(%rbx)
 466         mul     %rax
 467         add     $2, n
 468         js      L(dtop)
 469
 470 L(dend):add     %r8, %r10
 471         adc     %r9, %rax
 472         mov     %r10, I(-8(rp),-8(rp,n,8))
 473         mov     %rax, I((rp),(rp,n,8))
 474         adc     %rbx, %rdx
 475         mov     %rdx, I(8(rp),8(rp,n,8))
 476
 477         pop     %r14
 478         pop     %r13
 479         pop     %r12
 480         pop     %rbp
 481         pop     %rbx
 482         FUNC_EXIT()
 483         ret
 484 EPILOGUE()