source/libs/gmp/gmp-src/mpn/x86_64/coreibwl/sqr_basecase.asm

   1 dnl  AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
   2
   3 dnl  Copyright 2015 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C cycles/limb   mul_1           addmul_1
  34 C AMD K8,K9     n/a             n/a
  35 C AMD K10       n/a             n/a
  36 C AMD bull      n/a             n/a
  37 C AMD pile      n/a             n/a
  38 C AMD steam     n/a             n/a
  39 C AMD excavator  ?               ?
  40 C AMD bobcat    n/a             n/a
  41 C AMD jaguar    n/a             n/a
  42 C Intel P4      n/a             n/a
  43 C Intel core2   n/a             n/a
  44 C Intel NHM     n/a             n/a
  45 C Intel SBR     n/a             n/a
  46 C Intel IBR     n/a             n/a
  47 C Intel HWL      1.68           n/a
  48 C Intel BWL      1.69         1.8-1.9
  49 C Intel atom    n/a             n/a
  50 C Intel SLM     n/a             n/a
  51 C VIA nano      n/a             n/a
  52
  53 C The inner loops of this code are the result of running a code generation and
  54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
  55
  56 C TODO
  57 C  * We have 8 addmul_1 loops which fall into each other.  The idea is to save
  58 C    on switching code, since a circularly updated computed goto target will
  59 C    hardly allow correct branch prediction.  On 2nd thought, we now might make
  60 C    each of the 8 loop branches be poorly predicted since they will be
  61 C    executed fewer times for each time.  With just one addmul_1 loop, the loop
  62 C    count will change only once each 8th time!
  63 C  * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code.  We have
  64 C    3 variants below, but the haswell code turns out to be fastest.
  65 C  * Do overlapped software pipelining.
  66 C  * When changing this, make sure the code which falls into the inner loops
  67 C    does not execute too many no-ops (for both PIC and non-PIC).
  68
  69 define(`rp',      `%rdi')
  70 define(`up',      `%rsi')
  71 define(`un_param',`%rdx')
  72
  73 define(`n',       `%rcx')
  74 define(`un_save', `%rbx')
  75 define(`u0',      `%rdx')
  76
  77 define(`w0',    `%r8')
  78 define(`w1',    `%r9')
  79 define(`w2',    `%r10')
  80 define(`w3',    `%r11')
  81
  82 ABI_SUPPORT(DOS64)
  83 ABI_SUPPORT(STD64)
  84
  85 ASM_START()
  86         TEXT
  87         ALIGN(16)
  88 PROLOGUE(mpn_sqr_basecase)
  89         FUNC_ENTRY(3)
  90
  91         cmp     $2, un_param
  92         jae     L(gt1)
  93
  94         mov     (up), %rdx
  95         mulx(   %rdx, %rax, %rdx)
  96         mov     %rax, (rp)
  97         mov     %rdx, 8(rp)
  98         FUNC_EXIT()
  99         ret
 100
 101 L(gt1): jne     L(gt2)
 102
 103         mov     (up), %rdx
 104         mov     8(up), %rcx
 105         mulx(   %rcx, %r9, %r10)        C v0 * v1       W 1 2
 106         mulx(   %rdx, %rax, %r8)        C v0 * v0       W 0 1
 107         mov     %rcx, %rdx
 108         mulx(   %rdx, %r11, %rdx)       C v1 * v1       W 2 3
 109         add     %r9, %r9                C               W 1
 110         adc     %r10, %r10              C               W 2
 111         adc     $0, %rdx                C               W 3
 112         add     %r9, %r8                C W 1
 113         adc     %r11, %r10              C W 2
 114         adc     $0, %rdx                C W 3
 115         mov     %rax, (rp)
 116         mov     %r8, 8(rp)
 117         mov     %r10, 16(rp)
 118         mov     %rdx, 24(rp)
 119         FUNC_EXIT()
 120         ret
 121
 122 L(gt2): cmp     $4, un_param
 123         jae     L(gt3)
 124
 125         push    %rbx
 126         mov     (up), %rdx
 127         mulx(   8,(up), w2, w3)
 128         mulx(   16,(up), w0, w1)
 129         add     w3, w0
 130         mov     8(up), %rdx
 131         mulx(   16,(up), %rax, w3)
 132         adc     %rax, w1
 133         adc     $0, w3
 134         test    R32(%rbx), R32(%rbx)
 135         mov     (up), %rdx
 136         mulx(   %rdx, %rbx, %rcx)
 137         mov     %rbx, (rp)
 138         mov     8(up), %rdx
 139         mulx(   %rdx, %rax, %rbx)
 140         mov     16(up), %rdx
 141         mulx(   %rdx, %rsi, %rdx)
 142         adcx(   w2, w2)
 143         adcx(   w0, w0)
 144         adcx(   w1, w1)
 145         adcx(   w3, w3)
 146         adox(   w2, %rcx)
 147         adox(   w0, %rax)
 148         adox(   w1, %rbx)
 149         adox(   w3, %rsi)
 150         mov     $0, R32(%r8)
 151         adox(   %r8, %rdx)
 152         adcx(   %r8, %rdx)
 153         mov     %rcx, 8(rp)
 154         mov     %rax, 16(rp)
 155         mov     %rbx, 24(rp)
 156         mov     %rsi, 32(rp)
 157         mov     %rdx, 40(rp)
 158         pop     %rbx
 159         FUNC_EXIT()
 160         ret
 161
 162 L(gt3): push    %rbx
 163
 164         push    rp
 165         push    up
 166         push    un_param
 167
 168         lea     -3(un_param), R32(un_save)
 169         lea     5(un_param), n
 170         mov     R32(un_param), R32(%rax)
 171         and     $-8, R32(un_save)
 172         shr     $3, R32(n)              C count for mul_1 loop
 173         neg     un_save                 C 8*count and offert for addmul_1 loops
 174         and     $7, R32(%rax)           C clear CF for adc as side-effect
 175
 176         mov     (up), u0
 177
 178         lea     L(mtab)(%rip), %r10
 179 ifdef(`PIC',
 180 `       movslq  (%r10,%rax,4), %r8
 181         lea     (%r8, %r10), %r10
 182         jmp     *%r10
 183 ',`
 184         jmp     *(%r10,%rax,8)
 185 ')
 186
 187 L(mf0): mulx(   8,(up), w2, w3)
 188         lea     64(up), up
 189 C       lea     (rp), rp
 190         jmp     L(mb0)
 191
 192 L(mf3): mulx(   8,(up), w0, w1)
 193         lea     24(up), up
 194         lea     24(rp), rp
 195         jmp     L(mb3)
 196
 197 L(mf4): mulx(   8,(up), w2, w3)
 198         lea     32(up), up
 199         lea     32(rp), rp
 200         jmp     L(mb4)
 201
 202 L(mf5): mulx(   8,(up), w0, w1)
 203         lea     40(up), up
 204         lea     40(rp), rp
 205         jmp     L(mb5)
 206
 207 L(mf6): mulx(   8,(up), w2, w3)
 208         lea     48(up), up
 209         lea     48(rp), rp
 210         jmp     L(mb6)
 211
 212 L(mf7): mulx(   8,(up), w0, w1)
 213         lea     56(up), up
 214         lea     56(rp), rp
 215         jmp     L(mb7)
 216
 217 L(mf1): mulx(   8,(up), w0, w1)
 218         lea     8(up), up
 219         lea     8(rp), rp
 220         jmp     L(mb1)
 221
 222 L(mf2): mulx(   8,(up), w2, w3)
 223         lea     16(up), up
 224         lea     16(rp), rp
 225         dec     R32(n)
 226         mulx(   (up), w0, w1)
 227
 228         ALIGN(16)
 229 L(top): mov     w2, -8(rp)
 230         adc     w3, w0
 231 L(mb1): mulx(   8,(up), w2, w3)
 232         adc     w1, w2
 233         lea     64(up), up
 234         mov     w0, (rp)
 235 L(mb0): mov     w2, 8(rp)
 236         mulx(   -48,(up), w0, w1)
 237         lea     64(rp), rp
 238         adc     w3, w0
 239 L(mb7): mulx(   -40,(up), w2, w3)
 240         mov     w0, -48(rp)
 241         adc     w1, w2
 242 L(mb6): mov     w2, -40(rp)
 243         mulx(   -32,(up), w0, w1)
 244         adc     w3, w0
 245 L(mb5): mulx(   -24,(up), w2, w3)
 246         mov     w0, -32(rp)
 247         adc     w1, w2
 248 L(mb4): mulx(   -16,(up), w0, w1)
 249         mov     w2, -24(rp)
 250         adc     w3, w0
 251 L(mb3): mulx(   -8,(up), w2, w3)
 252         adc     w1, w2
 253         mov     w0, -16(rp)
 254         dec     R32(n)
 255         mulx(   (up), w0, w1)
 256         jnz     L(top)
 257
 258 L(end): mov     w2, -8(rp)
 259         adc     w3, w0
 260         mov     w0, (rp)
 261         adc     %rcx, w1
 262         mov     w1, 8(rp)
 263
 264         lea     L(atab)(%rip), %r10
 265 ifdef(`PIC',
 266 `       movslq  (%r10,%rax,4), %r11
 267         lea     (%r11, %r10), %r11
 268         jmp     *%r11
 269 ',`
 270         jmp     *(%r10,%rax,8)
 271 ')
 272
 273 L(ed0): adox(   (rp), w0)
 274         adox(   %rcx, w1)               C relies on rcx = 0
 275         mov     w0, (rp)
 276         adc     %rcx, w1                C relies on rcx = 0
 277         mov     w1, 8(rp)
 278 L(f7):  lea     -64(up,un_save,8), up
 279         or      R32(un_save), R32(n)
 280         mov     8(up), u0
 281         mulx(   16,(up), w0, w1)
 282         lea     -56(rp,un_save,8), rp
 283         jmp     L(b7)
 284
 285         ALIGN(16)
 286 L(tp0): adox(   -8,(rp), w2)
 287         adcx(   w3, w0)
 288         mov     w2, -8(rp)
 289         jrcxz   L(ed0)
 290         mulx(   8,(up), w2, w3)
 291         adox(   (rp), w0)
 292         lea     8(n), R32(n)
 293         mov     w0, (rp)
 294         adcx(   w1, w2)
 295 L(b0):  mulx(   16,(up), w0, w1)
 296         adcx(   w3, w0)
 297         adox(   8,(rp), w2)
 298         mov     w2, 8(rp)
 299         mulx(   24,(up), w2, w3)
 300         lea     64(up), up
 301         adcx(   w1, w2)
 302         adox(   16,(rp), w0)
 303         mov     w0, 16(rp)
 304         mulx(   -32,(up), w0, w1)
 305         adox(   24,(rp), w2)
 306         adcx(   w3, w0)
 307         mov     w2, 24(rp)
 308         mulx(   -24,(up), w2, w3)
 309         adcx(   w1, w2)
 310         adox(   32,(rp), w0)
 311         mov     w0, 32(rp)
 312         mulx(   -16,(up), w0, w1)
 313         adox(   40,(rp), w2)
 314         adcx(   w3, w0)
 315         mov     w2, 40(rp)
 316         adox(   48,(rp), w0)
 317         mulx(   -8,(up), w2, w3)
 318         mov     w0, 48(rp)
 319         lea     64(rp), rp
 320         adcx(   w1, w2)
 321         mulx(   (up), w0, w1)
 322         jmp     L(tp0)
 323
 324 L(ed1): adox(   (rp), w0)
 325         adox(   %rcx, w1)               C relies on rcx = 0
 326         mov     w0, (rp)
 327         adc     %rcx, w1                C relies on rcx = 0
 328         mov     w1, 8(rp)
 329 L(f0):  lea     -64(up,un_save,8), up
 330         or      R32(un_save), R32(n)
 331         mov     (up), u0
 332         mulx(   8,(up), w2, w3)
 333         lea     -56(rp,un_save,8), rp
 334         jmp     L(b0)
 335
 336         ALIGN(16)
 337 L(tp1): adox(   -8,(rp), w2)
 338         adcx(   w3, w0)
 339         mov     w2, -8(rp)
 340         jrcxz   L(ed1)
 341 L(b1):  mulx(   8,(up), w2, w3)
 342         adox(   (rp), w0)
 343         lea     8(n), R32(n)
 344         mov     w0, (rp)
 345         adcx(   w1, w2)
 346         mulx(   16,(up), w0, w1)
 347         adcx(   w3, w0)
 348         adox(   8,(rp), w2)
 349         mov     w2, 8(rp)
 350         mulx(   24,(up), w2, w3)
 351         lea     64(up), up
 352         adcx(   w1, w2)
 353         adox(   16,(rp), w0)
 354         mov     w0, 16(rp)
 355         mulx(   -32,(up), w0, w1)
 356         adox(   24,(rp), w2)
 357         adcx(   w3, w0)
 358         mov     w2, 24(rp)
 359         mulx(   -24,(up), w2, w3)
 360         adcx(   w1, w2)
 361         adox(   32,(rp), w0)
 362         mov     w0, 32(rp)
 363         mulx(   -16,(up), w0, w1)
 364         adox(   40,(rp), w2)
 365         adcx(   w3, w0)
 366         mov     w2, 40(rp)
 367         adox(   48,(rp), w0)
 368         mulx(   -8,(up), w2, w3)
 369         mov     w0, 48(rp)
 370         lea     64(rp), rp
 371         adcx(   w1, w2)
 372         mulx(   (up), w0, w1)
 373         jmp     L(tp1)
 374
 375 L(ed2): adox(   (rp), w0)
 376         adox(   %rcx, w1)               C relies on rcx = 0
 377         mov     w0, (rp)
 378         adc     %rcx, w1                C relies on rcx = 0
 379         mov     w1, 8(rp)
 380 L(f1):  lea     (up,un_save,8), up
 381         or      R32(un_save), R32(n)
 382         lea     8(un_save), un_save
 383         mov     -8(up), u0
 384         mulx(   (up), w0, w1)
 385         lea     -56(rp,un_save,8), rp
 386         jmp     L(b1)
 387
 388         ALIGN(16)
 389 L(tp2): adox(   -8,(rp), w2)
 390         adcx(   w3, w0)
 391         mov     w2, -8(rp)
 392         jrcxz   L(ed2)
 393         mulx(   8,(up), w2, w3)
 394         adox(   (rp), w0)
 395         lea     8(n), R32(n)
 396         mov     w0, (rp)
 397         adcx(   w1, w2)
 398         mulx(   16,(up), w0, w1)
 399         adcx(   w3, w0)
 400         adox(   8,(rp), w2)
 401         mov     w2, 8(rp)
 402         mulx(   24,(up), w2, w3)
 403         lea     64(up), up
 404         adcx(   w1, w2)
 405         adox(   16,(rp), w0)
 406         mov     w0, 16(rp)
 407         mulx(   -32,(up), w0, w1)
 408         adox(   24,(rp), w2)
 409         adcx(   w3, w0)
 410         mov     w2, 24(rp)
 411         mulx(   -24,(up), w2, w3)
 412         adcx(   w1, w2)
 413         adox(   32,(rp), w0)
 414         mov     w0, 32(rp)
 415         mulx(   -16,(up), w0, w1)
 416         adox(   40,(rp), w2)
 417         adcx(   w3, w0)
 418         mov     w2, 40(rp)
 419         adox(   48,(rp), w0)
 420         mulx(   -8,(up), w2, w3)
 421         mov     w0, 48(rp)
 422         lea     64(rp), rp
 423         adcx(   w1, w2)
 424         mulx(   (up), w0, w1)
 425         jmp     L(tp2)
 426
 427 L(ed3): adox(   (rp), w0)
 428         adox(   %rcx, w1)               C relies on rcx = 0
 429         mov     w0, (rp)
 430         adc     %rcx, w1                C relies on rcx = 0
 431         mov     w1, 8(rp)
 432 L(f2):  lea     (up,un_save,8), up
 433         or      R32(un_save), R32(n)
 434         jz      L(corner2)
 435         mov     -16(up), u0
 436         mulx(   -8,(up), w2, w3)
 437         lea     8(rp,un_save,8), rp
 438         mulx(   (up), w0, w1)
 439         jmp     L(tp2)
 440
 441         ALIGN(16)
 442 L(tp3): adox(   -8,(rp), w2)
 443         adcx(   w3, w0)
 444         mov     w2, -8(rp)
 445         jrcxz   L(ed3)
 446         mulx(   8,(up), w2, w3)
 447         adox(   (rp), w0)
 448         lea     8(n), R32(n)
 449         mov     w0, (rp)
 450         adcx(   w1, w2)
 451         mulx(   16,(up), w0, w1)
 452         adcx(   w3, w0)
 453         adox(   8,(rp), w2)
 454         mov     w2, 8(rp)
 455         mulx(   24,(up), w2, w3)
 456         lea     64(up), up
 457         adcx(   w1, w2)
 458         adox(   16,(rp), w0)
 459         mov     w0, 16(rp)
 460         mulx(   -32,(up), w0, w1)
 461         adox(   24,(rp), w2)
 462         adcx(   w3, w0)
 463         mov     w2, 24(rp)
 464         mulx(   -24,(up), w2, w3)
 465         adcx(   w1, w2)
 466         adox(   32,(rp), w0)
 467         mov     w0, 32(rp)
 468         mulx(   -16,(up), w0, w1)
 469         adox(   40,(rp), w2)
 470         adcx(   w3, w0)
 471         mov     w2, 40(rp)
 472 L(b3):  adox(   48,(rp), w0)
 473         mulx(   -8,(up), w2, w3)
 474         mov     w0, 48(rp)
 475         lea     64(rp), rp
 476         adcx(   w1, w2)
 477         mulx(   (up), w0, w1)
 478         jmp     L(tp3)
 479
 480 L(ed4): adox(   (rp), w0)
 481         adox(   %rcx, w1)               C relies on rcx = 0
 482         mov     w0, (rp)
 483         adc     %rcx, w1                C relies on rcx = 0
 484         mov     w1, 8(rp)
 485 L(f3):  lea     (up,un_save,8), up
 486         or      R32(un_save), R32(n)
 487         jz      L(corner3)
 488         mov     -24(up), u0
 489         mulx(   -16,(up), w0, w1)
 490         lea     -56(rp,un_save,8), rp
 491         jmp     L(b3)
 492
 493         ALIGN(16)
 494 L(tp4): adox(   -8,(rp), w2)
 495         adcx(   w3, w0)
 496         mov     w2, -8(rp)
 497         jrcxz   L(ed4)
 498         mulx(   8,(up), w2, w3)
 499         adox(   (rp), w0)
 500         lea     8(n), R32(n)
 501         mov     w0, (rp)
 502         adcx(   w1, w2)
 503         mulx(   16,(up), w0, w1)
 504         adcx(   w3, w0)
 505         adox(   8,(rp), w2)
 506         mov     w2, 8(rp)
 507         mulx(   24,(up), w2, w3)
 508         lea     64(up), up
 509         adcx(   w1, w2)
 510         adox(   16,(rp), w0)
 511         mov     w0, 16(rp)
 512         mulx(   -32,(up), w0, w1)
 513         adox(   24,(rp), w2)
 514         adcx(   w3, w0)
 515         mov     w2, 24(rp)
 516         mulx(   -24,(up), w2, w3)
 517         adcx(   w1, w2)
 518         adox(   32,(rp), w0)
 519         mov     w0, 32(rp)
 520 L(b4):  mulx(   -16,(up), w0, w1)
 521         adox(   40,(rp), w2)
 522         adcx(   w3, w0)
 523         mov     w2, 40(rp)
 524         adox(   48,(rp), w0)
 525         mulx(   -8,(up), w2, w3)
 526         mov     w0, 48(rp)
 527         lea     64(rp), rp
 528         adcx(   w1, w2)
 529         mulx(   (up), w0, w1)
 530         jmp     L(tp4)
 531
 532 L(ed5): adox(   (rp), w0)
 533         adox(   %rcx, w1)               C relies on rcx = 0
 534         mov     w0, (rp)
 535         adc     %rcx, w1                C relies on rcx = 0
 536         mov     w1, 8(rp)
 537 L(f4):  lea     (up,un_save,8), up
 538         or      R32(un_save), R32(n)
 539         mov     -32(up), u0
 540         mulx(   -24,(up), w2, w3)
 541         lea     -56(rp,un_save,8), rp
 542         jmp     L(b4)
 543
 544         ALIGN(16)
 545 L(tp5): adox(   -8,(rp), w2)
 546         adcx(   w3, w0)
 547         mov     w2, -8(rp)
 548         jrcxz   L(ed5)
 549         mulx(   8,(up), w2, w3)
 550         adox(   (rp), w0)
 551         lea     8(n), R32(n)
 552         mov     w0, (rp)
 553         adcx(   w1, w2)
 554         mulx(   16,(up), w0, w1)
 555         adcx(   w3, w0)
 556         adox(   8,(rp), w2)
 557         mov     w2, 8(rp)
 558         mulx(   24,(up), w2, w3)
 559         lea     64(up), up
 560         adcx(   w1, w2)
 561         adox(   16,(rp), w0)
 562         mov     w0, 16(rp)
 563         mulx(   -32,(up), w0, w1)
 564         adox(   24,(rp), w2)
 565         adcx(   w3, w0)
 566         mov     w2, 24(rp)
 567 L(b5):  mulx(   -24,(up), w2, w3)
 568         adcx(   w1, w2)
 569         adox(   32,(rp), w0)
 570         mov     w0, 32(rp)
 571         mulx(   -16,(up), w0, w1)
 572         adox(   40,(rp), w2)
 573         adcx(   w3, w0)
 574         mov     w2, 40(rp)
 575         adox(   48,(rp), w0)
 576         mulx(   -8,(up), w2, w3)
 577         mov     w0, 48(rp)
 578         lea     64(rp), rp
 579         adcx(   w1, w2)
 580         mulx(   (up), w0, w1)
 581         jmp     L(tp5)
 582
 583 L(ed6): adox(   (rp), w0)
 584         adox(   %rcx, w1)               C relies on rcx = 0
 585         mov     w0, (rp)
 586         adc     %rcx, w1                C relies on rcx = 0
 587         mov     w1, 8(rp)
 588 L(f5):  lea     (up,un_save,8), up
 589         or      R32(un_save), R32(n)
 590         mov     -40(up), u0
 591         mulx(   -32,(up), w0, w1)
 592         lea     -56(rp,un_save,8), rp
 593         jmp     L(b5)
 594
 595         ALIGN(16)
 596 L(tp6): adox(   -8,(rp), w2)
 597         adcx(   w3, w0)
 598         mov     w2, -8(rp)
 599         jrcxz   L(ed6)
 600         mulx(   8,(up), w2, w3)
 601         adox(   (rp), w0)
 602         lea     8(n), R32(n)
 603         mov     w0, (rp)
 604         adcx(   w1, w2)
 605         mulx(   16,(up), w0, w1)
 606         adcx(   w3, w0)
 607         adox(   8,(rp), w2)
 608         mov     w2, 8(rp)
 609         mulx(   24,(up), w2, w3)
 610         lea     64(up), up
 611         adcx(   w1, w2)
 612         adox(   16,(rp), w0)
 613         mov     w0, 16(rp)
 614 L(b6):  mulx(   -32,(up), w0, w1)
 615         adox(   24,(rp), w2)
 616         adcx(   w3, w0)
 617         mov     w2, 24(rp)
 618         mulx(   -24,(up), w2, w3)
 619         adcx(   w1, w2)
 620         adox(   32,(rp), w0)
 621         mov     w0, 32(rp)
 622         mulx(   -16,(up), w0, w1)
 623         adox(   40,(rp), w2)
 624         adcx(   w3, w0)
 625         mov     w2, 40(rp)
 626         adox(   48,(rp), w0)
 627         mulx(   -8,(up), w2, w3)
 628         mov     w0, 48(rp)
 629         lea     64(rp), rp
 630         adcx(   w1, w2)
 631         mulx(   (up), w0, w1)
 632         jmp     L(tp6)
 633
 634 L(ed7): adox(   (rp), w0)
 635         adox(   %rcx, w1)               C relies on rcx = 0
 636         mov     w0, (rp)
 637         adc     %rcx, w1                C relies on rcx = 0
 638         mov     w1, 8(rp)
 639 L(f6):  lea     (up,un_save,8), up
 640         or      R32(un_save), R32(n)
 641         mov     -48(up), u0
 642         mulx(   -40,(up), w2, w3)
 643         lea     -56(rp,un_save,8), rp
 644         jmp     L(b6)
 645
 646         ALIGN(16)
 647 L(tp7): adox(   -8,(rp), w2)
 648         adcx(   w3, w0)
 649         mov     w2, -8(rp)
 650         jrcxz   L(ed7)
 651         mulx(   8,(up), w2, w3)
 652         adox(   (rp), w0)
 653         lea     8(n), R32(n)
 654         mov     w0, (rp)
 655         adcx(   w1, w2)
 656         mulx(   16,(up), w0, w1)
 657         adcx(   w3, w0)
 658         adox(   8,(rp), w2)
 659         mov     w2, 8(rp)
 660 L(b7):  mulx(   24,(up), w2, w3)
 661         lea     64(up), up
 662         adcx(   w1, w2)
 663         adox(   16,(rp), w0)
 664         mov     w0, 16(rp)
 665         mulx(   -32,(up), w0, w1)
 666         adox(   24,(rp), w2)
 667         adcx(   w3, w0)
 668         mov     w2, 24(rp)
 669         mulx(   -24,(up), w2, w3)
 670         adcx(   w1, w2)
 671         adox(   32,(rp), w0)
 672         mov     w0, 32(rp)
 673         mulx(   -16,(up), w0, w1)
 674         adox(   40,(rp), w2)
 675         adcx(   w3, w0)
 676         mov     w2, 40(rp)
 677         adox(   48,(rp), w0)
 678         mulx(   -8,(up), w2, w3)
 679         mov     w0, 48(rp)
 680         lea     64(rp), rp
 681         adcx(   w1, w2)
 682         mulx(   (up), w0, w1)
 683         jmp     L(tp7)
 684
 685 L(corner3):
 686         mov     -24(up), u0
 687         mulx(   -16,(up), w0, w1)
 688         adox(   -8,(rp), w0)
 689         mulx(   -8,(up), w2, w3)
 690         mov     w0, -8(rp)
 691         lea     8(rp), rp
 692         adcx(   w1, w2)
 693         mulx(   (up), w0, w1)
 694         adox(   -8,(rp), w2)
 695         adcx(   w3, w0)
 696         mov     w2, -8(rp)
 697         adox(   (rp), w0)
 698         adox(   %rcx, w1)               C relies on rcx = 0
 699         adcx(   %rcx, w1)               C relies on rcx = 0
 700 L(corner2):
 701         mov     -16(up), u0
 702         mulx(   -8,(up), w2, w3)
 703         mulx(   (up), %rax, %rbx)
 704         adox(   w0, w2)
 705         adcx(   w3, %rax)
 706         mov     w2, (rp)
 707         adox(   w1, %rax)
 708         adox(   %rcx, %rbx)             C relies on rcx = 0
 709         mov     %rax, 8(rp)
 710         adc     %rcx, %rbx              C relies on rcx = 0
 711         mov     -8(up), %rdx
 712         mulx(   (up), %rax, %rdx)
 713         add     %rbx, %rax
 714         mov     %rax, 16(rp)
 715         adc     %rcx, %rdx              C relies on rcx = 0
 716         mov     %rdx, 24(rp)
 717
 718 L(sqr_diag_addlsh1):
 719         pop     n
 720         pop     up
 721         pop     rp
 722
 723 ifdef(`SDA_VARIANT',,`define(`SDA_VARIANT', 2)')
 724
 725 ifelse(SDA_VARIANT,1,`
 726         lea     (n,n), %rax
 727         movq    $0, -8(rp,%rax,8)               C FIXME
 728         test    R32(%rax), R32(%rax)
 729         mov     (up), %rdx
 730         lea     8(up), up
 731         mulx(   %rdx, %r8, %rdx)
 732         jmp     L(dm)
 733
 734         ALIGN(16)
 735 L(dtop):mov     8(rp), %r9
 736         adcx(   %r9, %r9)
 737         adox(   %rdx, %r9)
 738         mov     %r9, 8(rp)
 739         lea     16(rp), rp
 740         jrcxz   L(dend)
 741         mov     (up), %rdx
 742         mulx(   %rdx, %rax, %rdx)
 743         lea     8(up), up
 744         mov     (rp), %r8
 745         adcx(   %r8, %r8)
 746         adox(   %rax, %r8)
 747 L(dm):  mov     %r8, (rp)
 748         lea     -1(n), n
 749         jmp     L(dtop)
 750 L(dend):
 751 ')
 752
 753 ifelse(SDA_VARIANT,2,`
 754         dec     R32(n)
 755         mov     (up), %rdx
 756         xor     R32(%rbx), R32(%rbx)    C clear CF as side effect
 757         mulx(   %rdx, %rax, %r10)
 758         mov     %rax, (rp)
 759         mov     8(rp), %r8
 760         mov     16(rp), %r9
 761         jmp     L(dm)
 762
 763         ALIGN(16)
 764 L(dtop):mov     24(rp), %r8
 765         mov     32(rp), %r9
 766         lea     16(rp), rp
 767         lea     (%rdx,%rbx), %r10
 768 L(dm):  adc     %r8, %r8
 769         adc     %r9, %r9
 770         setc    R8(%rbx)
 771         mov     8(up), %rdx
 772         lea     8(up), up
 773         mulx(   %rdx, %rax, %rdx)
 774         add     %r10, %r8
 775         adc     %rax, %r9
 776         mov     %r8, 8(rp)
 777         mov     %r9, 16(rp)
 778         dec     R32(n)
 779         jnz     L(dtop)
 780
 781 L(dend):adc     %rbx, %rdx
 782         mov     %rdx, 24(rp)
 783 ')
 784
 785 ifelse(SDA_VARIANT,3,`
 786         dec     R32(n)
 787         mov     (up), %rdx
 788         test    R32(%rbx), R32(%rbx)    C clear CF and OF
 789         mulx(   %rdx, %rax, %r10)
 790         mov     %rax, (rp)
 791         mov     8(rp), %r8
 792         mov     16(rp), %r9
 793         jmp     L(dm)
 794
 795         ALIGN(16)
 796 L(dtop):jrcxz   L(dend)
 797         mov     24(rp), %r8
 798         mov     32(rp), %r9
 799         lea     16(rp), rp
 800 L(dm):  adcx(   %r8, %r8)
 801         adcx(   %r9, %r9)
 802         mov     8(up), %rdx
 803         lea     8(up), up
 804         adox(   %r10, %r8)
 805         mulx(   %rdx, %rax, %r10)
 806         adox(   %rax, %r9)
 807         mov     %r8, 8(rp)
 808         mov     %r9, 16(rp)
 809         lea     -1(n), R32(n)
 810         jmp     L(dtop)
 811
 812 L(dend):adcx(   %rcx, %r10)
 813         adox(   %rcx, %r10)
 814         mov     %r10, 24(rp)
 815 ')
 816
 817         pop     %rbx
 818         FUNC_EXIT()
 819         ret
 820
 821         JUMPTABSECT
 822         ALIGN(8)
 823 L(mtab):JMPENT( L(mf7), L(mtab))
 824         JMPENT( L(mf0), L(mtab))
 825         JMPENT( L(mf1), L(mtab))
 826         JMPENT( L(mf2), L(mtab))
 827         JMPENT( L(mf3), L(mtab))
 828         JMPENT( L(mf4), L(mtab))
 829         JMPENT( L(mf5), L(mtab))
 830         JMPENT( L(mf6), L(mtab))
 831 L(atab):JMPENT( L(f6), L(atab))
 832         JMPENT( L(f7), L(atab))
 833         JMPENT( L(f0), L(atab))
 834         JMPENT( L(f1), L(atab))
 835         JMPENT( L(f2), L(atab))
 836         JMPENT( L(f3), L(atab))
 837         JMPENT( L(f4), L(atab))
 838         JMPENT( L(f5), L(atab))
 839         TEXT
 840 EPILOGUE()