source/libs/gmp/gmp-src/mpn/x86_64/k8/mul_basecase.asm

   1 dnl  AMD64 mpn_mul_basecase.
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
   4
   5 dnl  Copyright 2008, 2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb
  36 C AMD K8,K9      2.375
  37 C AMD K10        2.375
  38 C Intel P4      15-16
  39 C Intel core2    4.45
  40 C Intel corei    4.35
  41 C Intel atom     ?
  42 C VIA nano       4.5
  43
  44 C The inner loops of this code are the result of running a code generation and
  45 C optimization tool suite written by David Harvey and Torbjorn Granlund.
  46
  47 C TODO
  48 C  * Use fewer registers.  (how??? I can't see it -- david)
  49 C  * Avoid some "mov $0,r" and instead use "xor r,r".
  50 C  * Can the top of each L(addmul_outer_n) prologue be folded into the
  51 C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
  52 C    case where vn = 1 or 2; is it worth it?
  53
  54 C INPUT PARAMETERS
  55 define(`rp',      `%rdi')
  56 define(`up',      `%rsi')
  57 define(`un_param',`%rdx')
  58 define(`vp',      `%rcx')
  59 define(`vn',      `%r8')
  60
  61 define(`v0', `%r12')
  62 define(`v1', `%r9')
  63
  64 define(`w0', `%rbx')
  65 define(`w1', `%r15')
  66 define(`w2', `%rbp')
  67 define(`w3', `%r10')
  68
  69 define(`n',  `%r11')
  70 define(`outer_addr', `%r14')
  71 define(`un',  `%r13')
  72
  73 ABI_SUPPORT(DOS64)
  74 ABI_SUPPORT(STD64)
  75
  76 ASM_START()
  77         TEXT
  78         ALIGN(16)
  79 PROLOGUE(mpn_mul_basecase)
  80         FUNC_ENTRY(4)
  81 IFDOS(` mov     56(%rsp), %r8d  ')
  82         push    %rbx
  83         push    %rbp
  84         push    %r12
  85         push    %r13
  86         push    %r14
  87         push    %r15
  88
  89         xor     R32(un), R32(un)
  90         mov     (up), %rax
  91         mov     (vp), v0
  92
  93         sub     un_param, un            C rdx used by mul
  94         mov     un, n
  95         mov     R32(un_param), R32(w0)
  96
  97         lea     (rp,un_param,8), rp
  98         lea     (up,un_param,8), up
  99
 100         mul     v0
 101
 102         test    $1, R8(vn)
 103         jz      L(mul_2)
 104
 105 C ===========================================================
 106 C     mul_1 for vp[0] if vn is odd
 107
 108 L(mul_1):
 109         and     $3, R32(w0)
 110         jz      L(mul_1_prologue_0)
 111         cmp     $2, R32(w0)
 112         jc      L(mul_1_prologue_1)
 113         jz      L(mul_1_prologue_2)
 114
 115 L(mul_1_prologue_3):
 116         add     $-1, n
 117         lea     L(addmul_outer_3)(%rip), outer_addr
 118         mov     %rax, w3
 119         mov     %rdx, w0
 120         jmp     L(mul_1_entry_3)
 121
 122 L(mul_1_prologue_0):
 123         mov     %rax, w2
 124         mov     %rdx, w3                C note: already w0 == 0
 125         lea     L(addmul_outer_0)(%rip), outer_addr
 126         jmp     L(mul_1_entry_0)
 127
 128 L(mul_1_prologue_1):
 129         cmp     $-1, un
 130         jne     2f
 131         mov     %rax, -8(rp)
 132         mov     %rdx, (rp)
 133         jmp     L(ret)
 134 2:      add     $1, n
 135         lea     L(addmul_outer_1)(%rip), outer_addr
 136         mov     %rax, w1
 137         mov     %rdx, w2
 138         xor     R32(w3), R32(w3)
 139         mov     (up,n,8), %rax
 140         jmp     L(mul_1_entry_1)
 141
 142 L(mul_1_prologue_2):
 143         add     $-2, n
 144         lea     L(addmul_outer_2)(%rip), outer_addr
 145         mov     %rax, w0
 146         mov     %rdx, w1
 147         mov     24(up,n,8), %rax
 148         xor     R32(w2), R32(w2)
 149         xor     R32(w3), R32(w3)
 150         jmp     L(mul_1_entry_2)
 151
 152
 153         C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
 154
 155         ALIGN(16)
 156 L(mul_1_top):
 157         mov     w0, -16(rp,n,8)
 158         add     %rax, w1
 159         mov     (up,n,8), %rax
 160         adc     %rdx, w2
 161 L(mul_1_entry_1):
 162         xor     R32(w0), R32(w0)
 163         mul     v0
 164         mov     w1, -8(rp,n,8)
 165         add     %rax, w2
 166         adc     %rdx, w3
 167 L(mul_1_entry_0):
 168         mov     8(up,n,8), %rax
 169         mul     v0
 170         mov     w2, (rp,n,8)
 171         add     %rax, w3
 172         adc     %rdx, w0
 173 L(mul_1_entry_3):
 174         mov     16(up,n,8), %rax
 175         mul     v0
 176         mov     w3, 8(rp,n,8)
 177         xor     R32(w2), R32(w2)        C zero
 178         mov     w2, w3                  C zero
 179         add     %rax, w0
 180         mov     24(up,n,8), %rax
 181         mov     w2, w1                  C zero
 182         adc     %rdx, w1
 183 L(mul_1_entry_2):
 184         mul     v0
 185         add     $4, n
 186         js      L(mul_1_top)
 187
 188         mov     w0, -16(rp)
 189         add     %rax, w1
 190         mov     w1, -8(rp)
 191         adc     %rdx, w2
 192         mov     w2, (rp)
 193
 194         add     $-1, vn                 C vn -= 1
 195         jz      L(ret)
 196
 197         mov     8(vp), v0
 198         mov     16(vp), v1
 199
 200         lea     8(vp), vp               C vp += 1
 201         lea     8(rp), rp               C rp += 1
 202
 203         jmp     *outer_addr
 204
 205 C ===========================================================
 206 C     mul_2 for vp[0], vp[1] if vn is even
 207
 208         ALIGN(16)
 209 L(mul_2):
 210         mov     8(vp), v1
 211
 212         and     $3, R32(w0)
 213         jz      L(mul_2_prologue_0)
 214         cmp     $2, R32(w0)
 215         jz      L(mul_2_prologue_2)
 216         jc      L(mul_2_prologue_1)
 217
 218 L(mul_2_prologue_3):
 219         lea     L(addmul_outer_3)(%rip), outer_addr
 220         add     $2, n
 221         mov     %rax, -16(rp,n,8)
 222         mov     %rdx, w2
 223         xor     R32(w3), R32(w3)
 224         xor     R32(w0), R32(w0)
 225         mov     -16(up,n,8), %rax
 226         jmp     L(mul_2_entry_3)
 227
 228         ALIGN(16)
 229 L(mul_2_prologue_0):
 230         add     $3, n
 231         mov     %rax, w0
 232         mov     %rdx, w1
 233         xor     R32(w2), R32(w2)
 234         mov     -24(up,n,8), %rax
 235         lea     L(addmul_outer_0)(%rip), outer_addr
 236         jmp     L(mul_2_entry_0)
 237
 238         ALIGN(16)
 239 L(mul_2_prologue_1):
 240         mov     %rax, w3
 241         mov     %rdx, w0
 242         xor     R32(w1), R32(w1)
 243         lea     L(addmul_outer_1)(%rip), outer_addr
 244         jmp     L(mul_2_entry_1)
 245
 246         ALIGN(16)
 247 L(mul_2_prologue_2):
 248         add     $1, n
 249         lea     L(addmul_outer_2)(%rip), outer_addr
 250         mov     $0, R32(w0)
 251         mov     $0, R32(w1)
 252         mov     %rax, w2
 253         mov     -8(up,n,8), %rax
 254         mov     %rdx, w3
 255         jmp     L(mul_2_entry_2)
 256
 257         C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
 258
 259         ALIGN(16)
 260 L(mul_2_top):
 261         mov     -32(up,n,8), %rax
 262         mul     v1
 263         add     %rax, w0
 264         adc     %rdx, w1
 265         mov     -24(up,n,8), %rax
 266         xor     R32(w2), R32(w2)
 267         mul     v0
 268         add     %rax, w0
 269         mov     -24(up,n,8), %rax
 270         adc     %rdx, w1
 271         adc     $0, R32(w2)
 272 L(mul_2_entry_0):
 273         mul     v1
 274         add     %rax, w1
 275         mov     w0, -24(rp,n,8)
 276         adc     %rdx, w2
 277         mov     -16(up,n,8), %rax
 278         mul     v0
 279         mov     $0, R32(w3)
 280         add     %rax, w1
 281         adc     %rdx, w2
 282         mov     -16(up,n,8), %rax
 283         adc     $0, R32(w3)
 284         mov     $0, R32(w0)
 285         mov     w1, -16(rp,n,8)
 286 L(mul_2_entry_3):
 287         mul     v1
 288         add     %rax, w2
 289         mov     -8(up,n,8), %rax
 290         adc     %rdx, w3
 291         mov     $0, R32(w1)
 292         mul     v0
 293         add     %rax, w2
 294         mov     -8(up,n,8), %rax
 295         adc     %rdx, w3
 296         adc     R32(w1), R32(w0)        C adc $0, w0
 297 L(mul_2_entry_2):
 298         mul     v1
 299         add     %rax, w3
 300         mov     w2, -8(rp,n,8)
 301         adc     %rdx, w0
 302         mov     (up,n,8), %rax
 303         mul     v0
 304         add     %rax, w3
 305         adc     %rdx, w0
 306         adc     $0, R32(w1)
 307 L(mul_2_entry_1):
 308         add     $4, n
 309         mov     w3, -32(rp,n,8)
 310         js      L(mul_2_top)
 311
 312         mov     -32(up,n,8), %rax       C FIXME: n is constant
 313         mul     v1
 314         add     %rax, w0
 315         mov     w0, (rp)
 316         adc     %rdx, w1
 317         mov     w1, 8(rp)
 318
 319         add     $-2, vn                 C vn -= 2
 320         jz      L(ret)
 321
 322         mov     16(vp), v0
 323         mov     24(vp), v1
 324
 325         lea     16(vp), vp              C vp += 2
 326         lea     16(rp), rp              C rp += 2
 327
 328         jmp     *outer_addr
 329
 330
 331 C ===========================================================
 332 C     addmul_2 for remaining vp's
 333
 334         C in the following prologues, we reuse un to store the
 335         C adjusted value of n that is reloaded on each iteration
 336
 337 L(addmul_outer_0):
 338         add     $3, un
 339         lea     0(%rip), outer_addr
 340
 341         mov     un, n
 342         mov     -24(up,un,8), %rax
 343         mul     v0
 344         mov     %rax, w0
 345         mov     -24(up,un,8), %rax
 346         mov     %rdx, w1
 347         xor     R32(w2), R32(w2)
 348         jmp     L(addmul_entry_0)
 349
 350 L(addmul_outer_1):
 351         mov     un, n
 352         mov     (up,un,8), %rax
 353         mul     v0
 354         mov     %rax, w3
 355         mov     (up,un,8), %rax
 356         mov     %rdx, w0
 357         xor     R32(w1), R32(w1)
 358         jmp     L(addmul_entry_1)
 359
 360 L(addmul_outer_2):
 361         add     $1, un
 362         lea     0(%rip), outer_addr
 363
 364         mov     un, n
 365         mov     -8(up,un,8), %rax
 366         mul     v0
 367         xor     R32(w0), R32(w0)
 368         mov     %rax, w2
 369         xor     R32(w1), R32(w1)
 370         mov     %rdx, w3
 371         mov     -8(up,un,8), %rax
 372         jmp     L(addmul_entry_2)
 373
 374 L(addmul_outer_3):
 375         add     $2, un
 376         lea     0(%rip), outer_addr
 377
 378         mov     un, n
 379         mov     -16(up,un,8), %rax
 380         xor     R32(w3), R32(w3)
 381         mul     v0
 382         mov     %rax, w1
 383         mov     -16(up,un,8), %rax
 384         mov     %rdx, w2
 385         jmp     L(addmul_entry_3)
 386
 387         C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
 388
 389         ALIGN(16)
 390 L(addmul_top):
 391         add     w3, -32(rp,n,8)
 392         adc     %rax, w0
 393         mov     -24(up,n,8), %rax
 394         adc     %rdx, w1
 395         xor     R32(w2), R32(w2)
 396         mul     v0
 397         add     %rax, w0
 398         mov     -24(up,n,8), %rax
 399         adc     %rdx, w1
 400         adc     R32(w2), R32(w2)        C adc $0, w2
 401 L(addmul_entry_0):
 402         mul     v1
 403         xor     R32(w3), R32(w3)
 404         add     w0, -24(rp,n,8)
 405         adc     %rax, w1
 406         mov     -16(up,n,8), %rax
 407         adc     %rdx, w2
 408         mul     v0
 409         add     %rax, w1
 410         mov     -16(up,n,8), %rax
 411         adc     %rdx, w2
 412         adc     $0, R32(w3)
 413 L(addmul_entry_3):
 414         mul     v1
 415         add     w1, -16(rp,n,8)
 416         adc     %rax, w2
 417         mov     -8(up,n,8), %rax
 418         adc     %rdx, w3
 419         mul     v0
 420         xor     R32(w0), R32(w0)
 421         add     %rax, w2
 422         adc     %rdx, w3
 423         mov     $0, R32(w1)
 424         mov     -8(up,n,8), %rax
 425         adc     R32(w1), R32(w0)        C adc $0, w0
 426 L(addmul_entry_2):
 427         mul     v1
 428         add     w2, -8(rp,n,8)
 429         adc     %rax, w3
 430         adc     %rdx, w0
 431         mov     (up,n,8), %rax
 432         mul     v0
 433         add     %rax, w3
 434         mov     (up,n,8), %rax
 435         adc     %rdx, w0
 436         adc     $0, R32(w1)
 437 L(addmul_entry_1):
 438         mul     v1
 439         add     $4, n
 440         js      L(addmul_top)
 441
 442         add     w3, -8(rp)
 443         adc     %rax, w0
 444         mov     w0, (rp)
 445         adc     %rdx, w1
 446         mov     w1, 8(rp)
 447
 448         add     $-2, vn                 C vn -= 2
 449         jz      L(ret)
 450
 451         lea     16(rp), rp              C rp += 2
 452         lea     16(vp), vp              C vp += 2
 453
 454         mov     (vp), v0
 455         mov     8(vp), v1
 456
 457         jmp     *outer_addr
 458
 459         ALIGN(16)
 460 L(ret): pop     %r15
 461         pop     %r14
 462         pop     %r13
 463         pop     %r12
 464         pop     %rbp
 465         pop     %rbx
 466         FUNC_EXIT()
 467         ret
 468
 469 EPILOGUE()