usr/src/common/bignum/amd64/bignum_amd64_asm.s

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/asm_linkage.h>
  27
  28
  29 / ------------------------------------------------------------------------
  30 /
  31 /  Implementation of big_mul_set_vec which exploits
  32 /  the 64X64->128 bit  unsigned multiply instruction.
  33 /
  34 /  As defined in Sun's bignum library for pkcs11, bignums are
  35 /  composed of an array of 64-bit "digits" or "chunks" along with
  36 /  descriptive information.
  37 /
  38 / ------------------------------------------------------------------------
  39
  40 / r = a * digit, r and a are vectors of length len
  41 / returns the carry digit
  42 / r and a are 64 bit aligned.
  43 /
  44 / uint64_t
  45 / big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
  46 /
  47         ENTRY(big_mul_set_vec)
  48         xorq    %rax, %rax              / if (len == 0) return (0)
  49         testq   %rdx, %rdx
  50         jz      .L17
  51
  52         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
  53         xorq    %r9, %r9                / cy = 0
  54
  55 .L15:
  56         cmpq    $8, %r8                 / 8 - len
  57         jb      .L16
  58         movq    0(%rsi), %rax           / rax = a[0]
  59         movq    8(%rsi), %r11           / prefetch a[1]
  60         mulq    %rcx                    / p = a[0] * digit
  61         addq    %r9, %rax
  62         adcq    $0, %rdx                / p += cy
  63         movq    %rax, 0(%rdi)           / r[0] = lo(p)
  64         movq    %rdx, %r9               / cy = hi(p)
  65
  66         movq    %r11, %rax
  67         movq    16(%rsi), %r11          / prefetch a[2]
  68         mulq    %rcx                    / p = a[1] * digit
  69         addq    %r9, %rax
  70         adcq    $0, %rdx                / p += cy
  71         movq    %rax, 8(%rdi)           / r[1] = lo(p)
  72         movq    %rdx, %r9               / cy = hi(p)
  73
  74         movq    %r11, %rax
  75         movq    24(%rsi), %r11          / prefetch a[3]
  76         mulq    %rcx                    / p = a[2] * digit
  77         addq    %r9, %rax
  78         adcq    $0, %rdx                / p += cy
  79         movq    %rax, 16(%rdi)          / r[2] = lo(p)
  80         movq    %rdx, %r9               / cy = hi(p)
  81
  82         movq    %r11, %rax
  83         movq    32(%rsi), %r11          / prefetch a[4]
  84         mulq    %rcx                    / p = a[3] * digit
  85         addq    %r9, %rax
  86         adcq    $0, %rdx                / p += cy
  87         movq    %rax, 24(%rdi)          / r[3] = lo(p)
  88         movq    %rdx, %r9               / cy = hi(p)
  89
  90         movq    %r11, %rax
  91         movq    40(%rsi), %r11          / prefetch a[5]
  92         mulq    %rcx                    / p = a[4] * digit
  93         addq    %r9, %rax
  94         adcq    $0, %rdx                / p += cy
  95         movq    %rax, 32(%rdi)          / r[4] = lo(p)
  96         movq    %rdx, %r9               / cy = hi(p)
  97
  98         movq    %r11, %rax
  99         movq    48(%rsi), %r11          / prefetch a[6]
 100         mulq    %rcx                    / p = a[5] * digit
 101         addq    %r9, %rax
 102         adcq    $0, %rdx                / p += cy
 103         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 104         movq    %rdx, %r9               / cy = hi(p)
 105
 106         movq    %r11, %rax
 107         movq    56(%rsi), %r11          / prefetch a[7]
 108         mulq    %rcx                    / p = a[6] * digit
 109         addq    %r9, %rax
 110         adcq    $0, %rdx                / p += cy
 111         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 112         movq    %rdx, %r9               / cy = hi(p)
 113
 114         movq    %r11, %rax
 115         mulq    %rcx                    / p = a[7] * digit
 116         addq    %r9, %rax
 117         adcq    $0, %rdx                / p += cy
 118         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 119         movq    %rdx, %r9               / cy = hi(p)
 120
 121         addq    $64, %rsi
 122         addq    $64, %rdi
 123         subq    $8, %r8
 124
 125         jz      .L17
 126         jmp     .L15
 127
 128 .L16:
 129         movq    0(%rsi), %rax
 130         mulq    %rcx                    / p = a[0] * digit
 131         addq    %r9, %rax
 132         adcq    $0, %rdx                / p += cy
 133         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 134         movq    %rdx, %r9               / cy = hi(p)
 135         decq    %r8
 136         jz      .L17
 137
 138         movq    8(%rsi), %rax
 139         mulq    %rcx                    / p = a[1] * digit
 140         addq    %r9, %rax
 141         adcq    $0, %rdx                / p += cy
 142         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 143         movq    %rdx, %r9               / cy = hi(p)
 144         decq    %r8
 145         jz      .L17
 146
 147         movq    16(%rsi), %rax
 148         mulq    %rcx                    / p = a[2] * digit
 149         addq    %r9, %rax
 150         adcq    $0, %rdx                / p += cy
 151         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 152         movq    %rdx, %r9               / cy = hi(p)
 153         decq    %r8
 154         jz      .L17
 155
 156         movq    24(%rsi), %rax
 157         mulq    %rcx                    / p = a[3] * digit
 158         addq    %r9, %rax
 159         adcq    $0, %rdx                / p += cy
 160         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 161         movq    %rdx, %r9               / cy = hi(p)
 162         decq    %r8
 163         jz      .L17
 164
 165         movq    32(%rsi), %rax
 166         mulq    %rcx                    / p = a[4] * digit
 167         addq    %r9, %rax
 168         adcq    $0, %rdx                / p += cy
 169         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 170         movq    %rdx, %r9               / cy = hi(p)
 171         decq    %r8
 172         jz      .L17
 173
 174         movq    40(%rsi), %rax
 175         mulq    %rcx                    / p = a[5] * digit
 176         addq    %r9, %rax
 177         adcq    $0, %rdx                / p += cy
 178         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 179         movq    %rdx, %r9               / cy = hi(p)
 180         decq    %r8
 181         jz      .L17
 182
 183         movq    48(%rsi), %rax
 184         mulq    %rcx                    / p = a[6] * digit
 185         addq    %r9, %rax
 186         adcq    $0, %rdx                / p += cy
 187         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 188         movq    %rdx, %r9               / cy = hi(p)
 189         decq    %r8
 190         jz      .L17
 191
 192
 193 .L17:
 194         movq    %r9, %rax
 195         ret
 196         SET_SIZE(big_mul_set_vec)
 197
 198
 199 / ------------------------------------------------------------------------
 200 /
 201 /  Implementation of big_mul_add_vec which exploits
 202 /  the 64X64->128 bit  unsigned multiply instruction.
 203 /
 204 /  As defined in Sun's bignum library for pkcs11, bignums are
 205 /  composed of an array of 64-bit "digits" or "chunks" along with
 206 /  descriptive information.
 207 /
 208 / ------------------------------------------------------------------------
 209
 210 / r += a * digit, r and a are vectors of length len
 211 / returns the carry digit
 212 / r and a are 64 bit aligned.
 213 /
 214 / uint64_t
 215 / big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
 216 /
 217         ENTRY(big_mul_add_vec)
 218         xorq    %rax, %rax              / if (len == 0) return (0)
 219         testq   %rdx, %rdx
 220         jz      .L27
 221
 222         movq    %rdx, %r8               / Use r8 for len; %rdx is used by mul
 223         xorq    %r9, %r9                / cy = 0
 224
 225 .L25:
 226         cmpq    $8, %r8                 / 8 - len
 227         jb      .L26
 228         movq    0(%rsi), %rax           / rax = a[0]
 229         movq    0(%rdi), %r10           / r10 = r[0]
 230         movq    8(%rsi), %r11           / prefetch a[1]
 231         mulq    %rcx                    / p = a[0] * digit
 232         addq    %r10, %rax
 233         adcq    $0, %rdx                / p += r[0]
 234         movq    8(%rdi), %r10           / prefetch r[1]
 235         addq    %r9, %rax
 236         adcq    $0, %rdx                / p += cy
 237         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 238         movq    %rdx, %r9               / cy = hi(p)
 239
 240         movq    %r11, %rax
 241         movq    16(%rsi), %r11          / prefetch a[2]
 242         mulq    %rcx                    / p = a[1] * digit
 243         addq    %r10, %rax
 244         adcq    $0, %rdx                / p += r[1]
 245         movq    16(%rdi), %r10          / prefetch r[2]
 246         addq    %r9, %rax
 247         adcq    $0, %rdx                / p += cy
 248         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 249         movq    %rdx, %r9               / cy = hi(p)
 250
 251         movq    %r11, %rax
 252         movq    24(%rsi), %r11          / prefetch a[3]
 253         mulq    %rcx                    / p = a[2] * digit
 254         addq    %r10, %rax
 255         adcq    $0, %rdx                / p += r[2]
 256         movq    24(%rdi), %r10          / prefetch r[3]
 257         addq    %r9, %rax
 258         adcq    $0, %rdx                / p += cy
 259         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 260         movq    %rdx, %r9               / cy = hi(p)
 261
 262         movq    %r11, %rax
 263         movq    32(%rsi), %r11          / prefetch a[4]
 264         mulq    %rcx                    / p = a[3] * digit
 265         addq    %r10, %rax
 266         adcq    $0, %rdx                / p += r[3]
 267         movq    32(%rdi), %r10          / prefetch r[4]
 268         addq    %r9, %rax
 269         adcq    $0, %rdx                / p += cy
 270         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 271         movq    %rdx, %r9               / cy = hi(p)
 272
 273         movq    %r11, %rax
 274         movq    40(%rsi), %r11          / prefetch a[5]
 275         mulq    %rcx                    / p = a[4] * digit
 276         addq    %r10, %rax
 277         adcq    $0, %rdx                / p += r[4]
 278         movq    40(%rdi), %r10          / prefetch r[5]
 279         addq    %r9, %rax
 280         adcq    $0, %rdx                / p += cy
 281         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 282         movq    %rdx, %r9               / cy = hi(p)
 283
 284         movq    %r11, %rax
 285         movq    48(%rsi), %r11          / prefetch a[6]
 286         mulq    %rcx                    / p = a[5] * digit
 287         addq    %r10, %rax
 288         adcq    $0, %rdx                / p += r[5]
 289         movq    48(%rdi), %r10          / prefetch r[6]
 290         addq    %r9, %rax
 291         adcq    $0, %rdx                / p += cy
 292         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 293         movq    %rdx, %r9               / cy = hi(p)
 294
 295         movq    %r11, %rax
 296         movq    56(%rsi), %r11          / prefetch a[7]
 297         mulq    %rcx                    / p = a[6] * digit
 298         addq    %r10, %rax
 299         adcq    $0, %rdx                / p += r[6]
 300         movq    56(%rdi), %r10          / prefetch r[7]
 301         addq    %r9, %rax
 302         adcq    $0, %rdx                / p += cy
 303         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 304         movq    %rdx, %r9               / cy = hi(p)
 305
 306         movq    %r11, %rax
 307         mulq    %rcx                    / p = a[7] * digit
 308         addq    %r10, %rax
 309         adcq    $0, %rdx                / p += r[7]
 310         addq    %r9, %rax
 311         adcq    $0, %rdx                / p += cy
 312         movq    %rax, 56(%rdi)          / r[7] = lo(p)
 313         movq    %rdx, %r9               / cy = hi(p)
 314
 315         addq    $64, %rsi
 316         addq    $64, %rdi
 317         subq    $8, %r8
 318
 319         jz      .L27
 320         jmp     .L25
 321
 322 .L26:
 323         movq    0(%rsi), %rax
 324         movq    0(%rdi), %r10
 325         mulq    %rcx                    / p = a[0] * digit
 326         addq    %r10, %rax
 327         adcq    $0, %rdx                / p += r[0]
 328         addq    %r9, %rax
 329         adcq    $0, %rdx                / p += cy
 330         movq    %rax, 0(%rdi)           / r[0] = lo(p)
 331         movq    %rdx, %r9               / cy = hi(p)
 332         decq    %r8
 333         jz      .L27
 334
 335         movq    8(%rsi), %rax
 336         movq    8(%rdi), %r10
 337         mulq    %rcx                    / p = a[1] * digit
 338         addq    %r10, %rax
 339         adcq    $0, %rdx                / p += r[1]
 340         addq    %r9, %rax
 341         adcq    $0, %rdx                / p += cy
 342         movq    %rax, 8(%rdi)           / r[1] = lo(p)
 343         movq    %rdx, %r9               / cy = hi(p)
 344         decq    %r8
 345         jz      .L27
 346
 347         movq    16(%rsi), %rax
 348         movq    16(%rdi), %r10
 349         mulq    %rcx                    / p = a[2] * digit
 350         addq    %r10, %rax
 351         adcq    $0, %rdx                / p += r[2]
 352         addq    %r9, %rax
 353         adcq    $0, %rdx                / p += cy
 354         movq    %rax, 16(%rdi)          / r[2] = lo(p)
 355         movq    %rdx, %r9               / cy = hi(p)
 356         decq    %r8
 357         jz      .L27
 358
 359         movq    24(%rsi), %rax
 360         movq    24(%rdi), %r10
 361         mulq    %rcx                    / p = a[3] * digit
 362         addq    %r10, %rax
 363         adcq    $0, %rdx                / p += r[3]
 364         addq    %r9, %rax
 365         adcq    $0, %rdx                / p += cy
 366         movq    %rax, 24(%rdi)          / r[3] = lo(p)
 367         movq    %rdx, %r9               / cy = hi(p)
 368         decq    %r8
 369         jz      .L27
 370
 371         movq    32(%rsi), %rax
 372         movq    32(%rdi), %r10
 373         mulq    %rcx                    / p = a[4] * digit
 374         addq    %r10, %rax
 375         adcq    $0, %rdx                / p += r[4]
 376         addq    %r9, %rax
 377         adcq    $0, %rdx                / p += cy
 378         movq    %rax, 32(%rdi)          / r[4] = lo(p)
 379         movq    %rdx, %r9               / cy = hi(p)
 380         decq    %r8
 381         jz      .L27
 382
 383         movq    40(%rsi), %rax
 384         movq    40(%rdi), %r10
 385         mulq    %rcx                    / p = a[5] * digit
 386         addq    %r10, %rax
 387         adcq    $0, %rdx                / p += r[5]
 388         addq    %r9, %rax
 389         adcq    $0, %rdx                / p += cy
 390         movq    %rax, 40(%rdi)          / r[5] = lo(p)
 391         movq    %rdx, %r9               / cy = hi(p)
 392         decq    %r8
 393         jz      .L27
 394
 395         movq    48(%rsi), %rax
 396         movq    48(%rdi), %r10
 397         mulq    %rcx                    / p = a[6] * digit
 398         addq    %r10, %rax
 399         adcq    $0, %rdx                / p += r[6]
 400         addq    %r9, %rax
 401         adcq    $0, %rdx                / p += cy
 402         movq    %rax, 48(%rdi)          / r[6] = lo(p)
 403         movq    %rdx, %r9               / cy = hi(p)
 404         decq    %r8
 405         jz      .L27
 406
 407
 408 .L27:
 409         movq    %r9, %rax
 410         ret
 411         SET_SIZE(big_mul_add_vec)
 412
 413
 414 / void
 415 / big_sqr_vec(uint64_t *r, uint64_t *a, int len)
 416
 417         ENTRY(big_sqr_vec)
 418         pushq   %rbx
 419         pushq   %rbp
 420         pushq   %r12
 421         pushq   %r13
 422         pushq   %r14
 423         pushq   %r15
 424         pushq   %rdx                    / save arg3, len
 425         pushq   %rsi                    / save arg2, a
 426         pushq   %rdi                    / save arg1, r
 427
 428         leaq    8(%rdi), %r13           / tr = r + 1
 429         movq    %rsi, %r14              / ta = a
 430         movq    %rdx, %r15              / tlen = len
 431         decq    %r15                    / tlen = len - 1
 432         movq    %r13, %rdi              / arg1 = tr
 433         leaq    8(%r14), %rsi           / arg2 = ta + 1
 434         movq    %r15, %rdx              / arg3 = tlen
 435         movq    0(%r14), %rcx           / arg4 = ta[0]
 436         call    big_mul_set_vec
 437         movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 438 .L31:
 439         decq    %r15                    / --tlen
 440         jz      .L32                    / while (--tlen != 0)
 441
 442         addq    $16, %r13               / tr += 2
 443         addq    $8, %r14                / ++ta
 444         movq    %r13, %rdi              / arg1 = tr
 445         leaq    8(%r14), %rsi           / arg2 = ta + 1
 446         movq    %r15, %rdx              / arg3 = tlen
 447         movq    0(%r14), %rcx           / arg4 = ta[0]
 448         call    big_mul_add_vec
 449         movq    %rax, 0(%r13, %r15, 8)  / tr[tlen] = cy
 450         jmp     .L31
 451
 452 .L32:
 453
 454 / No more function calls after this.
 455 / Restore arguments to registers.
 456 / However, don't use %rdx for arg3, len, because it is heavily
 457 / used by the hardware MUL instruction.  Use %r8, instead.
 458         movq    0(%rsp), %rdi           / %rdi == arg1 == r
 459         movq    8(%rsp), %rsi           / %rsi == arg2 == a
 460         movq    16(%rsp), %r8           / %r8  == arg3 == len
 461
 462         movq    0(%rsi), %rax           / %rax = a[0];
 463         mulq    %rax                    / s = %edx:%eax = a[0]**2
 464         movq    %rax, 0(%rdi)           / r[0] = lo64(s)
 465         movq    %rdx, %r9               / cy = hi64(s)
 466         xorq    %rdx, %rdx
 467         movq    8(%rdi), %rax           / p = %rdx:%rax = r[1]
 468         addq    %rax, %rax
 469         adcq    $0, %rdx                / p = p << 1
 470         addq    %r9, %rax
 471         adcq    $0, %rdx                / p = (r[1] << 1) + cy
 472         movq    %rax, 8(%rdi)           / r[1] = lo64(p)
 473         movq    %rdx, %r9               / cy = hi64(p)
 474         movq    $1, %r11                / row = 1
 475         movq    $2, %r12                / col = 2
 476         movq    %r8, %r15
 477         decq    %r15                    / tlen = len - 1
 478 .L33:
 479         cmpq    %r8, %r11               / len - row
 480         jae     .L34                    / while (row < len)
 481
 482         movq    0(%rsi, %r11, 8), %rax  / s = (uint128_t)a[row]
 483         mulq    %rax                    / s = s * s
 484         xorq    %rbx, %rbx
 485         movq    0(%rdi, %r12, 8), %rcx  / p = (uint128_t)r[col]
 486         addq    %rcx, %rcx
 487         adcq    $0, %rbx                / p = p << 1
 488         addq    %rcx, %rax
 489         adcq    %rbx, %rdx              / t = p + s
 490         xorq    %r10, %r10
 491         movq    %rax, %rbp              / t2 = 0:lo64(t)
 492         addq    %r9, %rbp
 493         adcq    $0, %r10                / t2 = %r10:%rbp = lo64(t) + cy
 494         movq    %rbp, 0(%rdi, %r12, 8)  / r[col] = lo64(t2)
 495         xorq    %rcx, %rcx
 496         movq    %rdx, %r9
 497         addq    %r10, %r9
 498         adcq    $0, %rcx                / cy = hi64(t) + hi64(t2)
 499         cmpq    %r11, %r15
 500         je      .L34                    / if (row == len - 1) break
 501         xorq    %rdx, %rdx
 502         movq    8(%rdi, %r12, 8), %rax
 503         addq    %rax, %rax
 504         adcq    $0, %rdx
 505         addq    %r9, %rax
 506         adcq    %rcx, %rdx              / p = (lo64(r[col+1]) << 1) + cy
 507         movq    %rax, 8(%rdi, %r12, 8)  / r[col+1] = lo64(p)
 508         movq    %rdx, %r9               / cy = hi64(p)
 509
 510         incq    %r11                    / ++row
 511         addq    $2, %r12                / col += 2
 512         jmp     .L33
 513
 514 .L34:
 515         movq    %r9, 8(%rdi, %r12, 8)   / r[col+1] = lo64(cy)
 516
 517         addq    $24, %rsp               / skip %rdi, %rsi, %rdx
 518         popq    %r15
 519         popq    %r14
 520         popq    %r13
 521         popq    %r12
 522         popq    %rbp
 523         popq    %rbx
 524
 525         ret
 526
 527         SET_SIZE(big_sqr_vec)
 528