usr/src/common/bignum/i386/bignum_i386_asm.s

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 #include <sys/asm_linkage.h>
  26 #include <sys/x86_archext.h>
  27 #include <sys/controlregs.h>
  28
  29 #if defined(__lint)
  30
  31 #include <sys/types.h>
  32
  33 uint32_t
  34 bignum_use_sse2()
  35 { return (0); }
  36
  37 /* Not to be called by C code */
  38 /* ARGSUSED */
  39 uint32_t
  40 big_mul_set_vec_sse2_r()
  41 { return (0); }
  42
  43 /* Not to be called by C code */
  44 /* ARGSUSED */
  45 uint32_t
  46 big_mul_add_vec_sse2_r()
  47 { return (0); }
  48
  49 /* ARGSUSED */
  50 uint32_t
  51 big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
  52 { return (0); }
  53
  54 /* ARGSUSED */
  55 uint32_t
  56 big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
  57 { return (0); }
  58
  59 /* ARGSUSED */
  60 void
  61 big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
  62 {}
  63
  64 /* ARGSUSED */
  65 void
  66 big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
  67 {}
  68
  69 #if defined(MMX_MANAGE)
  70
  71 /* ARGSUSED */
  72 uint32_t
  73 big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
  74 { return (0); }
  75
  76 /* ARGSUSED */
  77 uint32_t
  78 big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
  79 { return (0); }
  80
  81 /* Not to be called by C code */
  82 /* ARGSUSED */
  83 void
  84 big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
  85 {}
  86
  87 #endif  /* MMX_MANAGE */
  88
  89 /*
  90  * UMUL
  91  *
  92  */
  93
  94 /* ARGSUSED */
  95 uint32_t
  96 big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
  97 { return (0); }
  98
  99 /* ARGSUSED */
 100 uint32_t
 101 big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
 102 { return (0); }
 103
 104 #else   /* __lint */
 105
 106 #if defined(MMX_MANAGE)
 107
 108 #if defined(_KERNEL)
 109
 110 #define KPREEMPT_DISABLE call kpr_disable
 111 #define KPREEMPT_ENABLE call kpr_enable
 112 #define TEST_TS(reg)                                    \
 113         movl    %cr0, reg;                              \
 114         clts;                                           \
 115         testl   $CR0_TS, reg
 116
 117 #else   /* _KERNEL */
 118
 119 #define KPREEMPT_DISABLE
 120 #define KPREEMPT_ENABLE
 121
 122 #define TEST_TS(reg)                                    \
 123         movl    $0, reg;                                \
 124         testl   $CR0_TS, reg
 125
 126 #endif  /* _KERNEL */
 127
 128 #define MMX_SIZE 8
 129 #define MMX_ALIGN 8
 130
 131 #define SAVE_MMX_PROLOG(sreg, nreg)                     \
 132         subl    $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;        \
 133         movl    %esp, sreg;                             \
 134         addl    $MMX_ALIGN, sreg;                       \
 135         andl    $-1![MMX_ALIGN-1], sreg;
 136
 137 #define RSTOR_MMX_EPILOG(nreg)                          \
 138         addl    $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
 139
 140 #define SAVE_MMX_0TO4(sreg)                     \
 141         SAVE_MMX_PROLOG(sreg, 5);               \
 142         movq    %mm0, 0(sreg);                  \
 143         movq    %mm1, 8(sreg);                  \
 144         movq    %mm2, 16(sreg);                 \
 145         movq    %mm3, 24(sreg);                 \
 146         movq    %mm4, 32(sreg)
 147
 148 #define RSTOR_MMX_0TO4(sreg)                    \
 149         movq    0(sreg), %mm0;                  \
 150         movq    8(sreg), %mm1;                  \
 151         movq    16(sreg), %mm2;                 \
 152         movq    24(sreg), %mm3;                 \
 153         movq    32(sreg), %mm4;                 \
 154         RSTOR_MMX_EPILOG(5)
 155
 156 #endif  /* MMX_MANAGE */
 157
 158 / Note: this file contains implementations for
 159 /       big_mul_set_vec()
 160 /       big_mul_add_vec()
 161 /       big_mul_vec()
 162 /       big_sqr_vec()
 163 / One set of implementations is for SSE2-capable models.
 164 / The other uses no MMX, SSE, or SSE2 instructions, only
 165 / the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
 166 /
 167 / The code for the implementations is grouped by SSE2 vs UMUL,
 168 / rather than grouping pairs of implementations for each function.
 169 / This is because the bignum implementation gets "imprinted"
 170 / on the correct implementation, at the time of first use,
 171 / so none of the code for the other implementations is ever
 172 / executed.  So, it is a no-brainer to layout the code to minimize
 173 / the "footprint" of executed code.
 174
 175 / Can we use SSE2 instructions?  Return value is non-zero
 176 / if we can.
 177 /
 178 / Note:
 179 /   Using the cpuid instruction directly would work equally
 180 /   well in userland and in the kernel, but we do not use the
 181 /   cpuid instruction in the kernel, we use x86_featureset,
 182 /   instead.  This means we honor any decisions the kernel
 183 /   startup code may have made in setting this variable,
 184 /   including disabling SSE2.  It might even be a good idea
 185 /   to honor this kind of setting in userland, as well, but
 186 /   the variable, x86_featureset is not readily available to
 187 /   userland processes.
 188 /
 189 / uint32_t
 190 / bignum_use_sse2()
 191
 192         ENTRY(bignum_use_sse2)
 193 #if defined(_KERNEL)
 194         xor     %eax, %eax
 195         bt      $X86FSET_SSE2, x86_featureset
 196         adc     %eax, %eax
 197 #else   /* _KERNEL */
 198         pushl   %ebx
 199         movl    $1, %eax                / Get feature information
 200         cpuid
 201         movl    %edx, %eax              / set return value
 202         popl    %ebx
 203         andl    $CPUID_INTC_EDX_SSE2, %eax
 204 #endif  /* _KERNEL */
 205         ret
 206         SET_SIZE(bignum_use_sse2)
 207
 208
 209 / ------------------------------------------------------------------------
 210 /               SSE2 Implementations
 211 / ------------------------------------------------------------------------
 212
 213 / r = a * digit, r and a are vectors of length len
 214 / returns the carry digit
 215 / Suitable only for x86 models that support SSE2 instruction set extensions
 216 /
 217 / uint32_t
 218 / big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
 219 /
 220 / r     %edx
 221 / a     %ebx
 222 / len   %ecx
 223 / digit %mm3
 224 /
 225 / Does not touch the following registers: %esi, %edi, %mm4
 226 /
 227 / N.B.:
 228 /   This is strictly for internal use.
 229 /   The interface is very light-weight.
 230 /   All parameters are passed in registers.
 231 /   It does not conform to the SYSV x86 ABI.
 232 /   So, don't even think about calling this function directly from C code.
 233 /
 234 / The basic multiply digit loop is unrolled 8 times.
 235 / Each comment is preceded by an instance number.
 236 / Instructions that have been moved retain their original, "natural"
 237 / instance number.  It should be easier this way to follow
 238 / the step-wise refinement process that went into constructing
 239 / the final code.
 240
 241 #define UNROLL          8
 242 #define UNROLL32        32
 243
 244         ENTRY(big_mul_set_vec_sse2_r)
 245         xorl    %eax, %eax      / if (len == 0) return (0);
 246         testl   %ecx, %ecx
 247         jz      .L17
 248
 249         pxor    %mm0, %mm0      / cy = 0
 250
 251 .L15:
 252         cmpl    $UNROLL, %ecx
 253         jl      .L16
 254         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 255         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 256         paddq   %mm1, %mm0      / 1: mm0 = digit * a[i] + cy;
 257         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 258         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 259         psrlq   $32, %mm0       / 1: cy = product[63..32]
 260
 261         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 262         paddq   %mm1, %mm0      / 2: mm0 = digit * a[i] + cy;
 263         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 264         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 265         psrlq   $32, %mm0       / 2: cy = product[63..32]
 266
 267         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 268         paddq   %mm1, %mm0      / 3: mm0 = digit * a[i] + cy;
 269         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 270         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 271         psrlq   $32, %mm0       / 3: cy = product[63..32]
 272
 273         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 274         paddq   %mm1, %mm0      / 4: mm0 = digit * a[i] + cy;
 275         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 276         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 277         psrlq   $32, %mm0       / 4: cy = product[63..32]
 278
 279         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 280         paddq   %mm1, %mm0      / 5: mm0 = digit * a[i] + cy;
 281         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 282         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 283         psrlq   $32, %mm0       / 5: cy = product[63..32]
 284
 285         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 286         paddq   %mm1, %mm0      / 6: mm0 = digit * a[i] + cy;
 287         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 288         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 289         psrlq   $32, %mm0       / 6: cy = product[63..32]
 290
 291         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 292         paddq   %mm1, %mm0      / 7: mm0 = digit * a[i] + cy;
 293         movd    28(%ebx), %mm1  / 8: mm1 = a[i]
 294         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 295         psrlq   $32, %mm0       / 7: cy = product[63..32]
 296
 297         pmuludq %mm3, %mm1      / 8: mm1 = digit * a[i]
 298         paddq   %mm1, %mm0      / 8: mm0 = digit * a[i] + cy;
 299         movd    %mm0, 28(%edx)  / 8: r[i] = product[31..0]
 300         psrlq   $32, %mm0       / 8: cy = product[63..32]
 301
 302         leal    UNROLL32(%ebx), %ebx    / a += UNROLL
 303         leal    UNROLL32(%edx), %edx    / r += UNROLL
 304         subl    $UNROLL, %ecx           / len -= UNROLL
 305         jz      .L17
 306         jmp     .L15
 307
 308 .L16:
 309         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 310         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 311         paddq   %mm1, %mm0      / 1: mm0 = digit * a[i] + cy;
 312         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 313         psrlq   $32, %mm0       / 1: cy = product[63..32]
 314         subl    $1, %ecx
 315         jz      .L17
 316
 317         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 318         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 319         paddq   %mm1, %mm0      / 2: mm0 = digit * a[i] + cy;
 320         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 321         psrlq   $32, %mm0       / 2: cy = product[63..32]
 322         subl    $1, %ecx
 323         jz      .L17
 324
 325         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 326         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 327         paddq   %mm1, %mm0      / 3: mm0 = digit * a[i] + cy;
 328         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 329         psrlq   $32, %mm0       / 3: cy = product[63..32]
 330         subl    $1, %ecx
 331         jz      .L17
 332
 333         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 334         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 335         paddq   %mm1, %mm0      / 4: mm0 = digit * a[i] + cy;
 336         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 337         psrlq   $32, %mm0       / 4: cy = product[63..32]
 338         subl    $1, %ecx
 339         jz      .L17
 340
 341         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 342         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 343         paddq   %mm1, %mm0      / 5: mm0 = digit * a[i] + cy;
 344         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 345         psrlq   $32, %mm0       / 5: cy = product[63..32]
 346         subl    $1, %ecx
 347         jz      .L17
 348
 349         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 350         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 351         paddq   %mm1, %mm0      / 6: mm0 = digit * a[i] + cy;
 352         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 353         psrlq   $32, %mm0       / 6: cy = product[63..32]
 354         subl    $1, %ecx
 355         jz      .L17
 356
 357         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 358         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 359         paddq   %mm1, %mm0      / 7: mm0 = digit * a[i] + cy;
 360         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 361         psrlq   $32, %mm0       / 7: cy = product[63..32]
 362
 363 .L17:
 364         movd    %mm0, %eax      / return (cy)
 365         / no emms.  caller is responsible for emms
 366         ret
 367         SET_SIZE(big_mul_set_vec_sse2_r)
 368
 369
 370 / r = a * digit, r and a are vectors of length len
 371 / returns the carry digit
 372 / Suitable only for x86 models that support SSE2 instruction set extensions
 373 /
 374 / r              8(%ebp)        %edx
 375 / a             12(%ebp)        %ebx
 376 / len           16(%ebp)        %ecx
 377 / digit         20(%ebp)        %mm3
 378 /
 379 / In userland, there is just the one function, big_mul_set_vec_sse2().
 380 / But in the kernel, there are two variations:
 381 /    1. big_mul_set_vec_sse2() which does what is necessary to save and
 382 /       restore state, if necessary, and to ensure that preemtion is
 383 /       disabled.
 384 /    2. big_mul_set_vec_sse2_nsv() which just does the work;
 385 /       it is the caller's responsibility to ensure that MMX state
 386 /       does not need to be saved and restored and that preemption
 387 /       is already disabled.
 388
 389 #if defined(MMX_MANAGE)
 390         ENTRY(big_mul_set_vec_sse2)
 391         pushl   %ebp
 392         movl    %esp, %ebp
 393         pushl   %ebx
 394         pushl   %esi
 395         KPREEMPT_DISABLE
 396         TEST_TS(%ebx)
 397         pushl   %ebx
 398         jnz     .setvec_no_save
 399         pushl   %edi
 400         SAVE_MMX_0TO4(%edi)
 401         movl    8(%ebp), %edx
 402         movl    12(%ebp), %ebx
 403         movl    16(%ebp), %ecx
 404         movd    20(%ebp), %mm3
 405         call    big_mul_set_vec_sse2_r
 406         movl    %eax, %esi
 407         RSTOR_MMX_0TO4(%edi)
 408         popl    %edi
 409         jmp     .setvec_rtn
 410
 411 .setvec_no_save:
 412         movl    8(%ebp), %edx
 413         movl    12(%ebp), %ebx
 414         movl    16(%ebp), %ecx
 415         movd    20(%ebp), %mm3
 416         call    big_mul_set_vec_sse2_r
 417         movl    %eax, %esi
 418
 419 .setvec_rtn:
 420         emms
 421         popl    %ebx
 422         movl    %ebx, %cr0
 423         KPREEMPT_ENABLE
 424         movl    %esi, %eax
 425         popl    %esi
 426         popl    %ebx
 427         leave
 428         ret
 429         SET_SIZE(big_mul_set_vec_sse2)
 430
 431         ENTRY(big_mul_set_vec_sse2_nsv)
 432         pushl   %ebp
 433         movl    %esp, %ebp
 434         pushl   %ebx
 435         movl    8(%ebp), %edx
 436         movl    12(%ebp), %ebx
 437         movl    16(%ebp), %ecx
 438         movd    20(%ebp), %mm3
 439         call    big_mul_set_vec_sse2_r
 440         popl    %ebx
 441         leave
 442         ret
 443         SET_SIZE(big_mul_set_vec_sse2_nsv)
 444
 445 #else   /* !defined(MMX_MANAGE) */
 446
 447 / r = a * digit, r and a are vectors of length len
 448 / returns the carry digit
 449 / Suitable only for x86 models that support SSE2 instruction set extensions
 450 /
 451 / r              8(%ebp)        %edx
 452 / a             12(%ebp)        %ebx
 453 / len           16(%ebp)        %ecx
 454 / digit         20(%ebp)        %mm3
 455
 456         ENTRY(big_mul_set_vec_sse2)
 457         pushl   %ebp
 458         movl    %esp, %ebp
 459         pushl   %ebx
 460         movl    8(%ebp), %edx
 461         movl    12(%ebp), %ebx
 462         movl    16(%ebp), %ecx
 463         movd    20(%ebp), %mm3
 464         call    big_mul_set_vec_sse2_r
 465         popl    %ebx
 466         emms
 467         leave
 468         ret
 469         SET_SIZE(big_mul_set_vec_sse2)
 470
 471 #endif  /* MMX_MANAGE */
 472
 473
 474 / r = r + a * digit, r and a are vectors of length len
 475 / returns the carry digit
 476 / Suitable only for x86 models that support SSE2 instruction set extensions
 477 /
 478 / uint32_t
 479 / big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
 480 /
 481 / r     %edx
 482 / a     %ebx
 483 / len   %ecx
 484 / digit %mm3
 485 /
 486 / N.B.:
 487 /   This is strictly for internal use.
 488 /   The interface is very light-weight.
 489 /   All parameters are passed in registers.
 490 /   It does not conform to the SYSV x86 ABI.
 491 /   So, don't even think about calling this function directly from C code.
 492 /
 493 / The basic multiply digit loop is unrolled 8 times.
 494 / Each comment is preceded by an instance number.
 495 / Instructions that have been moved retain their original, "natural"
 496 / instance number.  It should be easier this way to follow
 497 / the step-wise refinement process that went into constructing
 498 / the final code.
 499
 500         ENTRY(big_mul_add_vec_sse2_r)
 501         xorl    %eax, %eax
 502         testl   %ecx, %ecx
 503         jz      .L27
 504
 505         pxor    %mm0, %mm0      / cy = 0
 506
 507 .L25:
 508         cmpl    $UNROLL, %ecx
 509         jl      .L26
 510         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 511         movd    0(%edx), %mm2   / 1: mm2 = r[i]
 512         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 513         paddq   %mm1, %mm2      / 1: mm2 = digit * a[i] + r[i]
 514         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 515         paddq   %mm2, %mm0      / 1: mm0 = digit * a[i] + r[i] + cy;
 516         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 517         movd    4(%edx), %mm2   / 2: mm2 = r[i]
 518         psrlq   $32, %mm0       / 1: cy = product[63..32]
 519
 520         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 521         paddq   %mm1, %mm2      / 2: mm2 = digit * a[i] + r[i]
 522         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 523         paddq   %mm2, %mm0      / 2: mm0 = digit * a[i] + r[i] + cy;
 524         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 525         movd    8(%edx), %mm2   / 3: mm2 = r[i]
 526         psrlq   $32, %mm0       / 2: cy = product[63..32]
 527
 528         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 529         paddq   %mm1, %mm2      / 3: mm2 = digit * a[i] + r[i]
 530         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 531         paddq   %mm2, %mm0      / 3: mm0 = digit * a[i] + r[i] + cy;
 532         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 533         movd    12(%edx), %mm2  / 4: mm2 = r[i]
 534         psrlq   $32, %mm0       / 3: cy = product[63..32]
 535
 536         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 537         paddq   %mm1, %mm2      / 4: mm2 = digit * a[i] + r[i]
 538         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 539         paddq   %mm2, %mm0      / 4: mm0 = digit * a[i] + r[i] + cy;
 540         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 541         movd    16(%edx), %mm2  / 5: mm2 = r[i]
 542         psrlq   $32, %mm0       / 4: cy = product[63..32]
 543
 544         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 545         paddq   %mm1, %mm2      / 5: mm2 = digit * a[i] + r[i]
 546         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 547         paddq   %mm2, %mm0      / 5: mm0 = digit * a[i] + r[i] + cy;
 548         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 549         movd    20(%edx), %mm2  / 6: mm2 = r[i]
 550         psrlq   $32, %mm0       / 5: cy = product[63..32]
 551
 552         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 553         paddq   %mm1, %mm2      / 6: mm2 = digit * a[i] + r[i]
 554         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 555         paddq   %mm2, %mm0      / 6: mm0 = digit * a[i] + r[i] + cy;
 556         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 557         movd    24(%edx), %mm2  / 7: mm2 = r[i]
 558         psrlq   $32, %mm0       / 6: cy = product[63..32]
 559
 560         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 561         paddq   %mm1, %mm2      / 7: mm2 = digit * a[i] + r[i]
 562         movd    28(%ebx), %mm1  / 8: mm1 = a[i]
 563         paddq   %mm2, %mm0      / 7: mm0 = digit * a[i] + r[i] + cy;
 564         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 565         movd    28(%edx), %mm2  / 8: mm2 = r[i]
 566         psrlq   $32, %mm0       / 7: cy = product[63..32]
 567
 568         pmuludq %mm3, %mm1      / 8: mm1 = digit * a[i]
 569         paddq   %mm1, %mm2      / 8: mm2 = digit * a[i] + r[i]
 570         paddq   %mm2, %mm0      / 8: mm0 = digit * a[i] + r[i] + cy;
 571         movd    %mm0, 28(%edx)  / 8: r[i] = product[31..0]
 572         psrlq   $32, %mm0       / 8: cy = product[63..32]
 573
 574         leal    UNROLL32(%ebx), %ebx    / a += UNROLL
 575         leal    UNROLL32(%edx), %edx    / r += UNROLL
 576         subl    $UNROLL, %ecx           / len -= UNROLL
 577         jz      .L27
 578         jmp     .L25
 579
 580 .L26:
 581         movd    0(%ebx), %mm1   / 1: mm1 = a[i]
 582         movd    0(%edx), %mm2   / 1: mm2 = r[i]
 583         pmuludq %mm3, %mm1      / 1: mm1 = digit * a[i]
 584         paddq   %mm1, %mm2      / 1: mm2 = digit * a[i] + r[i]
 585         paddq   %mm2, %mm0      / 1: mm0 = digit * a[i] + r[i] + cy;
 586         movd    %mm0, 0(%edx)   / 1: r[i] = product[31..0]
 587         psrlq   $32, %mm0       / 1: cy = product[63..32]
 588         subl    $1, %ecx
 589         jz      .L27
 590
 591         movd    4(%ebx), %mm1   / 2: mm1 = a[i]
 592         movd    4(%edx), %mm2   / 2: mm2 = r[i]
 593         pmuludq %mm3, %mm1      / 2: mm1 = digit * a[i]
 594         paddq   %mm1, %mm2      / 2: mm2 = digit * a[i] + r[i]
 595         paddq   %mm2, %mm0      / 2: mm0 = digit * a[i] + r[i] + cy;
 596         movd    %mm0, 4(%edx)   / 2: r[i] = product[31..0]
 597         psrlq   $32, %mm0       / 2: cy = product[63..32]
 598         subl    $1, %ecx
 599         jz      .L27
 600
 601         movd    8(%ebx), %mm1   / 3: mm1 = a[i]
 602         movd    8(%edx), %mm2   / 3: mm2 = r[i]
 603         pmuludq %mm3, %mm1      / 3: mm1 = digit * a[i]
 604         paddq   %mm1, %mm2      / 3: mm2 = digit * a[i] + r[i]
 605         paddq   %mm2, %mm0      / 3: mm0 = digit * a[i] + r[i] + cy;
 606         movd    %mm0, 8(%edx)   / 3: r[i] = product[31..0]
 607         psrlq   $32, %mm0       / 3: cy = product[63..32]
 608         subl    $1, %ecx
 609         jz      .L27
 610
 611         movd    12(%ebx), %mm1  / 4: mm1 = a[i]
 612         movd    12(%edx), %mm2  / 4: mm2 = r[i]
 613         pmuludq %mm3, %mm1      / 4: mm1 = digit * a[i]
 614         paddq   %mm1, %mm2      / 4: mm2 = digit * a[i] + r[i]
 615         paddq   %mm2, %mm0      / 4: mm0 = digit * a[i] + r[i] + cy;
 616         movd    %mm0, 12(%edx)  / 4: r[i] = product[31..0]
 617         psrlq   $32, %mm0       / 4: cy = product[63..32]
 618         subl    $1, %ecx
 619         jz      .L27
 620
 621         movd    16(%ebx), %mm1  / 5: mm1 = a[i]
 622         movd    16(%edx), %mm2  / 5: mm2 = r[i]
 623         pmuludq %mm3, %mm1      / 5: mm1 = digit * a[i]
 624         paddq   %mm1, %mm2      / 5: mm2 = digit * a[i] + r[i]
 625         paddq   %mm2, %mm0      / 5: mm0 = digit * a[i] + r[i] + cy;
 626         movd    %mm0, 16(%edx)  / 5: r[i] = product[31..0]
 627         psrlq   $32, %mm0       / 5: cy = product[63..32]
 628         subl    $1, %ecx
 629         jz      .L27
 630
 631         movd    20(%ebx), %mm1  / 6: mm1 = a[i]
 632         movd    20(%edx), %mm2  / 6: mm2 = r[i]
 633         pmuludq %mm3, %mm1      / 6: mm1 = digit * a[i]
 634         paddq   %mm1, %mm2      / 6: mm2 = digit * a[i] + r[i]
 635         paddq   %mm2, %mm0      / 6: mm0 = digit * a[i] + r[i] + cy;
 636         movd    %mm0, 20(%edx)  / 6: r[i] = product[31..0]
 637         psrlq   $32, %mm0       / 6: cy = product[63..32]
 638         subl    $1, %ecx
 639         jz      .L27
 640
 641         movd    24(%ebx), %mm1  / 7: mm1 = a[i]
 642         movd    24(%edx), %mm2  / 7: mm2 = r[i]
 643         pmuludq %mm3, %mm1      / 7: mm1 = digit * a[i]
 644         paddq   %mm1, %mm2      / 7: mm2 = digit * a[i] + r[i]
 645         paddq   %mm2, %mm0      / 7: mm0 = digit * a[i] + r[i] + cy;
 646         movd    %mm0, 24(%edx)  / 7: r[i] = product[31..0]
 647         psrlq   $32, %mm0       / 7: cy = product[63..32]
 648
 649 .L27:
 650         movd    %mm0, %eax
 651         / no emms.  caller is responsible for emms
 652         ret
 653         SET_SIZE(big_mul_add_vec_sse2_r)
 654
 655
 656 / r = r + a * digit, r and a are vectors of length len
 657 / returns the carry digit
 658 / Suitable only for x86 models that support SSE2 instruction set extensions
 659 /
 660 / r              8(%ebp)        %edx
 661 / a             12(%ebp)        %ebx
 662 / len           16(%ebp)        %ecx
 663 / digit         20(%ebp)        %mm3
 664 /
 665 / In userland, there is just the one function, big_mul_add_vec_sse2().
 666 / But in the kernel, there are two variations:
 667 /    1. big_mul_add_vec_sse2() which does what is necessary to save and
 668 /       restore state, if necessary, and to ensure that preemtion is
 669 /       disabled.
 670 /    2. big_mul_add_vec_sse2_nsv() which just does the work;
 671 /       it is the caller's responsibility to ensure that MMX state
 672 /       does not need to be saved and restored and that preemption
 673 /       is already disabled.
 674
 675
 676 #if defined(MMX_MANAGE)
 677
 678         ENTRY(big_mul_add_vec_sse2)
 679         pushl   %ebp
 680         movl    %esp, %ebp
 681         pushl   %ebx
 682         pushl   %esi
 683         KPREEMPT_DISABLE
 684         TEST_TS(%ebx)
 685         pushl   %ebx
 686         jnz     .addvec_no_save
 687         pushl   %edi
 688         SAVE_MMX_0TO4(%edi)
 689         movl    8(%ebp), %edx
 690         movl    12(%ebp), %ebx
 691         movl    16(%ebp), %ecx
 692         movd    20(%ebp), %mm3
 693         call    big_mul_add_vec_sse2_r
 694         movl    %eax, %esi
 695         RSTOR_MMX_0TO4(%edi)
 696         popl    %edi
 697         jmp     .addvec_rtn
 698
 699 .addvec_no_save:
 700         movl    8(%ebp), %edx
 701         movl    12(%ebp), %ebx
 702         movl    16(%ebp), %ecx
 703         movd    20(%ebp), %mm3
 704         call    big_mul_add_vec_sse2_r
 705         movl    %eax, %esi
 706
 707 .addvec_rtn:
 708         emms
 709         popl    %ebx
 710         movl    %ebx, %cr0
 711         KPREEMPT_ENABLE
 712         movl    %esi, %eax
 713         popl    %esi
 714         popl    %ebx
 715         leave
 716         ret
 717         SET_SIZE(big_mul_add_vec_sse2)
 718
 719         ENTRY(big_mul_add_vec_sse2_nsv)
 720         pushl   %ebp
 721         movl    %esp, %ebp
 722         pushl   %ebx
 723         movl    8(%ebp), %edx
 724         movl    12(%ebp), %ebx
 725         movl    16(%ebp), %ecx
 726         movd    20(%ebp), %mm3
 727         call    big_mul_add_vec_sse2_r
 728         popl    %ebx
 729         leave
 730         ret
 731         SET_SIZE(big_mul_add_vec_sse2_nsv)
 732
 733
 734 #else   /* !defined(MMX_MANAGE) */
 735
 736         ENTRY(big_mul_add_vec_sse2)
 737         pushl   %ebp
 738         movl    %esp, %ebp
 739         pushl   %ebx
 740         movl    8(%ebp), %edx
 741         movl    12(%ebp), %ebx
 742         movl    16(%ebp), %ecx
 743         movd    20(%ebp), %mm3
 744         call    big_mul_add_vec_sse2_r
 745         popl    %ebx
 746         emms
 747         leave
 748         ret
 749         SET_SIZE(big_mul_add_vec_sse2)
 750
 751 #endif  /* MMX_MANAGE */
 752
 753
 754 / void
 755 / big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
 756 / {
 757 /       int i;
 758 /
 759 /       r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
 760 /       for (i = 1; i < blen; ++i)
 761 /               r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
 762 / }
 763
 764
 765 #if defined(MMX_MANAGE)
 766         ENTRY(big_mul_vec_sse2_fc)
 767 #else
 768         ENTRY(big_mul_vec_sse2)
 769 #endif
 770         subl    $0x8, %esp
 771         pushl   %ebx
 772         pushl   %ebp
 773         pushl   %esi
 774         pushl   %edi
 775         movl    40(%esp), %eax
 776         movl    %eax, 20(%esp)
 777         pushl   (%eax)
 778         movl    40(%esp), %edi
 779         pushl   %edi
 780         movl    40(%esp), %esi
 781         pushl   %esi
 782         movl    40(%esp), %ebx
 783         pushl   %ebx
 784 #if defined(MMX_MANAGE)
 785         call    big_mul_set_vec_sse2_nsv
 786 #else
 787         call    big_mul_set_vec_sse2
 788 #endif
 789         addl    $0x10, %esp
 790         movl    %eax, (%ebx,%edi,4)
 791         movl    44(%esp), %eax
 792         movl    %eax, 16(%esp)
 793         cmpl    $0x1, %eax
 794         jle     .mulvec_rtn
 795         movl    $0x1, %ebp
 796
 797         .align 16
 798 .mulvec_add:
 799         movl    20(%esp), %eax
 800         pushl   (%eax,%ebp,4)
 801         pushl   %edi
 802         pushl   %esi
 803         leal    (%ebx,%ebp,4), %eax
 804         pushl   %eax
 805 #if defined(MMX_MANAGE)
 806         call    big_mul_add_vec_sse2_nsv
 807 #else
 808         call    big_mul_add_vec_sse2
 809 #endif
 810         addl    $0x10, %esp
 811         leal    (%ebp,%edi), %ecx
 812         movl    %eax, (%ebx,%ecx,4)
 813         incl    %ebp
 814         cmpl    16(%esp), %ebp
 815         jl      .mulvec_add
 816 .mulvec_rtn:
 817 #if defined(MMX_MANAGE)
 818         emms
 819 #endif
 820         popl    %edi
 821         popl    %esi
 822         popl    %ebp
 823         popl    %ebx
 824         addl    $0x8, %esp
 825         ret
 826 #if defined(MMX_MANAGE)
 827         SET_SIZE(big_mul_vec_sse2_fc)
 828 #else
 829         SET_SIZE(big_mul_vec_sse2)
 830 #endif
 831
 832 #if defined(MMX_MANAGE)
 833
 834         ENTRY(big_mul_vec_sse2)
 835         pushl   %ebp
 836         movl    %esp, %ebp
 837         subl    $8, %esp
 838         pushl   %edi
 839         KPREEMPT_DISABLE
 840         TEST_TS(%eax)
 841         movl    %eax, -8(%ebp)
 842         jnz     .mulvec_no_save
 843         SAVE_MMX_0TO4(%edi)
 844         movl    %edi, -4(%ebp)
 845 .mulvec_no_save:
 846         movl    24(%ebp), %eax          / blen
 847         pushl   %eax
 848         movl    20(%ebp), %eax          / b
 849         pushl   %eax
 850         movl    16(%ebp), %eax          / alen
 851         pushl   %eax
 852         movl    12(%ebp), %eax          / a
 853         pushl   %eax
 854         movl    8(%ebp), %eax           / r
 855         pushl   %eax
 856         call    big_mul_vec_sse2_fc
 857         addl    $20, %esp
 858         movl    -8(%ebp), %eax
 859         testl   $CR0_TS, %eax
 860         jnz     .mulvec_no_rstr
 861         movl    -4(%ebp), %edi
 862         RSTOR_MMX_0TO4(%edi)
 863 .mulvec_no_rstr:
 864         movl    %eax, %cr0
 865         KPREEMPT_ENABLE
 866         popl    %edi
 867         leave
 868         ret
 869         SET_SIZE(big_mul_vec_sse2)
 870
 871 #endif  /* MMX_MANAGE */
 872
 873
 874
 875 #undef UNROLL
 876 #undef UNROLL32
 877
 878
 879 / r = a * a, r and a are vectors of length len
 880 / Suitable only for x86 models that support SSE2 instruction set extensions
 881 /
 882 / This function is not suitable for a truly general-purpose multiprecision
 883 / arithmetic library, because it does not work for "small" numbers, that is
 884 / numbers of 1 or 2 digits.  big_mul() just uses the ordinary big_mul_vec()
 885 / for any small numbers.
 886
 887 #if defined(MMX_MANAGE)
 888         ENTRY(big_sqr_vec_sse2_fc)
 889 #else
 890         ENTRY(big_sqr_vec_sse2)
 891         pushl   %ebp
 892         movl    %esp, %ebp
 893 #endif
 894
 895         pushl   %ebx
 896         pushl   %edi
 897         pushl   %esi
 898
 899         / r[1..alen] = a[0] * a[1..alen-1]
 900
 901         movl    8(%ebp), %edi           / r = arg(r)
 902         movl    12(%ebp), %esi          / a = arg(a)
 903         movl    16(%ebp), %ecx          / cnt = arg(alen)
 904         movd    %ecx, %mm4              / save_cnt = arg(alen)
 905         leal    4(%edi), %edx           / dst = &r[1]
 906         movl    %esi, %ebx              / src = a
 907         movd    0(%ebx), %mm3           / mm3 = a[0]
 908         leal    4(%ebx), %ebx           / src = &a[1]
 909         subl    $1, %ecx                / --cnt
 910         call    big_mul_set_vec_sse2_r  / r[1..alen-1] = a[0] * a[1..alen-1]
 911         movl    %edi, %edx              / dst = r
 912         movl    %esi, %ebx              / src = a
 913         movd    %mm4, %ecx              / cnt = save_cnt
 914         movl    %eax, (%edx, %ecx, 4)   / r[cnt] = cy
 915
 916 /       /* High-level vector C pseudocode */
 917 /       for (i = 1; i < alen-1; ++i)
 918 /               r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
 919 /
 920 /       /* Same thing, but slightly lower level C-like pseudocode */
 921 /       i = 1;
 922 /       r = &arg_r[2*i + 1];
 923 /       a = &arg_a[i + 1];
 924 /       digit = arg_a[i];
 925 /       cnt = alen - 3;
 926 /       while (cnt != 0) {
 927 /               r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
 928 /               r += 2;
 929 /               ++a;
 930 /               --cnt;
 931 /       }
 932 /
 933 /       /* Same thing, but even lower level
 934 /        * For example, pointers are raw pointers,
 935 /        * with no scaling by object size.
 936 /        */
 937 /       r = arg_r + 12; /* i == 1; 2i + 1 == 3;  4*3 == 12; */
 938 /       a = arg_a + 8;
 939 /       digit = *(arg_a + 4);
 940 /       cnt = alen - 3;
 941 /       while (cnt != 0) {
 942 /               cy = big_mul_add_vec_sse2_r();
 943 /               *(r + 4 * cnt) = cy;
 944 /               r += 8;
 945 /               a += 4;
 946 /               --cnt;
 947 /       }
 948
 949         leal    4(%edi), %edi           / r += 4; r = &r[1]
 950         leal    4(%esi), %esi           / a += 4; a = &a[1]
 951         movd    %mm4, %ecx              / cnt = save
 952         subl    $2, %ecx                / cnt = alen - 2; i in 1..alen-2
 953         movd    %ecx, %mm4              / save_cnt
 954         jecxz   .L32                    / while (cnt != 0) {
 955 .L31:
 956         movd    0(%esi), %mm3           / digit = a[i]
 957         leal    4(%esi), %esi           / a += 4; a = &a[1]; a = &a[i + 1]
 958         leal    8(%edi), %edi           / r += 8; r = &r[2]; r = &r[2 * i + 1]
 959         movl    %edi, %edx              / edx = r
 960         movl    %esi, %ebx              / ebx = a
 961         cmp     $1, %ecx                / The last triangle term is special
 962         jz      .L32
 963         call    big_mul_add_vec_sse2_r
 964         movd    %mm4, %ecx              / cnt = save_cnt
 965         movl    %eax, (%edi, %ecx, 4)   / r[cnt] = cy
 966         subl    $1, %ecx                / --cnt
 967         movd    %ecx, %mm4              / save_cnt = cnt
 968         jmp     .L31                    / }
 969
 970 .L32:
 971         movd    0(%ebx), %mm1           / mm1 = a[i + 1]
 972         movd    0(%edx), %mm2           / mm2 = r[2 * i + 1]
 973         pmuludq %mm3, %mm1              / mm1 = p = digit * a[i + 1]
 974         paddq   %mm1, %mm2              / mm2 = r[2 * i + 1] + p
 975         movd    %mm2, 0(%edx)           / r[2 * i + 1] += lo32(p)
 976         psrlq   $32, %mm2               / mm2 = cy
 977         movd    %mm2, 4(%edx)           / r[2 * i + 2] = cy
 978         pxor    %mm2, %mm2
 979         movd    %mm2, 8(%edx)           / r[2 * i + 3] = 0
 980
 981         movl    8(%ebp), %edx           / r = arg(r)
 982         movl    12(%ebp), %ebx          / a = arg(a)
 983         movl    16(%ebp), %ecx          / cnt = arg(alen)
 984
 985         / compute low-order corner
 986         / p = a[0]**2
 987         / r[0] = lo32(p)
 988         / cy   = hi32(p)
 989         movd    0(%ebx), %mm2           / mm2 = a[0]
 990         pmuludq %mm2, %mm2              / mm2 = p = a[0]**2
 991         movd    %mm2, 0(%edx)           / r[0] = lo32(p)
 992         psrlq   $32, %mm2               / mm2 = cy = hi32(p)
 993
 994         / p = 2 * r[1]
 995         / t = p + cy
 996         / r[1] = lo32(t)
 997         / cy   = hi32(t)
 998         movd    4(%edx), %mm1           / mm1 = r[1]
 999         psllq   $1, %mm1                / mm1 = p = 2 * r[1]
1000         paddq   %mm1, %mm2              / mm2 = t = p + cy
1001         movd    %mm2, 4(%edx)           / r[1] = low32(t)
1002         psrlq   $32, %mm2               / mm2 = cy = hi32(t)
1003
1004         / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
1005         subl    $2, %ecx                / cnt = alen - 2
1006 .L34:
1007         movd    4(%ebx), %mm0           / mm0 = diag = a[i+1]
1008         pmuludq %mm0, %mm0              / mm0 = p = diag**2
1009         paddq   %mm0, %mm2              / mm2 = t = p + cy
1010         movd    %mm2, %eax
1011         movd    %eax, %mm1              / mm1 = lo32(t)
1012         psrlq   $32, %mm2               / mm2 = hi32(t)
1013
1014         movd    8(%edx), %mm3           / mm3 = r[2*i]
1015         psllq   $1, %mm3                / mm3 = 2*r[2*i]
1016         paddq   %mm3, %mm1              / mm1 = 2*r[2*i] + lo32(t)
1017         movd    %mm1, 8(%edx)           / r[2*i] = 2*r[2*i] + lo32(t)
1018         psrlq   $32, %mm1
1019         paddq   %mm1, %mm2
1020
1021         movd    12(%edx), %mm3          / mm3 = r[2*i+1]
1022         psllq   $1, %mm3                / mm3 = 2*r[2*i+1]
1023         paddq   %mm3, %mm2              / mm2 = 2*r[2*i+1] + hi32(t)
1024         movd    %mm2, 12(%edx)          / r[2*i+1] = mm2
1025         psrlq   $32, %mm2               / mm2 = cy
1026         leal    8(%edx), %edx           / r += 2
1027         leal    4(%ebx), %ebx           / ++a
1028         subl    $1, %ecx                / --cnt
1029         jnz     .L34
1030
1031         / Carry from last triangle term must participate in doubling,
1032         / but this step isn't paired up with a squaring the elements
1033         / of the inner diagonal.
1034         / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
1035         movd    8(%edx), %mm3           / mm3 = r[2*i]
1036         psllq   $1, %mm3                / mm3 = 2*r[2*i]
1037         paddq   %mm3, %mm2              / mm2 = 2*r[2*i] + cy
1038         movd    %mm2, 8(%edx)           / r[2*i] = lo32(2*r[2*i] + cy)
1039         psrlq   $32, %mm2               / mm2 = cy = hi32(2*r[2*i] + cy)
1040
1041         movd    12(%edx), %mm3          / mm3 = r[2*i+1]
1042         psllq   $1, %mm3                / mm3 = 2*r[2*i+1]
1043         paddq   %mm3, %mm2              / mm2 = 2*r[2*i+1] + cy
1044         movd    %mm2, 12(%edx)          / r[2*i+1] = mm2
1045         psrlq   $32, %mm2               / mm2 = cy
1046
1047         / compute high-order corner and add it in
1048         / p = a[alen - 1]**2
1049         / t = p + cy
1050         / r[alen + alen - 2] += lo32(t)
1051         / cy = hi32(t)
1052         / r[alen + alen - 1] = cy
1053         movd    4(%ebx), %mm0           / mm0 = a[$-1]
1054         movd    8(%edx), %mm3           / mm3 = r[$-2]
1055         pmuludq %mm0, %mm0              / mm0 = p = a[$-1]**2
1056         paddq   %mm0, %mm2              / mm2 = t = p + cy
1057         paddq   %mm3, %mm2              / mm2 = r[$-2] + t
1058         movd    %mm2, 8(%edx)           / r[$-2] = lo32(r[$-2] + t)
1059         psrlq   $32, %mm2               / mm2 = cy = hi32(r[$-2] + t)
1060         movd    12(%edx), %mm3
1061         paddq   %mm3, %mm2
1062         movd    %mm2, 12(%edx)          / r[$-1] += cy
1063
1064 .L35:
1065         emms
1066         popl    %esi
1067         popl    %edi
1068         popl    %ebx
1069
1070 #if defined(MMX_MANAGE)
1071         ret
1072         SET_SIZE(big_sqr_vec_sse2_fc)
1073 #else
1074         leave
1075         ret
1076         SET_SIZE(big_sqr_vec_sse2)
1077 #endif
1078
1079
1080 #if defined(MMX_MANAGE)
1081         ENTRY(big_sqr_vec_sse2)
1082         pushl   %ebp
1083         movl    %esp, %ebp
1084         KPREEMPT_DISABLE
1085         TEST_TS(%ebx)
1086         pushl   %ebx
1087         jnz     .sqr_no_save
1088         pushl   %edi
1089         SAVE_MMX_0TO4(%edi)
1090         call    big_sqr_vec_sse2_fc
1091         RSTOR_MMX_0TO4(%edi)
1092         popl    %edi
1093         jmp     .sqr_rtn
1094
1095 .sqr_no_save:
1096         call    big_sqr_vec_sse2_fc
1097
1098 .sqr_rtn:
1099         popl    %ebx
1100         movl    %ebx, %cr0
1101         KPREEMPT_ENABLE
1102         leave
1103         ret
1104         SET_SIZE(big_sqr_vec_sse2)
1105
1106 #endif  /* MMX_MANAGE */
1107
1108 / ------------------------------------------------------------------------
1109 /               UMUL Implementations
1110 / ------------------------------------------------------------------------
1111
1112
1113 / r = a * digit, r and a are vectors of length len
1114 / returns the carry digit
1115 / Does not use any MMX, SSE, or SSE2 instructions.
1116 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1117 / This is a fall-back implementation for x86 models that do not support
1118 / the PMULUDQ instruction.
1119 /
1120 / uint32_t
1121 / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1122 /
1123 / r              8(%ebp)        %edx    %edi
1124 / a             12(%ebp)        %ebx    %esi
1125 / len           16(%ebp)        %ecx
1126 / digit         20(%ebp)        %esi
1127
1128         ENTRY(big_mul_set_vec_umul)
1129         pushl   %ebp
1130         movl    %esp, %ebp
1131         pushl   %esi
1132         pushl   %edi
1133         pushl   %ebx
1134         movl    16(%ebp), %ecx
1135         xorl    %ebx, %ebx      / cy = 0
1136         testl   %ecx, %ecx
1137         movl    8(%ebp), %edi
1138         movl    12(%ebp), %esi
1139         je      .L57
1140
1141 .L55:
1142         movl    (%esi), %eax    / eax = a[i]
1143         leal    4(%esi), %esi   / ++a
1144         mull    20(%ebp)        / edx:eax = a[i] * digit
1145         addl    %ebx, %eax
1146         adcl    $0, %edx        / edx:eax = a[i] * digit + cy
1147         movl    %eax, (%edi)    / r[i] = product[31..0]
1148         movl    %edx, %ebx      / cy = product[63..32]
1149         leal    4(%edi), %edi   / ++r
1150         decl    %ecx            / --len
1151         jnz     .L55            / while (len != 0)
1152 .L57:
1153         movl    %ebx, %eax
1154         popl    %ebx
1155         popl    %edi
1156         popl    %esi
1157         leave
1158         ret
1159         SET_SIZE(big_mul_set_vec_umul)
1160
1161
1162 / r = r + a * digit, r and a are vectors of length len
1163 / returns the carry digit
1164 / Does not use any MMX, SSE, or SSE2 instructions.
1165 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1166 / This is a fall-back implementation for x86 models that do not support
1167 / the PMULUDQ instruction.
1168 /
1169 / uint32_t
1170 / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1171 /
1172 / r              8(%ebp)        %edx    %edi
1173 / a             12(%ebp)        %ebx    %esi
1174 / len           16(%ebp)        %ecx
1175 / digit         20(%ebp)        %esi
1176
1177         ENTRY(big_mul_add_vec_umul)
1178         pushl   %ebp
1179         movl    %esp, %ebp
1180         pushl   %esi
1181         pushl   %edi
1182         pushl   %ebx
1183         movl    16(%ebp), %ecx
1184         xorl    %ebx, %ebx      / cy = 0
1185         testl   %ecx, %ecx
1186         movl    8(%ebp), %edi
1187         movl    12(%ebp), %esi
1188         je      .L67
1189         .align 4
1190 .L65:
1191         movl    (%esi), %eax    / eax = a[i]
1192         leal    4(%esi), %esi   / ++a
1193         mull    20(%ebp)        / edx:eax = a[i] * digit
1194         addl    (%edi), %eax
1195         adcl    $0, %edx        / edx:eax = a[i] * digit + r[i]
1196         addl    %ebx, %eax
1197         adcl    $0, %edx        / edx:eax = a[i] * digit + r[i] + cy
1198         movl    %eax, (%edi)    / r[i] = product[31..0]
1199         movl    %edx, %ebx      / cy = product[63..32]
1200         leal    4(%edi), %edi   / ++r
1201         decl    %ecx            / --len
1202         jnz     .L65            / while (len != 0)
1203 .L67:
1204         movl    %ebx, %eax
1205         popl    %ebx
1206         popl    %edi
1207         popl    %esi
1208         leave
1209         ret
1210         SET_SIZE(big_mul_add_vec_umul)
1211
1212 #endif  /* __lint */