4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/asm_linkage.h>
26 #include <sys/x86_archext.h>
27 #include <sys/controlregs.h>
31 #include <sys/types.h>
37 /* Not to be called by C code */
40 big_mul_set_vec_sse2_r
()
43 /* Not to be called by C code */
46 big_mul_add_vec_sse2_r
()
51 big_mul_set_vec_sse2
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
56 big_mul_add_vec_sse2
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
61 big_mul_vec_sse2
(uint32_t
*r
, uint32_t
*a, int alen
, uint32_t
*b, int blen
)
66 big_sqr_vec_sse2
(uint32_t
*r
, uint32_t
*a, int len
)
69 #if defined(MMX_MANAGE)
73 big_mul_set_vec_sse2_nsv
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
78 big_mul_add_vec_sse2_nsv
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
81 /* Not to be called by C code */
84 big_sqr_vec_sse2_fc
(uint32_t
*r
, uint32_t
*a, int len
)
87 #endif /* MMX_MANAGE */
96 big_mul_set_vec_umul
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
101 big_mul_add_vec_umul
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
106 #if defined(MMX_MANAGE)
110 #define KPREEMPT_DISABLE call kpr_disable
111 #define KPREEMPT_ENABLE call kpr_enable
112 #define TEST_TS(reg) \
119 #define KPREEMPT_DISABLE
120 #define KPREEMPT_ENABLE
122 #define TEST_TS(reg) \
131 #define SAVE_MMX_PROLOG(sreg, nreg) \
132 subl $_MUL
(MMX_SIZE
, nreg
+ MMX_ALIGN
), %esp; \
134 addl $MMX_ALIGN
, sreg; \
135 andl $
-1![MMX_ALIGN-
1], sreg;
137 #define RSTOR_MMX_EPILOG(nreg) \
138 addl $_MUL
(MMX_SIZE
, nreg
+ MMX_ALIGN
), %esp;
140 #define SAVE_MMX_0TO4(sreg) \
141 SAVE_MMX_PROLOG
(sreg
, 5); \
142 movq
%mm0
, 0(sreg
); \
143 movq
%mm1
, 8(sreg
); \
144 movq
%mm2
, 16(sreg
); \
145 movq
%mm3
, 24(sreg
); \
148 #define RSTOR_MMX_0TO4(sreg) \
149 movq
0(sreg
), %mm0; \
150 movq
8(sreg
), %mm1; \
151 movq
16(sreg
), %mm2; \
152 movq
24(sreg
), %mm3; \
153 movq
32(sreg
), %mm4; \
156 #endif /* MMX_MANAGE */
158 / Note
: this file contains implementations for
163 / One set of implementations is for SSE2-capable models.
164 / The other uses no MMX
, SSE
, or SSE2 instructions
, only
165 / the x86
32 X
32 -> 64 unsigned multiply instruction
, MUL.
167 / The code for the implementations is grouped by SSE2 vs UMUL
,
168 / rather than grouping pairs of implementations for each function.
169 / This is because the bignum implementation gets
"imprinted"
170 / on the correct implementation
, at the time of first use
,
171 / so none of the code for the other implementations is ever
172 / executed. So
, it is
a no-brainer to layout the code to minimize
173 / the
"footprint" of executed code.
175 / Can we use SSE2 instructions? Return value is non-zero
179 / Using the cpuid instruction directly would work equally
180 / well in userland
and in the kernel
, but we do
not use the
181 / cpuid instruction in the kernel
, we use x86_featureset
,
182 / instead. This means we honor any decisions the kernel
183 / startup code may have made in setting this variable
,
184 / including disabling SSE2. It might even
be a good idea
185 / to honor this kind of setting in userland
, as well
, but
186 / the variable
, x86_featureset is
not readily available to
187 / userland processes.
192 ENTRY
(bignum_use_sse2
)
195 bt $X86FSET_SSE2
, x86_featureset
199 movl $
1, %eax
/ Get feature information
201 movl
%edx
, %eax
/ set return value
203 andl $CPUID_INTC_EDX_SSE2
, %eax
206 SET_SIZE
(bignum_use_sse2
)
209 / ------------------------------------------------------------------------
210 / SSE2 Implementations
211 / ------------------------------------------------------------------------
213 / r
= a * digit
, r
and a are vectors of length len
214 / returns the carry digit
215 / Suitable only for x86 models that support SSE2 instruction set extensions
218 / big_mul_set_vec_sse2_r
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
225 / Does
not touch the following registers
: %esi
, %edi
, %mm4
228 / This is strictly for internal use.
229 / The interface is very light-weight.
230 / All parameters are passed in registers.
231 / It does
not conform to the SYSV x86 ABI.
232 / So
, don
't even think about calling this function directly from C code.
234 / The basic multiply digit loop is unrolled 8 times.
235 / Each comment is preceded by an instance number.
236 / Instructions that have been moved retain their original, "natural"
237 / instance number. It should be easier this way to follow
238 / the step-wise refinement process that went into constructing
244 ENTRY(big_mul_set_vec_sse2_r)
245 xorl %eax, %eax / if (len == 0) return (0);
249 pxor %mm0, %mm0 / cy = 0
254 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
255 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
256 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
257 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
258 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
259 psrlq $32, %mm0 / 1: cy = product[63..32]
261 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
262 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
263 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
264 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
265 psrlq $32, %mm0 / 2: cy = product[63..32]
267 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
268 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
269 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
270 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
271 psrlq $32, %mm0 / 3: cy = product[63..32]
273 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
274 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
275 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
276 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
277 psrlq $32, %mm0 / 4: cy = product[63..32]
279 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
280 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
281 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
282 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
283 psrlq $32, %mm0 / 5: cy = product[63..32]
285 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
286 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
287 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
288 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
289 psrlq $32, %mm0 / 6: cy = product[63..32]
291 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
292 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
293 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
294 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
295 psrlq $32, %mm0 / 7: cy = product[63..32]
297 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
298 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy;
299 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
300 psrlq $32, %mm0 / 8: cy = product[63..32]
302 leal UNROLL32(%ebx), %ebx / a += UNROLL
303 leal UNROLL32(%edx), %edx / r += UNROLL
304 subl $UNROLL, %ecx / len -= UNROLL
309 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
310 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
311 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
312 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
313 psrlq $32, %mm0 / 1: cy = product[63..32]
317 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
318 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
319 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
320 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
321 psrlq $32, %mm0 / 2: cy = product[63..32]
325 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
326 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
327 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
328 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
329 psrlq $32, %mm0 / 3: cy = product[63..32]
333 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
334 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
335 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
336 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
337 psrlq $32, %mm0 / 4: cy = product[63..32]
341 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
342 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
343 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
344 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
345 psrlq $32, %mm0 / 5: cy = product[63..32]
349 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
350 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
351 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
352 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
353 psrlq $32, %mm0 / 6: cy = product[63..32]
357 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
358 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
359 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
360 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
361 psrlq $32, %mm0 / 7: cy = product[63..32]
364 movd %mm0, %eax / return (cy)
365 / no emms. caller is responsible for emms
367 SET_SIZE(big_mul_set_vec_sse2_r)
370 / r = a * digit, r and a are vectors of length len
371 / returns the carry digit
372 / Suitable only for x86 models that support SSE2 instruction set extensions
377 / digit 20(%ebp) %mm3
379 / In userland, there is just the one function, big_mul_set_vec_sse2().
380 / But in the kernel, there are two variations:
381 / 1. big_mul_set_vec_sse2() which does what is necessary to save and
382 / restore state, if necessary, and to ensure that preemtion is
384 / 2. big_mul_set_vec_sse2_nsv() which just does the work;
385 / it is the caller's responsibility to ensure that MMX state
386 / does
not need to
be saved
and restored
and that preemption
387 / is already disabled.
389 #if defined(MMX_MANAGE)
390 ENTRY
(big_mul_set_vec_sse2
)
405 call big_mul_set_vec_sse2_r
416 call big_mul_set_vec_sse2_r
429 SET_SIZE
(big_mul_set_vec_sse2
)
431 ENTRY
(big_mul_set_vec_sse2_nsv
)
439 call big_mul_set_vec_sse2_r
443 SET_SIZE
(big_mul_set_vec_sse2_nsv
)
445 #else /* !defined(MMX_MANAGE) */
447 / r
= a * digit
, r
and a are vectors of length len
448 / returns the carry digit
449 / Suitable only for x86 models that support SSE2 instruction set extensions
454 / digit
20(%ebp
) %mm3
456 ENTRY
(big_mul_set_vec_sse2
)
464 call big_mul_set_vec_sse2_r
469 SET_SIZE
(big_mul_set_vec_sse2
)
471 #endif /* MMX_MANAGE */
474 / r
= r
+ a * digit
, r
and a are vectors of length len
475 / returns the carry digit
476 / Suitable only for x86 models that support SSE2 instruction set extensions
479 / big_mul_add_vec_sse2_r
(uint32_t
*r
, uint32_t
*a, int len
, uint32_t digit
)
487 / This is strictly for internal use.
488 / The interface is very light-weight.
489 / All parameters are passed in registers.
490 / It does
not conform to the SYSV x86 ABI.
491 / So
, don
't even think about calling this function directly from C code.
493 / The basic multiply digit loop is unrolled 8 times.
494 / Each comment is preceded by an instance number.
495 / Instructions that have been moved retain their original, "natural"
496 / instance number. It should be easier this way to follow
497 / the step-wise refinement process that went into constructing
500 ENTRY(big_mul_add_vec_sse2_r)
505 pxor %mm0, %mm0 / cy = 0
510 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
511 movd 0(%edx), %mm2 / 1: mm2 = r[i]
512 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
513 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
514 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
515 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
516 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
517 movd 4(%edx), %mm2 / 2: mm2 = r[i]
518 psrlq $32, %mm0 / 1: cy = product[63..32]
520 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
521 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
522 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
523 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
524 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
525 movd 8(%edx), %mm2 / 3: mm2 = r[i]
526 psrlq $32, %mm0 / 2: cy = product[63..32]
528 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
529 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
530 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
531 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
532 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
533 movd 12(%edx), %mm2 / 4: mm2 = r[i]
534 psrlq $32, %mm0 / 3: cy = product[63..32]
536 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
537 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
538 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
539 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
540 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
541 movd 16(%edx), %mm2 / 5: mm2 = r[i]
542 psrlq $32, %mm0 / 4: cy = product[63..32]
544 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
545 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
546 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
547 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
548 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
549 movd 20(%edx), %mm2 / 6: mm2 = r[i]
550 psrlq $32, %mm0 / 5: cy = product[63..32]
552 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
553 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
554 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
555 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
556 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
557 movd 24(%edx), %mm2 / 7: mm2 = r[i]
558 psrlq $32, %mm0 / 6: cy = product[63..32]
560 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
561 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
562 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
563 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
564 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
565 movd 28(%edx), %mm2 / 8: mm2 = r[i]
566 psrlq $32, %mm0 / 7: cy = product[63..32]
568 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
569 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i]
570 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy;
571 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
572 psrlq $32, %mm0 / 8: cy = product[63..32]
574 leal UNROLL32(%ebx), %ebx / a += UNROLL
575 leal UNROLL32(%edx), %edx / r += UNROLL
576 subl $UNROLL, %ecx / len -= UNROLL
581 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
582 movd 0(%edx), %mm2 / 1: mm2 = r[i]
583 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
584 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
585 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
586 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
587 psrlq $32, %mm0 / 1: cy = product[63..32]
591 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
592 movd 4(%edx), %mm2 / 2: mm2 = r[i]
593 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
594 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
595 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
596 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
597 psrlq $32, %mm0 / 2: cy = product[63..32]
601 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
602 movd 8(%edx), %mm2 / 3: mm2 = r[i]
603 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
604 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
605 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
606 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
607 psrlq $32, %mm0 / 3: cy = product[63..32]
611 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
612 movd 12(%edx), %mm2 / 4: mm2 = r[i]
613 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
614 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
615 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
616 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
617 psrlq $32, %mm0 / 4: cy = product[63..32]
621 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
622 movd 16(%edx), %mm2 / 5: mm2 = r[i]
623 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
624 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
625 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
626 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
627 psrlq $32, %mm0 / 5: cy = product[63..32]
631 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
632 movd 20(%edx), %mm2 / 6: mm2 = r[i]
633 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
634 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
635 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
636 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
637 psrlq $32, %mm0 / 6: cy = product[63..32]
641 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
642 movd 24(%edx), %mm2 / 7: mm2 = r[i]
643 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
644 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
645 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
646 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
647 psrlq $32, %mm0 / 7: cy = product[63..32]
651 / no emms. caller is responsible for emms
653 SET_SIZE(big_mul_add_vec_sse2_r)
656 / r = r + a * digit, r and a are vectors of length len
657 / returns the carry digit
658 / Suitable only for x86 models that support SSE2 instruction set extensions
663 / digit 20(%ebp) %mm3
665 / In userland, there is just the one function, big_mul_add_vec_sse2().
666 / But in the kernel, there are two variations:
667 / 1. big_mul_add_vec_sse2() which does what is necessary to save and
668 / restore state, if necessary, and to ensure that preemtion is
670 / 2. big_mul_add_vec_sse2_nsv() which just does the work;
671 / it is the caller's responsibility to ensure that MMX state
672 / does
not need to
be saved
and restored
and that preemption
673 / is already disabled.
676 #if defined(MMX_MANAGE)
678 ENTRY
(big_mul_add_vec_sse2
)
693 call big_mul_add_vec_sse2_r
704 call big_mul_add_vec_sse2_r
717 SET_SIZE
(big_mul_add_vec_sse2
)
719 ENTRY
(big_mul_add_vec_sse2_nsv
)
727 call big_mul_add_vec_sse2_r
731 SET_SIZE
(big_mul_add_vec_sse2_nsv
)
734 #else /* !defined(MMX_MANAGE) */
736 ENTRY
(big_mul_add_vec_sse2
)
744 call big_mul_add_vec_sse2_r
749 SET_SIZE
(big_mul_add_vec_sse2
)
751 #endif /* MMX_MANAGE */
755 / big_mul_vec_sse2
(uint32_t
*r
, uint32_t
*a, int alen
, uint32_t
*b, int blen
)
759 / r
[alen
] = big_mul_set_vec_sse2
(r
, a, alen
, b[0]);
760 / for
(i
= 1; i
< blen;
++i
)
761 / r
[alen
+ i
] = big_mul_add_vec_sse2
(r+i
, a, alen
, b[i
]);
765 #if defined(MMX_MANAGE)
766 ENTRY
(big_mul_vec_sse2_fc
)
768 ENTRY
(big_mul_vec_sse2
)
784 #if defined(MMX_MANAGE)
785 call big_mul_set_vec_sse2_nsv
787 call big_mul_set_vec_sse2
790 movl
%eax
, (%ebx
,%edi
,4)
803 leal
(%ebx
,%ebp
,4), %eax
805 #if defined(MMX_MANAGE)
806 call big_mul_add_vec_sse2_nsv
808 call big_mul_add_vec_sse2
811 leal
(%ebp
,%edi
), %ecx
812 movl
%eax
, (%ebx
,%ecx
,4)
817 #if defined(MMX_MANAGE)
826 #if defined(MMX_MANAGE)
827 SET_SIZE
(big_mul_vec_sse2_fc
)
829 SET_SIZE
(big_mul_vec_sse2
)
832 #if defined(MMX_MANAGE)
834 ENTRY
(big_mul_vec_sse2
)
846 movl
24(%ebp
), %eax
/ blen
848 movl
20(%ebp
), %eax
/ b
850 movl
16(%ebp
), %eax
/ alen
852 movl
12(%ebp
), %eax
/ a
854 movl
8(%ebp
), %eax
/ r
856 call big_mul_vec_sse2_fc
869 SET_SIZE
(big_mul_vec_sse2
)
871 #endif /* MMX_MANAGE */
879 / r
= a * a, r
and a are vectors of length len
880 / Suitable only for x86 models that support SSE2 instruction set extensions
882 / This function is
not suitable for
a truly general-purpose multiprecision
883 / arithmetic library
, because it does
not work for
"small" numbers
, that is
884 / numbers of
1 or 2 digits. big_mul
() just uses the ordinary big_mul_vec
()
885 / for any small numbers.
887 #if defined(MMX_MANAGE)
888 ENTRY
(big_sqr_vec_sse2_fc
)
890 ENTRY
(big_sqr_vec_sse2
)
899 / r
[1..alen] = a[0] * a[1..alen-1]
901 movl
8(%ebp
), %edi
/ r
= arg
(r
)
902 movl
12(%ebp
), %esi
/ a = arg
(a)
903 movl
16(%ebp
), %ecx
/ cnt
= arg
(alen
)
904 movd
%ecx
, %mm4
/ save_cnt
= arg
(alen
)
905 leal
4(%edi
), %edx
/ dst = &r
[1]
906 movl
%esi
, %ebx
/ src
= a
907 movd
0(%ebx
), %mm3
/ mm3
= a[0]
908 leal
4(%ebx
), %ebx
/ src
= &a[1]
909 subl $
1, %ecx
/ --cnt
910 call big_mul_set_vec_sse2_r
/ r
[1..alen-1] = a[0] * a[1..alen-1]
911 movl
%edi
, %edx
/ dst = r
912 movl
%esi
, %ebx
/ src
= a
913 movd
%mm4
, %ecx
/ cnt
= save_cnt
914 movl
%eax
, (%edx
, %ecx
, 4) / r
[cnt
] = cy
916 / /* High-level vector C pseudocode */
917 / for
(i
= 1; i
< alen-
1;
++i
)
918 / r
[2*i
+ 1 ... ] += a[i] * a[i+1 .. alen-1]
920 / /* Same thing, but slightly lower level C-like pseudocode */
922 / r
= &arg_r
[2*i
+ 1];
927 / r
[cnt
] = big_mul_add_vec_sse2_r
(r
, a, cnt
, digit
);
933 / /* Same thing, but even lower level
934 / * For example, pointers are raw pointers,
935 / * with no scaling by object size.
937 / r
= arg_r
+ 12;
/* i == 1; 2i + 1 == 3; 4*3 == 12; */
939 / digit
= *(arg_a
+ 4);
942 / cy
= big_mul_add_vec_sse2_r
();
943 / *(r
+ 4 * cnt
) = cy;
949 leal
4(%edi
), %edi
/ r
+= 4; r
= &r
[1]
950 leal
4(%esi
), %esi
/ a += 4;
a = &a[1]
951 movd
%mm4
, %ecx
/ cnt
= save
952 subl $
2, %ecx
/ cnt
= alen
- 2; i in
1..alen-2
953 movd
%ecx
, %mm4
/ save_cnt
954 jecxz
.L32 / while (cnt != 0) {
956 movd
0(%esi
), %mm3
/ digit
= a[i
]
957 leal
4(%esi
), %esi
/ a += 4;
a = &a[1];
a = &a[i
+ 1]
958 leal
8(%edi
), %edi
/ r
+= 8; r
= &r
[2]; r
= &r
[2 * i
+ 1]
959 movl
%edi
, %edx
/ edx
= r
960 movl
%esi
, %ebx
/ ebx
= a
961 cmp $
1, %ecx
/ The last triangle term is special
963 call big_mul_add_vec_sse2_r
964 movd
%mm4
, %ecx
/ cnt
= save_cnt
965 movl
%eax
, (%edi
, %ecx
, 4) / r
[cnt
] = cy
966 subl $
1, %ecx
/ --cnt
967 movd
%ecx
, %mm4
/ save_cnt
= cnt
971 movd
0(%ebx
), %mm1
/ mm1
= a[i
+ 1]
972 movd
0(%edx
), %mm2
/ mm2
= r
[2 * i
+ 1]
973 pmuludq
%mm3
, %mm1
/ mm1
= p
= digit
* a[i
+ 1]
974 paddq
%mm1
, %mm2
/ mm2
= r
[2 * i
+ 1] + p
975 movd
%mm2
, 0(%edx
) / r
[2 * i
+ 1] += lo32
(p
)
976 psrlq $
32, %mm2
/ mm2
= cy
977 movd
%mm2
, 4(%edx
) / r
[2 * i
+ 2] = cy
979 movd
%mm2
, 8(%edx
) / r
[2 * i
+ 3] = 0
981 movl
8(%ebp
), %edx
/ r
= arg
(r
)
982 movl
12(%ebp
), %ebx
/ a = arg
(a)
983 movl
16(%ebp
), %ecx
/ cnt
= arg
(alen
)
985 / compute low-order corner
989 movd
0(%ebx
), %mm2
/ mm2
= a[0]
990 pmuludq
%mm2
, %mm2
/ mm2
= p
= a[0]**2
991 movd
%mm2
, 0(%edx
) / r
[0] = lo32
(p
)
992 psrlq $
32, %mm2
/ mm2
= cy
= hi32
(p
)
998 movd
4(%edx
), %mm1
/ mm1
= r
[1]
999 psllq $
1, %mm1
/ mm1
= p
= 2 * r
[1]
1000 paddq
%mm1
, %mm2
/ mm2
= t = p
+ cy
1001 movd
%mm2
, 4(%edx
) / r
[1] = low32
(t)
1002 psrlq $
32, %mm2
/ mm2
= cy
= hi32
(t)
1004 / r
[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
1005 subl $
2, %ecx
/ cnt
= alen
- 2
1007 movd
4(%ebx
), %mm0
/ mm0
= diag
= a[i+
1]
1008 pmuludq
%mm0
, %mm0
/ mm0
= p
= diag
**2
1009 paddq
%mm0
, %mm2
/ mm2
= t = p
+ cy
1011 movd
%eax
, %mm1
/ mm1
= lo32
(t)
1012 psrlq $
32, %mm2
/ mm2
= hi32
(t)
1014 movd
8(%edx
), %mm3
/ mm3
= r
[2*i
]
1015 psllq $
1, %mm3
/ mm3
= 2*r
[2*i
]
1016 paddq
%mm3
, %mm1
/ mm1
= 2*r
[2*i
] + lo32
(t)
1017 movd
%mm1
, 8(%edx
) / r
[2*i
] = 2*r
[2*i
] + lo32
(t)
1021 movd
12(%edx
), %mm3
/ mm3
= r
[2*i+
1]
1022 psllq $
1, %mm3
/ mm3
= 2*r
[2*i+
1]
1023 paddq
%mm3
, %mm2
/ mm2
= 2*r
[2*i+
1] + hi32
(t)
1024 movd
%mm2
, 12(%edx
) / r
[2*i+
1] = mm2
1025 psrlq $
32, %mm2
/ mm2
= cy
1026 leal
8(%edx
), %edx
/ r
+= 2
1027 leal
4(%ebx
), %ebx
/ ++a
1028 subl $
1, %ecx
/ --cnt
1031 / Carry from last triangle term must participate in doubling
,
1032 / but this step isn
't paired up with a squaring the elements
1033 / of the inner diagonal.
1034 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
1035 movd 8(%edx), %mm3 / mm3 = r[2*i]
1036 psllq $1, %mm3 / mm3 = 2*r[2*i]
1037 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy
1038 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy)
1039 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy)
1041 movd 12(%edx), %mm3 / mm3 = r[2*i+1]
1042 psllq $1, %mm3 / mm3 = 2*r[2*i+1]
1043 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy
1044 movd %mm2, 12(%edx) / r[2*i+1] = mm2
1045 psrlq $32, %mm2 / mm2 = cy
1047 / compute high-order corner and add it in
1048 / p = a[alen - 1]**2
1050 / r[alen + alen - 2] += lo32(t)
1052 / r[alen + alen - 1] = cy
1053 movd 4(%ebx), %mm0 / mm0 = a[$-1]
1054 movd 8(%edx), %mm3 / mm3 = r[$-2]
1055 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2
1056 paddq %mm0, %mm2 / mm2 = t = p + cy
1057 paddq %mm3, %mm2 / mm2 = r[$-2] + t
1058 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t)
1059 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t)
1062 movd %mm2, 12(%edx) / r[$-1] += cy
1070 #if defined(MMX_MANAGE)
1072 SET_SIZE(big_sqr_vec_sse2_fc)
1076 SET_SIZE(big_sqr_vec_sse2)
1080 #if defined(MMX_MANAGE)
1081 ENTRY(big_sqr_vec_sse2)
1090 call big_sqr_vec_sse2_fc
1091 RSTOR_MMX_0TO4(%edi)
1096 call big_sqr_vec_sse2_fc
1104 SET_SIZE(big_sqr_vec_sse2)
1106 #endif /* MMX_MANAGE */
1108 / ------------------------------------------------------------------------
1109 / UMUL Implementations
1110 / ------------------------------------------------------------------------
1113 / r = a * digit, r and a are vectors of length len
1114 / returns the carry digit
1115 / Does not use any MMX, SSE, or SSE2 instructions.
1116 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1117 / This is a fall-back implementation for x86 models that do not support
1118 / the PMULUDQ instruction.
1121 / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1123 / r 8(%ebp) %edx %edi
1124 / a 12(%ebp) %ebx %esi
1126 / digit 20(%ebp) %esi
1128 ENTRY(big_mul_set_vec_umul)
1135 xorl %ebx, %ebx / cy = 0
1142 movl (%esi), %eax / eax = a[i]
1143 leal 4(%esi), %esi / ++a
1144 mull 20(%ebp) / edx:eax = a[i] * digit
1146 adcl $0, %edx / edx:eax = a[i] * digit + cy
1147 movl %eax, (%edi) / r[i] = product[31..0]
1148 movl %edx, %ebx / cy = product[63..32]
1149 leal 4(%edi), %edi / ++r
1151 jnz .L55 / while (len != 0)
1159 SET_SIZE(big_mul_set_vec_umul)
1162 / r = r + a * digit, r and a are vectors of length len
1163 / returns the carry digit
1164 / Does not use any MMX, SSE, or SSE2 instructions.
1165 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1166 / This is a fall-back implementation for x86 models that do not support
1167 / the PMULUDQ instruction.
1170 / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1172 / r 8(%ebp) %edx %edi
1173 / a 12(%ebp) %ebx %esi
1175 / digit 20(%ebp) %esi
1177 ENTRY(big_mul_add_vec_umul)
1184 xorl %ebx, %ebx / cy = 0
1191 movl (%esi), %eax / eax = a[i]
1192 leal 4(%esi), %esi / ++a
1193 mull 20(%ebp) / edx:eax = a[i] * digit
1195 adcl $0, %edx / edx:eax = a[i] * digit + r[i]
1197 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy
1198 movl %eax, (%edi) / r[i] = product[31..0]
1199 movl %edx, %ebx / cy = product[63..32]
1200 leal 4(%edi), %edi / ++r
1202 jnz .L65 / while (len != 0)
1210 SET_SIZE(big_mul_add_vec_umul)