8990 /opt/onbld/gk is useless
[unleashed.git] / usr / src / common / bignum / i386 / bignum_i386_asm.s
blob0fb45a7370d5fd942a1e34993a5840e935a12187
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/asm_linkage.h>
26 #include <sys/x86_archext.h>
27 #include <sys/controlregs.h>
29 #if defined(__lint)
31 #include <sys/types.h>
33 uint32_t
34 bignum_use_sse2()
35 { return (0); }
37 /* Not to be called by C code */
38 /* ARGSUSED */
39 uint32_t
40 big_mul_set_vec_sse2_r()
41 { return (0); }
43 /* Not to be called by C code */
44 /* ARGSUSED */
45 uint32_t
46 big_mul_add_vec_sse2_r()
47 { return (0); }
49 /* ARGSUSED */
50 uint32_t
51 big_mul_set_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
52 { return (0); }
54 /* ARGSUSED */
55 uint32_t
56 big_mul_add_vec_sse2(uint32_t *r, uint32_t *a, int len, uint32_t digit)
57 { return (0); }
59 /* ARGSUSED */
60 void
61 big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
64 /* ARGSUSED */
65 void
66 big_sqr_vec_sse2(uint32_t *r, uint32_t *a, int len)
69 #if defined(MMX_MANAGE)
71 /* ARGSUSED */
72 uint32_t
73 big_mul_set_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
74 { return (0); }
76 /* ARGSUSED */
77 uint32_t
78 big_mul_add_vec_sse2_nsv(uint32_t *r, uint32_t *a, int len, uint32_t digit)
79 { return (0); }
81 /* Not to be called by C code */
82 /* ARGSUSED */
83 void
84 big_sqr_vec_sse2_fc(uint32_t *r, uint32_t *a, int len)
87 #endif /* MMX_MANAGE */
90 * UMUL
94 /* ARGSUSED */
95 uint32_t
96 big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
97 { return (0); }
99 /* ARGSUSED */
100 uint32_t
101 big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
102 { return (0); }
104 #else /* __lint */
106 #if defined(MMX_MANAGE)
108 #if defined(_KERNEL)
110 #define KPREEMPT_DISABLE call kpr_disable
111 #define KPREEMPT_ENABLE call kpr_enable
112 #define TEST_TS(reg) \
113 movl %cr0, reg; \
114 clts; \
115 testl $CR0_TS, reg
117 #else /* _KERNEL */
119 #define KPREEMPT_DISABLE
120 #define KPREEMPT_ENABLE
122 #define TEST_TS(reg) \
123 movl $0, reg; \
124 testl $CR0_TS, reg
126 #endif /* _KERNEL */
128 #define MMX_SIZE 8
129 #define MMX_ALIGN 8
131 #define SAVE_MMX_PROLOG(sreg, nreg) \
132 subl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp; \
133 movl %esp, sreg; \
134 addl $MMX_ALIGN, sreg; \
135 andl $-1![MMX_ALIGN-1], sreg;
137 #define RSTOR_MMX_EPILOG(nreg) \
138 addl $_MUL(MMX_SIZE, nreg + MMX_ALIGN), %esp;
140 #define SAVE_MMX_0TO4(sreg) \
141 SAVE_MMX_PROLOG(sreg, 5); \
142 movq %mm0, 0(sreg); \
143 movq %mm1, 8(sreg); \
144 movq %mm2, 16(sreg); \
145 movq %mm3, 24(sreg); \
146 movq %mm4, 32(sreg)
148 #define RSTOR_MMX_0TO4(sreg) \
149 movq 0(sreg), %mm0; \
150 movq 8(sreg), %mm1; \
151 movq 16(sreg), %mm2; \
152 movq 24(sreg), %mm3; \
153 movq 32(sreg), %mm4; \
154 RSTOR_MMX_EPILOG(5)
156 #endif /* MMX_MANAGE */
158 / Note: this file contains implementations for
159 / big_mul_set_vec()
160 / big_mul_add_vec()
161 / big_mul_vec()
162 / big_sqr_vec()
163 / One set of implementations is for SSE2-capable models.
164 / The other uses no MMX, SSE, or SSE2 instructions, only
165 / the x86 32 X 32 -> 64 unsigned multiply instruction, MUL.
167 / The code for the implementations is grouped by SSE2 vs UMUL,
168 / rather than grouping pairs of implementations for each function.
169 / This is because the bignum implementation gets "imprinted"
170 / on the correct implementation, at the time of first use,
171 / so none of the code for the other implementations is ever
172 / executed. So, it is a no-brainer to layout the code to minimize
173 / the "footprint" of executed code.
175 / Can we use SSE2 instructions? Return value is non-zero
176 / if we can.
178 / Note:
179 / Using the cpuid instruction directly would work equally
180 / well in userland and in the kernel, but we do not use the
181 / cpuid instruction in the kernel, we use x86_featureset,
182 / instead. This means we honor any decisions the kernel
183 / startup code may have made in setting this variable,
184 / including disabling SSE2. It might even be a good idea
185 / to honor this kind of setting in userland, as well, but
186 / the variable, x86_featureset is not readily available to
187 / userland processes.
189 / uint32_t
190 / bignum_use_sse2()
192 ENTRY(bignum_use_sse2)
193 #if defined(_KERNEL)
194 xor %eax, %eax
195 bt $X86FSET_SSE2, x86_featureset
196 adc %eax, %eax
197 #else /* _KERNEL */
198 pushl %ebx
199 movl $1, %eax / Get feature information
200 cpuid
201 movl %edx, %eax / set return value
202 popl %ebx
203 andl $CPUID_INTC_EDX_SSE2, %eax
204 #endif /* _KERNEL */
206 SET_SIZE(bignum_use_sse2)
209 / ------------------------------------------------------------------------
210 / SSE2 Implementations
211 / ------------------------------------------------------------------------
213 / r = a * digit, r and a are vectors of length len
214 / returns the carry digit
215 / Suitable only for x86 models that support SSE2 instruction set extensions
217 / uint32_t
218 / big_mul_set_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
220 / r %edx
221 / a %ebx
222 / len %ecx
223 / digit %mm3
225 / Does not touch the following registers: %esi, %edi, %mm4
227 / N.B.:
228 / This is strictly for internal use.
229 / The interface is very light-weight.
230 / All parameters are passed in registers.
231 / It does not conform to the SYSV x86 ABI.
232 / So, don't even think about calling this function directly from C code.
234 / The basic multiply digit loop is unrolled 8 times.
235 / Each comment is preceded by an instance number.
236 / Instructions that have been moved retain their original, "natural"
237 / instance number. It should be easier this way to follow
238 / the step-wise refinement process that went into constructing
239 / the final code.
241 #define UNROLL 8
242 #define UNROLL32 32
244 ENTRY(big_mul_set_vec_sse2_r)
245 xorl %eax, %eax / if (len == 0) return (0);
246 testl %ecx, %ecx
247 jz .L17
249 pxor %mm0, %mm0 / cy = 0
251 .L15:
252 cmpl $UNROLL, %ecx
253 jl .L16
254 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
255 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
256 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
257 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
258 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
259 psrlq $32, %mm0 / 1: cy = product[63..32]
261 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
262 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
263 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
264 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
265 psrlq $32, %mm0 / 2: cy = product[63..32]
267 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
268 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
269 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
270 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
271 psrlq $32, %mm0 / 3: cy = product[63..32]
273 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
274 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
275 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
276 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
277 psrlq $32, %mm0 / 4: cy = product[63..32]
279 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
280 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
281 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
282 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
283 psrlq $32, %mm0 / 5: cy = product[63..32]
285 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
286 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
287 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
288 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
289 psrlq $32, %mm0 / 6: cy = product[63..32]
291 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
292 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
293 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
294 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
295 psrlq $32, %mm0 / 7: cy = product[63..32]
297 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
298 paddq %mm1, %mm0 / 8: mm0 = digit * a[i] + cy;
299 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
300 psrlq $32, %mm0 / 8: cy = product[63..32]
302 leal UNROLL32(%ebx), %ebx / a += UNROLL
303 leal UNROLL32(%edx), %edx / r += UNROLL
304 subl $UNROLL, %ecx / len -= UNROLL
305 jz .L17
306 jmp .L15
308 .L16:
309 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
310 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
311 paddq %mm1, %mm0 / 1: mm0 = digit * a[i] + cy;
312 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
313 psrlq $32, %mm0 / 1: cy = product[63..32]
314 subl $1, %ecx
315 jz .L17
317 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
318 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
319 paddq %mm1, %mm0 / 2: mm0 = digit * a[i] + cy;
320 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
321 psrlq $32, %mm0 / 2: cy = product[63..32]
322 subl $1, %ecx
323 jz .L17
325 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
326 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
327 paddq %mm1, %mm0 / 3: mm0 = digit * a[i] + cy;
328 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
329 psrlq $32, %mm0 / 3: cy = product[63..32]
330 subl $1, %ecx
331 jz .L17
333 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
334 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
335 paddq %mm1, %mm0 / 4: mm0 = digit * a[i] + cy;
336 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
337 psrlq $32, %mm0 / 4: cy = product[63..32]
338 subl $1, %ecx
339 jz .L17
341 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
342 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
343 paddq %mm1, %mm0 / 5: mm0 = digit * a[i] + cy;
344 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
345 psrlq $32, %mm0 / 5: cy = product[63..32]
346 subl $1, %ecx
347 jz .L17
349 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
350 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
351 paddq %mm1, %mm0 / 6: mm0 = digit * a[i] + cy;
352 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
353 psrlq $32, %mm0 / 6: cy = product[63..32]
354 subl $1, %ecx
355 jz .L17
357 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
358 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
359 paddq %mm1, %mm0 / 7: mm0 = digit * a[i] + cy;
360 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
361 psrlq $32, %mm0 / 7: cy = product[63..32]
363 .L17:
364 movd %mm0, %eax / return (cy)
365 / no emms. caller is responsible for emms
367 SET_SIZE(big_mul_set_vec_sse2_r)
370 / r = a * digit, r and a are vectors of length len
371 / returns the carry digit
372 / Suitable only for x86 models that support SSE2 instruction set extensions
374 / r 8(%ebp) %edx
375 / a 12(%ebp) %ebx
376 / len 16(%ebp) %ecx
377 / digit 20(%ebp) %mm3
379 / In userland, there is just the one function, big_mul_set_vec_sse2().
380 / But in the kernel, there are two variations:
381 / 1. big_mul_set_vec_sse2() which does what is necessary to save and
382 / restore state, if necessary, and to ensure that preemtion is
383 / disabled.
384 / 2. big_mul_set_vec_sse2_nsv() which just does the work;
385 / it is the caller's responsibility to ensure that MMX state
386 / does not need to be saved and restored and that preemption
387 / is already disabled.
389 #if defined(MMX_MANAGE)
390 ENTRY(big_mul_set_vec_sse2)
391 pushl %ebp
392 movl %esp, %ebp
393 pushl %ebx
394 pushl %esi
395 KPREEMPT_DISABLE
396 TEST_TS(%ebx)
397 pushl %ebx
398 jnz .setvec_no_save
399 pushl %edi
400 SAVE_MMX_0TO4(%edi)
401 movl 8(%ebp), %edx
402 movl 12(%ebp), %ebx
403 movl 16(%ebp), %ecx
404 movd 20(%ebp), %mm3
405 call big_mul_set_vec_sse2_r
406 movl %eax, %esi
407 RSTOR_MMX_0TO4(%edi)
408 popl %edi
409 jmp .setvec_rtn
411 .setvec_no_save:
412 movl 8(%ebp), %edx
413 movl 12(%ebp), %ebx
414 movl 16(%ebp), %ecx
415 movd 20(%ebp), %mm3
416 call big_mul_set_vec_sse2_r
417 movl %eax, %esi
419 .setvec_rtn:
420 emms
421 popl %ebx
422 movl %ebx, %cr0
423 KPREEMPT_ENABLE
424 movl %esi, %eax
425 popl %esi
426 popl %ebx
427 leave
429 SET_SIZE(big_mul_set_vec_sse2)
431 ENTRY(big_mul_set_vec_sse2_nsv)
432 pushl %ebp
433 movl %esp, %ebp
434 pushl %ebx
435 movl 8(%ebp), %edx
436 movl 12(%ebp), %ebx
437 movl 16(%ebp), %ecx
438 movd 20(%ebp), %mm3
439 call big_mul_set_vec_sse2_r
440 popl %ebx
441 leave
443 SET_SIZE(big_mul_set_vec_sse2_nsv)
445 #else /* !defined(MMX_MANAGE) */
447 / r = a * digit, r and a are vectors of length len
448 / returns the carry digit
449 / Suitable only for x86 models that support SSE2 instruction set extensions
451 / r 8(%ebp) %edx
452 / a 12(%ebp) %ebx
453 / len 16(%ebp) %ecx
454 / digit 20(%ebp) %mm3
456 ENTRY(big_mul_set_vec_sse2)
457 pushl %ebp
458 movl %esp, %ebp
459 pushl %ebx
460 movl 8(%ebp), %edx
461 movl 12(%ebp), %ebx
462 movl 16(%ebp), %ecx
463 movd 20(%ebp), %mm3
464 call big_mul_set_vec_sse2_r
465 popl %ebx
466 emms
467 leave
469 SET_SIZE(big_mul_set_vec_sse2)
471 #endif /* MMX_MANAGE */
474 / r = r + a * digit, r and a are vectors of length len
475 / returns the carry digit
476 / Suitable only for x86 models that support SSE2 instruction set extensions
478 / uint32_t
479 / big_mul_add_vec_sse2_r(uint32_t *r, uint32_t *a, int len, uint32_t digit)
481 / r %edx
482 / a %ebx
483 / len %ecx
484 / digit %mm3
486 / N.B.:
487 / This is strictly for internal use.
488 / The interface is very light-weight.
489 / All parameters are passed in registers.
490 / It does not conform to the SYSV x86 ABI.
491 / So, don't even think about calling this function directly from C code.
493 / The basic multiply digit loop is unrolled 8 times.
494 / Each comment is preceded by an instance number.
495 / Instructions that have been moved retain their original, "natural"
496 / instance number. It should be easier this way to follow
497 / the step-wise refinement process that went into constructing
498 / the final code.
500 ENTRY(big_mul_add_vec_sse2_r)
501 xorl %eax, %eax
502 testl %ecx, %ecx
503 jz .L27
505 pxor %mm0, %mm0 / cy = 0
507 .L25:
508 cmpl $UNROLL, %ecx
509 jl .L26
510 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
511 movd 0(%edx), %mm2 / 1: mm2 = r[i]
512 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
513 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
514 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
515 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
516 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
517 movd 4(%edx), %mm2 / 2: mm2 = r[i]
518 psrlq $32, %mm0 / 1: cy = product[63..32]
520 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
521 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
522 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
523 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
524 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
525 movd 8(%edx), %mm2 / 3: mm2 = r[i]
526 psrlq $32, %mm0 / 2: cy = product[63..32]
528 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
529 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
530 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
531 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
532 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
533 movd 12(%edx), %mm2 / 4: mm2 = r[i]
534 psrlq $32, %mm0 / 3: cy = product[63..32]
536 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
537 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
538 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
539 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
540 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
541 movd 16(%edx), %mm2 / 5: mm2 = r[i]
542 psrlq $32, %mm0 / 4: cy = product[63..32]
544 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
545 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
546 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
547 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
548 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
549 movd 20(%edx), %mm2 / 6: mm2 = r[i]
550 psrlq $32, %mm0 / 5: cy = product[63..32]
552 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
553 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
554 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
555 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
556 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
557 movd 24(%edx), %mm2 / 7: mm2 = r[i]
558 psrlq $32, %mm0 / 6: cy = product[63..32]
560 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
561 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
562 movd 28(%ebx), %mm1 / 8: mm1 = a[i]
563 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
564 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
565 movd 28(%edx), %mm2 / 8: mm2 = r[i]
566 psrlq $32, %mm0 / 7: cy = product[63..32]
568 pmuludq %mm3, %mm1 / 8: mm1 = digit * a[i]
569 paddq %mm1, %mm2 / 8: mm2 = digit * a[i] + r[i]
570 paddq %mm2, %mm0 / 8: mm0 = digit * a[i] + r[i] + cy;
571 movd %mm0, 28(%edx) / 8: r[i] = product[31..0]
572 psrlq $32, %mm0 / 8: cy = product[63..32]
574 leal UNROLL32(%ebx), %ebx / a += UNROLL
575 leal UNROLL32(%edx), %edx / r += UNROLL
576 subl $UNROLL, %ecx / len -= UNROLL
577 jz .L27
578 jmp .L25
580 .L26:
581 movd 0(%ebx), %mm1 / 1: mm1 = a[i]
582 movd 0(%edx), %mm2 / 1: mm2 = r[i]
583 pmuludq %mm3, %mm1 / 1: mm1 = digit * a[i]
584 paddq %mm1, %mm2 / 1: mm2 = digit * a[i] + r[i]
585 paddq %mm2, %mm0 / 1: mm0 = digit * a[i] + r[i] + cy;
586 movd %mm0, 0(%edx) / 1: r[i] = product[31..0]
587 psrlq $32, %mm0 / 1: cy = product[63..32]
588 subl $1, %ecx
589 jz .L27
591 movd 4(%ebx), %mm1 / 2: mm1 = a[i]
592 movd 4(%edx), %mm2 / 2: mm2 = r[i]
593 pmuludq %mm3, %mm1 / 2: mm1 = digit * a[i]
594 paddq %mm1, %mm2 / 2: mm2 = digit * a[i] + r[i]
595 paddq %mm2, %mm0 / 2: mm0 = digit * a[i] + r[i] + cy;
596 movd %mm0, 4(%edx) / 2: r[i] = product[31..0]
597 psrlq $32, %mm0 / 2: cy = product[63..32]
598 subl $1, %ecx
599 jz .L27
601 movd 8(%ebx), %mm1 / 3: mm1 = a[i]
602 movd 8(%edx), %mm2 / 3: mm2 = r[i]
603 pmuludq %mm3, %mm1 / 3: mm1 = digit * a[i]
604 paddq %mm1, %mm2 / 3: mm2 = digit * a[i] + r[i]
605 paddq %mm2, %mm0 / 3: mm0 = digit * a[i] + r[i] + cy;
606 movd %mm0, 8(%edx) / 3: r[i] = product[31..0]
607 psrlq $32, %mm0 / 3: cy = product[63..32]
608 subl $1, %ecx
609 jz .L27
611 movd 12(%ebx), %mm1 / 4: mm1 = a[i]
612 movd 12(%edx), %mm2 / 4: mm2 = r[i]
613 pmuludq %mm3, %mm1 / 4: mm1 = digit * a[i]
614 paddq %mm1, %mm2 / 4: mm2 = digit * a[i] + r[i]
615 paddq %mm2, %mm0 / 4: mm0 = digit * a[i] + r[i] + cy;
616 movd %mm0, 12(%edx) / 4: r[i] = product[31..0]
617 psrlq $32, %mm0 / 4: cy = product[63..32]
618 subl $1, %ecx
619 jz .L27
621 movd 16(%ebx), %mm1 / 5: mm1 = a[i]
622 movd 16(%edx), %mm2 / 5: mm2 = r[i]
623 pmuludq %mm3, %mm1 / 5: mm1 = digit * a[i]
624 paddq %mm1, %mm2 / 5: mm2 = digit * a[i] + r[i]
625 paddq %mm2, %mm0 / 5: mm0 = digit * a[i] + r[i] + cy;
626 movd %mm0, 16(%edx) / 5: r[i] = product[31..0]
627 psrlq $32, %mm0 / 5: cy = product[63..32]
628 subl $1, %ecx
629 jz .L27
631 movd 20(%ebx), %mm1 / 6: mm1 = a[i]
632 movd 20(%edx), %mm2 / 6: mm2 = r[i]
633 pmuludq %mm3, %mm1 / 6: mm1 = digit * a[i]
634 paddq %mm1, %mm2 / 6: mm2 = digit * a[i] + r[i]
635 paddq %mm2, %mm0 / 6: mm0 = digit * a[i] + r[i] + cy;
636 movd %mm0, 20(%edx) / 6: r[i] = product[31..0]
637 psrlq $32, %mm0 / 6: cy = product[63..32]
638 subl $1, %ecx
639 jz .L27
641 movd 24(%ebx), %mm1 / 7: mm1 = a[i]
642 movd 24(%edx), %mm2 / 7: mm2 = r[i]
643 pmuludq %mm3, %mm1 / 7: mm1 = digit * a[i]
644 paddq %mm1, %mm2 / 7: mm2 = digit * a[i] + r[i]
645 paddq %mm2, %mm0 / 7: mm0 = digit * a[i] + r[i] + cy;
646 movd %mm0, 24(%edx) / 7: r[i] = product[31..0]
647 psrlq $32, %mm0 / 7: cy = product[63..32]
649 .L27:
650 movd %mm0, %eax
651 / no emms. caller is responsible for emms
653 SET_SIZE(big_mul_add_vec_sse2_r)
656 / r = r + a * digit, r and a are vectors of length len
657 / returns the carry digit
658 / Suitable only for x86 models that support SSE2 instruction set extensions
660 / r 8(%ebp) %edx
661 / a 12(%ebp) %ebx
662 / len 16(%ebp) %ecx
663 / digit 20(%ebp) %mm3
665 / In userland, there is just the one function, big_mul_add_vec_sse2().
666 / But in the kernel, there are two variations:
667 / 1. big_mul_add_vec_sse2() which does what is necessary to save and
668 / restore state, if necessary, and to ensure that preemtion is
669 / disabled.
670 / 2. big_mul_add_vec_sse2_nsv() which just does the work;
671 / it is the caller's responsibility to ensure that MMX state
672 / does not need to be saved and restored and that preemption
673 / is already disabled.
676 #if defined(MMX_MANAGE)
678 ENTRY(big_mul_add_vec_sse2)
679 pushl %ebp
680 movl %esp, %ebp
681 pushl %ebx
682 pushl %esi
683 KPREEMPT_DISABLE
684 TEST_TS(%ebx)
685 pushl %ebx
686 jnz .addvec_no_save
687 pushl %edi
688 SAVE_MMX_0TO4(%edi)
689 movl 8(%ebp), %edx
690 movl 12(%ebp), %ebx
691 movl 16(%ebp), %ecx
692 movd 20(%ebp), %mm3
693 call big_mul_add_vec_sse2_r
694 movl %eax, %esi
695 RSTOR_MMX_0TO4(%edi)
696 popl %edi
697 jmp .addvec_rtn
699 .addvec_no_save:
700 movl 8(%ebp), %edx
701 movl 12(%ebp), %ebx
702 movl 16(%ebp), %ecx
703 movd 20(%ebp), %mm3
704 call big_mul_add_vec_sse2_r
705 movl %eax, %esi
707 .addvec_rtn:
708 emms
709 popl %ebx
710 movl %ebx, %cr0
711 KPREEMPT_ENABLE
712 movl %esi, %eax
713 popl %esi
714 popl %ebx
715 leave
717 SET_SIZE(big_mul_add_vec_sse2)
719 ENTRY(big_mul_add_vec_sse2_nsv)
720 pushl %ebp
721 movl %esp, %ebp
722 pushl %ebx
723 movl 8(%ebp), %edx
724 movl 12(%ebp), %ebx
725 movl 16(%ebp), %ecx
726 movd 20(%ebp), %mm3
727 call big_mul_add_vec_sse2_r
728 popl %ebx
729 leave
731 SET_SIZE(big_mul_add_vec_sse2_nsv)
734 #else /* !defined(MMX_MANAGE) */
736 ENTRY(big_mul_add_vec_sse2)
737 pushl %ebp
738 movl %esp, %ebp
739 pushl %ebx
740 movl 8(%ebp), %edx
741 movl 12(%ebp), %ebx
742 movl 16(%ebp), %ecx
743 movd 20(%ebp), %mm3
744 call big_mul_add_vec_sse2_r
745 popl %ebx
746 emms
747 leave
749 SET_SIZE(big_mul_add_vec_sse2)
751 #endif /* MMX_MANAGE */
754 / void
755 / big_mul_vec_sse2(uint32_t *r, uint32_t *a, int alen, uint32_t *b, int blen)
757 / int i;
759 / r[alen] = big_mul_set_vec_sse2(r, a, alen, b[0]);
760 / for (i = 1; i < blen; ++i)
761 / r[alen + i] = big_mul_add_vec_sse2(r+i, a, alen, b[i]);
765 #if defined(MMX_MANAGE)
766 ENTRY(big_mul_vec_sse2_fc)
767 #else
768 ENTRY(big_mul_vec_sse2)
769 #endif
770 subl $0x8, %esp
771 pushl %ebx
772 pushl %ebp
773 pushl %esi
774 pushl %edi
775 movl 40(%esp), %eax
776 movl %eax, 20(%esp)
777 pushl (%eax)
778 movl 40(%esp), %edi
779 pushl %edi
780 movl 40(%esp), %esi
781 pushl %esi
782 movl 40(%esp), %ebx
783 pushl %ebx
784 #if defined(MMX_MANAGE)
785 call big_mul_set_vec_sse2_nsv
786 #else
787 call big_mul_set_vec_sse2
788 #endif
789 addl $0x10, %esp
790 movl %eax, (%ebx,%edi,4)
791 movl 44(%esp), %eax
792 movl %eax, 16(%esp)
793 cmpl $0x1, %eax
794 jle .mulvec_rtn
795 movl $0x1, %ebp
797 .align 16
798 .mulvec_add:
799 movl 20(%esp), %eax
800 pushl (%eax,%ebp,4)
801 pushl %edi
802 pushl %esi
803 leal (%ebx,%ebp,4), %eax
804 pushl %eax
805 #if defined(MMX_MANAGE)
806 call big_mul_add_vec_sse2_nsv
807 #else
808 call big_mul_add_vec_sse2
809 #endif
810 addl $0x10, %esp
811 leal (%ebp,%edi), %ecx
812 movl %eax, (%ebx,%ecx,4)
813 incl %ebp
814 cmpl 16(%esp), %ebp
815 jl .mulvec_add
816 .mulvec_rtn:
817 #if defined(MMX_MANAGE)
818 emms
819 #endif
820 popl %edi
821 popl %esi
822 popl %ebp
823 popl %ebx
824 addl $0x8, %esp
825 ret
826 #if defined(MMX_MANAGE)
827 SET_SIZE(big_mul_vec_sse2_fc)
828 #else
829 SET_SIZE(big_mul_vec_sse2)
830 #endif
832 #if defined(MMX_MANAGE)
834 ENTRY(big_mul_vec_sse2)
835 pushl %ebp
836 movl %esp, %ebp
837 subl $8, %esp
838 pushl %edi
839 KPREEMPT_DISABLE
840 TEST_TS(%eax)
841 movl %eax, -8(%ebp)
842 jnz .mulvec_no_save
843 SAVE_MMX_0TO4(%edi)
844 movl %edi, -4(%ebp)
845 .mulvec_no_save:
846 movl 24(%ebp), %eax / blen
847 pushl %eax
848 movl 20(%ebp), %eax / b
849 pushl %eax
850 movl 16(%ebp), %eax / alen
851 pushl %eax
852 movl 12(%ebp), %eax / a
853 pushl %eax
854 movl 8(%ebp), %eax / r
855 pushl %eax
856 call big_mul_vec_sse2_fc
857 addl $20, %esp
858 movl -8(%ebp), %eax
859 testl $CR0_TS, %eax
860 jnz .mulvec_no_rstr
861 movl -4(%ebp), %edi
862 RSTOR_MMX_0TO4(%edi)
863 .mulvec_no_rstr:
864 movl %eax, %cr0
865 KPREEMPT_ENABLE
866 popl %edi
867 leave
869 SET_SIZE(big_mul_vec_sse2)
871 #endif /* MMX_MANAGE */
875 #undef UNROLL
876 #undef UNROLL32
879 / r = a * a, r and a are vectors of length len
880 / Suitable only for x86 models that support SSE2 instruction set extensions
882 / This function is not suitable for a truly general-purpose multiprecision
883 / arithmetic library, because it does not work for "small" numbers, that is
884 / numbers of 1 or 2 digits. big_mul() just uses the ordinary big_mul_vec()
885 / for any small numbers.
887 #if defined(MMX_MANAGE)
888 ENTRY(big_sqr_vec_sse2_fc)
889 #else
890 ENTRY(big_sqr_vec_sse2)
891 pushl %ebp
892 movl %esp, %ebp
893 #endif
895 pushl %ebx
896 pushl %edi
897 pushl %esi
899 / r[1..alen] = a[0] * a[1..alen-1]
901 movl 8(%ebp), %edi / r = arg(r)
902 movl 12(%ebp), %esi / a = arg(a)
903 movl 16(%ebp), %ecx / cnt = arg(alen)
904 movd %ecx, %mm4 / save_cnt = arg(alen)
905 leal 4(%edi), %edx / dst = &r[1]
906 movl %esi, %ebx / src = a
907 movd 0(%ebx), %mm3 / mm3 = a[0]
908 leal 4(%ebx), %ebx / src = &a[1]
909 subl $1, %ecx / --cnt
910 call big_mul_set_vec_sse2_r / r[1..alen-1] = a[0] * a[1..alen-1]
911 movl %edi, %edx / dst = r
912 movl %esi, %ebx / src = a
913 movd %mm4, %ecx / cnt = save_cnt
914 movl %eax, (%edx, %ecx, 4) / r[cnt] = cy
916 / /* High-level vector C pseudocode */
917 / for (i = 1; i < alen-1; ++i)
918 / r[2*i + 1 ... ] += a[i] * a[i+1 .. alen-1]
920 / /* Same thing, but slightly lower level C-like pseudocode */
921 / i = 1;
922 / r = &arg_r[2*i + 1];
923 / a = &arg_a[i + 1];
924 / digit = arg_a[i];
925 / cnt = alen - 3;
926 / while (cnt != 0) {
927 / r[cnt] = big_mul_add_vec_sse2_r(r, a, cnt, digit);
928 / r += 2;
929 / ++a;
930 / --cnt;
933 / /* Same thing, but even lower level
934 / * For example, pointers are raw pointers,
935 / * with no scaling by object size.
936 / */
937 / r = arg_r + 12; /* i == 1; 2i + 1 == 3; 4*3 == 12; */
938 / a = arg_a + 8;
939 / digit = *(arg_a + 4);
940 / cnt = alen - 3;
941 / while (cnt != 0) {
942 / cy = big_mul_add_vec_sse2_r();
943 / *(r + 4 * cnt) = cy;
944 / r += 8;
945 / a += 4;
946 / --cnt;
949 leal 4(%edi), %edi / r += 4; r = &r[1]
950 leal 4(%esi), %esi / a += 4; a = &a[1]
951 movd %mm4, %ecx / cnt = save
952 subl $2, %ecx / cnt = alen - 2; i in 1..alen-2
953 movd %ecx, %mm4 / save_cnt
954 jecxz .L32 / while (cnt != 0) {
955 .L31:
956 movd 0(%esi), %mm3 / digit = a[i]
957 leal 4(%esi), %esi / a += 4; a = &a[1]; a = &a[i + 1]
958 leal 8(%edi), %edi / r += 8; r = &r[2]; r = &r[2 * i + 1]
959 movl %edi, %edx / edx = r
960 movl %esi, %ebx / ebx = a
961 cmp $1, %ecx / The last triangle term is special
962 jz .L32
963 call big_mul_add_vec_sse2_r
964 movd %mm4, %ecx / cnt = save_cnt
965 movl %eax, (%edi, %ecx, 4) / r[cnt] = cy
966 subl $1, %ecx / --cnt
967 movd %ecx, %mm4 / save_cnt = cnt
968 jmp .L31 / }
970 .L32:
971 movd 0(%ebx), %mm1 / mm1 = a[i + 1]
972 movd 0(%edx), %mm2 / mm2 = r[2 * i + 1]
973 pmuludq %mm3, %mm1 / mm1 = p = digit * a[i + 1]
974 paddq %mm1, %mm2 / mm2 = r[2 * i + 1] + p
975 movd %mm2, 0(%edx) / r[2 * i + 1] += lo32(p)
976 psrlq $32, %mm2 / mm2 = cy
977 movd %mm2, 4(%edx) / r[2 * i + 2] = cy
978 pxor %mm2, %mm2
979 movd %mm2, 8(%edx) / r[2 * i + 3] = 0
981 movl 8(%ebp), %edx / r = arg(r)
982 movl 12(%ebp), %ebx / a = arg(a)
983 movl 16(%ebp), %ecx / cnt = arg(alen)
985 / compute low-order corner
986 / p = a[0]**2
987 / r[0] = lo32(p)
988 / cy = hi32(p)
989 movd 0(%ebx), %mm2 / mm2 = a[0]
990 pmuludq %mm2, %mm2 / mm2 = p = a[0]**2
991 movd %mm2, 0(%edx) / r[0] = lo32(p)
992 psrlq $32, %mm2 / mm2 = cy = hi32(p)
994 / p = 2 * r[1]
995 / t = p + cy
996 / r[1] = lo32(t)
997 / cy = hi32(t)
998 movd 4(%edx), %mm1 / mm1 = r[1]
999 psllq $1, %mm1 / mm1 = p = 2 * r[1]
1000 paddq %mm1, %mm2 / mm2 = t = p + cy
1001 movd %mm2, 4(%edx) / r[1] = low32(t)
1002 psrlq $32, %mm2 / mm2 = cy = hi32(t)
1004 / r[2..$-3] = inner_diagonal[*]**2 + 2 * r[2..$-3]
1005 subl $2, %ecx / cnt = alen - 2
1006 .L34:
1007 movd 4(%ebx), %mm0 / mm0 = diag = a[i+1]
1008 pmuludq %mm0, %mm0 / mm0 = p = diag**2
1009 paddq %mm0, %mm2 / mm2 = t = p + cy
1010 movd %mm2, %eax
1011 movd %eax, %mm1 / mm1 = lo32(t)
1012 psrlq $32, %mm2 / mm2 = hi32(t)
1014 movd 8(%edx), %mm3 / mm3 = r[2*i]
1015 psllq $1, %mm3 / mm3 = 2*r[2*i]
1016 paddq %mm3, %mm1 / mm1 = 2*r[2*i] + lo32(t)
1017 movd %mm1, 8(%edx) / r[2*i] = 2*r[2*i] + lo32(t)
1018 psrlq $32, %mm1
1019 paddq %mm1, %mm2
1021 movd 12(%edx), %mm3 / mm3 = r[2*i+1]
1022 psllq $1, %mm3 / mm3 = 2*r[2*i+1]
1023 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + hi32(t)
1024 movd %mm2, 12(%edx) / r[2*i+1] = mm2
1025 psrlq $32, %mm2 / mm2 = cy
1026 leal 8(%edx), %edx / r += 2
1027 leal 4(%ebx), %ebx / ++a
1028 subl $1, %ecx / --cnt
1029 jnz .L34
1031 / Carry from last triangle term must participate in doubling,
1032 / but this step isn't paired up with a squaring the elements
1033 / of the inner diagonal.
1034 / r[$-3..$-2] += 2 * r[$-3..$-2] + cy
1035 movd 8(%edx), %mm3 / mm3 = r[2*i]
1036 psllq $1, %mm3 / mm3 = 2*r[2*i]
1037 paddq %mm3, %mm2 / mm2 = 2*r[2*i] + cy
1038 movd %mm2, 8(%edx) / r[2*i] = lo32(2*r[2*i] + cy)
1039 psrlq $32, %mm2 / mm2 = cy = hi32(2*r[2*i] + cy)
1041 movd 12(%edx), %mm3 / mm3 = r[2*i+1]
1042 psllq $1, %mm3 / mm3 = 2*r[2*i+1]
1043 paddq %mm3, %mm2 / mm2 = 2*r[2*i+1] + cy
1044 movd %mm2, 12(%edx) / r[2*i+1] = mm2
1045 psrlq $32, %mm2 / mm2 = cy
1047 / compute high-order corner and add it in
1048 / p = a[alen - 1]**2
1049 / t = p + cy
1050 / r[alen + alen - 2] += lo32(t)
1051 / cy = hi32(t)
1052 / r[alen + alen - 1] = cy
1053 movd 4(%ebx), %mm0 / mm0 = a[$-1]
1054 movd 8(%edx), %mm3 / mm3 = r[$-2]
1055 pmuludq %mm0, %mm0 / mm0 = p = a[$-1]**2
1056 paddq %mm0, %mm2 / mm2 = t = p + cy
1057 paddq %mm3, %mm2 / mm2 = r[$-2] + t
1058 movd %mm2, 8(%edx) / r[$-2] = lo32(r[$-2] + t)
1059 psrlq $32, %mm2 / mm2 = cy = hi32(r[$-2] + t)
1060 movd 12(%edx), %mm3
1061 paddq %mm3, %mm2
1062 movd %mm2, 12(%edx) / r[$-1] += cy
1064 .L35:
1065 emms
1066 popl %esi
1067 popl %edi
1068 popl %ebx
1070 #if defined(MMX_MANAGE)
1072 SET_SIZE(big_sqr_vec_sse2_fc)
1073 #else
1074 leave
1076 SET_SIZE(big_sqr_vec_sse2)
1077 #endif
1080 #if defined(MMX_MANAGE)
1081 ENTRY(big_sqr_vec_sse2)
1082 pushl %ebp
1083 movl %esp, %ebp
1084 KPREEMPT_DISABLE
1085 TEST_TS(%ebx)
1086 pushl %ebx
1087 jnz .sqr_no_save
1088 pushl %edi
1089 SAVE_MMX_0TO4(%edi)
1090 call big_sqr_vec_sse2_fc
1091 RSTOR_MMX_0TO4(%edi)
1092 popl %edi
1093 jmp .sqr_rtn
1095 .sqr_no_save:
1096 call big_sqr_vec_sse2_fc
1098 .sqr_rtn:
1099 popl %ebx
1100 movl %ebx, %cr0
1101 KPREEMPT_ENABLE
1102 leave
1104 SET_SIZE(big_sqr_vec_sse2)
1106 #endif /* MMX_MANAGE */
1108 / ------------------------------------------------------------------------
1109 / UMUL Implementations
1110 / ------------------------------------------------------------------------
1113 / r = a * digit, r and a are vectors of length len
1114 / returns the carry digit
1115 / Does not use any MMX, SSE, or SSE2 instructions.
1116 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1117 / This is a fall-back implementation for x86 models that do not support
1118 / the PMULUDQ instruction.
1120 / uint32_t
1121 / big_mul_set_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1123 / r 8(%ebp) %edx %edi
1124 / a 12(%ebp) %ebx %esi
1125 / len 16(%ebp) %ecx
1126 / digit 20(%ebp) %esi
1128 ENTRY(big_mul_set_vec_umul)
1129 pushl %ebp
1130 movl %esp, %ebp
1131 pushl %esi
1132 pushl %edi
1133 pushl %ebx
1134 movl 16(%ebp), %ecx
1135 xorl %ebx, %ebx / cy = 0
1136 testl %ecx, %ecx
1137 movl 8(%ebp), %edi
1138 movl 12(%ebp), %esi
1139 je .L57
1141 .L55:
1142 movl (%esi), %eax / eax = a[i]
1143 leal 4(%esi), %esi / ++a
1144 mull 20(%ebp) / edx:eax = a[i] * digit
1145 addl %ebx, %eax
1146 adcl $0, %edx / edx:eax = a[i] * digit + cy
1147 movl %eax, (%edi) / r[i] = product[31..0]
1148 movl %edx, %ebx / cy = product[63..32]
1149 leal 4(%edi), %edi / ++r
1150 decl %ecx / --len
1151 jnz .L55 / while (len != 0)
1152 .L57:
1153 movl %ebx, %eax
1154 popl %ebx
1155 popl %edi
1156 popl %esi
1157 leave
1159 SET_SIZE(big_mul_set_vec_umul)
1162 / r = r + a * digit, r and a are vectors of length len
1163 / returns the carry digit
1164 / Does not use any MMX, SSE, or SSE2 instructions.
1165 / Uses x86 unsigned 32 X 32 -> 64 multiply instruction, MUL.
1166 / This is a fall-back implementation for x86 models that do not support
1167 / the PMULUDQ instruction.
1169 / uint32_t
1170 / big_mul_add_vec_umul(uint32_t *r, uint32_t *a, int len, uint32_t digit)
1172 / r 8(%ebp) %edx %edi
1173 / a 12(%ebp) %ebx %esi
1174 / len 16(%ebp) %ecx
1175 / digit 20(%ebp) %esi
1177 ENTRY(big_mul_add_vec_umul)
1178 pushl %ebp
1179 movl %esp, %ebp
1180 pushl %esi
1181 pushl %edi
1182 pushl %ebx
1183 movl 16(%ebp), %ecx
1184 xorl %ebx, %ebx / cy = 0
1185 testl %ecx, %ecx
1186 movl 8(%ebp), %edi
1187 movl 12(%ebp), %esi
1188 je .L67
1189 .align 4
1190 .L65:
1191 movl (%esi), %eax / eax = a[i]
1192 leal 4(%esi), %esi / ++a
1193 mull 20(%ebp) / edx:eax = a[i] * digit
1194 addl (%edi), %eax
1195 adcl $0, %edx / edx:eax = a[i] * digit + r[i]
1196 addl %ebx, %eax
1197 adcl $0, %edx / edx:eax = a[i] * digit + r[i] + cy
1198 movl %eax, (%edi) / r[i] = product[31..0]
1199 movl %edx, %ebx / cy = product[63..32]
1200 leal 4(%edi), %edi / ++r
1201 decl %ecx / --len
1202 jnz .L65 / while (len != 0)
1203 .L67:
1204 movl %ebx, %eax
1205 popl %ebx
1206 popl %edi
1207 popl %esi
1208 leave
1210 SET_SIZE(big_mul_add_vec_umul)
1212 #endif /* __lint */