beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / mod_34lsub1.asm
blob31e25b79bc3bc9c05fee72941b06a80c15855d08
1 dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
3 dnl Copyright 2000-2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C Pentium4: 1.0 cycles/limb
37 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
39 C Enhancements:
41 C There might a couple of cycles to save by using plain integer code for
42 C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
43 C about 46 (inclusive of some function call overheads).
45 defframe(PARAM_SIZE, 8)
46 defframe(PARAM_SRC, 4)
48 dnl re-use parameter space
49 define(SAVE_EBX, `PARAM_SRC')
50 define(SAVE_ESI, `PARAM_SIZE')
52 TEXT
53 ALIGN(16)
54 PROLOGUE(mpn_mod_34lsub1)
55 deflit(`FRAME',0)
57 movl PARAM_SIZE, %ecx
58 movl PARAM_SRC, %edx
59 movl (%edx), %eax
61 subl $2, %ecx
62 ja L(three_or_more)
63 jne L(one)
65 movl 4(%edx), %edx
66 movl %eax, %ecx
67 shrl $24, %eax C src[0] high
69 andl $0x00FFFFFF, %ecx C src[0] low
70 addl %ecx, %eax
72 movl %edx, %ecx
73 shll $8, %edx
75 shrl $16, %ecx C src[1] low
76 addl %ecx, %eax
78 andl $0x00FFFF00, %edx C src[1] high
79 addl %edx, %eax
81 L(one):
82 ret
85 L(three_or_more):
86 pxor %mm0, %mm0
87 pxor %mm1, %mm1
88 pxor %mm2, %mm2
90 pcmpeqd %mm7, %mm7
91 psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
93 pcmpeqd %mm6, %mm6
94 psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
96 L(top):
97 C eax
98 C ebx
99 C ecx counter, size-2 to 0, -1 or -2
100 C edx src, incrementing
102 C mm0 sum 0mod3
103 C mm1 sum 1mod3
104 C mm2 sum 2mod3
105 C mm3
106 C mm4
107 C mm5
108 C mm6 0x0000000000FFFFFF
109 C mm7 0x00000000FFFFFFFF
111 movd (%edx), %mm3
112 paddq %mm3, %mm0
114 movd 4(%edx), %mm3
115 paddq %mm3, %mm1
117 movd 8(%edx), %mm3
118 paddq %mm3, %mm2
120 addl $12, %edx
121 subl $3, %ecx
122 ja L(top)
125 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
127 addl $1, %ecx
128 js L(combine) C 0 more
130 movd (%edx), %mm3
131 paddq %mm3, %mm0
133 jz L(combine) C 1 more
135 movd 4(%edx), %mm3
136 paddq %mm3, %mm1
138 L(combine):
139 movq %mm7, %mm3 C low halves
140 pand %mm0, %mm3
142 movq %mm7, %mm4
143 pand %mm1, %mm4
145 movq %mm7, %mm5
146 pand %mm2, %mm5
148 psrlq $32, %mm0 C high halves
149 psrlq $32, %mm1
150 psrlq $32, %mm2
152 paddq %mm0, %mm4 C fold high halves to give 33 bits each
153 paddq %mm1, %mm5
154 paddq %mm2, %mm3
156 psllq $8, %mm4 C combine at respective offsets
157 psllq $16, %mm5
158 paddq %mm4, %mm3
159 paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
161 pand %mm3, %mm6 C fold at 24 bits
162 psrlq $24, %mm3
164 paddq %mm6, %mm3
165 movd %mm3, %eax
167 ASSERT(z, C nothing left in high dword
168 `psrlq $32, %mm3
169 movd %mm3, %ecx
170 orl %ecx, %ecx')
172 emms
175 EPILOGUE()