beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mod_34lsub1.asm
blobee3ad04099d5d8808f951ab98e55fa7b15e7e042
1 dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
3 dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C Athlon: 1
36 C Hammer: 1
39 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
41 C The loop form below and the 64 byte code alignment seem necessary for the
42 C claimed speed. This is a bit strange, since normally k7 isn't very
43 C sensitive to such things. Perhaps there has to be 6 instructions in the
44 C first 16 bytes for the BTB entry or something.
46 defframe(PARAM_SIZE, 8)
47 defframe(PARAM_SRC, 4)
49 dnl re-use parameter space
50 define(SAVE_EDI, `PARAM_SIZE')
52 TEXT
53 ALIGN(64)
54 PROLOGUE(mpn_mod_34lsub1)
55 deflit(`FRAME',0)
57 movl PARAM_SIZE, %ecx
58 movl PARAM_SRC, %edx
60 subl $2, %ecx
61 ja L(three_or_more)
63 movl (%edx), %eax
64 jb L(one)
66 movl 4(%edx), %ecx
67 movl %eax, %edx
68 shrl $24, %eax C src[0] low
70 andl $0xFFFFFF, %edx C src[0] high
71 addl %edx, %eax
72 movl %ecx, %edx
74 andl $0xFFFF, %ecx
75 shrl $16, %edx C src[1] high
76 addl %edx, %eax
78 shll $8, %ecx C src[1] low
79 addl %ecx, %eax
81 L(one):
82 ret
85 L(three_or_more):
86 C eax
87 C ebx
88 C ecx size-2
89 C edx src
90 C esi
91 C edi
93 pushl %ebx FRAME_pushl()
94 xorl %eax, %eax
95 xorl %ebx, %ebx
97 movl %edi, SAVE_EDI
98 pushl %esi FRAME_pushl()
99 xorl %esi, %esi C and clear carry flag
102 C code offset 0x40 at this point
103 L(top):
104 C eax acc 0mod3
105 C ebx acc 1mod3
106 C ecx counter, limbs
107 C edx src
108 C esi acc 2mod3
109 C edi
111 leal 24(%edx), %edx
112 leal -2(%ecx), %ecx
113 adcl -24(%edx), %eax
114 adcl -20(%edx), %ebx
115 adcl -16(%edx), %esi
117 decl %ecx
118 jng L(done_loop)
120 leal -2(%ecx), %ecx
121 adcl -12(%edx), %eax
122 adcl -8(%edx), %ebx
123 adcl -4(%edx), %esi
125 decl %ecx
126 jg L(top)
129 leal 12(%edx), %edx
132 L(done_loop):
133 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
135 incl %ecx
136 movl $0xFFFFFFFF, %edi
137 js L(combine)
139 adcl -12(%edx), %eax
140 decl %ecx
141 movl $0xFFFFFF00, %edi
142 js L(combine)
144 adcl -8(%edx), %ebx
145 movl $0xFFFF0000, %edi
148 L(combine):
149 C eax acc 0mod3
150 C ebx acc 1mod3
151 C ecx
152 C edx
153 C esi acc 2mod3
154 C edi mask
156 sbbl %ecx, %ecx C carry
157 movl %eax, %edx C 0mod3
158 shrl $24, %eax C 0mod3 high
160 andl %edi, %ecx C carry masked
161 andl $0x00FFFFFF, %edx C 0mod3 low
162 movl %ebx, %edi C 1mod3
164 subl %ecx, %eax C apply carry
165 shrl $16, %ebx C 1mod3 high
166 andl $0xFFFF, %edi
168 addl %edx, %eax C apply 0mod3 low
169 movl %esi, %edx C 2mod3
170 shll $8, %edi C 1mod3 low
172 addl %ebx, %eax C apply 1mod3 high
173 shrl $8, %esi C 2mod3 high
174 movzbl %dl, %edx C 2mod3 low
176 addl %edi, %eax C apply 1mod3 low
177 shll $16, %edx C 2mod3 low
179 addl %esi, %eax C apply 2mod3 high
180 popl %esi FRAME_popl()
182 movl SAVE_EDI, %edi
183 addl %edx, %eax C apply 2mod3 low
184 popl %ebx FRAME_popl()
188 EPILOGUE()