beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mod_34lsub1.asm
blob7e30503e544b9392b9da75374f167d4fb3da1921
1 dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
3 dnl Copyright 2000-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: 2.66 cycles/limb
37 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
39 C An attempt was made to use a loop like
41 C L(top):
42 C adcl (%edx), %eax
43 C adcl 4(%edx), %ebx
44 C adcl 8(%edx), %esi
45 C leal 12(%edx), %edx
46 C loop L(top)
48 C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
49 C The form used instead can save about 6 cycles by not dividing by 3.
51 C In the code used, putting the "leal"s at the top of the loop is necessary
52 C for the claimed speed, anywhere else costs an extra cycle per loop.
53 C Perhaps a tight loop like this needs short decode instructions at the
54 C branch target, which would explain the leal/loop form above taking 8
55 C cycles instead of 7 too.
57 defframe(PARAM_SIZE, 8)
58 defframe(PARAM_SRC, 4)
60 dnl re-use parameter space
61 define(SAVE_EBX, `PARAM_SIZE')
62 define(SAVE_ESI, `PARAM_SRC')
64 TEXT
65 ALIGN(16)
66 PROLOGUE(mpn_mod_34lsub1)
67 deflit(`FRAME',0)
69 movl PARAM_SIZE, %eax
70 movl PARAM_SRC, %edx
72 subl $2, %eax
73 ja L(three_or_more)
75 Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary
76 jne L(one)
78 movl %eax, %ecx
79 movl 4(%edx), %edx
81 shrl $24, %eax C src[0] high
82 andl $0x00FFFFFF, %ecx C src[0] low
84 addl %ecx, %eax
85 movl %edx, %ecx
87 shll $8, %edx
88 andl $0x00FFFF00, %edx C src[1] high
90 shrl $16, %ecx C src[1] low
91 addl %ecx, %eax
93 addl %edx, %eax
95 L(one):
96 ret
99 L(three_or_more):
100 C eax size-2
101 C ebx
102 C ecx
103 C edx src
105 movl %ebx, SAVE_EBX
106 xorl %ebx, %ebx
108 movl %esi, SAVE_ESI
109 pushl %edi FRAME_pushl()
111 xorl %esi, %esi
112 xorl %edi, %edi C and clear carry flag
114 L(top):
115 C eax counter, limbs
116 C ebx acc 0mod3
117 C ecx
118 C edx src, incrementing
119 C esi acc 1mod3
120 C edi acc 2mod3
121 C ebp
123 leal -2(%eax), %eax
124 leal 12(%edx), %edx
126 adcl -12(%edx), %ebx
127 adcl -8(%edx), %esi
128 adcl -4(%edx), %edi
130 decl %eax
131 jg L(top)
134 C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
136 movb $0, %cl
137 incl %eax
139 js L(combine) C 0 more
141 Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings
143 movb $8, %cl
144 decl %eax
146 js L(combine) C 1 more
148 adcl 4(%edx), %esi
150 movb $16, %cl
153 L(combine):
154 sbbl %edx, %edx
156 shll %cl, %edx C carry
157 movl %ebx, %eax C 0mod3
159 shrl $24, %eax C 0mod3 high
160 andl $0x00FFFFFF, %ebx C 0mod3 low
162 subl %edx, %eax C apply carry
163 movl %esi, %ecx C 1mod3
165 shrl $16, %esi C 1mod3 high
166 addl %ebx, %eax C apply 0mod3 low
168 andl $0x0000FFFF, %ecx
169 addl %esi, %eax C apply 1mod3 high
171 shll $8, %ecx C 1mod3 low
172 movl %edi, %edx C 2mod3
174 shrl $8, %edx C 2mod3 high
175 addl %ecx, %eax C apply 1mod3 low
177 addl %edx, %eax C apply 2mod3 high
178 andl $0x000000FF, %edi
180 shll $16, %edi C 2mod3 low
181 movl SAVE_EBX, %ebx
183 addl %edi, %eax C apply 2mod3 low
184 movl SAVE_ESI, %esi
186 popl %edi
190 EPILOGUE()