beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / addlsh1_n.asm
bloba957b6f78e621557aec4a316bdbb283643793cbe
1 dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
3 dnl Copyright 2011 Free Software Foundation, Inc.
5 dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
36 C The innerloop is 2*3-way unrolled, which is best we can do with the available
37 C registers. It seems tricky to use the same structure for rsblsh1_n, since we
38 C cannot feed carry between operations there.
40 C cycles/limb
41 C P5
42 C P6 model 0-8,10-12
43 C P6 model 9 (Banias)
44 C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift)
45 C P4 model 0 (Willamette)
46 C P4 model 1 (?)
47 C P4 model 2 (Northwood)
48 C P4 model 3 (Prescott)
49 C P4 model 4 (Nocona)
50 C Intel Atom 6
51 C AMD K6 ?
52 C AMD K7 2.5
53 C AMD K8
55 C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
56 C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately,
57 C that means we need an initial magic multiply.
59 C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We
60 C cannot do rsblsh1_n since we feed carry from the shift blocks to the
61 C add/subtract blocks, which is right for addition but reversed for
62 C subtraction. We could perhaps do sublsh1_n, with some extra move insns,
63 C without losing any time, since we're not issue limited but carry recurrency
64 C latency.
66 C Breaking carry recurrency might be a good idea. We would then need separate
67 C registers for the shift carry and add/subtract carry, which in turn would
68 C force is to 2*2-way unrolling.
70 defframe(PARAM_SIZE, 16)
71 defframe(PARAM_DBLD, 12)
72 defframe(PARAM_SRC, 8)
73 defframe(PARAM_DST, 4)
75 dnl re-use parameter space
76 define(VAR_COUNT,`PARAM_DST')
77 define(VAR_TMP,`PARAM_DBLD')
79 ASM_START()
80 TEXT
81 ALIGN(8)
82 PROLOGUE(mpn_addlsh1_n)
83 deflit(`FRAME',0)
85 define(`rp', `%edi')
86 define(`up', `%esi')
87 define(`vp', `%ebp')
89 mov $0x2aaaaaab, %eax
91 push %ebx FRAME_pushl()
92 mov PARAM_SIZE, %ebx C size
94 push rp FRAME_pushl()
95 mov PARAM_DST, rp
97 mul %ebx
99 push up FRAME_pushl()
100 mov PARAM_SRC, up
102 not %edx C count = -(size\8)-1
103 mov %edx, VAR_COUNT
105 push vp FRAME_pushl()
106 mov PARAM_DBLD, vp
108 lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3
109 xor %edx, %edx
110 lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6
111 or %ebx, %ebx
112 jz L(exact)
114 L(oop):
115 ifdef(`CPU_P6',`
116 shr %edx ') C restore 2nd saved carry bit
117 mov (vp), %eax
118 adc %eax, %eax
119 rcr %edx C restore 1st saved carry bit
120 lea 4(vp), vp
121 adc (up), %eax
122 lea 4(up), up
123 adc %edx, %edx C save a carry bit in edx
124 ifdef(`CPU_P6',`
125 adc %edx, %edx ') C save another carry bit in edx
126 dec %ebx
127 mov %eax, (rp)
128 lea 4(rp), rp
129 jnz L(oop)
130 mov vp, VAR_TMP
131 L(exact):
132 incl VAR_COUNT
133 jz L(end)
135 ALIGN(16)
136 L(top):
137 ifdef(`CPU_P6',`
138 shr %edx ') C restore 2nd saved carry bit
139 mov (vp), %eax
140 adc %eax, %eax
141 mov 4(vp), %ebx
142 adc %ebx, %ebx
143 mov 8(vp), %ecx
144 adc %ecx, %ecx
146 rcr %edx C restore 1st saved carry bit
148 adc (up), %eax
149 mov %eax, (rp)
150 adc 4(up), %ebx
151 mov %ebx, 4(rp)
152 adc 8(up), %ecx
153 mov %ecx, 8(rp)
155 mov 12(vp), %eax
156 adc %eax, %eax
157 mov 16(vp), %ebx
158 adc %ebx, %ebx
159 mov 20(vp), %ecx
160 adc %ecx, %ecx
162 lea 24(vp), vp
163 adc %edx, %edx C save a carry bit in edx
165 adc 12(up), %eax
166 mov %eax, 12(rp)
167 adc 16(up), %ebx
168 mov %ebx, 16(rp)
169 adc 20(up), %ecx
171 lea 24(up), up
173 ifdef(`CPU_P6',`
174 adc %edx, %edx ') C save another carry bit in edx
175 mov %ecx, 20(rp)
176 incl VAR_COUNT
177 lea 24(rp), rp
178 jne L(top)
180 L(end):
181 pop vp FRAME_popl()
182 pop up FRAME_popl()
184 ifdef(`CPU_P6',`
185 xor %eax, %eax
186 shr $1, %edx
187 adc %edx, %eax
189 adc $0, %edx
190 mov %edx, %eax
192 pop rp FRAME_popl()
193 pop %ebx FRAME_popl()
195 EPILOGUE()
196 ASM_END()