beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / sqr_basecase.asm
blob46a36121fec96670211f9328b5cd06be045be6f8
1 dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
36 C AMD K8,K9 ? ? ?
37 C AMD K10 ? ? ?
38 C AMD bull ? ? ?
39 C AMD pile ? ? ?
40 C AMD steam ? ? ?
41 C AMD bobcat ? ? ?
42 C AMD jaguar ? ? ?
43 C Intel P4 ? ? ?
44 C Intel core ? ? ?
45 C Intel NHM ? ? ?
46 C Intel SBR 2.57 2.93 3.0
47 C Intel IBR 2.35 2.66 3.0
48 C Intel HWL 2.02 2.5 2.5
49 C Intel BWL ? ? ?
50 C Intel atom ? ? ?
51 C VIA nano ? ? ?
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
55 C that the sqr_diag_addlsh1 loop was manually written.
57 C TODO
58 C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
59 C * Streamline pointer updates.
60 C * Perhaps suppress a few more xor insns in feed-in code.
61 C * Make sure we write no dead registers in feed-in code.
62 C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
63 C out for negative sizes being zero-extended, though.
64 C * The straight-line code for n <= 3 comes from the K8 code, and might be
65 C quite sub-optimal here. Write specific code, and add code for n = 4.
66 C * The mul_2 loop has a 10 insn common sequence in the loop start and the
67 C wind-down code. Try re-rolling it.
68 C * This file has been the subject to just basic micro-optimisation.
70 C When playing with pointers, set this to $2 to fall back to conservative
71 C indexing in wind-down code.
72 define(`I',`$1')
74 define(`rp', `%rdi')
75 define(`up', `%rsi')
76 define(`un_param',`%rdx')
79 ABI_SUPPORT(DOS64)
80 ABI_SUPPORT(STD64)
82 ASM_START()
83 TEXT
84 ALIGN(32)
85 PROLOGUE(mpn_sqr_basecase)
86 FUNC_ENTRY(3)
88 cmp $2, un_param
89 jae L(gt1)
91 mov (up), %rax
92 mul %rax
93 mov %rax, (rp)
94 mov %rdx, 8(rp)
95 FUNC_EXIT()
96 ret
98 L(gt1): jne L(gt2)
100 mov (up), %rax
101 mov %rax, %r8
102 mul %rax
103 mov 8(up), %r11
104 mov %rax, (rp)
105 mov %r11, %rax
106 mov %rdx, %r9
107 mul %rax
108 mov %rax, %r10
109 mov %r11, %rax
110 mov %rdx, %r11
111 mul %r8
112 xor %r8, %r8
113 add %rax, %r9
114 adc %rdx, %r10
115 adc %r8, %r11
116 add %rax, %r9
117 mov %r9, 8(rp)
118 adc %rdx, %r10
119 mov %r10, 16(rp)
120 adc %r8, %r11
121 mov %r11, 24(rp)
122 FUNC_EXIT()
125 L(gt2): cmp $4, un_param
126 jae L(gt3)
127 define(`v0', `%r8')
128 define(`v1', `%r9')
129 define(`w0', `%r10')
130 define(`w2', `%r11')
132 mov (up), %rax
133 mov %rax, %r10
134 mul %rax
135 mov 8(up), %r11
136 mov %rax, (rp)
137 mov %r11, %rax
138 mov %rdx, 8(rp)
139 mul %rax
140 mov 16(up), %rcx
141 mov %rax, 16(rp)
142 mov %rcx, %rax
143 mov %rdx, 24(rp)
144 mul %rax
145 mov %rax, 32(rp)
146 mov %rdx, 40(rp)
148 mov %r11, %rax
149 mul %r10
150 mov %rax, %r8
151 mov %rcx, %rax
152 mov %rdx, %r9
153 mul %r10
154 xor %r10, %r10
155 add %rax, %r9
156 mov %r11, %rax
157 mov %r10, %r11
158 adc %rdx, %r10
160 mul %rcx
161 add %rax, %r10
162 adc %r11, %rdx
163 add %r8, %r8
164 adc %r9, %r9
165 adc %r10, %r10
166 adc %rdx, %rdx
167 adc %r11, %r11
168 add %r8, 8(rp)
169 adc %r9, 16(rp)
170 adc %r10, 24(rp)
171 adc %rdx, 32(rp)
172 adc %r11, 40(rp)
173 FUNC_EXIT()
176 L(gt3):
178 define(`v0', `%r8')
179 define(`v1', `%r9')
180 define(`w0', `%r10')
181 define(`w1', `%r11')
182 define(`w2', `%rbx')
183 define(`w3', `%rbp')
184 define(`un', `%r12')
185 define(`n', `%rcx')
187 define(`X0', `%r13')
188 define(`X1', `%r14')
190 L(do_mul_2):
191 mov (up), v0
192 push %rbx
193 lea (rp,un_param,8), rp C point rp at R[un]
194 mov 8(up), %rax
195 push %rbp
196 lea (up,un_param,8), up C point up right after U's end
197 mov %rax, v1
198 push %r12
199 mov $1, R32(un) C free up rdx
200 push %r13
201 sub un_param, un
202 push %r14
203 push un
204 mul v0
205 mov %rax, (rp,un,8)
206 mov 8(up,un,8), %rax
207 test $1, R8(un)
208 jnz L(m2b1)
210 L(m2b0):lea 2(un), n
211 xor R32(w1), R32(w1) C FIXME
212 xor R32(w2), R32(w2) C FIXME
213 mov %rdx, w0
214 jmp L(m2l0)
216 L(m2b1):lea 1(un), n
217 xor R32(w3), R32(w3) C FIXME
218 xor R32(w0), R32(w0) C FIXME
219 mov %rdx, w2
220 jmp L(m2l1)
222 ALIGN(32)
223 L(m2tp):
224 L(m2l0):mul v0
225 add %rax, w0
226 mov %rdx, w3
227 adc $0, w3
228 mov -8(up,n,8), %rax
229 mul v1
230 add w1, w0
231 adc $0, w3
232 add %rax, w2
233 mov w0, -8(rp,n,8)
234 mov %rdx, w0
235 adc $0, w0
236 mov (up,n,8), %rax
237 L(m2l1):mul v0
238 add %rax, w2
239 mov %rdx, w1
240 adc $0, w1
241 add w3, w2
242 mov (up,n,8), %rax
243 adc $0, w1
244 mul v1
245 mov w2, (rp,n,8)
246 add %rax, w0
247 mov %rdx, w2
248 mov 8(up,n,8), %rax
249 adc $0, w2
250 add $2, n
251 jnc L(m2tp)
253 L(m2ed):mul v0
254 add %rax, w0
255 mov %rdx, w3
256 adc $0, w3
257 mov I(-8(up),-8(up,n,8)), %rax
258 mul v1
259 add w1, w0
260 adc $0, w3
261 add %rax, w2
262 mov w0, I(-8(rp),-8(rp,n,8))
263 adc $0, %rdx
264 add w3, w2
265 mov w2, I((rp),(rp,n,8))
266 adc $0, %rdx
267 mov %rdx, I(8(rp),8(rp,n,8))
269 add $2, un C decrease |un|
271 L(do_addmul_2):
272 L(outer):
273 lea 16(rp), rp
274 cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1}
275 jge L(corner) C FIXME: move to before the lea above
277 mov -8(up,un,8), v0
278 mov (up,un,8), %rax
279 mov %rax, v1
280 mul v0
281 test $1, R8(un)
282 jnz L(a1x1)
284 L(a1x0):mov (rp,un,8), X0
285 xor w0, w0
286 mov 8(rp,un,8), X1
287 add %rax, X0
288 mov %rdx, w1
289 adc $0, w1
290 xor w2, w2
291 mov X0, (rp,un,8)
292 mov 8(up,un,8), %rax
293 test $2, R8(un)
294 jnz L(a110)
296 L(a100):lea 2(un), n C un = 4, 8, 12, ...
297 jmp L(lo0)
299 L(a110):lea (un), n C un = 2, 6, 10, ...
300 jmp L(lo2)
302 L(a1x1):mov (rp,un,8), X1
303 xor w2, w2
304 mov 8(rp,un,8), X0
305 add %rax, X1
306 mov %rdx, w3
307 adc $0, w3
308 xor w0, w0
309 mov 8(up,un,8), %rax
310 test $2, R8(un)
311 jz L(a111)
313 L(a101):lea 3(un), n C un = 1, 5, 9, ...
314 jmp L(lo1)
316 L(a111):lea 1(un), n C un = 3, 7, 11, ...
317 jmp L(lo3)
319 ALIGN(32)
320 L(top): mul v1
321 mov %rdx, w0
322 add %rax, X0
323 adc $0, w0
324 add w1, X1
325 adc $0, w3
326 add w2, X0
327 adc $0, w0
328 mov -16(up,n,8), %rax
329 L(lo1): mul v0
330 add %rax, X0
331 mov %rdx, w1
332 adc $0, w1
333 mov -16(up,n,8), %rax
334 mul v1
335 mov X1, -24(rp,n,8)
336 mov -8(rp,n,8), X1
337 add w3, X0
338 adc $0, w1
339 mov %rdx, w2
340 mov X0, -16(rp,n,8)
341 add %rax, X1
342 adc $0, w2
343 mov -8(up,n,8), %rax
344 add w0, X1
345 adc $0, w2
346 L(lo0): mul v0
347 add %rax, X1
348 mov %rdx, w3
349 adc $0, w3
350 mov -8(up,n,8), %rax
351 mul v1
352 add w1, X1
353 mov (rp,n,8), X0
354 adc $0, w3
355 mov %rdx, w0
356 add %rax, X0
357 adc $0, w0
358 mov (up,n,8), %rax
359 L(lo3): mul v0
360 add w2, X0
361 mov X1, -8(rp,n,8)
362 mov %rdx, w1
363 adc $0, w0
364 add %rax, X0
365 adc $0, w1
366 mov (up,n,8), %rax
367 add w3, X0
368 adc $0, w1
369 mul v1
370 mov 8(rp,n,8), X1
371 add %rax, X1
372 mov %rdx, w2
373 adc $0, w2
374 mov 8(up,n,8), %rax
375 mov X0, (rp,n,8)
376 L(lo2): mul v0
377 add w0, X1
378 mov %rdx, w3
379 adc $0, w2
380 add %rax, X1
381 mov 8(up,n,8), %rax
382 mov 16(rp,n,8), X0
383 adc $0, w3
384 add $4, n
385 jnc L(top)
387 L(end): mul v1
388 add w1, X1
389 adc $0, w3
390 add w2, %rax
391 adc $0, %rdx
392 mov X1, I(-8(rp),-24(rp,n,8))
393 add w3, %rax
394 adc $0, %rdx
395 mov %rax, I((rp),-16(rp,n,8))
396 mov %rdx, I(8(rp),-8(rp,n,8))
398 add $2, un C decrease |un|
399 jmp L(outer) C loop until a small corner remains
401 L(corner):
402 pop n
403 jg L(small_corner)
405 lea 8(rp), rp
406 mov -24(up), v0
407 mov -16(up), %rax
408 mov %rax, v1
409 mul v0
410 mov -24(rp), X0
411 mov -16(rp), X1
412 add %rax, X0
413 mov %rdx, w1
414 adc $0, w1
415 xor w2, w2
416 mov X0, -24(rp)
417 mov -8(up), %rax
418 mul v0
419 add $0, X1
420 mov %rdx, w3
421 adc $0, w2
422 add %rax, X1
423 mov -8(up), %rax
424 adc $0, w3
425 mul v1
426 add w1, X1
427 adc $0, w3
428 add w2, %rax
429 adc $0, %rdx
430 mov X1, -16(rp)
431 jmp L(com)
433 L(small_corner):
434 mov -8(rp), w3
435 mov -16(up), v0
436 mov -8(up), %rax
437 mul v0
438 L(com): add w3, %rax
439 adc $0, %rdx
440 mov %rax, -8(rp)
441 mov %rdx, (rp)
443 L(sqr_diag_addlsh1):
444 mov -8(up,n,8), %rax
445 shl n
446 mul %rax
447 mov %rax, (rp,n,8)
449 xor R32(%rbx), R32(%rbx)
450 mov 8(rp,n,8), %r8
451 mov 16(rp,n,8), %r9
452 jmp L(dm)
454 ALIGN(32)
455 L(dtop):add %r8, %r10
456 adc %r9, %rax
457 mov 8(rp,n,8), %r8
458 mov 16(rp,n,8), %r9
459 mov %r10, -8(rp,n,8)
460 mov %rax, (rp,n,8)
461 L(dm): adc %r8, %r8
462 adc %r9, %r9
463 mov (up,n,4), %rax
464 lea (%rdx,%rbx), %r10
465 setc R8(%rbx)
466 mul %rax
467 add $2, n
468 js L(dtop)
470 L(dend):add %r8, %r10
471 adc %r9, %rax
472 mov %r10, I(-8(rp),-8(rp,n,8))
473 mov %rax, I((rp),(rp,n,8))
474 adc %rbx, %rdx
475 mov %rdx, I(8(rp),8(rp,n,8))
477 pop %r14
478 pop %r13
479 pop %r12
480 pop %rbp
481 pop %rbx
482 FUNC_EXIT()
484 EPILOGUE()