beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / bobcat / mul_basecase.asm
blobe7d46bfcff6cf201cebea49f7f89ff4d6b7a1290
1 dnl AMD64 mpn_mul_basecase optimised for AMD bobcat.
3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 4.5
35 C AMD K10 4.5
36 C AMD bd1 4.75
37 C AMD bobcat 5
38 C Intel P4 17.7
39 C Intel core2 5.5
40 C Intel NHM 5.43
41 C Intel SBR 3.92
42 C Intel atom 23
43 C VIA nano 5.63
45 C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
46 C multiply insn bandwidth, without any apparent loop branch exit pipeline
47 C replays experienced on K8. The structure is unusual: it falls into mul_1 in
48 C the same way for all n, then it splits into 4 different wind-down blocks and
49 C 4 separate addmul_1 loops.
51 C We have not tried using the same addmul_1 loops with a switch into feed-in
52 C code, as we do in other basecase implementations. Doing that could save
53 C substantial code volume, but would also probably add some overhead.
55 C TODO
56 C * Tune un < 3 code.
57 C * Fix slowdown for un=vn=3 (67->71) compared to default code.
58 C * This is 1263 bytes, compared to 1099 bytes for default code. Consider
59 C combining addmul loops like that code. Tolerable slowdown?
60 C * Lots of space could be saved by replacing the "switch" code by gradual
61 C jumps out from mul_1 winddown code, perhaps with no added overhead.
62 C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
64 ABI_SUPPORT(DOS64)
65 ABI_SUPPORT(STD64)
67 C Standard parameters
68 define(`rp', `%rdi')
69 define(`up', `%rsi')
70 define(`un_param', `%rdx')
71 define(`vp', `%rcx')
72 define(`vn', `%r8')
73 C Standard allocations
74 define(`un', `%rbx')
75 define(`w0', `%r10')
76 define(`w1', `%r11')
77 define(`w2', `%r12')
78 define(`w3', `%r13')
79 define(`n', `%rbp')
80 define(`v0', `%r9')
82 C Temp macro for allowing control over indexing.
83 C Define to return $1 for more conservative ptr handling.
84 define(`X',`$2')
87 ASM_START()
88 TEXT
89 ALIGN(16)
90 PROLOGUE(mpn_mul_basecase)
91 FUNC_ENTRY(4)
92 IFDOS(` mov 56(%rsp), %r8d ')
94 mov (up), %rax
95 mov (vp), v0
97 cmp $2, un_param
98 ja L(ge3)
99 jz L(u2)
101 mul v0 C u0 x v0
102 mov %rax, (rp)
103 mov %rdx, 8(rp)
104 FUNC_EXIT()
107 L(u2): mul v0 C u0 x v0
108 mov %rax, (rp)
109 mov 8(up), %rax
110 mov %rdx, w0
111 mul v0
112 add %rax, w0
113 mov %rdx, w1
114 adc $0, w1
115 cmp $1, R32(vn)
116 jnz L(u2v2)
117 mov w0, 8(rp)
118 mov w1, 16(rp)
119 FUNC_EXIT()
122 L(u2v2):mov 8(vp), v0
123 mov (up), %rax
124 mul v0
125 add %rax, w0
126 mov w0, 8(rp)
127 mov %rdx, %r8 C CAUTION: r8 realloc
128 adc $0, %r8
129 mov 8(up), %rax
130 mul v0
131 add w1, %r8
132 adc $0, %rdx
133 add %r8, %rax
134 adc $0, %rdx
135 mov %rax, 16(rp)
136 mov %rdx, 24(rp)
137 FUNC_EXIT()
141 L(ge3): push %rbx
142 push %rbp
143 push %r12
144 push %r13
146 lea 8(vp), vp
148 lea -24(rp,un_param,8), rp
149 lea -24(up,un_param,8), up
150 xor R32(un), R32(un)
151 mov $2, R32(n)
152 sub un_param, un
153 sub un_param, n
155 mul v0
156 mov %rax, w2
157 mov %rdx, w3
158 jmp L(L3)
160 ALIGN(16)
161 L(top): mov w0, -16(rp,n,8)
162 add w1, w2
163 adc $0, w3
164 mov (up,n,8), %rax
165 mul v0
166 mov %rax, w0
167 mov %rdx, w1
168 mov w2, -8(rp,n,8)
169 add w3, w0
170 adc $0, w1
171 mov 8(up,n,8), %rax
172 mul v0
173 mov %rax, w2
174 mov %rdx, w3
175 mov w0, (rp,n,8)
176 add w1, w2
177 adc $0, w3
178 L(L3): mov 16(up,n,8), %rax
179 mul v0
180 mov %rax, w0
181 mov %rdx, w1
182 mov w2, 8(rp,n,8)
183 add w3, w0
184 adc $0, w1
185 mov 24(up,n,8), %rax
186 mul v0
187 mov %rax, w2
188 mov %rdx, w3
189 add $4, n
190 js L(top)
192 mov w0, -16(rp,n,8)
193 add w1, w2
194 adc $0, w3
196 C Switch on n into right addmul_l loop
197 test n, n
198 jz L(r2)
199 cmp $2, R32(n)
200 ja L(r3)
201 jz L(r0)
202 jmp L(r1)
205 L(r3): mov w2, X(-8(rp,n,8),16(rp))
206 mov w3, X((rp,n,8),24(rp))
207 add $2, un
209 C outer loop(3)
210 L(to3): dec vn
211 jz L(ret)
212 mov (vp), v0
213 mov 8(up,un,8), %rax
214 lea 8(vp), vp
215 lea 8(rp), rp
216 mov un, n
217 mul v0
218 mov %rax, w2
219 mov %rdx, w3
220 jmp L(al3)
222 ALIGN(16)
223 L(ta3): add w0, -16(rp,n,8)
224 adc w1, w2
225 adc $0, w3
226 mov (up,n,8), %rax
227 mul v0
228 mov %rax, w0
229 mov %rdx, w1
230 add w2, -8(rp,n,8)
231 adc w3, w0
232 adc $0, w1
233 mov 8(up,n,8), %rax
234 mul v0
235 mov %rax, w2
236 mov %rdx, w3
237 add w0, (rp,n,8)
238 adc w1, w2
239 adc $0, w3
240 L(al3): mov 16(up,n,8), %rax
241 mul v0
242 mov %rax, w0
243 mov %rdx, w1
244 add w2, 8(rp,n,8)
245 adc w3, w0
246 adc $0, w1
247 mov 24(up,n,8), %rax
248 mul v0
249 mov %rax, w2
250 mov %rdx, w3
251 add $4, n
252 js L(ta3)
254 add w0, X(-16(rp,n,8),8(rp))
255 adc w1, w2
256 adc $0, w3
257 add w2, X(-8(rp,n,8),16(rp))
258 adc $0, w3
259 mov w3, X((rp,n,8),24(rp))
260 jmp L(to3)
263 L(r2): mov X(0(up,n,8),(up)), %rax
264 mul v0
265 mov %rax, w0
266 mov %rdx, w1
267 mov w2, X(-8(rp,n,8),-8(rp))
268 add w3, w0
269 adc $0, w1
270 mov X(8(up,n,8),8(up)), %rax
271 mul v0
272 mov %rax, w2
273 mov %rdx, w3
274 mov w0, X((rp,n,8),(rp))
275 add w1, w2
276 adc $0, w3
277 mov X(16(up,n,8),16(up)), %rax
278 mul v0
279 mov %rax, w0
280 mov %rdx, w1
281 mov w2, X(8(rp,n,8),8(rp))
282 add w3, w0
283 adc $0, w1
284 mov w0, X(16(rp,n,8),16(rp))
285 adc $0, w3
286 mov w1, X(24(rp,n,8),24(rp))
287 inc un
289 C outer loop(2)
290 L(to2): dec vn
291 jz L(ret)
292 mov (vp), v0
293 mov 16(up,un,8), %rax
294 lea 8(vp), vp
295 lea 8(rp), rp
296 mov un, n
297 mul v0
298 mov %rax, w0
299 mov %rdx, w1
300 jmp L(al2)
302 ALIGN(16)
303 L(ta2): add w0, -16(rp,n,8)
304 adc w1, w2
305 adc $0, w3
306 mov (up,n,8), %rax
307 mul v0
308 mov %rax, w0
309 mov %rdx, w1
310 add w2, -8(rp,n,8)
311 adc w3, w0
312 adc $0, w1
313 mov 8(up,n,8), %rax
314 mul v0
315 mov %rax, w2
316 mov %rdx, w3
317 add w0, (rp,n,8)
318 adc w1, w2
319 adc $0, w3
320 mov 16(up,n,8), %rax
321 mul v0
322 mov %rax, w0
323 mov %rdx, w1
324 add w2, 8(rp,n,8)
325 adc w3, w0
326 adc $0, w1
327 L(al2): mov 24(up,n,8), %rax
328 mul v0
329 mov %rax, w2
330 mov %rdx, w3
331 add $4, n
332 js L(ta2)
334 add w0, X(-16(rp,n,8),8(rp))
335 adc w1, w2
336 adc $0, w3
337 add w2, X(-8(rp,n,8),16(rp))
338 adc $0, w3
339 mov w3, X((rp,n,8),24(rp))
340 jmp L(to2)
343 L(r1): mov X(0(up,n,8),8(up)), %rax
344 mul v0
345 mov %rax, w0
346 mov %rdx, w1
347 mov w2, X(-8(rp,n,8),(rp))
348 add w3, w0
349 adc $0, w1
350 mov X(8(up,n,8),16(up)), %rax
351 mul v0
352 mov %rax, w2
353 mov %rdx, w3
354 mov w0, X((rp,n,8),8(rp))
355 add w1, w2
356 adc $0, w3
357 mov w2, X(8(rp,n,8),16(rp))
358 mov w3, X(16(rp,n,8),24(rp))
359 add $4, un
361 C outer loop(1)
362 L(to1): dec vn
363 jz L(ret)
364 mov (vp), v0
365 mov -8(up,un,8), %rax
366 lea 8(vp), vp
367 lea 8(rp), rp
368 mov un, n
369 mul v0
370 mov %rax, w2
371 mov %rdx, w3
372 jmp L(al1)
374 ALIGN(16)
375 L(ta1): add w0, -16(rp,n,8)
376 adc w1, w2
377 adc $0, w3
378 L(al1): mov (up,n,8), %rax
379 mul v0
380 mov %rax, w0
381 mov %rdx, w1
382 add w2, -8(rp,n,8)
383 adc w3, w0
384 adc $0, w1
385 mov 8(up,n,8), %rax
386 mul v0
387 mov %rax, w2
388 mov %rdx, w3
389 add w0, (rp,n,8)
390 adc w1, w2
391 adc $0, w3
392 mov 16(up,n,8), %rax
393 mul v0
394 mov %rax, w0
395 mov %rdx, w1
396 add w2, 8(rp,n,8)
397 adc w3, w0
398 adc $0, w1
399 mov 24(up,n,8), %rax
400 mul v0
401 mov %rax, w2
402 mov %rdx, w3
403 add $4, n
404 js L(ta1)
406 add w0, X(-16(rp,n,8),8(rp))
407 adc w1, w2
408 adc $0, w3
409 add w2, X(-8(rp,n,8),16(rp))
410 adc $0, w3
411 mov w3, X((rp,n,8),24(rp))
412 jmp L(to1)
415 L(r0): mov X((up,n,8),16(up)), %rax
416 mul v0
417 mov %rax, w0
418 mov %rdx, w1
419 mov w2, X(-8(rp,n,8),8(rp))
420 add w3, w0
421 adc $0, w1
422 mov w0, X((rp,n,8),16(rp))
423 mov w1, X(8(rp,n,8),24(rp))
424 add $3, un
426 C outer loop(0)
427 L(to0): dec vn
428 jz L(ret)
429 mov (vp), v0
430 mov (up,un,8), %rax
431 lea 8(vp), vp
432 lea 8(rp), rp
433 mov un, n
434 mul v0
435 mov %rax, w0
436 mov %rdx, w1
437 jmp L(al0)
439 ALIGN(16)
440 L(ta0): add w0, -16(rp,n,8)
441 adc w1, w2
442 adc $0, w3
443 mov (up,n,8), %rax
444 mul v0
445 mov %rax, w0
446 mov %rdx, w1
447 add w2, -8(rp,n,8)
448 adc w3, w0
449 adc $0, w1
450 L(al0): mov 8(up,n,8), %rax
451 mul v0
452 mov %rax, w2
453 mov %rdx, w3
454 add w0, (rp,n,8)
455 adc w1, w2
456 adc $0, w3
457 mov 16(up,n,8), %rax
458 mul v0
459 mov %rax, w0
460 mov %rdx, w1
461 add w2, 8(rp,n,8)
462 adc w3, w0
463 adc $0, w1
464 mov 24(up,n,8), %rax
465 mul v0
466 mov %rax, w2
467 mov %rdx, w3
468 add $4, n
469 js L(ta0)
471 add w0, X(-16(rp,n,8),8(rp))
472 adc w1, w2
473 adc $0, w3
474 add w2, X(-8(rp,n,8),16(rp))
475 adc $0, w3
476 mov w3, X((rp,n,8),24(rp))
477 jmp L(to0)
480 L(ret): pop %r13
481 pop %r12
482 pop %rbp
483 pop %rbx
484 FUNC_EXIT()
486 EPILOGUE()