beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / addlsh1_n.asm
blob93b63b2018dfeabeb60231ef4ac0953dd7b185d3
1 dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
3 dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C dst!=src1,2 dst==src1 dst==src2
36 C P6 model 0-8,10-12 -
37 C P6 model 9 (Banias) ?
38 C P6 model 13 (Dothan) ?
39 C P4 model 0-1 (Willamette) ?
40 C P4 model 2 (Northwood) 4.25 6 6
41 C P4 model 3-4 (Prescott) 5 8.5 8.5
43 C The slightly strange combination of indexing and pointer incrementing
44 C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or
45 C src2 is a slowdown.
47 C The dependent chain is simply the paddq of x+2*y to the previous carry,
48 C then psrlq to get the new carry. That makes 4 c/l the target speed, which
49 C is almost achieved for separate src/dst but when src==dst the write
50 C combining anomalies slow it down.
52 defframe(PARAM_SIZE, 16)
53 defframe(PARAM_SRC2, 12)
54 defframe(PARAM_SRC1, 8)
55 defframe(PARAM_DST, 4)
57 dnl re-use parameter space
58 define(SAVE_EBX,`PARAM_SRC1')
60 TEXT
61 ALIGN(8)
63 PROLOGUE(mpn_addlsh1_n)
64 deflit(`FRAME',0)
66 mov PARAM_SRC1, %eax
67 mov %ebx, SAVE_EBX
69 mov PARAM_SRC2, %ebx
70 pxor %mm0, %mm0 C initial carry
72 mov PARAM_DST, %edx
74 mov PARAM_SIZE, %ecx
76 lea (%edx,%ecx,4), %edx C dst end
77 neg %ecx C -size
79 L(top):
80 C eax src1 end
81 C ebx src2 end
82 C ecx counter, limbs, negative
83 C edx dst end
84 C mm0 carry
86 movd (%ebx), %mm2
87 movd (%eax), %mm1
88 psrlq $32, %mm0
89 lea 4(%eax), %eax
90 lea 4(%ebx), %ebx
92 psllq $1, %mm2
93 paddq %mm2, %mm1
95 paddq %mm1, %mm0
97 movd %mm0, (%edx,%ecx,4)
98 add $1, %ecx
99 jnz L(top)
102 psrlq $32, %mm0
103 mov SAVE_EBX, %ebx
104 movd %mm0, %eax
105 emms
108 EPILOGUE()