beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / mul_1.asm
blob6347b8bf624b8cfa7500f5d89f707377a4fe2217
1 dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
3 dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C P6 model 0-8,10-12 -
36 C P6 model 9 (Banias) 4.17
37 C P6 model 13 (Dothan) 4.17
38 C P4 model 0-1 (Willamette) 4
39 C P4 model 2 (Northwood) 4
40 C P4 model 3-4 (Prescott) 4.55
42 C TODO:
43 C * Tweak eax/edx offsets in loop as to save some lea's
44 C * Perhaps software pipeline small-case code
46 C INPUT PARAMETERS
47 C rp sp + 4
48 C up sp + 8
49 C n sp + 12
50 C v0 sp + 16
52 TEXT
53 ALIGN(16)
54 PROLOGUE(mpn_mul_1)
55 pxor %mm6, %mm6
56 L(ent): mov 4(%esp), %edx
57 mov 8(%esp), %eax
58 mov 12(%esp), %ecx
59 movd 16(%esp), %mm7
60 cmp $4, %ecx
61 jnc L(big)
63 L(lp0): movd (%eax), %mm0
64 lea 4(%eax), %eax
65 lea 4(%edx), %edx
66 pmuludq %mm7, %mm0
67 paddq %mm0, %mm6
68 movd %mm6, -4(%edx)
69 psrlq $32, %mm6
70 dec %ecx
71 jnz L(lp0)
72 movd %mm6, %eax
73 emms
74 ret
76 L(big): and $3, %ecx
77 je L(0)
78 cmp $2, %ecx
79 jc L(1)
80 je L(2)
81 jmp L(3) C FIXME: one case should fall through
83 L(0): movd (%eax), %mm3
84 sub 12(%esp), %ecx C loop count
85 lea -16(%eax), %eax
86 lea -12(%edx), %edx
87 pmuludq %mm7, %mm3
88 movd 20(%eax), %mm0
89 pmuludq %mm7, %mm0
90 movd 24(%eax), %mm1
91 jmp L(00)
93 L(1): movd (%eax), %mm2
94 sub 12(%esp), %ecx
95 lea -12(%eax), %eax
96 lea -8(%edx), %edx
97 pmuludq %mm7, %mm2
98 movd 16(%eax), %mm3
99 pmuludq %mm7, %mm3
100 movd 20(%eax), %mm0
101 jmp L(01)
103 L(2): movd (%eax), %mm1
104 sub 12(%esp), %ecx
105 lea -8(%eax), %eax
106 lea -4(%edx), %edx
107 pmuludq %mm7, %mm1
108 movd 12(%eax), %mm2
109 pmuludq %mm7, %mm2
110 movd 16(%eax), %mm3
111 jmp L(10)
113 L(3): movd (%eax), %mm0
114 sub 12(%esp), %ecx
115 lea -4(%eax), %eax
116 pmuludq %mm7, %mm0
117 movd 8(%eax), %mm1
118 pmuludq %mm7, %mm1
119 movd 12(%eax), %mm2
121 ALIGN(16)
122 L(top): pmuludq %mm7, %mm2
123 paddq %mm0, %mm6
124 movd 16(%eax), %mm3
125 movd %mm6, 0(%edx)
126 psrlq $32, %mm6
127 L(10): pmuludq %mm7, %mm3
128 paddq %mm1, %mm6
129 movd 20(%eax), %mm0
130 movd %mm6, 4(%edx)
131 psrlq $32, %mm6
132 L(01): pmuludq %mm7, %mm0
133 paddq %mm2, %mm6
134 movd 24(%eax), %mm1
135 movd %mm6, 8(%edx)
136 psrlq $32, %mm6
137 L(00): pmuludq %mm7, %mm1
138 paddq %mm3, %mm6
139 movd 28(%eax), %mm2
140 movd %mm6, 12(%edx)
141 psrlq $32, %mm6
142 lea 16(%eax), %eax
143 lea 16(%edx), %edx
144 add $4, %ecx
145 ja L(top)
147 L(end): pmuludq %mm7, %mm2
148 paddq %mm0, %mm6
149 movd %mm6, 0(%edx)
150 psrlq $32, %mm6
151 paddq %mm1, %mm6
152 movd %mm6, 4(%edx)
153 psrlq $32, %mm6
154 paddq %mm2, %mm6
155 movd %mm6, 8(%edx)
156 psrlq $32, %mm6
157 movd %mm6, %eax
158 emms
160 EPILOGUE()
161 PROLOGUE(mpn_mul_1c)
162 movd 20(%esp), %mm6
163 jmp L(ent)
164 EPILOGUE()