beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc32 / vmx / logops_n.asm
blobd656d3b73f7bc2780cbe87dfaeabf26a13afe389
1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
2 dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
3 dnl logical operations.
5 dnl Copyright 2006 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C and,ior,andn,nior,xor iorn,xnor nand
37 C cycles/limb cycles/limb cycles/limb
38 C 7400,7410 (G4): 1.39 ? ?
39 C 744x,745x (G4+): 1.14 1.39 1.39
40 C 970: 1.7 2.0 2.0
42 C STATUS
43 C * Works for all sizes and alignment for 32-bit limbs.
44 C * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
45 C * Current performance makes this pointless for 970
47 C TODO
48 C * Might want to make variants when just one of the source operands needs
49 C vperm, and when neither needs it. The latter runs 50% faster on 7400.
50 C * Idea: If the source operands are equally aligned, we could do the logops
51 C first, then vperm before storing! That means we never need more than one
52 C vperm, ever!
53 C * Perhaps align `rp' after initial alignment loop?
54 C * Instead of having scalar code in the beginning and end, consider using
55 C read-modify-write vector code.
56 C * Software pipeline? Hopefully not too important, this is hairy enough
57 C already.
58 C * At least be more clever about operand loading, i.e., load v operands before
59 C u operands, since v operands are sometimes negated.
61 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
62 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
63 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
65 define(`vnegb', `') C default neg-before to null
66 define(`vnega', `') C default neg-before to null
68 ifdef(`OPERATION_and_n',
69 ` define(`func', `mpn_and_n')
70 define(`logopS',`and $1,$2,$3')
71 define(`logop', `vand $1,$2,$3')')
72 ifdef(`OPERATION_andn_n',
73 ` define(`func', `mpn_andn_n')
74 define(`logopS',`andc $1,$2,$3')
75 define(`logop', `vandc $1,$2,$3')')
76 ifdef(`OPERATION_nand_n',
77 ` define(`func', `mpn_nand_n')
78 define(`logopS',`nand $1,$2,$3')
79 define(`logop', `vand $1,$2,$3')
80 define(`vnega', `vnor $1,$2,$2')')
81 ifdef(`OPERATION_ior_n',
82 ` define(`func', `mpn_ior_n')
83 define(`logopS',`or $1,$2,$3')
84 define(`logop', `vor $1,$2,$3')')
85 ifdef(`OPERATION_iorn_n',
86 ` define(`func', `mpn_iorn_n')
87 define(`logopS',`orc $1,$2,$3')
88 define(`vnegb', `vnor $1,$2,$2')
89 define(`logop', `vor $1,$2,$3')')
90 ifdef(`OPERATION_nior_n',
91 ` define(`func', `mpn_nior_n')
92 define(`logopS',`nor $1,$2,$3')
93 define(`logop', `vnor $1,$2,$3')')
94 ifdef(`OPERATION_xor_n',
95 ` define(`func', `mpn_xor_n')
96 define(`logopS',`xor $1,$2,$3')
97 define(`logop', `vxor $1,$2,$3')')
98 ifdef(`OPERATION_xnor_n',
99 ` define(`func',`mpn_xnor_n')
100 define(`logopS',`eqv $1,$2,$3')
101 define(`vnegb', `vnor $1,$2,$2')
102 define(`logop', `vxor $1,$2,$3')')
104 ifelse(GMP_LIMB_BITS,`32',`
105 define(`LIMB32',` $1')
106 define(`LIMB64',`')
108 define(`LIMB32',`')
109 define(`LIMB64',` $1')
112 C INPUT PARAMETERS
113 define(`rp', `r3')
114 define(`up', `r4')
115 define(`vp', `r5')
116 define(`n', `r6')
118 define(`us', `v8')
119 define(`vs', `v9')
121 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
123 ASM_START()
124 PROLOGUE(func)
126 LIMB32(`cmpwi cr0, n, 8 ')
127 LIMB64(`cmpdi cr0, n, 4 ')
128 bge L(big)
130 mtctr n
132 LIMB32(`lwz r8, 0(up) ')
133 LIMB32(`lwz r9, 0(vp) ')
134 LIMB32(`logopS( r0, r8, r9) ')
135 LIMB32(`stw r0, 0(rp) ')
136 LIMB32(`bdz L(endS) ')
138 L(topS):
139 LIMB32(`lwzu r8, 4(up) ')
140 LIMB64(`ld r8, 0(up) ')
141 LIMB64(`addi up, up, GMP_LIMB_BYTES ')
142 LIMB32(`lwzu r9, 4(vp) ')
143 LIMB64(`ld r9, 0(vp) ')
144 LIMB64(`addi vp, vp, GMP_LIMB_BYTES ')
145 logopS( r0, r8, r9)
146 LIMB32(`stwu r0, 4(rp) ')
147 LIMB64(`std r0, 0(rp) ')
148 LIMB64(`addi rp, rp, GMP_LIMB_BYTES ')
149 bdnz L(topS)
150 L(endS):
153 L(big): mfspr r12, 256
154 oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME
155 mtspr 256, r0
157 C First loop until the destination is 16-byte aligned. This will execute 0 or 1
158 C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
160 LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4
161 LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2
162 beq L(aligned)
164 subfic r7, r0, LIMBS_PER_VR
165 LIMB32(`li r10, 0 ')
166 subf n, r7, n
167 L(top0):
168 LIMB32(`lwz r8, 0(up) ')
169 LIMB64(`ld r8, 0(up) ')
170 addi up, up, GMP_LIMB_BYTES
171 LIMB32(`lwz r9, 0(vp) ')
172 LIMB64(`ld r9, 0(vp) ')
173 addi vp, vp, GMP_LIMB_BYTES
174 LIMB32(`addic. r7, r7, -1 ')
175 logopS( r0, r8, r9)
176 LIMB32(`stwx r0, r10, rp ')
177 LIMB64(`std r0, 0(rp) ')
178 LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
179 LIMB32(`bne L(top0) ')
181 addi rp, rp, 16 C update rp, but preserve its alignment
183 L(aligned):
184 LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n
185 LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n
186 mtctr r7 C copy n to count register
188 li r10, 16
189 lvsl us, 0, up
190 lvsl vs, 0, vp
192 lvx v2, 0, up
193 lvx v3, 0, vp
194 bdnz L(gt1)
195 lvx v0, r10, up
196 lvx v1, r10, vp
197 vperm v4, v2, v0, us
198 vperm v5, v3, v1, vs
199 vnegb( v5, v5)
200 logop( v6, v4, v5)
201 vnega( v6, v6)
202 stvx v6, 0, rp
203 addi up, up, 16
204 addi vp, vp, 16
205 addi rp, rp, 4
206 b L(tail)
208 L(gt1): addi up, up, 16
209 addi vp, vp, 16
211 L(top): lvx v0, 0, up
212 lvx v1, 0, vp
213 vperm v4, v2, v0, us
214 vperm v5, v3, v1, vs
215 vnegb( v5, v5)
216 logop( v6, v4, v5)
217 vnega( v6, v6)
218 stvx v6, 0, rp
219 bdz L(end)
220 lvx v2, r10, up
221 lvx v3, r10, vp
222 vperm v4, v0, v2, us
223 vperm v5, v1, v3, vs
224 vnegb( v5, v5)
225 logop( v6, v4, v5)
226 vnega( v6, v6)
227 stvx v6, r10, rp
228 addi up, up, 32
229 addi vp, vp, 32
230 addi rp, rp, 32
231 bdnz L(top)
233 andi. r0, up, 15
234 vxor v0, v0, v0
235 beq 1f
236 lvx v0, 0, up
237 1: andi. r0, vp, 15
238 vxor v1, v1, v1
239 beq 1f
240 lvx v1, 0, vp
241 1: vperm v4, v2, v0, us
242 vperm v5, v3, v1, vs
243 vnegb( v5, v5)
244 logop( v6, v4, v5)
245 vnega( v6, v6)
246 stvx v6, 0, rp
247 addi rp, rp, 4
248 b L(tail)
250 L(end): andi. r0, up, 15
251 vxor v2, v2, v2
252 beq 1f
253 lvx v2, r10, up
254 1: andi. r0, vp, 15
255 vxor v3, v3, v3
256 beq 1f
257 lvx v3, r10, vp
258 1: vperm v4, v0, v2, us
259 vperm v5, v1, v3, vs
260 vnegb( v5, v5)
261 logop( v6, v4, v5)
262 vnega( v6, v6)
263 stvx v6, r10, rp
265 addi up, up, 16
266 addi vp, vp, 16
267 addi rp, rp, 20
269 L(tail):
270 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
271 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
272 beq L(ret)
273 addi rp, rp, 15
274 LIMB32(`rlwinm rp, rp, 0,0,27 ')
275 LIMB64(`rldicr rp, rp, 0,59 ')
276 li r10, 0
277 L(top2):
278 LIMB32(`lwzx r8, r10, up ')
279 LIMB64(`ldx r8, r10, up ')
280 LIMB32(`lwzx r9, r10, vp ')
281 LIMB64(`ldx r9, r10, vp ')
282 LIMB32(`addic. r7, r7, -1 ')
283 logopS( r0, r8, r9)
284 LIMB32(`stwx r0, r10, rp ')
285 LIMB64(`std r0, 0(rp) ')
286 LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
287 LIMB32(`bne L(top2) ')
289 L(ret): mtspr 256, r12
291 EPILOGUE()
293 C This works for 64-bit PowerPC, since a limb ptr can only be aligned
294 C in 2 relevant ways, which means we can always find a pair of aligned
295 C pointers of rp, up, and vp.
296 C process words until rp is 16-byte aligned
297 C if (((up | vp) & 15) == 0)
298 C process with VMX without any vperm
299 C else if ((up & 15) != 0 && (vp & 15) != 0)
300 C process with VMX using vperm on store data
301 C else if ((up & 15) != 0)
302 C process with VMX using vperm on up data
303 C else
304 C process with VMX using vperm on vp data
306 C rlwinm, r0, up, 0,28,31
307 C rlwinm r0, vp, 0,28,31
308 C cmpwi cr7, r0, 0
309 C cror cr6, cr0, cr7
310 C crand cr0, cr0, cr7