1 dnl PowerPC
-32/VMX
and PowerPC
-64/VMX mpn_and_n
, mpn_andn_n
, mpn_nand_n
,
2 dnl mpn_ior_n
, mpn_iorn_n
, mpn_nior_n
, mpn_xor_n
, mpn_xnor_n
-- mpn bitwise
3 dnl logical operations.
5 dnl Copyright
2006 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
36 C and,ior,andn,nior,xor iorn,xnor nand
37 C cycles/limb cycles/limb cycles/limb
38 C 7400,7410 (G4): 1.39 ? ?
39 C 744x,745x (G4+): 1.14 1.39 1.39
43 C * Works for all sizes and alignment for 32-bit limbs.
44 C * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
45 C * Current performance makes this pointless for 970
48 C * Might want to make variants when just one of the source operands needs
49 C vperm, and when neither needs it. The latter runs 50% faster on 7400.
50 C * Idea: If the source operands are equally aligned, we could do the logops
51 C first, then vperm before storing! That means we never need more than one
53 C * Perhaps align `rp' after initial alignment
loop?
54 C
* Instead of having scalar code
in the beginning
and end, consider using
55 C read
-modify
-write vector code.
56 C
* Software pipeline
? Hopefully
not too important
, this is hairy enough
58 C
* At least be more clever about operand loading
, i.e.
, load v operands before
59 C u operands
, since v operands are sometimes negated.
61 define
(`GMP_LIMB_BYTES
', eval(GMP_LIMB_BITS/8))
62 define(`LIMBS_PER_VR', eval
(16/GMP_LIMB_BYTES
))
63 define
(`LIMBS_PER_2VR
', eval(32/GMP_LIMB_BYTES))
65 define(`vnegb', `
') C default neg-before to null
66 define(`vnega', `
') C default neg-before to null
68 ifdef(`OPERATION_and_n',
69 ` define
(`func
', `mpn_and_n')
70 define
(`logopS
',`and $1,$2,$3')
71 define
(`logop
', `vand $1,$2,$3')')
72 ifdef(`OPERATION_andn_n',
73 ` define
(`func
', `mpn_andn_n')
74 define
(`logopS
',`andc $1,$2,$3')
75 define
(`logop
', `vandc $1,$2,$3')')
76 ifdef(`OPERATION_nand_n',
77 ` define
(`func
', `mpn_nand_n')
78 define
(`logopS
',`nand $1,$2,$3')
79 define
(`logop
', `vand $1,$2,$3')
80 define
(`vnega
', `vnor $1,$2,$2')')
81 ifdef(`OPERATION_ior_n',
82 ` define
(`func
', `mpn_ior_n')
83 define
(`logopS
',`or $1,$2,$3')
84 define
(`logop
', `vor $1,$2,$3')')
85 ifdef(`OPERATION_iorn_n',
86 ` define
(`func
', `mpn_iorn_n')
87 define
(`logopS
',`orc $1,$2,$3')
88 define
(`vnegb
', `vnor $1,$2,$2')
89 define
(`logop
', `vor $1,$2,$3')')
90 ifdef(`OPERATION_nior_n',
91 ` define
(`func
', `mpn_nior_n')
92 define
(`logopS
',`nor $1,$2,$3')
93 define
(`logop
', `vnor $1,$2,$3')')
94 ifdef(`OPERATION_xor_n',
95 ` define
(`func
', `mpn_xor_n')
96 define
(`logopS
',`xor $1,$2,$3')
97 define
(`logop
', `vxor $1,$2,$3')')
98 ifdef(`OPERATION_xnor_n',
99 ` define
(`func
',`mpn_xnor_n')
100 define
(`logopS
',`eqv $1,$2,$3')
101 define
(`vnegb
', `vnor $1,$2,$2')
102 define
(`logop
', `vxor $1,$2,$3')')
104 ifelse(GMP_LIMB_BITS,`32',`
105 define
(`LIMB32
',` $1')
109 define(`LIMB64',`
$1')
121 MULFUNC_PROLOGUE
(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n
)
126 LIMB32
(`cmpwi cr0
, n
, 8 ')
127 LIMB64(`cmpdi cr0, n, 4 ')
132 LIMB32
(`lwz r8
, 0(up
) ')
133 LIMB32(`lwz r9, 0(vp) ')
134 LIMB32
(`logopS
( r0
, r8
, r9
) ')
135 LIMB32(`stw r0, 0(rp) ')
136 LIMB32
(`bdz L
(endS) ')
139 LIMB32(`lwzu r8, 4(up) ')
140 LIMB64
(`ld r8
, 0(up
) ')
141 LIMB64(`addi up, up, GMP_LIMB_BYTES ')
142 LIMB32
(`lwzu r9
, 4(vp
) ')
143 LIMB64(`ld r9, 0(vp) ')
144 LIMB64
(`addi vp
, vp
, GMP_LIMB_BYTES
')
146 LIMB32(`stwu r0, 4(rp) ')
147 LIMB64
(`
std r0
, 0(rp
) ')
148 LIMB64(`addi rp, rp, GMP_LIMB_BYTES ')
153 L
(big
): mfspr r12
, 256
154 oris r0
, r12
, 0xfffc C Set VRSAVE bit
0-13 FIXME
157 C First
loop until the destination is
16-byte aligned.
This will execute
0 or 1
158 C times for
64-bit machines
, and 0 to
3 times for
32-bit machines.
160 LIMB32
(`rlwinm. r0
, rp
, 30,30,31') C (rp >> 2) mod 4
161 LIMB64(`rlwinm. r0, rp, 29,31,31') C
(rp
>> 3) mod 2
164 subfic r7
, r0
, LIMBS_PER_VR
168 LIMB32(`lwz r8, 0(up) ')
169 LIMB64
(`ld r8
, 0(up
) ')
170 addi up, up, GMP_LIMB_BYTES
171 LIMB32(`lwz r9, 0(vp) ')
172 LIMB64
(`ld r9
, 0(vp
) ')
173 addi vp, vp, GMP_LIMB_BYTES
174 LIMB32(`addic. r7, r7, -1 ')
176 LIMB32
(`stwx r0
, r10
, rp
')
177 LIMB64(`std r0, 0(rp) ')
178 LIMB32
(`addi r10
, r10
, GMP_LIMB_BYTES
')
179 LIMB32(`bne L(top0) ')
181 addi rp
, rp
, 16 C update rp
, but preserve its alignment
184 LIMB64
(`srdi r7
, n
, 1 ') C loop count corresponding to n
185 LIMB32(`srwi r7, n, 2 ') C
loop count corresponding to n
186 mtctr r7 C copy n to count register
208 L
(gt1
): addi up
, up
, 16
211 L
(top
): lvx v0
, 0, up
241 1: vperm v4
, v2
, v0
, us
250 L
(end): andi. r0
, up
, 15
258 1: vperm v4
, v0
, v2
, us
270 LIMB32
(`rlwinm. r7
, n
, 0,30,31 ') C r7 = n mod 4
271 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7
= n
mod 2
274 LIMB32
(`rlwinm rp
, rp
, 0,0,27 ')
275 LIMB64(`rldicr rp, rp, 0,59 ')
278 LIMB32
(`lwzx r8
, r10
, up
')
279 LIMB64(`ldx r8, r10, up ')
280 LIMB32
(`lwzx r9
, r10
, vp
')
281 LIMB64(`ldx r9, r10, vp ')
282 LIMB32
(`addic. r7
, r7
, -1 ')
284 LIMB32(`stwx r0, r10, rp ')
285 LIMB64
(`
std r0
, 0(rp
) ')
286 LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
287 LIMB32
(`bne L
(top2
) ')
289 L(ret): mtspr 256, r12
293 C This works for 64-bit PowerPC, since a limb ptr can only be aligned
294 C in 2 relevant ways, which means we can always find a pair of aligned
295 C pointers of rp, up, and vp.
296 C process words until rp is 16-byte aligned
297 C if (((up | vp) & 15) == 0)
298 C process with VMX without any vperm
299 C else if ((up & 15) != 0 && (vp & 15) != 0)
300 C process with VMX using vperm on store data
301 C else if ((up & 15) != 0)
302 C process with VMX using vperm on up data
304 C process with VMX using vperm on vp data
306 C rlwinm, r0, up, 0,28,31
307 C rlwinm r0, vp, 0,28,31
310 C crand cr0, cr0, cr7