1 dnl PowerPC
-32/VMX
and PowerPC
-64/VMX mpn_popcount.
3 dnl Copyright
2006, 2010 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
35 C 744x,745x (G4+): 1.125
39 C * Rewrite the awkward huge n outer loop code.
40 C * Two lvx, two vperm, and two vxor could make us a similar hamdist.
41 C * Compress cnsts table in 64-bit mode, only half the values are needed.
43 define(`GMP_LIMB_BYTES', eval
(GMP_LIMB_BITS
/8))
44 define
(`LIMBS_PER_VR
', eval(16/GMP_LIMB_BYTES))
45 define(`LIMBS_PER_2VR', eval
(32/GMP_LIMB_BYTES
))
47 define
(`OPERATION_popcount
')
55 ifelse(GMP_LIMB_BITS,32,`
56 define(`LIMB32',`
$1')
60 define
(`LIMB64
',` $1')
63 C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
64 C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
65 define(`LIMBS_PER_CHUNK', 0x1000)
66 define
(`LIMBS_CHUNK_THRES
', 0x1001)
69 PROLOGUE(mpn_popcount,toc)
71 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13
74 ifdef(`HAVE_ABI_mode32',
75 ` rldicl n
, n
, 0, 32') C zero extend n
77 C Load various constants into vector registers
80 vspltisb cnt4, 4 C 0x0404...04 used as shift count
85 LIMB64(`lis r0, LIMBS_CHUNK_THRES ')
86 LIMB64
(`cmpd cr7
, n
, r0
')
90 rlwinm r6, ap, 2,26,29
94 LIMB32(`rlwinm r8, ap, 30,30,31 ')
95 LIMB64
(`rlwinm r8
, ap
, 29,31,31 ')
96 add n, n, r8 C compensate n for rounded down `ap'
99 li r8
, 0 C grand total count
101 vxor v12
, v12
, v12 C zero total count
102 vxor v13
, v13
, v13 C zero total count
104 addic. n
, n
, -LIMBS_PER_VR
107 addic. n
, n
, -LIMBS_PER_VR
110 C For
64-bit machines
, handle huge n that would overflow vsum4ubs
111 LIMB64
(`ble cr7
, L
(small) ')
112 LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n
113 LIMB64
(`lis n
, LIMBS_PER_CHUNK
')
117 LIMB32(`srwi r7, n, 3 ') C
loop count corresponding to n
118 LIMB64
(`srdi r7
, n
, 2 ') C loop count corresponding to n
120 mtctr r7 C copy n to count register
126 L(ent): lvx v1, r12, ap
130 vperm v2, rtab, rtab, v0
131 vperm v3, rtab, rtab, v8
132 vperm v4, rtab, rtab, v1
133 vperm v5, rtab, rtab, v9
136 vsum4ubs v12, v6, v12
137 vsum4ubs v13, v7, v13
140 andi. n, n, eval(LIMBS_PER_2VR-1)
145 cmpwi n, LIMBS_PER_VR
151 LIMB32(`rlwinm r6, n, 4,26,27 ')
152 LIMB64
(`rlwinm r6
, n
, 5,26,26 ')
158 vperm v2, rtab, rtab, v0
159 vperm v3, rtab, rtab, v8
160 vperm v4, rtab, rtab, v1
161 vperm v5, rtab, rtab, v9
164 vsum4ubs v12, v6, v12
165 vsum4ubs v13, v7, v13
168 L(rt): vadduwm v3, v12, v13
169 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs
170 stvx v3, r7, r1 C FIXME: ...support storing below sp?
181 C Handle outer loop for huge n. We inherit cr7 and r0 from above.
182 LIMB64(`ble cr7, L(ret)
183 vxor v12, v12, v12 C zero total count
184 vxor v13, v13, v13 C zero total count
188 addis r9, n, -LIMBS_PER_CHUNK C remaining n
189 lis n, LIMBS_PER_CHUNK
190 L(2): srdi r7, n, 2 C loop count corresponding to n
191 mtctr r7 C copy n to count register
203 .
byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
204 .
byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
205 C Masks for
high end of number
206 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
207 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
209 .
byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
210 .
byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
212 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
213 .
byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
215 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
216 .
byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
217 C Masks for
low end of number
218 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
219 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
221 .
byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
222 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
224 .
byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
225 .
byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
227 .
byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
228 .
byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff