beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / vmx / popcount.asm
blobb95fb88b1ae0cd38ca6f0d1e6d3f6f41d23da9d4
1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
3 dnl Copyright 2006, 2010 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C 7400,7410 (G4): ?
35 C 744x,745x (G4+): 1.125
36 C 970 (G5): 2.25
38 C TODO
39 C * Rewrite the awkward huge n outer loop code.
40 C * Two lvx, two vperm, and two vxor could make us a similar hamdist.
41 C * Compress cnsts table in 64-bit mode, only half the values are needed.
43 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
44 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
45 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
47 define(`OPERATION_popcount')
49 define(`ap', `r3')
50 define(`n', `r4')
52 define(`rtab', `v10')
53 define(`cnt4', `v11')
55 ifelse(GMP_LIMB_BITS,32,`
56 define(`LIMB32',` $1')
57 define(`LIMB64',`')
58 ',`
59 define(`LIMB32',`')
60 define(`LIMB64',` $1')
63 C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
64 C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
65 define(`LIMBS_PER_CHUNK', 0x1000)
66 define(`LIMBS_CHUNK_THRES', 0x1001)
68 ASM_START()
69 PROLOGUE(mpn_popcount,toc)
70 mfspr r10, 256
71 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13
72 mtspr 256, r0
74 ifdef(`HAVE_ABI_mode32',
75 ` rldicl n, n, 0, 32') C zero extend n
77 C Load various constants into vector registers
78 LEAL( r11, cnsts)
79 li r12, 16
80 vspltisb cnt4, 4 C 0x0404...04 used as shift count
82 li r7, 160
83 lvx rtab, 0, r11
85 LIMB64(`lis r0, LIMBS_CHUNK_THRES ')
86 LIMB64(`cmpd cr7, n, r0 ')
88 lvx v0, 0, ap
89 addi r7, r11, 80
90 rlwinm r6, ap, 2,26,29
91 lvx v8, r7, r6
92 vand v0, v0, v8
94 LIMB32(`rlwinm r8, ap, 30,30,31 ')
95 LIMB64(`rlwinm r8, ap, 29,31,31 ')
96 add n, n, r8 C compensate n for rounded down `ap'
98 vxor v1, v1, v1
99 li r8, 0 C grand total count
101 vxor v12, v12, v12 C zero total count
102 vxor v13, v13, v13 C zero total count
104 addic. n, n, -LIMBS_PER_VR
105 ble L(sum)
107 addic. n, n, -LIMBS_PER_VR
108 ble L(lsum)
110 C For 64-bit machines, handle huge n that would overflow vsum4ubs
111 LIMB64(`ble cr7, L(small) ')
112 LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n
113 LIMB64(`lis n, LIMBS_PER_CHUNK ')
115 ALIGN(16)
116 L(small):
117 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
118 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
119 addi r7, r7, 1
120 mtctr r7 C copy n to count register
121 b L(ent)
123 ALIGN(16)
124 L(top):
125 lvx v0, 0, ap
126 L(ent): lvx v1, r12, ap
127 addi ap, ap, 32
128 vsrb v8, v0, cnt4
129 vsrb v9, v1, cnt4
130 vperm v2, rtab, rtab, v0
131 vperm v3, rtab, rtab, v8
132 vperm v4, rtab, rtab, v1
133 vperm v5, rtab, rtab, v9
134 vaddubm v6, v2, v3
135 vaddubm v7, v4, v5
136 vsum4ubs v12, v6, v12
137 vsum4ubs v13, v7, v13
138 bdnz L(top)
140 andi. n, n, eval(LIMBS_PER_2VR-1)
141 beq L(rt)
143 lvx v0, 0, ap
144 vxor v1, v1, v1
145 cmpwi n, LIMBS_PER_VR
146 ble L(sum)
147 L(lsum):
148 vor v1, v0, v0
149 lvx v0, r12, ap
150 L(sum):
151 LIMB32(`rlwinm r6, n, 4,26,27 ')
152 LIMB64(`rlwinm r6, n, 5,26,26 ')
153 addi r7, r11, 16
154 lvx v8, r7, r6
155 vand v0, v0, v8
156 vsrb v8, v0, cnt4
157 vsrb v9, v1, cnt4
158 vperm v2, rtab, rtab, v0
159 vperm v3, rtab, rtab, v8
160 vperm v4, rtab, rtab, v1
161 vperm v5, rtab, rtab, v9
162 vaddubm v6, v2, v3
163 vaddubm v7, v4, v5
164 vsum4ubs v12, v6, v12
165 vsum4ubs v13, v7, v13
167 ALIGN(16)
168 L(rt): vadduwm v3, v12, v13
169 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs
170 stvx v3, r7, r1 C FIXME: ...support storing below sp?
172 lwz r7, -16(r1)
173 add r8, r8, r7
174 lwz r7, -12(r1)
175 add r8, r8, r7
176 lwz r7, -8(r1)
177 add r8, r8, r7
178 lwz r7, -4(r1)
179 add r8, r8, r7
181 C Handle outer loop for huge n. We inherit cr7 and r0 from above.
182 LIMB64(`ble cr7, L(ret)
183 vxor v12, v12, v12 C zero total count
184 vxor v13, v13, v13 C zero total count
185 mr n, r9
186 cmpd cr7, n, r0
187 ble cr7, L(2)
188 addis r9, n, -LIMBS_PER_CHUNK C remaining n
189 lis n, LIMBS_PER_CHUNK
190 L(2): srdi r7, n, 2 C loop count corresponding to n
191 mtctr r7 C copy n to count register
192 b L(top)
195 ALIGN(16)
196 L(ret): mr r3, r8
197 mtspr 256, r10
199 EPILOGUE()
201 DEF_OBJECT(cnsts,16)
202 C Counts for vperm
203 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
204 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
205 C Masks for high end of number
206 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
207 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
209 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
210 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
212 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
213 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
215 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
216 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
217 C Masks for low end of number
218 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
219 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
221 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
222 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
224 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
225 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
227 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
228 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
229 END_OBJECT(cnsts)
230 ASM_END()