beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc32 / vmx / mod_34lsub1.asm
blob2bb11cd17321631af3db934351c0fa8e05476887
1 dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
3 dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C cycles/limb
36 C 603e: -
37 C 604e: -
38 C 75x (G3): -
39 C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75
40 C 744x,745x (G4+): 0.75
41 C ppc970: 0.75
42 C power4: -
43 C power5: -
45 C TODO
46 C * Either start using the low-end masking constants, or remove them.
47 C * Merge multiple feed-in cases into a parameterized code block.
48 C * Reduce register usage. It should be possible to almost halve it.
50 define(`up', `r3')
51 define(`n', `r4')
53 define(`a0', `v3')
54 define(`a1', `v4')
55 define(`a2', `v5')
56 define(`c0', `v6')
57 define(`c1', `v7')
58 define(`c2', `v8')
59 define(`z', `v9')
60 define(`x0', `v10')
61 define(`x1', `v11')
62 define(`x2', `v12')
63 define(`x3', `v13')
64 define(`pv', `v14')
65 define(`y0', `v0')
66 define(`y1', `v1')
67 define(`y2', `v2')
68 define(`y3', `v15')
70 ASM_START()
71 PROLOGUE(mpn_mod_34lsub1)
72 cmpwi cr0, n, 20 C tuned cutoff point
73 bge L(large)
75 li r9, 0 C result accumulator
76 mulli r10, n, 0xb C 0xb = ceil(32/3)
77 srwi. r10, r10, 5 C r10 = floor(n/3), n < 32
78 beq L(small_tail)
79 mtctr r10
80 lwz r6, 0(up)
81 lwz r7, 4(up)
82 lwzu r8, 8(up)
83 subf n, r10, n
84 subf n, r10, n
85 subf n, r10, n
86 bdz L(small_end)
88 ALIGN(16)
89 L(los): rlwinm r0, r6, 0,8,31
90 add r9, r9, r0 C add 24b from u0
91 srwi r0, r6, 24
92 lwz r6, 4(up)
93 rlwimi r0, r7, 8, 0x00ffff00 C --111100
94 add r9, r9, r0 C add 8b from u0 and 16b from u1
95 srwi r0, r7, 16
96 lwz r7, 8(up)
97 rlwimi r0, r8, 16, 0x00ff0000 C --221111
98 add r9, r9, r0 C add 16b from u1 and 8b from u2
99 srwi r0, r8, 8 C --222222
100 lwzu r8, 12(up)
101 add r9, r9, r0 C add 24b from u2
102 bdnz L(los)
103 L(small_end):
104 rlwinm r0, r6, 0,8,31
105 add r9, r9, r0 C add 24b from u0
106 srwi r0, r6, 24
107 rlwimi r0, r7, 8, 0x00ffff00 C --111100
108 add r9, r9, r0 C add 8b from u0 and 16b from u1
109 srwi r0, r7, 16
110 rlwimi r0, r8, 16, 0x00ff0000 C --221111
111 add r9, r9, r0 C add 16b from u1 and 8b from u2
112 srwi r0, r8, 8 C --222222
113 add r9, r9, r0 C add 24b from u2
115 addi up, up, 4
116 rlwinm r0, r9, 0,8,31
117 srwi r9, r9, 24
118 add r9, r9, r0
120 L(small_tail):
121 cmpi cr0, n, 1
122 blt L(ret)
124 lwz r6, 0(up)
125 rlwinm r0, r6, 0,8,31
126 srwi r6, r6, 24
127 add r9, r9, r0
128 add r9, r9, r6
130 beq L(ret)
132 lwz r6, 4(up)
133 rlwinm r0, r6, 8,8,23
134 srwi r6, r6, 16
135 add r9, r9, r0
136 add r9, r9, r6
138 L(ret): mr r3, r9
142 L(large):
143 stwu r1, -32(r1)
144 mfspr r10, 256
145 oris r0, r10, 0xffff C Set VRSAVE bit 0-15
146 mtspr 256, r0
148 andi. r7, up, 15
149 vxor a0, v0, v0
150 lis r9, 0xaaaa
151 vxor a1, v0, v0
152 ori r9, r9, 0xaaab
153 vxor a2, v0, v0
154 li r5, 16
155 vxor c0, v0, v0
156 li r6, 32
157 vxor c1, v0, v0
158 LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin
159 vxor c2, v0, v0
160 vxor z, v0, v0
162 beq L(aligned16)
164 cmpwi cr7, r7, 8
165 bge cr7, L(na4)
167 lvx a2, 0, up
168 addi up, up, 16
169 vsldoi a2, a2, z, 4
170 vsldoi a2, z, a2, 12
172 addi n, n, 9
173 mulhwu r0, n, r9
174 srwi r0, r0, 3 C r0 = floor(n/12)
175 mtctr r0
177 mulli r8, r0, 12
178 subf n, r8, n
179 b L(2)
181 L(na4): bne cr7, L(na8)
183 lvx a1, 0, up
184 addi up, up, -16
185 vsldoi a1, a1, z, 8
186 vsldoi a1, z, a1, 8
188 addi n, n, 6
189 mulhwu r0, n, r9
190 srwi r0, r0, 3 C r0 = floor(n/12)
191 mtctr r0
193 mulli r8, r0, 12
194 subf n, r8, n
195 b L(1)
197 L(na8):
198 lvx a0, 0, up
199 vsldoi a0, a0, z, 12
200 vsldoi a0, z, a0, 4
202 addi n, n, 3
203 mulhwu r0, n, r9
204 srwi r0, r0, 3 C r0 = floor(n/12)
205 mtctr r0
207 mulli r8, r0, 12
208 subf n, r8, n
209 b L(0)
211 L(aligned16):
212 mulhwu r0, n, r9
213 srwi r0, r0, 3 C r0 = floor(n/12)
214 mtctr r0
216 mulli r8, r0, 12
217 subf n, r8, n
219 lvx a0, 0, up
220 L(0): lvx a1, r5, up
221 L(1): lvx a2, r6, up
222 addi up, up, 48
223 L(2): bdz L(end)
224 li r12, 256
225 li r9, 288
226 ALIGN(32)
227 L(top):
228 lvx v0, 0, up
229 vaddcuw v10, a0, v0
230 vadduwm a0, a0, v0
231 vadduwm c0, c0, v10
233 lvx v1, r5, up
234 vaddcuw v10, a1, v1
235 vadduwm a1, a1, v1
236 vadduwm c1, c1, v10
238 lvx v2, r6, up
239 dcbt up, r12
240 dcbt up, r9
241 addi up, up, 48
242 vaddcuw v10, a2, v2
243 vadduwm a2, a2, v2
244 vadduwm c2, c2, v10
245 bdnz L(top)
247 L(end):
248 C n = 0...11
249 cmpwi cr0, n, 0
250 beq L(sum)
251 cmpwi cr0, n, 4
252 ble L(tail.1..4)
253 cmpwi cr0, n, 8
254 ble L(tail.5..8)
256 L(tail.9..11):
257 lvx v0, 0, up
258 vaddcuw v10, a0, v0
259 vadduwm a0, a0, v0
260 vadduwm c0, c0, v10
262 lvx v1, r5, up
263 vaddcuw v10, a1, v1
264 vadduwm a1, a1, v1
265 vadduwm c1, c1, v10
267 lvx v2, r6, up
269 addi r8, r11, 96
270 rlwinm r3, n ,4,26,27
271 lvx v11, r3, r8
272 vand v2, v2, v11
274 vaddcuw v10, a2, v2
275 vadduwm a2, a2, v2
276 vadduwm c2, c2, v10
277 b L(sum)
279 L(tail.5..8):
280 lvx v0, 0, up
281 vaddcuw v10, a0, v0
282 vadduwm a0, a0, v0
283 vadduwm c0, c0, v10
285 lvx v1, r5, up
287 addi r8, r11, 96
288 rlwinm r3, n ,4,26,27
289 lvx v11, r3, r8
290 vand v1, v1, v11
292 vaddcuw v10, a1, v1
293 vadduwm a1, a1, v1
294 vadduwm c1, c1, v10
295 b L(sum)
297 L(tail.1..4):
298 lvx v0, 0, up
300 addi r8, r11, 96
301 rlwinm r3, n ,4,26,27
302 lvx v11, r3, r8
303 vand v0, v0, v11
305 vaddcuw v10, a0, v0
306 vadduwm a0, a0, v0
307 vadduwm c0, c0, v10
309 L(sum): lvx pv, 0, r11
310 vperm x0, a0, z, pv C extract 4 24-bit field from a0
311 vperm y0, c2, z, pv
312 lvx pv, r5, r11
313 vperm x1, a1, z, pv C extract 4 24-bit field from a1
314 vperm y1, c0, z, pv C extract 4 24-bit field from a1
315 lvx pv, r6, r11
316 vperm x2, a2, z, pv C extract 4 24-bit field from a1
317 vperm y2, c1, z, pv C extract 4 24-bit field from a1
318 li r10, 48
319 lvx pv, r10, r11
320 vperm x3, a0, z, pv C extract remaining/partial a0 fields
321 vperm y3, c2, z, pv C extract remaining/partial a0 fields
322 li r10, 64
323 lvx pv, r10, r11
324 vperm x3, a1, x3, pv C insert remaining/partial a1 fields
325 vperm y3, c0, y3, pv C insert remaining/partial a1 fields
326 li r10, 80
327 lvx pv, r10, r11
328 vperm x3, a2, x3, pv C insert remaining/partial a2 fields
329 vperm y3, c1, y3, pv C insert remaining/partial a2 fields
331 C We now have 4 128-bit accumulators to sum
332 vadduwm x0, x0, x1
333 vadduwm x2, x2, x3
334 vadduwm x0, x0, x2
336 vadduwm y0, y0, y1
337 vadduwm y2, y2, y3
338 vadduwm y0, y0, y2
340 vadduwm x0, x0, y0
342 C Reduce 32-bit fields
343 vsumsws x0, x0, z
345 li r7, 16
346 stvx x0, r7, r1
347 lwz r3, 28(r1)
349 mtspr 256, r10
350 addi r1, r1, 32
352 EPILOGUE()
354 C load | v0 | v1 | v2 |
355 C acc | a0 | a1 | a2 |
356 C carry | c0 | c1 | c2 |
357 C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128
358 C |---|---|---|---|---|---|---|---|---|---|---|---| 32
359 C | | | | | | | | | | | | | | | | | 24
360 C | | | | | | | | | 48
362 C $---------------$---------------$---------------$---------------$
363 C | . . . . . . . . . . . . . . . |
364 C |_______________________________________________________________|
365 C | | | | | | |
366 C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
369 DEF_OBJECT(cnsts,16)
370 C Permutation vectors in the order they are used above
371 C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
372 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
373 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
374 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
375 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
376 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
377 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
378 C Masks for high end of number
379 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
380 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
381 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
382 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
383 C Masks for low end of number
384 C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
385 C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
386 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
387 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
388 END_OBJECT(cnsts)