1 dnl PowerPC
-32 mpn_mod_34lsub1
-- mpn remainder
mod 2^
24-1.
3 dnl Copyright
2002, 2003, 2005-2007, 2012 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
39 C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75
40 C 744x,745x (G4+): 0.75
46 C * Either start using the low-end masking constants, or remove them.
47 C * Merge multiple feed-in cases into a parameterized code block.
48 C * Reduce register usage. It should be possible to almost halve it.
71 PROLOGUE(mpn_mod_34lsub1)
72 cmpwi cr0, n, 20 C tuned cutoff point
75 li r9, 0 C result accumulator
76 mulli r10, n, 0xb C 0xb = ceil(32/3)
77 srwi. r10, r10, 5 C r10 = floor(n/3), n < 32
89 L(los): rlwinm r0, r6, 0,8,31
90 add r9, r9, r0 C add 24b from u0
93 rlwimi r0, r7, 8, 0x00ffff00 C --111100
94 add r9, r9, r0 C add 8b from u0 and 16b from u1
97 rlwimi r0, r8, 16, 0x00ff0000 C --221111
98 add r9, r9, r0 C add 16b from u1 and 8b from u2
99 srwi r0, r8, 8 C --222222
101 add r9, r9, r0 C add 24b from u2
104 rlwinm r0, r6, 0,8,31
105 add r9, r9, r0 C add 24b from u0
107 rlwimi r0, r7, 8, 0x00ffff00 C --111100
108 add r9, r9, r0 C add 8b from u0 and 16b from u1
110 rlwimi r0, r8, 16, 0x00ff0000 C --221111
111 add r9, r9, r0 C add 16b from u1 and 8b from u2
112 srwi r0, r8, 8 C --222222
113 add r9, r9, r0 C add 24b from u2
116 rlwinm r0, r9, 0,8,31
125 rlwinm r0, r6, 0,8,31
133 rlwinm r0, r6, 8,8,23
145 oris r0, r10, 0xffff C Set VRSAVE bit 0-15
158 LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin
174 srwi r0, r0, 3 C r0 = floor(n/12)
181 L(na4): bne cr7, L(na8)
190 srwi r0, r0, 3 C r0 = floor(n/12)
204 srwi r0, r0, 3 C r0 = floor(n/12)
213 srwi r0, r0, 3 C r0 = floor(n/12)
270 rlwinm r3, n ,4,26,27
288 rlwinm r3, n ,4,26,27
301 rlwinm r3, n ,4,26,27
309 L(sum): lvx pv, 0, r11
310 vperm x0, a0, z, pv C extract 4 24-bit field from a0
313 vperm x1, a1, z, pv C extract 4 24-bit field from a1
314 vperm y1, c0, z, pv C extract 4 24-bit field from a1
316 vperm x2, a2, z, pv C extract 4 24-bit field from a1
317 vperm y2, c1, z, pv C extract 4 24-bit field from a1
320 vperm x3, a0, z, pv C extract remaining/partial a0 fields
321 vperm y3, c2, z, pv C extract remaining/partial a0 fields
324 vperm x3, a1, x3, pv C insert remaining/partial a1 fields
325 vperm y3, c0, y3, pv C insert remaining/partial a1 fields
328 vperm x3, a2, x3, pv C insert remaining/partial a2 fields
329 vperm y3, c1, y3, pv C insert remaining/partial a2 fields
331 C We now have 4 128-bit accumulators to sum
342 C Reduce 32-bit fields
354 C load | v0 | v1 | v2 |
355 C acc | a0 | a1 | a2 |
356 C carry | c0 | c1 | c2 |
357 C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128
358 C |---|---|---|---|---|---|---|---|---|---|---|---| 32
359 C | | | | | | | | | | | | | | | | | 24
360 C | | | | | | | | | 48
362 C $---------------$---------------$---------------$---------------$
363 C | . . . . . . . . . . . . . . . |
364 C |_______________________________________________________________|
366 C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
370 C Permutation vectors in the order they are used above
371 C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
372 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
373 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
374 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
375 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
376 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
377 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
378 C Masks for high end of number
379 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
380 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
381 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
382 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
383 C Masks for low end of number
384 C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
385 C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
386 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
387 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff