1 dnl PowerPC
-32/VMX
and PowerPC
-64/VMX mpn_copyi.
3 dnl Copyright
2006 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
33 C 16-byte coaligned unaligned
34 C cycles/limb cycles/limb
35 C 7400,7410 (G4): 0.5 0.64
36 C 744x,745x (G4+): 0.75 0.82
37 C 970 (G5): 0.78 1.02 (64-bit limbs)
40 C * Works for all sizes and alignments.
43 C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
44 C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
46 C * Consider using VMX instructions also for head and tail, by using some
47 C read-modify-write tricks.
48 C * The VMX code is used from the smallest sizes it handles, but measurements
49 C show a large speed bump at the cutoff points. Small copying (perhaps
50 C using some read-modify-write technique) should be optimized.
51 C * Make an mpn_com based on this code.
53 define(`GMP_LIMB_BYTES', eval
(GMP_LIMB_BITS
/8))
54 define
(`LIMBS_PER_VR
', eval(16/GMP_LIMB_BYTES))
55 define(`LIMBS_PER_2VR', eval
(32/GMP_LIMB_BYTES
))
58 ifelse
(GMP_LIMB_BITS
,32,`
59 define
(`LIMB32
',` $1')
63 define(`LIMB64',`
$1')
77 LIMB32
(`cmpi cr7
, n
, 11 ')
78 LIMB64(`cmpdi cr7, n, 5 ')
84 C Handle
small cases with plain operations
87 LIMB32
(`lwz r0
, 0(up
) ')
88 LIMB64(`ld r0, 0(up) ')
89 addi up
, up
, GMP_LIMB_BYTES
90 LIMB32
(`stw r0
, 0(rp
) ')
91 LIMB64(`std r0, 0(rp) ')
92 addi rp
, rp
, GMP_LIMB_BYTES
96 C Handle
large cases with VMX operations
99 oris r0
, r12
, 0xf800 C Set VRSAVE bit
0-4
102 LIMB32
(`rlwinm. r7
, rp
, 30,30,31') C (rp >> 2) mod 4
103 LIMB64(`rlwinm. r7, rp, 29,31,31') C
(rp
>> 3) mod 2
106 subfic r7
, r7
, LIMBS_PER_VR
109 LIMB32
(`lwz r0
, 0(up
) ')
110 LIMB64(`ld r0, 0(up) ')
111 addi up
, up
, GMP_LIMB_BYTES
112 LIMB32
(`addic. r7
, r7
, -1 ')
113 LIMB32(`stw r0, 0(rp) ')
114 LIMB64
(`
std r0
, 0(rp
) ')
115 addi rp, rp, GMP_LIMB_BYTES
116 LIMB32(`bne L(top0) ')
120 LIMB32
(`rlwinm. r0
, up
, 30,30,31') C (up >> 2) mod 4
121 LIMB64(`rlwinm. r0, up, 29,31,31') C
(up
>> 3) mod 2
123 LIMB64
(`srdi r7
, n
, 2 ') C loop count corresponding to n
124 LIMB32(`srwi r7, n, 3 ') C
loop count corresponding to n
125 mtctr r7 C copy n to count register
133 LIMB32
(`andi. r0
, n
, 0x4 ')
134 LIMB64(`andi. r0, n, 0x2 ')
148 L
(lpu
): lvx v0
, 0, up
163 LIMB32
(`andi. r0
, n
, 0x4 ')
164 LIMB64(`andi. r0, n, 0x2 ')
173 L
(lpa
): lvx v0
, 0, up
183 LIMB32
(`rlwinm. r7
, n
, 0,30,31 ') C r7 = n mod 4
184 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7
= n
mod 2
188 LIMB32(`lwzx r0, r10, up ')
189 LIMB64
(`ld r0
, 0(up
) ')
190 LIMB32(`addic. r7, r7, -1 ')
191 LIMB32
(`stwx r0
, r10
, rp
')
192 LIMB64(`std r0, 0(rp) ')
193 LIMB32
(`addi r10
, r10
, GMP_LIMB_BYTES
')
194 LIMB32(`bne L(top2) ')
196 L
(ret): mtspr
256, r12