beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc32 / vmx / copyd.asm
blobdee7266200ec60543c0d5f37beeeb32cec2ce357
1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
3 dnl Copyright 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C 16-byte coaligned unaligned
34 C cycles/limb cycles/limb
35 C 7400,7410 (G4): 0.5 0.64
36 C 744x,745x (G4+): 0.75 0.82
37 C 970 (G5): 0.78 1.02 (64-bit limbs)
39 C STATUS
40 C * Works for all sizes and alignments.
42 C TODO
43 C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
44 C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
45 C c/l for 970.
46 C * Consider using VMX instructions also for head and tail, by using some
47 C read-modify-write tricks.
48 C * The VMX code is used from the smallest sizes it handles, but measurements
49 C show a large speed bump at the cutoff points. Small copying (perhaps
50 C using some read-modify-write technique) should be optimized.
51 C * Make an mpn_com based on this code.
53 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
54 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
55 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
58 ifelse(GMP_LIMB_BITS,32,`
59 define(`LIMB32',` $1')
60 define(`LIMB64',`')
61 ',`
62 define(`LIMB32',`')
63 define(`LIMB64',` $1')
66 C INPUT PARAMETERS
67 define(`rp', `r3')
68 define(`up', `r4')
69 define(`n', `r5')
71 define(`us', `v4')
74 ASM_START()
75 PROLOGUE(mpn_copyd)
77 LIMB32(`slwi. r0, n, 2 ')
78 LIMB64(`sldi. r0, n, 3 ')
79 add rp, rp, r0
80 add up, up, r0
82 LIMB32(`cmpi cr7, n, 11 ')
83 LIMB64(`cmpdi cr7, n, 5 ')
84 bge cr7, L(big)
86 beqlr cr0
88 C Handle small cases with plain operations
89 mtctr n
90 L(topS):
91 LIMB32(`lwz r0, -4(up) ')
92 LIMB64(`ld r0, -8(up) ')
93 addi up, up, -GMP_LIMB_BYTES
94 LIMB32(`stw r0, -4(rp) ')
95 LIMB64(`std r0, -8(rp) ')
96 addi rp, rp, -GMP_LIMB_BYTES
97 bdnz L(topS)
98 blr
100 C Handle large cases with VMX operations
101 L(big):
102 addi rp, rp, -16
103 addi up, up, -16
104 mfspr r12, 256
105 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
106 mtspr 256, r0
108 LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
109 LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
110 beq L(rp_aligned)
112 subf n, r7, n
113 L(top0):
114 LIMB32(`lwz r0, 12(up) ')
115 LIMB64(`ld r0, 8(up) ')
116 addi up, up, -GMP_LIMB_BYTES
117 LIMB32(`addic. r7, r7, -1 ')
118 LIMB32(`stw r0, 12(rp) ')
119 LIMB64(`std r0, 8(rp) ')
120 addi rp, rp, -GMP_LIMB_BYTES
121 LIMB32(`bne L(top0) ')
123 L(rp_aligned):
125 LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
126 LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2
128 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
129 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
130 mtctr r7 C copy n to count register
132 li r10, -16
134 beq L(up_aligned)
136 lvsl us, 0, up
138 addi up, up, 16
139 LIMB32(`andi. r0, n, 0x4 ')
140 LIMB64(`andi. r0, n, 0x2 ')
141 beq L(1)
142 lvx v0, 0, up
143 lvx v2, r10, up
144 vperm v3, v2, v0, us
145 stvx v3, 0, rp
146 addi up, up, -32
147 addi rp, rp, -16
148 b L(lpu)
149 L(1): lvx v2, 0, up
150 addi up, up, -16
151 b L(lpu)
153 ALIGN(32)
154 L(lpu): lvx v0, 0, up
155 vperm v3, v0, v2, us
156 stvx v3, 0, rp
157 lvx v2, r10, up
158 addi up, up, -32
159 vperm v3, v2, v0, us
160 stvx v3, r10, rp
161 addi rp, rp, -32
162 bdnz L(lpu)
164 b L(tail)
166 L(up_aligned):
168 LIMB32(`andi. r0, n, 0x4 ')
169 LIMB64(`andi. r0, n, 0x2 ')
170 beq L(lpa)
171 lvx v0, 0, up
172 stvx v0, 0, rp
173 addi up, up, -16
174 addi rp, rp, -16
175 b L(lpa)
177 ALIGN(32)
178 L(lpa): lvx v0, 0, up
179 lvx v1, r10, up
180 addi up, up, -32
182 stvx v0, 0, rp
183 stvx v1, r10, rp
184 addi rp, rp, -32
185 bdnz L(lpa)
187 L(tail):
188 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
189 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
190 beq L(ret)
191 LIMB32(`li r10, 12 ')
192 L(top2):
193 LIMB32(`lwzx r0, r10, up ')
194 LIMB64(`ld r0, 8(up) ')
195 LIMB32(`addic. r7, r7, -1 ')
196 LIMB32(`stwx r0, r10, rp ')
197 LIMB64(`std r0, 8(rp) ')
198 LIMB32(`addi r10, r10, -GMP_LIMB_BYTES')
199 LIMB32(`bne L(top2) ')
201 L(ret): mtspr 256, r12
203 EPILOGUE()