beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / aorsorrlshC_n.asm
blob6158f541fc0c2f50d8814e85b7f982a8bea15efa
1 dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
3 dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 C cycles/limb
32 C POWER3/PPC630 1.83 (1.5 c/l should be possible)
33 C POWER4/PPC970 3 (2.0 c/l should be possible)
34 C POWER5 3
35 C POWER6 3.5-47
36 C POWER7 3
38 C STATUS
39 C * Try combining upx+up, and vpx+vp.
40 C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
41 C greater than the 2nd operand. Yes, this addition is non-commutative wrt
42 C performance.
44 C INPUT PARAMETERS
45 define(`rp', `r3')
46 define(`up', `r4')
47 define(`vp', `r5')
48 define(`n', `r6')
50 ifdef(`DO_add', `
51 define(`ADDSUBC', `addc $1, $2, $3')
52 define(`ADDSUBE', `adde $1, $2, $3')
53 define(INITCY, `addic $1, r1, 0')
54 define(RETVAL, `addze r3, $1')
55 define(`func', mpn_addlsh`'LSH`'_n)')
56 ifdef(`DO_sub', `
57 define(`ADDSUBC', `subfc $1, $2, $3')
58 define(`ADDSUBE', `subfe $1, $2, $3')
59 define(INITCY, `addic $1, r1, -1')
60 define(RETVAL, `subfze r3, $1
61 neg r3, r3')
62 define(`func', mpn_sublsh`'LSH`'_n)')
63 ifdef(`DO_rsb', `
64 define(`ADDSUBC', `subfc $1, $3, $2')
65 define(`ADDSUBE', `subfe $1, $3, $2')
66 define(INITCY, `addic $1, r1, -1')
67 define(RETVAL, `addme r3, $1')
68 define(`func', mpn_rsblsh`'LSH`'_n)')
70 define(`rpx', `r6')
71 define(`upx', `r7')
72 define(`vpx', `r12')
74 define(`s0', `r0') define(`s1', `r9')
75 define(`u0', `r8')
76 define(`v0', `r10') define(`v1', `r11')
79 ASM_START()
80 PROLOGUE(func)
81 cmpldi cr0, n, 13
82 bgt L(big)
84 mtctr n C copy n in ctr
85 INITCY( r0) C clear cy
87 ld v0, 0(vp) C load v limb
88 ld u0, 0(up) C load u limb
89 addi up, up, -8 C update up
90 addi rp, rp, -8 C update rp
91 sldi s1, v0, LSH
92 bdz L(ex1) C If done, skip loop
94 ALIGN(16)
95 L(lo0): ld v1, 8(vp) C load v limb
96 ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
97 ldu u0, 16(up) C load u limb and update up
98 srdi s0, v0, RSH C shift down previous v limb
99 std s1, 8(rp) C store result limb
100 rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb
101 bdz L(ex0) C decrement ctr and exit if done
102 ldu v0, 16(vp) C load v limb and update vp
103 ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
104 ld u0, 8(up) C load u limb
105 srdi s1, v1, RSH C shift down previous v limb
106 stdu s0, 16(rp) C store result limb and update rp
107 rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb
108 bdnz L(lo0) C decrement ctr and loop back
110 L(ex1): ADDSUBE(r7, s1, u0)
111 std r7, 8(rp) C store last result limb
112 srdi r0, v0, RSH
113 RETVAL( r0)
115 L(ex0): ADDSUBE(r7, s0, u0)
116 std r7, 16(rp) C store last result limb
117 srdi r0, v1, RSH
118 RETVAL( r0)
122 L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0
123 addi r6, n, -1 C ...for ctr
124 srdi r6, r6, 1 C ...for ctr
125 mtctr r6 C copy count into ctr
126 beq cr0, L(b0)
128 L(b1): ld v1, 0(vp)
129 ld u0, 0(up)
130 sldi s1, v1, LSH
131 srdi s0, v1, RSH
132 ld v0, 8(vp)
133 ADDSUBC(s1, s1, u0) C add limbs without cy, set cy
134 addi rpx, rp, -16
135 addi rp, rp, -8
136 sub upx, up, rp
137 sub vpx, vp, rp
138 sub up, up, rpx
139 sub vp, vp, rpx
140 addi up, up, 8
141 addi upx, upx, 16
142 addi vp, vp, 16
143 addi vpx, vpx, 24
144 b L(mid)
146 L(b0): ld v0, 0(vp)
147 ld u0, 0(up)
148 sldi s0, v0, LSH
149 srdi s1, v0, RSH
150 ld v1, 8(vp)
151 ADDSUBC(s0, s0, u0) C add limbs without cy, set cy
152 addi rpx, rp, -8
153 addi rp, rp, -16
154 sub upx, up, rpx
155 sub vpx, vp, rpx
156 sub up, up, rp
157 sub vp, vp, rp
158 addi up, up, 8
159 addi upx, upx, 16
160 addi vp, vp, 16
161 addi vpx, vpx, 24
163 ALIGN(32)
164 L(top): ldx u0, rp, up
165 ldx v0, rp, vp
166 rldimi s1, v1, LSH, 0
167 stdu s0, 16(rp)
168 srdi s0, v1, RSH
169 ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
170 L(mid): ldx u0, rpx, upx
171 ldx v1, rpx, vpx
172 rldimi s0, v0, LSH, 0
173 stdu s1, 16(rpx)
174 srdi s1, v0, RSH
175 ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
176 bdnz L(top) C decrement CTR and loop back
178 ldx u0, rp, up
179 rldimi s1, v1, LSH, 0
180 std s0, 16(rp)
181 srdi s0, v1, RSH
182 ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
183 std s1, 24(rp)
185 RETVAL( s0)
187 EPILOGUE()