beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / aors_n.asm
blob0e8474fdcc227b99917c75a2c4c4bf58d028ef86
1 dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
3 dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C POWER3/PPC630 1.5
35 C POWER4/PPC970 2
36 C POWER5 2
37 C POWER6 2.63
38 C POWER7 2.25-2.87
40 C This code is a little bit slower for POWER3/PPC630 than the simple code used
41 C previously, but it is much faster for POWER4/PPC970. The reason for the
42 C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
43 C registers.
45 C INPUT PARAMETERS
46 C rp r3
47 C up r4
48 C vp r5
49 C n r6
51 ifdef(`OPERATION_add_n',`
52 define(ADDSUBC, adde)
53 define(ADDSUB, addc)
54 define(func, mpn_add_n)
55 define(func_nc, mpn_add_nc)
56 define(GENRVAL, `addi r3, r3, 1')
57 define(SETCBR, `addic r0, $1, -1')
58 define(CLRCB, `addic r0, r0, 0')
60 ifdef(`OPERATION_sub_n',`
61 define(ADDSUBC, subfe)
62 define(ADDSUB, subfc)
63 define(func, mpn_sub_n)
64 define(func_nc, mpn_sub_nc)
65 define(GENRVAL, `neg r3, r3')
66 define(SETCBR, `subfic r0, $1, 0')
67 define(CLRCB, `addic r0, r1, -1')
70 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
72 ASM_START()
73 PROLOGUE(func_nc)
74 SETCBR(r7)
75 b L(ent)
76 EPILOGUE()
78 PROLOGUE(func)
79 CLRCB
80 L(ent): std r31, -8(r1)
81 std r30, -16(r1)
82 std r29, -24(r1)
83 std r28, -32(r1)
85 rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
86 cmpdi cr6, r0, 2
87 addi r6, r6, 3 C compute count...
88 srdi r6, r6, 2 C ...for ctr
89 mtctr r6 C copy count into ctr
90 beq cr0, L(b00)
91 blt cr6, L(b01)
92 beq cr6, L(b10)
94 L(b11): ld r8, 0(r4) C load s1 limb
95 ld r9, 0(r5) C load s2 limb
96 ld r10, 8(r4) C load s1 limb
97 ld r11, 8(r5) C load s2 limb
98 ld r12, 16(r4) C load s1 limb
99 addi r4, r4, 24
100 ld r0, 16(r5) C load s2 limb
101 addi r5, r5, 24
102 ADDSUBC r29, r9, r8
103 ADDSUBC r30, r11, r10
104 ADDSUBC r31, r0, r12
105 std r29, 0(r3)
106 std r30, 8(r3)
107 std r31, 16(r3)
108 addi r3, r3, 24
109 bdnz L(go)
110 b L(ret)
112 L(b01): ld r12, 0(r4) C load s1 limb
113 addi r4, r4, 8
114 ld r0, 0(r5) C load s2 limb
115 addi r5, r5, 8
116 ADDSUBC r31, r0, r12 C add
117 std r31, 0(r3)
118 addi r3, r3, 8
119 bdnz L(go)
120 b L(ret)
122 L(b10): ld r10, 0(r4) C load s1 limb
123 ld r11, 0(r5) C load s2 limb
124 ld r12, 8(r4) C load s1 limb
125 addi r4, r4, 16
126 ld r0, 8(r5) C load s2 limb
127 addi r5, r5, 16
128 ADDSUBC r30, r11, r10 C add
129 ADDSUBC r31, r0, r12 C add
130 std r30, 0(r3)
131 std r31, 8(r3)
132 addi r3, r3, 16
133 bdnz L(go)
134 b L(ret)
136 L(b00): C INITCY C clear/set cy
137 L(go): ld r6, 0(r4) C load s1 limb
138 ld r7, 0(r5) C load s2 limb
139 ld r8, 8(r4) C load s1 limb
140 ld r9, 8(r5) C load s2 limb
141 ld r10, 16(r4) C load s1 limb
142 ld r11, 16(r5) C load s2 limb
143 ld r12, 24(r4) C load s1 limb
144 ld r0, 24(r5) C load s2 limb
145 bdz L(end)
147 addi r4, r4, 32
148 addi r5, r5, 32
150 ALIGN(16)
151 L(top): ADDSUBC r28, r7, r6
152 ld r6, 0(r4) C load s1 limb
153 ld r7, 0(r5) C load s2 limb
154 ADDSUBC r29, r9, r8
155 ld r8, 8(r4) C load s1 limb
156 ld r9, 8(r5) C load s2 limb
157 ADDSUBC r30, r11, r10
158 ld r10, 16(r4) C load s1 limb
159 ld r11, 16(r5) C load s2 limb
160 ADDSUBC r31, r0, r12
161 ld r12, 24(r4) C load s1 limb
162 ld r0, 24(r5) C load s2 limb
163 std r28, 0(r3)
164 addi r4, r4, 32
165 std r29, 8(r3)
166 addi r5, r5, 32
167 std r30, 16(r3)
168 std r31, 24(r3)
169 addi r3, r3, 32
170 bdnz L(top) C decrement ctr and loop back
172 L(end): ADDSUBC r28, r7, r6
173 ADDSUBC r29, r9, r8
174 ADDSUBC r30, r11, r10
175 ADDSUBC r31, r0, r12
176 std r28, 0(r3)
177 std r29, 8(r3)
178 std r30, 16(r3)
179 std r31, 24(r3)
181 L(ret): ld r31, -8(r1)
182 ld r30, -16(r1)
183 ld r29, -24(r1)
184 ld r28, -32(r1)
186 subfe r3, r0, r0 C -cy
187 GENRVAL
189 EPILOGUE()