beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / add_n.asm
blobbc572a57a90feb3c0b5c8d0ca68cf5e582ba4b8c
1 dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
2 dnl store sum in a third limb vector.
4 dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C EV4: ?
36 C EV5: 4.75
37 C EV6: 3
39 dnl INPUT PARAMETERS
40 dnl res_ptr r16
41 dnl s1_ptr r17
42 dnl s2_ptr r18
43 dnl size r19
45 ASM_START()
46 PROLOGUE(mpn_add_nc)
47 bis r20,r31,r25
48 br L(com)
49 EPILOGUE()
50 PROLOGUE(mpn_add_n)
51 bis r31,r31,r25 C clear cy
52 L(com): subq r19,4,r19 C decr loop cnt
53 blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop
54 C Start software pipeline for 1st loop
55 ldq r0,0(r18)
56 ldq r4,0(r17)
57 ldq r1,8(r18)
58 ldq r5,8(r17)
59 addq r17,32,r17 C update s1_ptr
60 addq r0,r4,r28 C 1st main add
61 ldq r2,16(r18)
62 addq r25,r28,r20 C 1st carry add
63 ldq r3,24(r18)
64 cmpult r28,r4,r8 C compute cy from last add
65 ldq r6,-16(r17)
66 cmpult r20,r28,r25 C compute cy from last add
67 ldq r7,-8(r17)
68 bis r8,r25,r25 C combine cy from the two adds
69 subq r19,4,r19 C decr loop cnt
70 addq r1,r5,r28 C 2nd main add
71 addq r18,32,r18 C update s2_ptr
72 addq r28,r25,r21 C 2nd carry add
73 cmpult r28,r5,r8 C compute cy from last add
74 blt r19,$Lend1 C if less than 4 limbs remain, jump
75 C 1st loop handles groups of 4 limbs in a software pipeline
76 ALIGN(16)
77 $Loop: cmpult r21,r28,r25 C compute cy from last add
78 ldq r0,0(r18)
79 bis r8,r25,r25 C combine cy from the two adds
80 ldq r1,8(r18)
81 addq r2,r6,r28 C 3rd main add
82 ldq r4,0(r17)
83 addq r28,r25,r22 C 3rd carry add
84 ldq r5,8(r17)
85 cmpult r28,r6,r8 C compute cy from last add
86 cmpult r22,r28,r25 C compute cy from last add
87 stq r20,0(r16)
88 bis r8,r25,r25 C combine cy from the two adds
89 stq r21,8(r16)
90 addq r3,r7,r28 C 4th main add
91 addq r28,r25,r23 C 4th carry add
92 cmpult r28,r7,r8 C compute cy from last add
93 cmpult r23,r28,r25 C compute cy from last add
94 addq r17,32,r17 C update s1_ptr
95 bis r8,r25,r25 C combine cy from the two adds
96 addq r16,32,r16 C update res_ptr
97 addq r0,r4,r28 C 1st main add
98 ldq r2,16(r18)
99 addq r25,r28,r20 C 1st carry add
100 ldq r3,24(r18)
101 cmpult r28,r4,r8 C compute cy from last add
102 ldq r6,-16(r17)
103 cmpult r20,r28,r25 C compute cy from last add
104 ldq r7,-8(r17)
105 bis r8,r25,r25 C combine cy from the two adds
106 subq r19,4,r19 C decr loop cnt
107 stq r22,-16(r16)
108 addq r1,r5,r28 C 2nd main add
109 stq r23,-8(r16)
110 addq r25,r28,r21 C 2nd carry add
111 addq r18,32,r18 C update s2_ptr
112 cmpult r28,r5,r8 C compute cy from last add
113 bge r19,$Loop
114 C Finish software pipeline for 1st loop
115 $Lend1: cmpult r21,r28,r25 C compute cy from last add
116 bis r8,r25,r25 C combine cy from the two adds
117 addq r2,r6,r28 C 3rd main add
118 addq r28,r25,r22 C 3rd carry add
119 cmpult r28,r6,r8 C compute cy from last add
120 cmpult r22,r28,r25 C compute cy from last add
121 stq r20,0(r16)
122 bis r8,r25,r25 C combine cy from the two adds
123 stq r21,8(r16)
124 addq r3,r7,r28 C 4th main add
125 addq r28,r25,r23 C 4th carry add
126 cmpult r28,r7,r8 C compute cy from last add
127 cmpult r23,r28,r25 C compute cy from last add
128 bis r8,r25,r25 C combine cy from the two adds
129 addq r16,32,r16 C update res_ptr
130 stq r22,-16(r16)
131 stq r23,-8(r16)
132 $Lend2: addq r19,4,r19 C restore loop cnt
133 beq r19,$Lret
134 C Start software pipeline for 2nd loop
135 ldq r0,0(r18)
136 ldq r4,0(r17)
137 subq r19,1,r19
138 beq r19,$Lend0
139 C 2nd loop handles remaining 1-3 limbs
140 ALIGN(16)
141 $Loop0: addq r0,r4,r28 C main add
142 ldq r0,8(r18)
143 cmpult r28,r4,r8 C compute cy from last add
144 ldq r4,8(r17)
145 addq r28,r25,r20 C carry add
146 addq r18,8,r18
147 addq r17,8,r17
148 stq r20,0(r16)
149 cmpult r20,r28,r25 C compute cy from last add
150 subq r19,1,r19 C decr loop cnt
151 bis r8,r25,r25 C combine cy from the two adds
152 addq r16,8,r16
153 bne r19,$Loop0
154 $Lend0: addq r0,r4,r28 C main add
155 addq r28,r25,r20 C carry add
156 cmpult r28,r4,r8 C compute cy from last add
157 cmpult r20,r28,r25 C compute cy from last add
158 stq r20,0(r16)
159 bis r8,r25,r25 C combine cy from the two adds
161 $Lret: bis r25,r31,r0 C return cy
162 ret r31,(r26),1
163 EPILOGUE()
164 ASM_END()