beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / pa32 / hppa1_1 / pa7100 / submul_1.asm
blob5ea08cbee5503dda2ebfe5afc158dba767d064df
1 dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and
2 dnl subtract the result from a second limb vector.
4 dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C INPUT PARAMETERS
35 define(`res_ptr',`%r26')
36 define(`s1_ptr',`%r25')
37 define(`size_param',`%r24')
38 define(`s2_limb',`%r23')
40 define(`cylimb',`%r28')
41 define(`s0',`%r19')
42 define(`s1',`%r20')
43 define(`s2',`%r3')
44 define(`s3',`%r4')
45 define(`lo0',`%r21')
46 define(`lo1',`%r5')
47 define(`lo2',`%r6')
48 define(`lo3',`%r7')
49 define(`hi0',`%r22')
50 define(`hi1',`%r23') C safe to reuse
51 define(`hi2',`%r29')
52 define(`hi3',`%r1')
54 ASM_START()
55 PROLOGUE(mpn_submul_1)
56 C .callinfo frame=128,no_calls
58 ldo 128(%r30),%r30
59 stws s2_limb,-16(%r30)
60 add %r0,%r0,cylimb C clear cy and cylimb
61 addib,< -4,size_param,L(few_limbs)
62 fldws -16(%r30),%fr31R
64 ldo -112(%r30),%r31
65 stw %r3,-96(%r30)
66 stw %r4,-92(%r30)
67 stw %r5,-88(%r30)
68 stw %r6,-84(%r30)
69 stw %r7,-80(%r30)
71 bb,>=,n s1_ptr,29,L(0)
73 fldws,ma 4(s1_ptr),%fr4
74 ldws 0(res_ptr),s0
75 xmpyu %fr4,%fr31R,%fr5
76 fstds %fr5,-16(%r31)
77 ldws -16(%r31),cylimb
78 ldws -12(%r31),lo0
79 sub s0,lo0,s0
80 add s0,lo0,%r0 C invert cy
81 addib,< -1,size_param,L(few_limbs)
82 stws,ma s0,4(res_ptr)
84 C start software pipeline ----------------------------------------------------
85 LDEF(0)
86 fldds,ma 8(s1_ptr),%fr4
87 fldds,ma 8(s1_ptr),%fr8
89 xmpyu %fr4L,%fr31R,%fr5
90 xmpyu %fr4R,%fr31R,%fr6
91 xmpyu %fr8L,%fr31R,%fr9
92 xmpyu %fr8R,%fr31R,%fr10
94 fstds %fr5,-16(%r31)
95 fstds %fr6,-8(%r31)
96 fstds %fr9,0(%r31)
97 fstds %fr10,8(%r31)
99 ldws -16(%r31),hi0
100 ldws -12(%r31),lo0
101 ldws -8(%r31),hi1
102 ldws -4(%r31),lo1
103 ldws 0(%r31),hi2
104 ldws 4(%r31),lo2
105 ldws 8(%r31),hi3
106 ldws 12(%r31),lo3
108 addc lo0,cylimb,lo0
109 addc lo1,hi0,lo1
110 addc lo2,hi1,lo2
111 addc lo3,hi2,lo3
113 addib,< -4,size_param,L(end)
114 addc %r0,hi3,cylimb C propagate carry into cylimb
115 C main loop ------------------------------------------------------------------
116 LDEF(loop)
117 fldds,ma 8(s1_ptr),%fr4
118 fldds,ma 8(s1_ptr),%fr8
120 ldws 0(res_ptr),s0
121 xmpyu %fr4L,%fr31R,%fr5
122 ldws 4(res_ptr),s1
123 xmpyu %fr4R,%fr31R,%fr6
124 ldws 8(res_ptr),s2
125 xmpyu %fr8L,%fr31R,%fr9
126 ldws 12(res_ptr),s3
127 xmpyu %fr8R,%fr31R,%fr10
129 fstds %fr5,-16(%r31)
130 sub s0,lo0,s0
131 fstds %fr6,-8(%r31)
132 subb s1,lo1,s1
133 fstds %fr9,0(%r31)
134 subb s2,lo2,s2
135 fstds %fr10,8(%r31)
136 subb s3,lo3,s3
137 subb %r0,%r0,lo0 C these two insns ...
138 add lo0,lo0,%r0 C ... just invert cy
140 ldws -16(%r31),hi0
141 ldws -12(%r31),lo0
142 ldws -8(%r31),hi1
143 ldws -4(%r31),lo1
144 ldws 0(%r31),hi2
145 ldws 4(%r31),lo2
146 ldws 8(%r31),hi3
147 ldws 12(%r31),lo3
149 addc lo0,cylimb,lo0
150 stws,ma s0,4(res_ptr)
151 addc lo1,hi0,lo1
152 stws,ma s1,4(res_ptr)
153 addc lo2,hi1,lo2
154 stws,ma s2,4(res_ptr)
155 addc lo3,hi2,lo3
156 stws,ma s3,4(res_ptr)
158 addib,>= -4,size_param,L(loop)
159 addc %r0,hi3,cylimb C propagate carry into cylimb
160 C finish software pipeline ---------------------------------------------------
161 LDEF(end)
162 ldws 0(res_ptr),s0
163 ldws 4(res_ptr),s1
164 ldws 8(res_ptr),s2
165 ldws 12(res_ptr),s3
167 sub s0,lo0,s0
168 stws,ma s0,4(res_ptr)
169 subb s1,lo1,s1
170 stws,ma s1,4(res_ptr)
171 subb s2,lo2,s2
172 stws,ma s2,4(res_ptr)
173 subb s3,lo3,s3
174 stws,ma s3,4(res_ptr)
175 subb %r0,%r0,lo0 C these two insns ...
176 add lo0,lo0,%r0 C ... invert cy
178 C restore callee-saves registers ---------------------------------------------
179 ldw -96(%r30),%r3
180 ldw -92(%r30),%r4
181 ldw -88(%r30),%r5
182 ldw -84(%r30),%r6
183 ldw -80(%r30),%r7
185 LDEF(few_limbs)
186 addib,=,n 4,size_param,L(ret)
188 LDEF(loop2)
189 fldws,ma 4(s1_ptr),%fr4
190 ldws 0(res_ptr),s0
191 xmpyu %fr4,%fr31R,%fr5
192 fstds %fr5,-16(%r30)
193 ldws -16(%r30),hi0
194 ldws -12(%r30),lo0
195 addc lo0,cylimb,lo0
196 addc %r0,hi0,cylimb
197 sub s0,lo0,s0
198 add s0,lo0,%r0 C invert cy
199 stws,ma s0,4(res_ptr)
200 addib,<> -1,size_param,L(loop2)
203 LDEF(ret)
204 addc %r0,cylimb,cylimb
205 bv 0(%r2)
206 ldo -128(%r30),%r30
207 EPILOGUE(mpn_submul_1)