1 dnl IA
-64 mpn_submul_1
-- Multiply a limb vector with a limb
and subtract the
2 dnl result from a second limb vector.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright
2000-2004 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
38 C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l)
41 C * Optimize feed-in and wind-down code, both for speed and code size.
42 C * Handle low limb input and results specially, using a common stf8 in the
44 C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
45 C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and
55 PROLOGUE(mpn_submul_1)
61 ` addp4 rp
= 0, rp C M I
62 addp4 up
= 0, up C M I
69 sub vl = r0, vl C M I negate vl
74 add r19 = -1, n C M I n - 1
78 cmp.eq p6, p0 = 0, vl C M I
79 mov r8 = 0 C M I zero cylimb
83 setf.sig f6 = vl C M2 M3
85 shr.u r19 = r19, 2 C I0
90 cmp.eq p10, p0 = 0, r14 C M I
91 (p6) br.spnt .Ldone C B vl == 0
94 cmp.eq p11, p0 = 2, r14 C M I
95 cmp.eq p12, p0 = 3, r14 C M I
99 (p10) br.dptk .Lb00 C B
100 (p11) br.dptk .Lb10 C B
101 (p12) br.dptk .Lb11 C B
105 .Lb01: br.cloop.dptk .grt1
107 xma.l f39 = f7, f6, f8
108 xma.hu f43 = f7, f6, f8
110 getf.sig r27 = f39 C lo
111 getf.sig r31 = f43 C hi
115 .grt1: ldf8 f44 = [rp], 8
122 xma.l f39 = f7, f6, f8
124 xma.hu f43 = f7, f6, f8
127 xma.l f36 = f32, f6, f44
129 xma.hu f40 = f32, f6, f44
133 getf.sig r27 = f39 C lo
134 xma.l f37 = f33, f6, f45
136 xma.hu f41 = f33, f6, f45
138 getf.sig r31 = f43 C hi
139 getf.sig r24 = f36 C lo
140 xma.l f38 = f34, f6, f46
142 xma.hu f42 = f34, f6, f46
144 getf.sig r28 = f40 C hi
145 getf.sig r25 = f37 C lo
146 xma.l f39 = f35, f6, f47
148 xma.hu f43 = f35, f6, f47
150 getf.sig r29 = f41 C hi
151 getf.sig r26 = f38 C lo
155 .grt5: ldf8 f44 = [rp], 8
158 getf.sig r27 = f39 C lo
159 xma.l f37 = f33, f6, f45
161 xma.hu f41 = f33, f6, f45
164 getf.sig r31 = f43 C hi
167 getf.sig r24 = f36 C lo
168 xma.l f38 = f34, f6, f46
170 xma.hu f42 = f34, f6, f46
173 getf.sig r28 = f40 C hi
176 getf.sig r25 = f37 C lo
177 xma.l f39 = f35, f6, f47
179 xma.hu f43 = f35, f6, f47
182 getf.sig r29 = f41 C hi
185 getf.sig r26 = f38 C lo
186 xma.l f36 = f32, f6, f44
188 xma.hu f40 = f32, f6, f44
193 .Lb10: ldf8 f47 = [rp], 8
197 xma.l f38 = f7, f6, f8
198 xma.hu f42 = f7, f6, f8
200 xma.l f39 = f35, f6, f47
201 xma.hu f43 = f35, f6, f47
203 getf.sig r26 = f38 C lo
204 getf.sig r30 = f42 C hi
207 getf.sig r27 = f39 C lo
208 getf.sig r31 = f43 C hi
212 .grt2: ldf8 f44 = [rp], 8
217 xma.l f38 = f7, f6, f8
218 xma.hu f42 = f7, f6, f8
222 xma.l f39 = f35, f6, f47
223 xma.hu f43 = f35, f6, f47
228 getf.sig r26 = f38 C lo
229 xma.l f36 = f32, f6, f44
231 xma.hu f40 = f32, f6, f44
234 getf.sig r30 = f42 C hi
236 getf.sig r27 = f39 C lo
237 xma.l f37 = f33, f6, f45
239 xma.hu f41 = f33, f6, f45
241 getf.sig r31 = f43 C hi
242 getf.sig r24 = f36 C lo
243 xma.l f38 = f34, f6, f46
245 xma.hu f42 = f34, f6, f46
247 getf.sig r28 = f40 C hi
248 getf.sig r25 = f37 C lo
249 xma.l f39 = f35, f6, f47
251 xma.hu f43 = f35, f6, f47
254 .grt6: ldf8 f44 = [rp], 8
255 getf.sig r30 = f42 C hi
258 getf.sig r27 = f39 C lo
259 xma.l f37 = f33, f6, f45
261 xma.hu f41 = f33, f6, f45
264 getf.sig r31 = f43 C hi
267 getf.sig r24 = f36 C lo
268 xma.l f38 = f34, f6, f46
270 xma.hu f42 = f34, f6, f46
273 getf.sig r28 = f40 C hi
276 getf.sig r25 = f37 C lo
277 xma.l f39 = f35, f6, f47
279 xma.hu f43 = f35, f6, f47
283 .Lb11: ldf8 f46 = [rp], 8
290 xma.l f37 = f7, f6, f8
291 xma.hu f41 = f7, f6, f8
293 xma.l f38 = f34, f6, f46
294 xma.hu f42 = f34, f6, f46
296 getf.sig r25 = f37 C lo
297 xma.l f39 = f35, f6, f47
298 xma.hu f43 = f35, f6, f47
300 getf.sig r29 = f41 C hi
303 getf.sig r26 = f38 C lo
304 getf.sig r30 = f42 C hi
307 getf.sig r27 = f39 C lo
308 getf.sig r31 = f43 C hi
312 .grt3: ldf8 f44 = [rp], 8
313 xma.l f37 = f7, f6, f8
315 xma.hu f41 = f7, f6, f8
318 xma.l f38 = f34, f6, f46
320 xma.hu f42 = f34, f6, f46
325 getf.sig r25 = f37 C lo
326 xma.l f39 = f35, f6, f47
328 xma.hu f43 = f35, f6, f47
331 getf.sig r29 = f41 C hi
334 getf.sig r26 = f38 C lo
335 xma.l f36 = f32, f6, f44
337 xma.hu f40 = f32, f6, f44
341 getf.sig r30 = f42 C hi
342 getf.sig r27 = f39 C lo
343 xma.l f37 = f33, f6, f45
345 xma.hu f41 = f33, f6, f45
347 getf.sig r31 = f43 C hi
348 getf.sig r24 = f36 C lo
349 xma.l f38 = f34, f6, f46
351 xma.hu f42 = f34, f6, f46
354 .grt7: ldf8 f44 = [rp], 8
355 getf.sig r30 = f42 C hi
358 getf.sig r27 = f39 C lo
359 xma.l f37 = f33, f6, f45
361 xma.hu f41 = f33, f6, f45
364 getf.sig r31 = f43 C hi
367 getf.sig r24 = f36 C lo
368 xma.l f38 = f34, f6, f46
370 xma.hu f42 = f34, f6, f46
374 .Lb00: ldf8 f45 = [rp], 8
381 xma.l f36 = f7, f6, f8
383 xma.hu f40 = f7, f6, f8
386 xma.l f37 = f33, f6, f45
387 xma.hu f41 = f33, f6, f45
389 getf.sig r24 = f36 C lo
390 xma.l f38 = f34, f6, f46
392 xma.hu f42 = f34, f6, f46
394 getf.sig r28 = f40 C hi
395 xma.l f39 = f35, f6, f47
396 getf.sig r25 = f37 C lo
398 xma.hu f43 = f35, f6, f47
400 getf.sig r29 = f41 C hi
401 getf.sig r26 = f38 C lo
404 getf.sig r30 = f42 C hi
405 getf.sig r27 = f39 C lo
409 .grt4: ldf8 f44 = [rp], 8
410 xma.l f37 = f33, f6, f45
412 xma.hu f41 = f33, f6, f45
416 xma.l f38 = f34, f6, f46
417 getf.sig r24 = f36 C lo
419 xma.hu f42 = f34, f6, f46
422 getf.sig r28 = f40 C hi
424 xma.l f39 = f35, f6, f47
425 getf.sig r25 = f37 C lo
427 xma.hu f43 = f35, f6, f47
430 getf.sig r29 = f41 C hi
433 getf.sig r26 = f38 C lo
434 xma.l f36 = f32, f6, f44
436 xma.hu f40 = f32, f6, f44
440 getf.sig r30 = f42 C hi
441 getf.sig r27 = f39 C lo
442 xma.l f37 = f33, f6, f45
444 xma.hu f41 = f33, f6, f45
447 .grt8: ldf8 f44 = [rp], 8
448 getf.sig r30 = f42 C hi
451 getf.sig r27 = f39 C lo
452 xma.l f37 = f33, f6, f45
454 xma.hu f41 = f33, f6, f45
461 cmp.ltu p6, p0 = r27, r8 C lo cmp
462 sub r14 = r27, r8 C lo sub
465 getf.sig r30 = f42 C hi
467 sub r8 = r20, r31 C hi sub
471 getf.sig r27 = f39 C lo
473 xma.l f37 = f33, f6, f45
477 xma.hu f41 = f33, f6, f45
482 .LL00: ldf8 f45 = [rp], 8
483 cmp.ltu p6, p0 = r24, r8
487 getf.sig r31 = f43 C hi
493 getf.sig r24 = f36 C lo
495 xma.l f38 = f34, f6, f46
499 xma.hu f42 = f34, f6, f46
504 .LL11: ldf8 f46 = [rp], 8
505 cmp.ltu p6, p0 = r25, r8
509 getf.sig r28 = f40 C hi
515 getf.sig r25 = f37 C lo
517 xma.l f39 = f35, f6, f47
521 xma.hu f43 = f35, f6, f47
526 .LL10: ldf8 f47 = [rp], 8
527 cmp.ltu p6, p0 = r26, r8
531 getf.sig r29 = f41 C hi
537 getf.sig r26 = f38 C lo
539 xma.l f36 = f32, f6, f44
543 xma.hu f40 = f32, f6, f44
550 cmp.ltu p6, p0 = r27, r8
557 xma.l f37 = f33, f6, f45
559 xma.hu f41 = f33, f6, f45
563 cmp.ltu p6, p0 = r24, r8
570 xma.l f38 = f34, f6, f46
572 xma.hu f42 = f34, f6, f46
576 cmp.ltu p6, p0 = r25, r8
583 xma.l f39 = f35, f6, f47
585 xma.hu f43 = f35, f6, f47
589 cmp.ltu p6, p0 = r26, r8
600 cmp.ltu p6, p0 = r27, r8
611 cmp.ltu p6, p0 = r24, r8
620 cmp.ltu p6, p0 = r25, r8
628 cmp.ltu p6, p0 = r26, r8
636 cmp.ltu p6, p0 = r27, r8
644 .Ldone: mov ar.lc = r2