1 dnl IA
-64 mpn_bdiv_dbm1.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2008, 2009 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
40 C * Optimize feed-in and wind-down code, both for speed and code size.
49 PROLOGUE(mpn_bdiv_dbm1c)
55 ` addp4 rp
= 0, rp C M I
56 addp4 up
= 0, up C M I
67 adds r16 = -1, n C M I
73 setf.sig f6 = bd C M2 M3
74 shr.u r31 = r16, 2 C I0
75 cmp.eq p10, p0 = 0, r14 C M I
79 cmp.eq p11, p0 = 2, r14 C M I
80 cmp.eq p12, p0 = 3, r14 C M I
84 cmp.ne p6, p7 = r0, r0 C M I
85 mov.i ar.lc = r31 C I0
86 cmp.ne p8, p9 = r0, r0 C M I
89 (p10) br.dptk .Lb00 C B
90 (p11) br.dptk .Lb10 C B
91 (p12) br.dptk .Lb11 C B
95 .Lb01: br.cloop.dptk .grt1
97 xma.l f38 = f9, f6, f0
98 xma.hu f39 = f9, f6, f0
104 .grt1: ldf8 f10 = [r33], 8
110 xma.l f38 = f9, f6, f0
111 xma.hu f39 = f9, f6, f0
115 xma.l f32 = f10, f6, f0
116 xma.hu f33 = f10, f6, f0
121 xma.l f34 = f11, f6, f0
122 xma.hu f35 = f11, f6, f0
127 xma.l f36 = f12, f6, f0
128 xma.hu f37 = f12, f6, f0
133 xma.l f38 = f13, f6, f0
134 xma.hu f39 = f13, f6, f0
137 .grt5: ldf8 f10 = [r33], 8
140 xma.l f34 = f11, f6, f0
141 xma.hu f35 = f11, f6, f0
147 xma.l f36 = f12, f6, f0
148 xma.hu f37 = f12, f6, f0
154 xma.l f38 = f13, f6, f0
155 xma.hu f39 = f13, f6, f0
158 .Lb10: ldf8 f13 = [r33], 8
162 xma.l f36 = f9, f6, f0
163 xma.hu f37 = f9, f6, f0
165 xma.l f38 = f13, f6, f0
166 xma.hu f39 = f13, f6, f0
177 .grt2: ldf8 f10 = [r33], 8
181 xma.l f36 = f9, f6, f0
182 xma.hu f37 = f9, f6, f0
186 xma.l f38 = f13, f6, f0
187 xma.hu f39 = f13, f6, f0
192 xma.l f32 = f10, f6, f0
193 xma.hu f33 = f10, f6, f0
199 xma.l f34 = f11, f6, f0
200 xma.hu f35 = f11, f6, f0
205 xma.l f36 = f12, f6, f0
206 xma.hu f37 = f12, f6, f0
209 .grt6: getf.sig r25 = f37
213 xma.l f34 = f11, f6, f0
214 xma.hu f35 = f11, f6, f0
220 xma.l f36 = f12, f6, f0
221 xma.hu f37 = f12, f6, f0
225 .Lb11: ldf8 f12 = [r33], 8
231 xma.l f34 = f9, f6, f0
232 xma.hu f35 = f9, f6, f0
234 xma.l f36 = f12, f6, f0
235 xma.hu f37 = f12, f6, f0
238 xma.l f38 = f13, f6, f0
239 xma.hu f39 = f13, f6, f0
250 .grt3: ldf8 f10 = [r33], 8
252 xma.l f34 = f9, f6, f0
253 xma.hu f35 = f9, f6, f0
257 xma.l f36 = f12, f6, f0
258 xma.hu f37 = f12, f6, f0
263 xma.l f38 = f13, f6, f0
264 xma.hu f39 = f13, f6, f0
270 xma.l f32 = f10, f6, f0
271 xma.hu f33 = f10, f6, f0
277 xma.l f34 = f11, f6, f0
278 xma.hu f35 = f11, f6, f0
281 .grt7: getf.sig r25 = f37
285 xma.l f34 = f11, f6, f0
286 xma.hu f35 = f11, f6, f0
290 .Lb00: ldf8 f11 = [r33], 8
298 xma.l f32 = f9, f6, f0
299 xma.hu f33 = f9, f6, f0
301 xma.l f34 = f11, f6, f0
302 xma.hu f35 = f11, f6, f0
305 xma.l f36 = f12, f6, f0
306 xma.hu f37 = f12, f6, f0
311 xma.l f38 = f13, f6, f0
312 xma.hu f39 = f13, f6, f0
319 .grt4: xma.l f32 = f9, f6, f0
320 xma.hu f33 = f9, f6, f0
324 xma.l f34 = f11, f6, f0
325 xma.hu f35 = f11, f6, f0
330 xma.l f36 = f12, f6, f0
331 xma.hu f37 = f12, f6, f0
337 xma.l f38 = f13, f6, f0
338 xma.hu f39 = f13, f6, f0
344 xma.l f32 = f10, f6, f0
345 xma.hu f33 = f10, f6, f0
349 C *** MAIN LOOP START ***
352 .pred.rel "mutex",p6,p7
355 xma.l f32 = f10, f6, f0
356 (p6) sub r15 = r19, r27, 1
359 xma.hu f33 = f10, f6, f0
360 (p7) sub r15 = r19, r27
366 cmp.ltu p6, p7 = r15, r20
375 xma.l f34 = f11, f6, f0
376 (p6) sub r15 = r16, r21, 1
379 xma.hu f35 = f11, f6, f0
380 (p7) sub r15 = r16, r21
386 cmp.ltu p6, p7 = r15, r22
395 xma.l f36 = f12, f6, f0
396 (p6) sub r15 = r17, r23, 1
399 xma.hu f37 = f12, f6, f0
400 (p7) sub r15 = r17, r23
406 cmp.ltu p6, p7 = r15, r24
415 xma.l f38 = f13, f6, f0
416 (p6) sub r15 = r18, r25, 1
419 xma.hu f39 = f13, f6, f0
420 (p7) sub r15 = r18, r25
426 cmp.ltu p6, p7 = r15, r26
430 br.cloop.sptk.few .Ltop
431 C *** MAIN LOOP END ***
435 xma.l f32 = f10, f6, f0
436 (p6) sub r15 = r19, r27, 1
438 xma.hu f33 = f10, f6, f0
439 (p7) sub r15 = r19, r27
441 .Lcj8: getf.sig r25 = f37
442 cmp.ltu p6, p7 = r15, r20
446 xma.l f34 = f11, f6, f0
447 (p6) sub r15 = r16, r21, 1
449 xma.hu f35 = f11, f6, f0
450 (p7) sub r15 = r16, r21
452 .Lcj7: getf.sig r27 = f39
453 cmp.ltu p6, p7 = r15, r22
457 xma.l f36 = f12, f6, f0
458 (p6) sub r15 = r17, r23, 1
460 xma.hu f37 = f12, f6, f0
461 (p7) sub r15 = r17, r23
463 .Lcj6: getf.sig r21 = f33
464 cmp.ltu p6, p7 = r15, r24
468 xma.l f38 = f13, f6, f0
469 (p6) sub r15 = r18, r25, 1
471 xma.hu f39 = f13, f6, f0
472 (p7) sub r15 = r18, r25
474 .Lcj5: getf.sig r23 = f35
475 cmp.ltu p6, p7 = r15, r26
479 (p6) sub r15 = r19, r27, 1
481 (p7) sub r15 = r19, r27
483 .Lcj4: getf.sig r25 = f37
484 cmp.ltu p6, p7 = r15, r20
488 (p6) sub r15 = r16, r21, 1
490 (p7) sub r15 = r16, r21
492 .Lcj3: getf.sig r27 = f39
493 cmp.ltu p6, p7 = r15, r22
496 (p6) sub r15 = r17, r23, 1
498 (p7) sub r15 = r17, r23
500 .Lcj2: cmp.ltu p6, p7 = r15, r24
503 (p6) sub r15 = r18, r25, 1
505 (p7) sub r15 = r18, r25
507 .Lcj1: cmp.ltu p6, p7 = r15, r26
510 (p6) sub r8 = r19, r27, 1
512 (p7) sub r8 = r19, r27