1 dnl IA
-64 mpn_addlshC_n
, mpn_sublshC_n
, mpn_rsblshC_n.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2003-2005, 2010, 2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
38 C
* Use shladd
in feed
-in code
(for mpn_addlshC_n
).
39 C
* Rewrite
loop to schedule loads closer to use
, since we do prefetch.
48 define(`ADDSUB', `
add $1 = $2, $3')
49 define(`CMP', `
cmp.ltu
$1,p0
= $2, $3')
52 define(`func', mpn_addlsh`
'LSH`'_n
)')
54 define
(`ADDSUB
', `sub $1 = $2, $3')
55 define
(`
CMP', `cmp.gtu $1,p0 = $2, $3')
58 define
(`func
', mpn_sublsh`'LSH`
'_n)')
60 define(`ADDSUB', `
sub $1 = $3, $2')
61 define(`CMP', `
cmp.gtu
$1,p0
= $2, $4')
64 define(`func', mpn_rsblsh`
'LSH`'_n
)')
68 define(`u0',`r14
') define(`u1',`r15
') define(`u2',`r16
') define(`u3',`r17
')
69 define(`v0',`r18
') define(`v1',`r19
') define(`v2',`r20
') define(`v3',`r21
')
70 define(`w0',`r22
') define(`w1',`r23
') define(`w2',`r24
') define(`w3',`r25
')
71 define(`s0',`r26
') define(`s1',`r27
') define(`s2',`r28
') define(`s3',`r29
')
72 define(`x0',`r30
') define(`x1',`r31
') define(`x2',`r3
') define(`x3',`r9
')
82 addp4 rp
= 0, rp C M I
83 addp4 up
= 0, up C M I
85 addp4 vp
= 0, vp C M I
90 {.mmi; ld8 r11 = [vp], 8 C M01
91 ld8 r10 = [up], 8 C M01
93 }{.mmi; and r14 = 3, n C M I
94 cmp.lt p15, p0 = 4, n C M I
97 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
98 cmp.eq p7, p0 = 2, r14 C M I
99 cmp.eq p8, p0 = 3, r14 C M I
101 (p6) br.dptk .Lb01 C B
102 (p7) br.dptk .Lb10 C B
103 (p8) br.dptk .Lb11 C B
107 {.mmi; ld8 v0 = [vp], 8 C M01
108 ld8 u0 = [up], 8 C M01
111 }{.mmi; ld8 v1 = [vp], 8 C M01
112 ld8 u1 = [up], 8 C M01
113 shl x3 = r11, LSH C I0
115 }{.mmi; ld8 v2 = [vp], 8 C M01
116 ld8 u2 = [up], 8 C M01
117 shrp x0 = v0, r11, 64-LSH C I0
118 }{.mmb; ADDSUB( w3, r10, x3) C M I
120 (p15) br.dpnt .grt4 C B
122 }{.mii; CMP( p7, w3, r10, x3) C M II0
123 shrp x1 = v1, v0, 64-LSH C I0
124 ADDSUB( w0, u0, x0) C M I
126 }{.mii; CMP( p8, w0, u0, x0) C M I
127 shrp x2 = v2, v1, 64-LSH C I0
128 ADDSUB( w1, u1, x1) C M I
135 {.mii; ld8 v3 = [vp], 8 C M01
136 shrp x0 = v0, r11, 64-LSH C I0
137 CMP( p8, w3, r10, x3) C M I
139 }{.mmi; ld8 u3 = [up], 8 C M01
141 shrp x1 = v1, v0, 64-LSH C I0
142 }{.mmi; ld8 v0 = [vp], 8 C M01
143 ADDSUB( w0, u0, x0) C M I
146 }{.mmi; CMP( p6, w0, u0, x0) C M I
149 }{.mmb; ADDSUB( w1, u1, x1) C M I
150 ld8 u0 = [up], 8 C M01
157 ` shladd w2
= r11
, LSH
, r10 C M I
158 shr.u r8
= r11
, 64-LSH C retval I0
159 (p15
) br.dpnt .grt1 C B
162 shl x2 = r11, LSH C I0
163 (p15) br.dpnt .grt1 C B
165 ADDSUB( w2, r10, x2) C M I
166 shr.u r8 = r11, 64-LSH C retval I0
169 CMP( p6
, w2
, r10
, x2
) C M I
172 .
grt1: ld8 v3
= [vp
], 8 C M01
173 ld8 u3
= [up
], 8 C M01
176 ld8 v0
= [vp
], 8 C M01
177 ld8 u0
= [up
], 8 C M01
178 mov.i ar.lc
= n C FIXME swap with next I0
184 {.mmi; ld8 v1 = [vp], 8 C M01
185 ld8 u1 = [up], 8 C M01
186 shrp x3 = v3, r11, 64-LSH C I0
188 }{.mmi; ld8 v2 = [vp], 8 C M01
189 ld8 u2 = [up], 8 C M01
190 shrp x0 = v0, v3, 64-LSH C I0
191 }{.mmb; CMP( p6, w2, r10, x2) C M I
192 ADDSUB( w3, u3, x3) C M I
193 br.cloop.dptk .grt5 C B
195 }{.mmi; CMP( p7, w3, u3, x3) C M I
196 ADDSUB( w0, u0, x0) C M I
197 shrp x1 = v1, v0, 64-LSH C I0
203 {.mmi; add r10 = PFDIST, up
205 shrp x0 = v0, v3, 64-LSH C I0
206 }{.mmb; ld8 v3 = [vp], 8 C M01
207 CMP( p8, w3, u3, x3) C M I
212 {.mmi; ld8 v2 = [vp], 8 C M01
213 ld8 u2 = [up], 8 C M01
214 shl x1 = r11, LSH C I0
217 (p15) br.dpnt .grt2 C B
219 }{.mmi; ADDSUB( w1, r10, x1) C M I
221 shrp x2 = v2, r11, 64-LSH C I0
223 }{.mmi; CMP( p9, w1, r10, x1) C M I
224 ADDSUB( w2, u2, x2) C M I
225 shr.u r8 = v2, 64-LSH C retval I0
227 }{.mmb; CMP( p6, w2, u2, x2) C M I
232 {.mmi; ld8 v3 = [vp], 8 C M01
233 ld8 u3 = [up], 8 C M01
236 }{.mmi; ld8 v0 = [vp], 8 C M01
237 ld8 u0 = [up], 8 C M01
239 }{.mmi; ADDSUB( w1, r10, x1) C M I
243 }{.mii; ld8 v1 = [vp], 8 C M01
244 shrp x2 = v2, r11, 64-LSH C I0
245 CMP( p8, w1, r10, x1) C M I
247 }{.mmi; add r10 = PFDIST, up
248 ld8 u1 = [up], 8 C M01
249 shrp x3 = v3, v2, 64-LSH C I0
250 }{.mmi; add r11 = PFDIST, vp
251 ld8 v2 = [vp], 8 C M01
252 ADDSUB( w2, u2, x2) C M I
254 }{.mmi; CMP( p6, w2, u2, x2) C M I
255 ld8 u2 = [up], 8 C M01
256 shrp x0 = v0, v3, 64-LSH C I0
257 }{.mib; ADDSUB( w3, u3, x3) C M I
259 br.cloop.dpnt L(top) C B
263 {.mmi; ld8 v1 = [vp], 8 C M01
264 ld8 u1 = [up], 8 C M01
265 shl x0 = r11, LSH C I0
267 }{.mmi; ld8 v2 = [vp], 8 C M01
268 ld8 u2 = [up], 8 C M01
272 (p15) br.dpnt .grt3 C B
275 shrp x1 = v1, r11, 64-LSH C I0
276 ADDSUB( w0, r10, x0) C M I
278 }{.mii; CMP( p8, w0, r10, x0) C M I
279 shrp x2 = v2, v1, 64-LSH C I0
280 ADDSUB( w1, u1, x1) C M I
282 }{.mmb; CMP( p9, w1, u1, x1) C M I
283 ADDSUB( w2, u2, x2) C M I
287 {.mmi; ld8 v3 = [vp], 8 C M01
288 ld8 u3 = [up], 8 C M01
289 shrp x1 = v1, r11, 64-LSH C I0
290 }{.mmi; ADDSUB( w0, r10, x0) C M I
294 }{.mmi; ld8 v0 = [vp], 8 C M01
295 CMP( p6, w0, r10, x0) C M I
297 }{.mmi; ld8 u0 = [up], 8 C M01
298 ADDSUB( w1, u1, x1) C M I
301 }{.mmi; add r10 = PFDIST, up
303 shrp x2 = v2, v1, 64-LSH C I0
304 }{.mmb; ld8 v1 = [vp], 8 C M01
305 CMP( p8, w1, u1, x1) C M I
309 C *** MAIN LOOP START ***
311 L(top): st8 [rp] = w1, 8 C M23
313 (p8) cmpeqor p6, p0 = LIM, w2 C M I
314 (p8) add w2 = INCR, w2 C M I
315 ld8 v3 = [vp], 8 C M01
316 CMP( p8, w3, u3, x3) C M I
318 .LL01: ld8 u3 = [up], 8 C M01
319 shrp x1 = v1, v0, 64-LSH C I0
320 (p6) cmpeqor p8, p0 = LIM, w3 C M I
321 (p6) add w3 = INCR, w3 C M I
322 ld8 v0 = [vp], 8 C M01
323 ADDSUB( w0, u0, x0) C M I
325 st8 [rp] = w2, 8 C M23
326 CMP( p6, w0, u0, x0) C M I
328 ld8 u0 = [up], 8 C M01
330 ADDSUB( w1, u1, x1) C M I
332 .LL00: st8 [rp] = w3, 8 C M23
333 shrp x2 = v2, v1, 64-LSH C I0
334 (p8) cmpeqor p6, p0 = LIM, w0 C M I
335 (p8) add w0 = INCR, w0 C M I
336 ld8 v1 = [vp], 8 C M01
337 CMP( p8, w1, u1, x1) C M I
339 .LL11: ld8 u1 = [up], 8 C M01
340 shrp x3 = v3, v2, 64-LSH C I0
341 (p6) cmpeqor p8, p0 = LIM, w1 C M I
342 (p6) add w1 = INCR, w1 C M I
343 ld8 v2 = [vp], 8 C M01
344 ADDSUB( w2, u2, x2) C M I
346 {.mmi; st8 [rp] = w0, 8 C M23
347 CMP( p6, w2, u2, x2) C M I
348 shrp x0 = v0, v3, 64-LSH C I0
350 ld8 u2 = [up], 8 C M01
351 ADDSUB( w3, u3, x3) C M I
352 br.cloop.dptk L(top) C B
355 C *** MAIN LOOP END ***
358 {.mmi; st8 [rp] = w1, 8 C M23
359 (p8) cmpeqor p6, p0 = LIM, w2 C M I
360 shrp x1 = v1, v0, 64-LSH C I0
362 (p8) add w2 = INCR, w2 C M I
363 CMP( p7, w3, u3, x3) C M I
364 ADDSUB( w0, u0, x0) C M I
368 {.mmi; st8 [rp] = w2, 8 C M23
369 (p6) cmpeqor p7, p0 = LIM, w3 C M I
370 shrp x2 = v2, v1, 64-LSH C I0
372 (p6) add w3 = INCR, w3 C M I
373 CMP( p8, w0, u0, x0) C M I
374 ADDSUB( w1, u1, x1) C M I
378 {.mmi; st8 [rp] = w3, 8 C M23
379 (p7) cmpeqor p8, p0 = LIM, w0 C M I
380 mov.i ar.lc = r2 C I0
382 (p7) add w0 = INCR, w0 C M I
383 CMP( p9, w1, u1, x1) C M I
384 ADDSUB( w2, u2, x2) C M I
388 {.mmi; st8 [rp] = w0, 8 C M23
389 (p8) cmpeqor p9, p0 = LIM, w1 C M I
390 shr.u r8 = v2, 64-LSH C I0
392 (p8) add w1 = INCR, w1 C M I
393 CMP( p6, w2, u2, x2) C M I
398 {.mmi; st8 [rp] = w1, 8 C M23
399 (p9) cmpeqor p6, p0 = LIM, w2 C M I
400 (p9) add w2 = INCR, w2 C M I
404 {.mmb; st8 [rp] = w2 C M23
406 (p6
) add r8
= -1, r8 C M I
408 (p6) add r8 = 1, r8 C M I
409 ') br.
ret.sptk.many b0 C B