beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / com.asm
blobf084ab5e961cdc4ec32ed8201aa6852fdf42628d
1 dnl Alpha mpn_com -- mpn one's complement.
3 dnl Copyright 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C EV4: 4.75
36 C EV5: 2.0
37 C EV6: 1.5
40 C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
42 C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
43 C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop
44 C will be 1.5+2/N c/l.
46 C 2 cycles of loop control are unavoidable, for pointer updates and the
47 C taken branch bubble, but also since ldq cannot issue two cycles after stq
48 C (and with a run of stqs that means neither of two cycles at the end of the
49 C loop.
51 C The fbeq is forced into the second cycle of the loop using unops, since
52 C the first time through it must wait for the cvtqt result. Once that
53 C result is ready (a 1 cycle stall) then both the branch and following loads
54 C can issue together.
56 C The main loop handles an odd count of limbs, being two limbs loaded before
57 C each size test, plus one pipelined around from the previous iteration (or
58 C setup in the entry sequence).
60 C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
61 C entry sequence, and an increment of the pointers. For an odd size there's
62 C no increment and the first store in the loop (r24) is a repeat of dst[0].
64 C Note that the load for r24 after the possible pointer increment is done
65 C before the explicit store to dst[0], in case src==dst.
68 ASM_START()
70 FLOAT64(L(dat), 2.0)
72 ALIGN(16)
74 PROLOGUE(mpn_com,gp)
76 C r16 dst
77 C r17 src
78 C r18 size
80 lda r30, -16(r30) C temporary stack space
81 lda r7, -3(r18) C size - 3
83 ldq r20, 0(r17) C src[0]
84 srl r7, 1, r6 C (size-3)/2
86 stq r6, 8(r30) C (size-3)/2
87 and r7, 1, r5 C 1 if size even
89 LEA( r8, L(dat))
90 s8addq r5, r17, r17 C skip src[0] if even
92 ornot r31, r20, r20 C ~src[0]
93 unop
95 ldt f0, 8(r30) C (size-3)/2
96 ldq r24, 0(r17) C src[0 or 1]
98 stq r20, 0(r16) C dst[0]
99 s8addq r5, r16, r19 C skip dst[0] if even
101 ldt f1, 0(r8) C data 2.0
102 lda r30, 16(r30) C restore stack
103 unop
104 cvtqt f0, f0 C (size-3)/2 as float
106 ornot r31, r24, r24
107 blt r7, L(done_1) C if size<=2
108 unop
109 unop
112 C 16-byte alignment here
113 L(top):
114 C r17 src, incrementing
115 C r19 dst, incrementing
116 C r24 dst[i] result, ready to store
117 C f0 (size-3)/2, decrementing
118 C f1 2.0
120 ldq r20, 8(r17) C src[i+1]
121 ldq r21, 16(r17) C src[i+2]
122 unop
123 unop
125 fbeq f0, L(done_2)
126 unop
127 ldq r22, 24(r17) C src[i+3]
128 ldq r23, 32(r17) C src[i+4]
130 stq r24, 0(r19) C dst[i]
131 ornot r31, r20, r20
132 subt f0, f1, f0 C count -= 2
133 unop
135 stq r20, 8(r19) C dst[i+1]
136 ornot r31, r21, r21
137 unop
138 unop
140 stq r21, 16(r19) C dst[i+2]
141 ornot r31, r22, r22
143 stq r22, 24(r19) C dst[i+3]
144 ornot r31, r23, r24
146 lda r17, 32(r17) C src += 4
147 lda r19, 32(r19) C dst += 4
148 unop
149 fbge f0, L(top)
152 L(done_1):
153 C r19 &dst[size-1]
154 C r24 result for dst[size-1]
156 stq r24, 0(r19) C dst[size-1]
157 ret r31, (r26), 1
160 L(done_2):
161 C r19 &dst[size-3]
162 C r20 src[size-2]
163 C r21 src[size-1]
164 C r24 result for dst[size-3]
166 stq r24, 0(r19) C dst[size-3]
167 ornot r31, r20, r20
169 stq r20, 8(r19) C dst[size-2]
170 ornot r31, r21, r21
172 stq r21, 16(r19) C dst[size-1]
173 ret r31, (r26), 1
175 EPILOGUE()
176 ASM_END()