1 dnl Alpha mpn_com
-- mpn one
's complement.
3 dnl Copyright 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl or both in parallel, as here.
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
40 C mp_limb_t mpn_com
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size);
42 C For ev5 the main
loop is
7 cycles plus
1 taken branch bubble
, for a total
43 C
2.0 c
/l.
In general
, a pattern like
this unrolled to N limbs per
loop
44 C will be
1.5+2/N c
/l.
46 C
2 cycles of
loop control are unavoidable
, for pointer updates
and the
47 C taken branch bubble
, but also since ldq cannot issue two cycles after stq
48 C
(and with a run of stqs that means neither of two cycles at the
end of the
51 C The fbeq is forced
into the second cycle of the
loop using unops
, since
52 C the first time through it must
wait for the cvtqt result. Once that
53 C result is ready
(a
1 cycle stall
) then both the branch
and following loads
56 C The main
loop handles an odd count of limbs
, being two limbs loaded before
57 C each
size test, plus one pipelined around from the previous iteration
(or
58 C setup
in the
entry sequence
).
60 C An even number of limbs is handled by an explicit dst
[0]=~src
[0] in the
61 C
entry sequence
, and an increment of the pointers. For an odd
size there
's
62 C no increment and the first store in the loop (r24) is a repeat of dst[0].
64 C Note that the load for r24 after the possible pointer increment is done
65 C before the explicit store to dst[0], in case src==dst.
80 lda r30, -16(r30) C temporary stack space
81 lda r7, -3(r18) C size - 3
83 ldq r20, 0(r17) C src[0]
84 srl r7, 1, r6 C (size-3)/2
86 stq r6, 8(r30) C (size-3)/2
87 and r7, 1, r5 C 1 if size even
90 s8addq r5, r17, r17 C skip src[0] if even
92 ornot r31, r20, r20 C ~src[0]
95 ldt f0, 8(r30) C (size-3)/2
96 ldq r24, 0(r17) C src[0 or 1]
98 stq r20, 0(r16) C dst[0]
99 s8addq r5, r16, r19 C skip dst[0] if even
101 ldt f1, 0(r8) C data 2.0
102 lda r30, 16(r30) C restore stack
104 cvtqt f0, f0 C (size-3)/2 as float
107 blt r7, L(done_1) C if size<=2
112 C 16-byte alignment here
114 C r17 src, incrementing
115 C r19 dst, incrementing
116 C r24 dst[i] result, ready to store
117 C f0 (size-3)/2, decrementing
120 ldq r20, 8(r17) C src[i+1]
121 ldq r21, 16(r17) C src[i+2]
127 ldq r22, 24(r17) C src[i+3]
128 ldq r23, 32(r17) C src[i+4]
130 stq r24, 0(r19) C dst[i]
132 subt f0, f1, f0 C count -= 2
135 stq r20, 8(r19) C dst[i+1]
140 stq r21, 16(r19) C dst[i+2]
143 stq r22, 24(r19) C dst[i+3]
146 lda r17, 32(r17) C src += 4
147 lda r19, 32(r19) C dst += 4
154 C r24 result for dst[size-1]
156 stq r24, 0(r19) C dst[size-1]
164 C r24 result for dst[size-3]
166 stq r24, 0(r19) C dst[size-3]
169 stq r20, 8(r19) C dst[size-2]
172 stq r21, 16(r19) C dst[size-1]