1 dnl Alpha mpn_modexact_1c_odd
-- mpn exact remainder
3 dnl Copyright
2003, 2004 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
40 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
43 C This code follows the "alternate" code in mpn/generic/mode1o.c,
44 C eliminating cbit+climb from the dependent chain. This leaves,
47 C 1 3 1 subq y = x - h
48 C 23 13 7 mulq q = y * inverse
49 C 23 14 7 umulh h = high (q * d)
53 C In each case, the load latency, loop control, and extra carry bit handling
54 C hide under the multiply latencies. Those latencies are long enough that
55 C we don't need to worry about alignment
or pairing to squeeze
out
58 C For the first limb
, some of the
loop code is broken
out and scheduled back
59 C since it can be done earlier.
61 C
- The first ldq src
[0] is
near the start of the routine
, for maximum
64 C
- The subq y
=x
-climb can be done without waiting for the inverse.
66 C
- The mulq y
*inverse is replicated after the final subq for the inverse
,
67 C instead of branching to the mulq
in the main
loop. On ev4 a branch
68 C there would cost cycles
, but we can hide them under the mulq latency.
70 C For the last limb
, high<divisor is tested
and if that
's true a subtract
71 C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
72 C data-dependent branch, but we're waiting for umulh so any penalty should
73 C hide there. The multiplies saved would be worth the cost anyway.
77 C For
size==1, a plain division
(done bitwise say
) might be faster than
78 C calculating an inverse
, the latter taking about
130 cycles on ev4
or 70 on
79 C ev5. A
call to gcc __remqu might be a possibility.
82 PROLOGUE
(mpn_modexact_1c_odd
,gp
)
89 LEA(r0
, binvert_limb_table
)
90 srl r18
, 1, r20 C d
>> 1
92 and r20
, 127, r20 C idx
= d
>>1 & 0x7F
94 addq r0
, r20
, r21 C table
+ idx
96 ifelse
(bwx_available_p
,1,
97 ` ldbu r20
, 0(r21
) C table
[idx
], inverse
8 bits
99 ldq_u r20, 0(r21) C table[idx] qword
100 extbl r20, r21, r20 C table[idx], inverse 8 bits
103 mull r20
, r20
, r7 C i
*i
104 addq r20
, r20
, r20 C
2*i
106 ldq r2
, 0(r16
) C x
= s
= src
[0]
107 lda r17
, -1(r17
) C
size--
108 clr r0 C initial cbit
=0
110 mull r7
, r18
, r7 C i
*i
*d
112 subq r20
, r7
, r20 C
2*i
-i
*i
*d
, inverse
16 bits
114 mull r20
, r20
, r7 C i
*i
115 addq r20
, r20
, r20 C
2*i
117 mull r7
, r18
, r7 C i
*i
*d
119 subq r20
, r7
, r20 C
2*i
-i
*i
*d
, inverse
32 bits
121 mulq r20
, r20
, r7 C i
*i
122 addq r20
, r20
, r20 C
2*i
124 mulq r7
, r18
, r7 C i
*i
*d
125 subq r2
, r19
, r3 C y
= x
- climb
127 subq r20
, r7
, r20 C inv
= 2*i
-i
*i
*d
, inverse
64 bits
129 ASSERT
(r7
, C should have d
*inv
==1 mod 2^
64
133 mulq r3, r20, r4 C first q = y * inv
135 beq r17, L(one) C if size==1
141 C r16 src, incrementing
142 C r17 size, decrementing
147 ldq r1, 0(r16) C s = src[i]
148 subq r1, r0, r2 C x = s - cbit
149 cmpult r1, r0, r0 C new cbit = s < cbit
151 subq r2, r19, r3 C y = x - climb
153 mulq r3, r20, r4 C q = y * inv
155 cmpult r2, r19, r5 C cbit2 = x < climb
156 addq r5, r0, r0 C cbit += cbit2
157 lda r16, 8(r16) C src++
158 lda r17, -1(r17) C size--
160 umulh r4, r18, r19 C climb = q * d
161 bne r17, L(top) C while 2 or more limbs left
170 ldq r1, 0(r16) C s = src[size-1] high limb
172 cmpult r1, r18, r2 C test high<divisor
173 bne r2, L(skip) C skip if so
175 C can't skip a division
, repeat
loop code
177 subq r1
, r0
, r2 C x
= s
- cbit
178 cmpult r1
, r0
, r0 C new cbit
= s
< cbit
180 subq r2
, r19
, r3 C y
= x
- climb
182 mulq r3
, r20
, r4 C q
= y
* inv
184 cmpult r2
, r19
, r5 C cbit2
= x
< climb
185 addq r5
, r0
, r0 C cbit
+= cbit2
187 umulh r4
, r18
, r19 C climb
= q
* d
189 addq r19
, r0
, r0 C return climb
+ cbit
195 C with
high<divisor
, the final step can be just
(cbit
+climb
)-s
and
196 C an addback of d if that underflows
198 addq r19
, r0
, r19 C c
= climb
+ cbit
200 subq r19
, r1
, r2 C c
- s
201 cmpult r19
, r1
, r3 C c
< s
203 addq r2
, r18
, r0 C return c
-s
+ divisor
205 cmoveq r3
, r2
, r0 C return c
-s if no underflow