1 dnl Alpha mpn_mod_1s_4p
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2009, 2010 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
36 C * Optimise. 2.75 c/l should be possible.
37 C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
38 C * Optimise feed-in code, starting the sw pipeline in switch code.
39 C * Shorten software pipeline. The mul instructions are scheduled too far
40 C from their users. Fixing this will allow us to use fewer registers.
41 C * If we cannot reduce register usage, write perhaps small-n basecase.
42 C * Does this work for PIC?
55 define(`B1modb', `r1
')
56 define(`B2modb', `r2
')
57 define(`B3modb', `r3
')
58 define(`B4modb', `r4
')
59 define(`B5modb', `r5
')
62 PROLOGUE(mpn_mod_1s_4p)
74 s8addq n, ap, ap C point ap at vector end
83 L(b3): ldq r21, -16(ap)
87 umulh r21, B1modb, r12
89 umulh r22, B2modb, r13
100 L(b0): ldq r21, -24(ap)
105 umulh r21, B1modb, r12
107 umulh r22, B2modb, r13
108 mulq r23, B3modb, r10
109 umulh r23, B3modb, r27
124 L(b1): bis r31, r31, rh
129 L(b2): ldq rh, -8(ap)
133 L(com): ble n, L(ed3)
141 umulh r21, B1modb, r12
143 umulh r22, B2modb, r13
144 mulq r23, B3modb, r10
145 umulh r23, B3modb, r27
147 umulh rl, B4modb, r28
151 L(top): ldq r21, 8(ap)
163 umulh r21, B1modb, r12
171 umulh r22, B2modb, r13
175 mulq r23, B3modb, r10
178 umulh r23, B3modb, r27
183 umulh rl, B4modb, r28
188 L(ed2): mulq rh, B5modb, rl
210 L(ed3): mulq rh, B1modb, r8
216 ldq r24, 8(r19) C cnt
231 mulq r9, r18, r21 C qh * b
233 cmpult r8, rl, r0 C rl > ql
237 cmpule r18, rl, r0 C rl >= b
253 PROLOGUE(mpn_mod_1s_4p_cps,gp)
277 jsr r26, mpn_invert_limb