1 dnl AMD64 mpn_mod_34lsub1
-- remainder modulo
2^
48-1.
3 dnl Copyright
2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation
,
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
49 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
52 C * Review feed-in and wind-down code. In particular, try to avoid adc and
53 C sbb to placate Pentium4.
54 C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
55 C without the dual loop exits.
63 PROLOGUE(mpn_mod_34lsub1)
66 mov $0x0000FFFFFFFFFFFF, %r11
77 shr $48, %rax C src[0] low
79 and %r11, %rdx C src[0] high
81 mov R32(%rsi), R32(%rdx)
83 shr $32, %rsi C src[1] high
86 shl $16, %rdx C src[1] low
94 L(gt2): xor R32(%rax), R32(%rax)
95 xor R32(%rcx), R32(%rcx)
96 xor R32(%rdx), R32(%rdx)
101 L(top): add (ap), %rax
124 L(end): add %r9, %rax
132 mov $0x10000, R32(%r10)
138 mov $0x100000000, %r10
141 sbb %rsi, %rsi C carry
142 mov %rax, %rdi C 0mod3
143 shr $48, %rax C 0mod3 high
145 and %r10, %rsi C carry masked
146 and %r11, %rdi C 0mod3 low
147 mov R32(%rcx), R32(%r10) C 1mod3
149 add %rsi, %rax C apply carry
150 shr $32, %rcx C 1mod3 high
152 add %rdi, %rax C apply 0mod3 low
153 movzwl %dx, R32(%rdi) C 2mod3
154 shl $16, %r10 C 1mod3 low
156 add %rcx, %rax C apply 1mod3 high
157 shr $16, %rdx C 2mod3 high
159 add %r10, %rax C apply 1mod3 low
160 shl $32, %rdi C 2mod3 low
162 add %rdx, %rax C apply 2mod3 high
163 add %rdi, %rax C apply 2mod3 low