beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / pentium4 / mod_34lsub1.asm
blobf34b3f079ab474c793d24b06138ebea92ead9f5d
1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
3 dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
4 dnl Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 1.0
37 C AMD K10 1.12
38 C Intel P4 3.25
39 C Intel core2 1.5
40 C Intel corei 1.5
41 C Intel atom 2.5
42 C VIA nano 1.75
45 C INPUT PARAMETERS
46 define(`ap', %rdi)
47 define(`n', %rsi)
49 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
51 C TODO
52 C * Review feed-in and wind-down code. In particular, try to avoid adc and
53 C sbb to placate Pentium4.
54 C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
55 C without the dual loop exits.
57 ABI_SUPPORT(DOS64)
58 ABI_SUPPORT(STD64)
60 ASM_START()
61 TEXT
62 ALIGN(32)
63 PROLOGUE(mpn_mod_34lsub1)
64 FUNC_ENTRY(2)
66 mov $0x0000FFFFFFFFFFFF, %r11
68 sub $2, %rsi
69 ja L(gt2)
71 mov (ap), %rax
72 nop
73 jb L(1)
75 mov 8(ap), %rsi
76 mov %rax, %rdx
77 shr $48, %rax C src[0] low
79 and %r11, %rdx C src[0] high
80 add %rdx, %rax
81 mov R32(%rsi), R32(%rdx)
83 shr $32, %rsi C src[1] high
84 add %rsi, %rax
86 shl $16, %rdx C src[1] low
87 add %rdx, %rax
89 L(1): FUNC_EXIT()
90 ret
93 ALIGN(16)
94 L(gt2): xor R32(%rax), R32(%rax)
95 xor R32(%rcx), R32(%rcx)
96 xor R32(%rdx), R32(%rdx)
97 xor %r8, %r8
98 xor %r9, %r9
99 xor %r10, %r10
101 L(top): add (ap), %rax
102 adc $0, %r10
103 add 8(ap), %rcx
104 adc $0, %r8
105 add 16(ap), %rdx
106 adc $0, %r9
108 sub $3, %rsi
109 jng L(end)
111 add 24(ap), %rax
112 adc $0, %r10
113 add 32(ap), %rcx
114 adc $0, %r8
115 add 40(ap), %rdx
116 lea 48(ap), ap
117 adc $0, %r9
119 sub $3, %rsi
120 jg L(top)
123 add $-24, ap
124 L(end): add %r9, %rax
125 adc %r10, %rcx
126 adc %r8, %rdx
128 inc %rsi
129 mov $0x1, R32(%r10)
130 js L(combine)
132 mov $0x10000, R32(%r10)
133 adc 24(ap), %rax
134 dec %rsi
135 js L(combine)
137 adc 32(ap), %rcx
138 mov $0x100000000, %r10
140 L(combine):
141 sbb %rsi, %rsi C carry
142 mov %rax, %rdi C 0mod3
143 shr $48, %rax C 0mod3 high
145 and %r10, %rsi C carry masked
146 and %r11, %rdi C 0mod3 low
147 mov R32(%rcx), R32(%r10) C 1mod3
149 add %rsi, %rax C apply carry
150 shr $32, %rcx C 1mod3 high
152 add %rdi, %rax C apply 0mod3 low
153 movzwl %dx, R32(%rdi) C 2mod3
154 shl $16, %r10 C 1mod3 low
156 add %rcx, %rax C apply 1mod3 high
157 shr $16, %rdx C 2mod3 high
159 add %r10, %rax C apply 1mod3 low
160 shl $32, %rdi C 2mod3 low
162 add %rdx, %rax C apply 2mod3 high
163 add %rdi, %rax C apply 2mod3 low
165 FUNC_EXIT()
167 EPILOGUE()