1 dnl AMD K7 mpn_mod_34lsub1
-- remainder modulo
2^
24-1.
3 dnl Copyright
2000-2002, 2004, 2005, 2008 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
39 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
41 C The loop form below and the 64 byte code alignment seem necessary for the
42 C claimed speed. This is a bit strange, since normally k7 isn't very
43 C sensitive to such things. Perhaps there has to be
6 instructions
in the
44 C first
16 bytes for the BTB
entry or something.
46 defframe
(PARAM_SIZE
, 8)
47 defframe
(PARAM_SRC
, 4)
49 dnl re
-use parameter space
50 define
(SAVE_EDI
, `PARAM_SIZE
')
54 PROLOGUE(mpn_mod_34lsub1)
68 shrl
$24, %eax C src
[0] low
70 andl
$0xFFFFFF
, %edx C src
[0] high
75 shrl
$16, %edx C src
[1] high
78 shll
$8, %ecx C src
[1] low
93 pushl
%ebx FRAME_pushl
()
98 pushl
%esi FRAME_pushl
()
99 xorl
%esi, %esi C
and clear carry flag
102 C code
offset 0x40 at
this point
133 C
ecx is
-2, -1 or 0 representing
0, 1 or 2 more limbs
, respectively
136 movl
$0xFFFFFFFF
, %edi
141 movl
$0xFFFFFF00
, %edi
145 movl
$0xFFFF0000
, %edi
156 sbbl
%ecx, %ecx C carry
157 movl
%eax, %edx C
0mod3
158 shrl
$24, %eax C
0mod3
high
160 andl
%edi, %ecx C carry masked
161 andl
$0x00FFFFFF
, %edx C
0mod3
low
162 movl
%ebx, %edi C
1mod3
164 subl
%ecx, %eax C apply carry
165 shrl
$16, %ebx C
1mod3
high
168 addl
%edx, %eax C apply
0mod3
low
169 movl
%esi, %edx C
2mod3
170 shll
$8, %edi C
1mod3
low
172 addl
%ebx, %eax C apply
1mod3
high
173 shrl
$8, %esi C
2mod3
high
174 movzbl
%dl, %edx C
2mod3
low
176 addl
%edi, %eax C apply
1mod3
low
177 shll
$16, %edx C
2mod3
low
179 addl
%esi, %eax C apply
2mod3
high
180 popl
%esi FRAME_popl
()
183 addl
%edx, %eax C apply
2mod3
low
184 popl
%ebx FRAME_popl
()