1 dnl AMD K6 mpn_mod_34lsub1
-- mpn remainder modulo
2**24-1.
3 dnl Copyright
2000-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K6: 2.66 cycles/limb
37 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
39 C An attempt was made to use a loop like
48 C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
49 C The form used instead can save about 6 cycles by not dividing by 3.
51 C In the code used, putting the "leal"s at the top of the loop is necessary
52 C for the claimed speed, anywhere else costs an extra cycle per loop.
53 C Perhaps a tight loop like this needs short decode instructions at the
54 C branch target, which would explain the leal/loop form above taking 8
55 C cycles instead of 7 too.
57 defframe(PARAM_SIZE, 8)
58 defframe(PARAM_SRC, 4)
60 dnl re-use parameter space
61 define(SAVE_EBX, `PARAM_SIZE')
62 define
(SAVE_ESI
, `PARAM_SRC
')
66 PROLOGUE(mpn_mod_34lsub1)
75 Zdisp
( movl
, 0,(%edx), %eax) C avoid code cache line boundary
81 shrl
$24, %eax C src
[0] high
82 andl
$0x00FFFFFF
, %ecx C src
[0] low
88 andl
$0x00FFFF00
, %edx C src
[1] high
90 shrl
$16, %ecx C src
[1] low
109 pushl
%edi FRAME_pushl
()
112 xorl
%edi, %edi C
and clear carry flag
118 C
edx src
, incrementing
134 C
ecx is
-3, -2 or -1 representing
0, 1 or 2 more limbs
, respectively
139 js L
(combine
) C
0 more
141 Zdisp
( adcl
, 0,(%edx), %ebx) C avoid code cache line crossings
146 js L
(combine
) C
1 more
156 shll
%cl, %edx C carry
157 movl
%ebx, %eax C
0mod3
159 shrl
$24, %eax C
0mod3
high
160 andl
$0x00FFFFFF
, %ebx C
0mod3
low
162 subl
%edx, %eax C apply carry
163 movl
%esi, %ecx C
1mod3
165 shrl
$16, %esi C
1mod3
high
166 addl
%ebx, %eax C apply
0mod3
low
168 andl
$0x0000FFFF
, %ecx
169 addl
%esi, %eax C apply
1mod3
high
171 shll
$8, %ecx C
1mod3
low
172 movl
%edi, %edx C
2mod3
174 shrl
$8, %edx C
2mod3
high
175 addl
%ecx, %eax C apply
1mod3
low
177 addl
%edx, %eax C apply
2mod3
high
178 andl
$0x000000FF
, %edi
180 shll
$16, %edi C
2mod3
low
183 addl
%edi, %eax C apply
2mod3
low