1 dnl Intel Pentium
4 mpn_mod_34lsub1
-- remainder modulo
2^
24-1.
3 dnl Copyright
2000-2003 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C Pentium4: 1.0 cycles/limb
37 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
41 C There might a couple of cycles to save by using plain integer code for
42 C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
43 C about 46 (inclusive of some function call overheads).
45 defframe(PARAM_SIZE, 8)
46 defframe(PARAM_SRC, 4)
48 dnl re-use parameter space
49 define(SAVE_EBX, `PARAM_SRC')
50 define
(SAVE_ESI
, `PARAM_SIZE
')
54 PROLOGUE(mpn_mod_34lsub1)
67 shrl
$24, %eax C src
[0] high
69 andl
$0x00FFFFFF
, %ecx C src
[0] low
75 shrl
$16, %ecx C src
[1] low
78 andl
$0x00FFFF00
, %edx C src
[1] high
91 psrlq
$32, %mm7 C
0x00000000FFFFFFFF, low 32 bits
94 psrlq
$40, %mm6 C
0x0000000000FFFFFF, low 24 bits
99 C
ecx counter
, size-2 to
0, -1 or -2
100 C
edx src
, incrementing
108 C mm6
0x0000000000FFFFFF
109 C mm7
0x00000000FFFFFFFF
125 C
ecx is
-2, -1 or 0 representing
0, 1 or 2 more limbs
, respectively
128 js L
(combine
) C
0 more
133 jz L
(combine
) C
1 more
139 movq
%mm7
, %mm3 C
low halves
148 psrlq
$32, %mm0 C
high halves
152 paddq
%mm0
, %mm4 C fold
high halves to give
33 bits each
156 psllq
$8, %mm4 C combine at respective offsets
159 paddq
%mm5
, %mm3 C
0x000cxxxxxxxxxxxx
, 50 bits
161 pand
%mm3
, %mm6 C fold at
24 bits
167 ASSERT
(z
, C nothing left
in high dword