1 dnl Intel Pentium mpn_copyi
-- copy limb vector
, incrementing.
3 dnl Copyright
1996, 2001, 2002, 2006 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P5: 1.25 cycles/limb
37 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
39 C Destination prefetching is done to avoid repeated write-throughs on lines
42 C At least one of the src or dst pointer needs to be incremented rather than
43 C using indexing, so that there's somewhere to put the
loop control without
44 C an AGI. Incrementing one
and not two lets us keep
loop overhead to
2
45 C cycles. Making it the src pointer incremented avoids an AGI on the
%ecx
46 C subtracts
in the finishup code.
48 C The block of finishup code is almost as big as the main
loop itself
, which
49 C is unfortunate
, but it
's faster that way than with say rep movsl, by about
50 C 10 cycles for instance on P55.
52 C There's nothing to be gained from MMX on P55
, since it can do only one
53 C movq load
(or store
) per cycle
, so the throughput would be the same as the
54 C code here
(and even then only if src
and dst have the same alignment
mod
57 defframe
(PARAM_SIZE
,12)
58 defframe
(PARAM_SRC
, 8)
59 defframe
(PARAM_DST
, 4)
69 pushl %ebx FRAME_pushl()
70 pushl %esi FRAME_pushl()
72 leal (%edx,%ecx,4), %edx C &dst[size-1]
73 xorl $-1, %ecx C -size-1
76 addl $8, %ecx C -size+7
80 movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0]
86 C ecx counter, limbs, negative
88 C esi src, incrementing
92 movl (%edx,%ecx,4), %eax C fetch destination cache line
95 movl (%esi), %eax C read words pairwise
97 movl %eax, -60(%edx,%ecx,4) C store words pairwise
98 movl %ebx, -56(%edx,%ecx,4)
102 movl %eax, -52(%edx,%ecx,4)
103 movl %ebx, -48(%edx,%ecx,4)
107 movl %eax, -44(%edx,%ecx,4)
108 movl %ebx, -40(%edx,%ecx,4)
112 movl %eax, -36(%edx,%ecx,4)
113 movl %ebx, -32(%edx,%ecx,4)
120 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
122 C edx dst, next location to store
129 movl %eax, -12(%edx,%ecx,4)
130 movl %ebx, -8(%edx,%ecx,4)
134 movl %eax, -4(%edx,%ecx,4)
135 movl %ebx, (%edx,%ecx,4)
146 movl %eax, -4(%edx,%ecx,4)
147 movl %ebx, (%edx,%ecx,4)
156 movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here