beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / copyi.asm
blobd983d6b46ec0f00abf0e3d7dcbfe035c0cf382f7
1 dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing.
3 dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: 1.25 cycles/limb
37 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
39 C Destination prefetching is done to avoid repeated write-throughs on lines
40 C not already in L1.
42 C At least one of the src or dst pointer needs to be incremented rather than
43 C using indexing, so that there's somewhere to put the loop control without
44 C an AGI. Incrementing one and not two lets us keep loop overhead to 2
45 C cycles. Making it the src pointer incremented avoids an AGI on the %ecx
46 C subtracts in the finishup code.
48 C The block of finishup code is almost as big as the main loop itself, which
49 C is unfortunate, but it's faster that way than with say rep movsl, by about
50 C 10 cycles for instance on P55.
52 C There's nothing to be gained from MMX on P55, since it can do only one
53 C movq load (or store) per cycle, so the throughput would be the same as the
54 C code here (and even then only if src and dst have the same alignment mod
55 C 8).
57 defframe(PARAM_SIZE,12)
58 defframe(PARAM_SRC, 8)
59 defframe(PARAM_DST, 4)
61 TEXT
62 ALIGN(8)
63 PROLOGUE(mpn_copyi)
64 deflit(`FRAME',0)
66 movl PARAM_SIZE, %ecx
67 movl PARAM_DST, %edx
69 pushl %ebx FRAME_pushl()
70 pushl %esi FRAME_pushl()
72 leal (%edx,%ecx,4), %edx C &dst[size-1]
73 xorl $-1, %ecx C -size-1
75 movl PARAM_SRC, %esi
76 addl $8, %ecx C -size+7
78 jns L(end)
80 movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0]
81 nop
83 L(top):
84 C eax scratch
85 C ebx scratch
86 C ecx counter, limbs, negative
87 C edx &dst[size-1]
88 C esi src, incrementing
89 C edi
90 C ebp
92 movl (%edx,%ecx,4), %eax C fetch destination cache line
93 addl $8, %ecx
95 movl (%esi), %eax C read words pairwise
96 movl 4(%esi), %ebx
97 movl %eax, -60(%edx,%ecx,4) C store words pairwise
98 movl %ebx, -56(%edx,%ecx,4)
100 movl 8(%esi), %eax
101 movl 12(%esi), %ebx
102 movl %eax, -52(%edx,%ecx,4)
103 movl %ebx, -48(%edx,%ecx,4)
105 movl 16(%esi), %eax
106 movl 20(%esi), %ebx
107 movl %eax, -44(%edx,%ecx,4)
108 movl %ebx, -40(%edx,%ecx,4)
110 movl 24(%esi), %eax
111 movl 28(%esi), %ebx
112 movl %eax, -36(%edx,%ecx,4)
113 movl %ebx, -32(%edx,%ecx,4)
115 leal 32(%esi), %esi
116 js L(top)
119 L(end):
120 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
121 C esi src end
122 C edx dst, next location to store
124 subl $4, %ecx
125 jns L(no4)
127 movl (%esi), %eax
128 movl 4(%esi), %ebx
129 movl %eax, -12(%edx,%ecx,4)
130 movl %ebx, -8(%edx,%ecx,4)
132 movl 8(%esi), %eax
133 movl 12(%esi), %ebx
134 movl %eax, -4(%edx,%ecx,4)
135 movl %ebx, (%edx,%ecx,4)
137 addl $16, %esi
138 addl $4, %ecx
139 L(no4):
141 subl $2, %ecx
142 jns L(no2)
144 movl (%esi), %eax
145 movl 4(%esi), %ebx
146 movl %eax, -4(%edx,%ecx,4)
147 movl %ebx, (%edx,%ecx,4)
149 addl $8, %esi
150 addl $2, %ecx
151 L(no2):
153 jnz L(done)
155 movl (%esi), %eax
156 movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here
158 L(done):
159 popl %esi
160 popl %ebx
164 EPILOGUE()