beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / copyd.asm
blob72a543b2a3726d7ab781d8d140603475f52dcb04
1 dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing.
3 dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: 1.25 cycles/limb
37 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
39 C See comments in copyi.asm.
41 defframe(PARAM_SIZE,12)
42 defframe(PARAM_SRC, 8)
43 defframe(PARAM_DST, 4)
45 TEXT
46 ALIGN(8)
47 PROLOGUE(mpn_copyd)
48 deflit(`FRAME',0)
50 movl PARAM_SRC, %eax
51 movl PARAM_SIZE, %ecx
53 pushl %esi FRAME_pushl()
54 pushl %edi FRAME_pushl()
56 leal -4(%eax,%ecx,4), %eax C &src[size-1]
57 movl PARAM_DST, %edx
59 subl $7, %ecx C size-7
60 jle L(end)
62 movl 28-4(%edx,%ecx,4), %esi C prefetch cache, dst[size-1]
63 nop
65 L(top):
66 C eax src, decrementing
67 C ebx
68 C ecx counter, limbs
69 C edx dst
70 C esi scratch
71 C edi scratch
72 C ebp
74 movl 28-32(%edx,%ecx,4), %esi C prefetch dst cache line
75 subl $8, %ecx
77 movl (%eax), %esi C read words pairwise
78 movl -4(%eax), %edi
79 movl %esi, 56(%edx,%ecx,4) C store words pairwise
80 movl %edi, 52(%edx,%ecx,4)
82 movl -8(%eax), %esi
83 movl -12(%eax), %edi
84 movl %esi, 48(%edx,%ecx,4)
85 movl %edi, 44(%edx,%ecx,4)
87 movl -16(%eax), %esi
88 movl -20(%eax), %edi
89 movl %esi, 40(%edx,%ecx,4)
90 movl %edi, 36(%edx,%ecx,4)
92 movl -24(%eax), %esi
93 movl -28(%eax), %edi
94 movl %esi, 32(%edx,%ecx,4)
95 movl %edi, 28(%edx,%ecx,4)
97 leal -32(%eax), %eax
98 jg L(top)
101 L(end):
102 C ecx -7 to 0, representing respectively 0 to 7 limbs remaining
103 C eax src end
104 C edx dst, next location to store
106 addl $4, %ecx
107 jle L(no4)
109 movl (%eax), %esi
110 movl -4(%eax), %edi
111 movl %esi, 8(%edx,%ecx,4)
112 movl %edi, 4(%edx,%ecx,4)
114 movl -8(%eax), %esi
115 movl -12(%eax), %edi
116 movl %esi, (%edx,%ecx,4)
117 movl %edi, -4(%edx,%ecx,4)
119 subl $16, %eax
120 subl $4, %ecx
121 L(no4):
123 addl $2, %ecx
124 jle L(no2)
126 movl (%eax), %esi
127 movl -4(%eax), %edi
128 movl %esi, (%edx,%ecx,4)
129 movl %edi, -4(%edx,%ecx,4)
131 subl $8, %eax
132 subl $2, %ecx
133 L(no2):
135 jnz L(done)
137 movl (%eax), %ecx
138 movl %ecx, (%edx) C risk of cache bank clash here
140 L(done):
141 popl %edi
142 popl %esi
146 EPILOGUE()