beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / com.asm
blobb0805452a62e50c6fd2c91dceb35f96a611738af
1 dnl Intel Pentium mpn_com -- mpn ones complement.
3 dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: 1.75 cycles/limb
37 NAILS_SUPPORT(0-31)
40 C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
42 C This code is similar to mpn_copyi, basically there's just some "xorl
43 C $GMP_NUMB_MASK"s inserted.
45 C Alternatives:
47 C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
48 C are the same alignment mod 8, but it doesn't seem worth the trouble for
49 C just that case (there'd need to be some plain integer available too for
50 C the unaligned case).
52 defframe(PARAM_SIZE,12)
53 defframe(PARAM_SRC, 8)
54 defframe(PARAM_DST, 4)
56 TEXT
57 ALIGN(8)
58 PROLOGUE(mpn_com)
59 deflit(`FRAME',0)
61 movl PARAM_SRC, %eax
62 movl PARAM_SIZE, %ecx
64 pushl %esi FRAME_pushl()
65 pushl %edi FRAME_pushl()
67 leal (%eax,%ecx,4), %eax
68 xorl $-1, %ecx C -size-1
70 movl PARAM_DST, %edx
71 addl $8, %ecx C -size+7
73 jns L(end)
75 movl (%edx), %esi C fetch destination cache line
76 nop
78 L(top):
79 C eax &src[size]
80 C ebx
81 C ecx counter, limbs, negative
82 C edx dst, incrementing
83 C esi scratch
84 C edi scratch
85 C ebp
87 movl 28(%edx), %esi C destination prefetch
88 addl $32, %edx
90 movl -28(%eax,%ecx,4), %esi
91 movl -24(%eax,%ecx,4), %edi
92 xorl $GMP_NUMB_MASK, %esi
93 xorl $GMP_NUMB_MASK, %edi
94 movl %esi, -32(%edx)
95 movl %edi, -28(%edx)
97 movl -20(%eax,%ecx,4), %esi
98 movl -16(%eax,%ecx,4), %edi
99 xorl $GMP_NUMB_MASK, %esi
100 xorl $GMP_NUMB_MASK, %edi
101 movl %esi, -24(%edx)
102 movl %edi, -20(%edx)
104 movl -12(%eax,%ecx,4), %esi
105 movl -8(%eax,%ecx,4), %edi
106 xorl $GMP_NUMB_MASK, %esi
107 xorl $GMP_NUMB_MASK, %edi
108 movl %esi, -16(%edx)
109 movl %edi, -12(%edx)
111 movl -4(%eax,%ecx,4), %esi
112 movl (%eax,%ecx,4), %edi
113 xorl $GMP_NUMB_MASK, %esi
114 xorl $GMP_NUMB_MASK, %edi
115 movl %esi, -8(%edx)
116 movl %edi, -4(%edx)
118 addl $8, %ecx
119 js L(top)
122 L(end):
123 C eax &src[size]
124 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
125 C edx dst, next location to store
127 subl $4, %ecx
130 jns L(no4)
132 movl -12(%eax,%ecx,4), %esi
133 movl -8(%eax,%ecx,4), %edi
134 xorl $GMP_NUMB_MASK, %esi
135 xorl $GMP_NUMB_MASK, %edi
136 movl %esi, (%edx)
137 movl %edi, 4(%edx)
139 movl -4(%eax,%ecx,4), %esi
140 movl (%eax,%ecx,4), %edi
141 xorl $GMP_NUMB_MASK, %esi
142 xorl $GMP_NUMB_MASK, %edi
143 movl %esi, 8(%edx)
144 movl %edi, 12(%edx)
146 addl $16, %edx
147 addl $4, %ecx
148 L(no4):
150 subl $2, %ecx
153 jns L(no2)
155 movl -4(%eax,%ecx,4), %esi
156 movl (%eax,%ecx,4), %edi
157 xorl $GMP_NUMB_MASK, %esi
158 xorl $GMP_NUMB_MASK, %edi
159 movl %esi, (%edx)
160 movl %edi, 4(%edx)
162 addl $8, %edx
163 addl $2, %ecx
164 L(no2):
166 popl %edi
167 jnz L(done)
169 movl -4(%eax), %ecx
171 xorl $GMP_NUMB_MASK, %ecx
172 popl %esi
174 movl %ecx, (%edx)
177 L(done):
178 popl %esi
181 EPILOGUE()