beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mmx / logops_n.asm
blobe17930bb2db4e1a01c58e4da8fb584285f442587
1 dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
2 dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
4 dnl Copyright 1999-2002 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 NAILS_SUPPORT(0-31)
37 C alignment dst/src1/src2, A=0mod8, N=4mod8
38 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
40 C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
41 C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
42 C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
44 C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
45 C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
46 C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
49 dnl M4_p and M4_i are the MMX and integer instructions
50 dnl M4_*_neg_dst means whether to negate the final result before writing
51 dnl M4_*_neg_src2 means whether to negate the src2 values before using them
53 define(M4_choose_op,
54 m4_assert_numargs(7)
55 `ifdef(`OPERATION_$1',`
56 define(`M4_function', `mpn_$1')
57 define(`M4_operation', `$1')
58 define(`M4_p', `$2')
59 define(`M4_p_neg_dst', `$3')
60 define(`M4_p_neg_src2',`$4')
61 define(`M4_i', `$5')
62 define(`M4_i_neg_dst', `$6')
63 define(`M4_i_neg_src2',`$7')
64 ')')
66 dnl xnor is done in "iorn" style because it's a touch faster than "nior"
67 dnl style (the two are equivalent for xor).
68 dnl
69 dnl pandn can't be used with nails.
71 M4_choose_op( and_n, pand,0,0, andl,0,0)
72 ifelse(GMP_NAIL_BITS,0,
73 `M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
74 `M4_choose_op(andn_n, pand,0,1, andl,0,1)')
75 M4_choose_op( nand_n, pand,1,0, andl,1,0)
76 M4_choose_op( ior_n, por,0,0, orl,0,0)
77 M4_choose_op( iorn_n, por,0,1, orl,0,1)
78 M4_choose_op( nior_n, por,1,0, orl,1,0)
79 M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
80 M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
82 ifdef(`M4_function',,
83 `m4_error(`Unrecognised or undefined OPERATION symbol
84 ')')
86 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
89 C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
90 C mp_size_t size);
92 C Do src1,size M4_operation src2,size, storing the result in dst,size.
94 C Unaligned movq loads and stores are a bit slower than aligned ones. The
95 C test at the start of the routine checks the alignment of src1 and if
96 C necessary processes one limb separately at the low end to make it aligned.
98 C The raw speeds without this alignment switch are as follows.
100 C alignment dst/src1/src2, A=0mod8, N=4mod8
101 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
103 C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
104 C K6 1.75 2.2 2.0 2.28 iorn,xnor
105 C K6 2.0 2.25 2.35 2.28 nand,nior
108 C Future:
110 C K6 can do one 64-bit load per cycle so each of these routines should be
111 C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
112 C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
113 C The others are 4 instructions per 2 limbs, and so can only approach 1.0
114 C because there's nowhere to hide some loop control.
116 defframe(PARAM_SIZE,16)
117 defframe(PARAM_SRC2,12)
118 defframe(PARAM_SRC1,8)
119 defframe(PARAM_DST, 4)
120 deflit(`FRAME',0)
122 TEXT
123 ALIGN(32)
124 PROLOGUE(M4_function)
125 movl PARAM_SIZE, %ecx
126 pushl %ebx FRAME_pushl()
128 movl PARAM_SRC1, %eax
130 movl PARAM_SRC2, %ebx
131 cmpl $1, %ecx
133 movl PARAM_DST, %edx
134 ja L(two_or_more)
137 movl (%ebx), %ecx
138 popl %ebx
139 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
140 M4_i (%eax), %ecx
141 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
142 movl %ecx, (%edx)
147 L(two_or_more):
148 C eax src1
149 C ebx src2
150 C ecx size
151 C edx dst
152 C esi
153 C edi
154 C ebp
156 pushl %esi FRAME_pushl()
157 testl $4, %eax
158 jz L(alignment_ok)
160 movl (%ebx), %esi
161 addl $4, %ebx
162 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)')
163 M4_i (%eax), %esi
164 addl $4, %eax
165 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)')
166 movl %esi, (%edx)
167 addl $4, %edx
168 decl %ecx
170 L(alignment_ok):
171 movl %ecx, %esi
172 shrl %ecx
173 jnz L(still_two_or_more)
175 movl (%ebx), %ecx
176 popl %esi
177 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
178 M4_i (%eax), %ecx
179 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
180 popl %ebx
181 movl %ecx, (%edx)
185 L(still_two_or_more):
186 ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
187 pcmpeqd %mm7, %mm7 C all ones
188 ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails
191 ALIGN(16)
192 L(top):
193 C eax src1
194 C ebx src2
195 C ecx counter
196 C edx dst
197 C esi
198 C edi
199 C ebp
201 C carry bit is low of size
203 movq -8(%ebx,%ecx,8), %mm0
204 ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
205 M4_p -8(%eax,%ecx,8), %mm0
206 ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
207 movq %mm0, -8(%edx,%ecx,8)
209 loop L(top)
212 jnc L(no_extra)
214 movl -4(%ebx,%esi,4), %ebx
215 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)')
216 M4_i -4(%eax,%esi,4), %ebx
217 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)')
218 movl %ebx, -4(%edx,%esi,4)
219 L(no_extra):
221 popl %esi
222 popl %ebx
223 emms_or_femms
226 EPILOGUE()