1 dnl AMD K6
-2 mpn_and_n
, mpn_andn_n
, mpn_nand_n
, mpn_ior_n
, mpn_iorn_n
,
2 dnl mpn_nior_n
, mpn_xor_n
, mpn_xnor_n
-- mpn bitwise logical operations.
4 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
37 C alignment dst/src1/src2, A=0mod8, N=4mod8
38 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
40 C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
41 C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
42 C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
44 C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
45 C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
46 C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
49 dnl M4_p and M4_i are the MMX and integer instructions
50 dnl M4_*_neg_dst means whether to negate the final result before writing
51 dnl M4_*_neg_src2 means whether to negate the src2 values before using them
55 `ifdef(`OPERATION_$1',`
56 define
(`M4_function
', `mpn_$1')
57 define
(`M4_operation
', `$1')
59 define
(`M4_p_neg_dst
', `$3')
60 define
(`M4_p_neg_src2
',`$4')
62 define
(`M4_i_neg_dst
', `$6')
63 define
(`M4_i_neg_src2
',`$7')
66 dnl xnor is done
in "iorn" style because it
's a touch faster than "nior"
67 dnl style (the two are equivalent for xor).
69 dnl pandn can't be used with nails.
71 M4_choose_op
( and_n
, pand
,0,0, andl
,0,0)
72 ifelse
(GMP_NAIL_BITS
,0,
73 `M4_choose_op
(andn_n
, pandn
,0,0, andl
,0,1)',
74 `M4_choose_op(andn_n, pand,0,1, andl,0,1)')
75 M4_choose_op
( nand_n
, pand
,1,0, andl
,1,0)
76 M4_choose_op
( ior_n
, por
,0,0, orl
,0,0)
77 M4_choose_op
( iorn_n
, por
,0,1, orl
,0,1)
78 M4_choose_op
( nior_n
, por
,1,0, orl
,1,0)
79 M4_choose_op
( xor_n
, pxor
,0,0, xorl
,0,0)
80 M4_choose_op
( xnor_n
, pxor
,0,1, xorl
,0,1)
83 `m4_error(`Unrecognised or undefined OPERATION symbol
86 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
89 C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
92 C Do src1,size M4_operation src2,size, storing the result in dst,size.
94 C Unaligned movq loads and stores are a bit slower than aligned ones. The
95 C test at the start of the routine checks the alignment of src1 and if
96 C necessary processes one limb separately at the low end to make it aligned.
98 C The raw speeds without this alignment switch are as follows.
100 C alignment dst/src1/src2, A=0mod8, N=4mod8
101 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
103 C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
104 C K6 1.75 2.2 2.0 2.28 iorn,xnor
105 C K6 2.0 2.25 2.35 2.28 nand,nior
110 C K6 can do one 64-bit load per cycle so each of these routines should be
111 C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
112 C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
113 C The others are 4 instructions per 2 limbs, and so can only approach 1.0
114 C because there's nowhere to hide some
loop control.
116 defframe
(PARAM_SIZE
,16)
117 defframe
(PARAM_SRC2
,12)
118 defframe
(PARAM_SRC1
,8)
119 defframe
(PARAM_DST
, 4)
124 PROLOGUE(M4_function)
125 movl PARAM_SIZE, %ecx
126 pushl %ebx FRAME_pushl()
128 movl PARAM_SRC1, %eax
130 movl PARAM_SRC2, %ebx
139 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
141 ifelse
(M4_i_neg_dst
,1,` notl_or_xorl_GMP_NUMB_MASK
( %ecx)')
156 pushl %esi FRAME_pushl()
162 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)')
165 ifelse
(M4_i_neg_dst
,1,` notl_or_xorl_GMP_NUMB_MASK
( %esi)')
173 jnz L(still_two_or_more)
177 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
179 ifelse
(M4_i_neg_dst
,1,` notl_or_xorl_GMP_NUMB_MASK
( %ecx)')
185 L(still_two_or_more):
186 ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
187 pcmpeqd %mm7, %mm7 C all ones
188 ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails
201 C carry bit is low of size
203 movq -8(%ebx,%ecx,8), %mm0
204 ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
205 M4_p
-8(%eax,%ecx,8), %mm0
206 ifelse
(M4_p_neg_dst
,1,` pxor
%mm7
, %mm0
')
207 movq %mm0, -8(%edx,%ecx,8)
214 movl -4(%ebx,%esi,4), %ebx
215 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)')
216 M4_i
-4(%eax,%esi,4), %ebx
217 ifelse
(M4_i_neg_dst
,1,` notl_or_xorl_GMP_NUMB_MASK
( %ebx)')
218 movl %ebx, -4(%edx,%esi,4)