source/libs/gmp/gmp-src/mpn/x86/k6/mmx/logops_n.asm

   1 dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
   2 dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
   3
   4 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34 NAILS_SUPPORT(0-31)
  35
  36
  37 C         alignment dst/src1/src2, A=0mod8, N=4mod8
  38 C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
  39 C
  40 C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
  41 C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
  42 C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
  43 C
  44 C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
  45 C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
  46 C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
  47
  48
  49 dnl  M4_p and M4_i are the MMX and integer instructions
  50 dnl  M4_*_neg_dst means whether to negate the final result before writing
  51 dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
  52
  53 define(M4_choose_op,
  54 m4_assert_numargs(7)
  55 `ifdef(`OPERATION_$1',`
  56 define(`M4_function',  `mpn_$1')
  57 define(`M4_operation', `$1')
  58 define(`M4_p',         `$2')
  59 define(`M4_p_neg_dst', `$3')
  60 define(`M4_p_neg_src2',`$4')
  61 define(`M4_i',         `$5')
  62 define(`M4_i_neg_dst', `$6')
  63 define(`M4_i_neg_src2',`$7')
  64 ')')
  65
  66 dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
  67 dnl  style (the two are equivalent for xor).
  68 dnl
  69 dnl  pandn can't be used with nails.
  70
  71 M4_choose_op( and_n,  pand,0,0,  andl,0,0)
  72 ifelse(GMP_NAIL_BITS,0,
  73 `M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
  74 `M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
  75 M4_choose_op( nand_n, pand,1,0,  andl,1,0)
  76 M4_choose_op( ior_n,  por,0,0,   orl,0,0)
  77 M4_choose_op( iorn_n, por,0,1,   orl,0,1)
  78 M4_choose_op( nior_n, por,1,0,   orl,1,0)
  79 M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
  80 M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
  81
  82 ifdef(`M4_function',,
  83 `m4_error(`Unrecognised or undefined OPERATION symbol
  84 ')')
  85
  86 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
  87
  88
  89 C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
  90 C                   mp_size_t size);
  91 C
  92 C Do src1,size M4_operation src2,size, storing the result in dst,size.
  93 C
  94 C Unaligned movq loads and stores are a bit slower than aligned ones.  The
  95 C test at the start of the routine checks the alignment of src1 and if
  96 C necessary processes one limb separately at the low end to make it aligned.
  97 C
  98 C The raw speeds without this alignment switch are as follows.
  99 C
 100 C           alignment dst/src1/src2, A=0mod8, N=4mod8
 101 C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
 102 C
 103 C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
 104 C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
 105 C K6                 2.0    2.25                2.35   2.28   nand,nior
 106 C
 107 C
 108 C Future:
 109 C
 110 C K6 can do one 64-bit load per cycle so each of these routines should be
 111 C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
 112 C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
 113 C The others are 4 instructions per 2 limbs, and so can only approach 1.0
 114 C because there's nowhere to hide some loop control.
 115
 116 defframe(PARAM_SIZE,16)
 117 defframe(PARAM_SRC2,12)
 118 defframe(PARAM_SRC1,8)
 119 defframe(PARAM_DST, 4)
 120 deflit(`FRAME',0)
 121
 122         TEXT
 123         ALIGN(32)
 124 PROLOGUE(M4_function)
 125                         movl    PARAM_SIZE, %ecx
 126                         pushl   %ebx            FRAME_pushl()
 127
 128                         movl    PARAM_SRC1, %eax
 129
 130                         movl    PARAM_SRC2, %ebx
 131                         cmpl    $1, %ecx
 132
 133                         movl    PARAM_DST, %edx
 134                         ja      L(two_or_more)
 135
 136
 137                         movl    (%ebx), %ecx
 138                         popl    %ebx
 139 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 140                         M4_i    (%eax), %ecx
 141 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 142                         movl    %ecx, (%edx)
 143
 144                         ret
 145
 146
 147 L(two_or_more):
 148                         C eax   src1
 149                         C ebx   src2
 150                         C ecx   size
 151                         C edx   dst
 152                         C esi
 153                         C edi
 154                         C ebp
 155
 156                         pushl   %esi            FRAME_pushl()
 157                         testl   $4, %eax
 158                         jz      L(alignment_ok)
 159
 160                         movl    (%ebx), %esi
 161                         addl    $4, %ebx
 162 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %esi)')
 163                         M4_i    (%eax), %esi
 164                         addl    $4, %eax
 165 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %esi)')
 166                         movl    %esi, (%edx)
 167                         addl    $4, %edx
 168                         decl    %ecx
 169
 170 L(alignment_ok):
 171                         movl    %ecx, %esi
 172                         shrl    %ecx
 173                         jnz     L(still_two_or_more)
 174
 175                         movl    (%ebx), %ecx
 176                         popl    %esi
 177 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 178                         M4_i    (%eax), %ecx
 179 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %ecx)')
 180                         popl    %ebx
 181                         movl    %ecx, (%edx)
 182                         ret
 183
 184
 185 L(still_two_or_more):
 186 ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
 187                         pcmpeqd %mm7, %mm7              C all ones
 188 ifelse(GMP_NAIL_BITS,0,,`psrld  $GMP_NAIL_BITS, %mm7')  C clear nails
 189 ')
 190
 191                         ALIGN(16)
 192 L(top):
 193                         C eax   src1
 194                         C ebx   src2
 195                         C ecx   counter
 196                         C edx   dst
 197                         C esi
 198                         C edi
 199                         C ebp
 200                         C
 201                         C carry bit is low of size
 202
 203                         movq    -8(%ebx,%ecx,8), %mm0
 204 ifelse(M4_p_neg_src2,1,`pxor    %mm7, %mm0')
 205                         M4_p    -8(%eax,%ecx,8), %mm0
 206 ifelse(M4_p_neg_dst,1,` pxor    %mm7, %mm0')
 207                         movq    %mm0, -8(%edx,%ecx,8)
 208
 209                         loop    L(top)
 210
 211
 212                         jnc     L(no_extra)
 213
 214                         movl    -4(%ebx,%esi,4), %ebx
 215 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(     %ebx)')
 216                         M4_i    -4(%eax,%esi,4), %ebx
 217 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK(     %ebx)')
 218                         movl    %ebx, -4(%edx,%esi,4)
 219 L(no_extra):
 220
 221                         popl    %esi
 222                         popl    %ebx
 223                         emms_or_femms
 224                         ret
 225
 226 EPILOGUE()