source/libs/gmp/gmp-src/mpn/x86/k6/k62mmx/rshift.asm

   1 dnl  AMD K6-2 mpn_rshift -- mpn right shift.
   2
   3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K6-2: 1.75 cycles/limb
  35
  36
  37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  38 C                       unsigned shift);
  39 C
  40
  41 defframe(PARAM_SHIFT,16)
  42 defframe(PARAM_SIZE, 12)
  43 defframe(PARAM_SRC,  8)
  44 defframe(PARAM_DST,  4)
  45 deflit(`FRAME',0)
  46
  47 dnl  Minimum 9, because the unrolled loop can't handle less.
  48 dnl
  49 deflit(UNROLL_THRESHOLD, 9)
  50
  51         TEXT
  52         ALIGN(32)
  53
  54 PROLOGUE(mpn_rshift)
  55 deflit(`FRAME',0)
  56
  57         C The 1 limb case can be done without the push %ebx, but it's then
  58         C still the same speed.  The push is left as a free helping hand for
  59         C the two_or_more code.
  60
  61         movl    PARAM_SIZE, %eax
  62         pushl   %ebx                    FRAME_pushl()
  63
  64         movl    PARAM_SRC, %ebx
  65         decl    %eax
  66
  67         movl    PARAM_SHIFT, %ecx
  68         jnz     L(two_or_more)
  69
  70         movl    (%ebx), %edx            C src limb
  71         movl    PARAM_DST, %ebx
  72
  73         shrdl(  %cl, %edx, %eax)        C return value
  74
  75         shrl    %cl, %edx
  76
  77         movl    %edx, (%ebx)            C dst limb
  78         popl    %ebx
  79
  80         ret
  81
  82
  83 C -----------------------------------------------------------------------------
  84         ALIGN(16)       C avoid offset 0x1f
  85 L(two_or_more):
  86         C eax   size-1
  87         C ebx   src
  88         C ecx   shift
  89         C edx
  90
  91         movl    (%ebx), %edx    C src low limb
  92         negl    %ecx
  93
  94         addl    $32, %ecx
  95         movd    PARAM_SHIFT, %mm6
  96
  97         shll    %cl, %edx
  98         cmpl    $UNROLL_THRESHOLD-1, %eax
  99
 100         jae     L(unroll)
 101
 102
 103         C eax   size-1
 104         C ebx   src
 105         C ecx   32-shift
 106         C edx   retval
 107         C
 108         C mm6   shift
 109
 110         movl    PARAM_DST, %ecx
 111         leal    (%ebx,%eax,4), %ebx
 112
 113         leal    -4(%ecx,%eax,4), %ecx
 114         negl    %eax
 115
 116         C This loop runs at about 3 cycles/limb, which is the amount of
 117         C decoding, and this is despite every second access being unaligned.
 118
 119 L(simple):
 120         C eax   counter, -(size-1) to -1
 121         C ebx   &src[size-1]
 122         C ecx   &dst[size-1]
 123         C edx   retval
 124         C
 125         C mm0   scratch
 126         C mm6   shift
 127
 128 Zdisp(  movq,   0,(%ebx,%eax,4), %mm0)
 129         incl    %eax
 130
 131         psrlq   %mm6, %mm0
 132
 133 Zdisp(  movd,   %mm0, 0,(%ecx,%eax,4))
 134         jnz     L(simple)
 135
 136
 137         movq    %mm0, (%ecx)
 138         movl    %edx, %eax
 139
 140         popl    %ebx
 141
 142         femms
 143         ret
 144
 145
 146 C -----------------------------------------------------------------------------
 147         ALIGN(16)
 148 L(unroll):
 149         C eax   size-1
 150         C ebx   src
 151         C ecx   32-shift
 152         C edx   retval
 153         C
 154         C mm6   shift
 155
 156         addl    $32, %ecx
 157         subl    $7, %eax                C size-8
 158
 159         movd    %ecx, %mm7
 160         movl    PARAM_DST, %ecx
 161
 162         movq    (%ebx), %mm2            C src low qword
 163         leal    (%ebx,%eax,4), %ebx     C src end - 32
 164
 165         testb   $4, %cl
 166         leal    (%ecx,%eax,4), %ecx     C dst end - 32
 167
 168         notl    %eax                    C -(size-7)
 169         jz      L(dst_aligned)
 170
 171         psrlq   %mm6, %mm2
 172         incl    %eax
 173
 174 Zdisp(  movd,   %mm2, 0,(%ecx,%eax,4))  C dst low limb
 175         movq    4(%ebx,%eax,4), %mm2    C new src low qword
 176 L(dst_aligned):
 177
 178         movq    12(%ebx,%eax,4), %mm0   C src second lowest qword
 179         nop     C avoid bad cache line crossing
 180
 181
 182         C This loop is the important bit, the rest is just support for it.
 183         C Four src limbs are held at the start, and four more will be read.
 184         C Four dst limbs will be written.  This schedule seems necessary for
 185         C full speed.
 186         C
 187         C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
 188         C and leaves 0 to 3 which can be tested with test $1 and $2.
 189
 190 L(top):
 191         C eax   counter, -(size-7) step by +4 until >=0
 192         C ebx   src end - 32
 193         C ecx   dst end - 32
 194         C edx   retval
 195         C
 196         C mm0   src next qword
 197         C mm1   scratch
 198         C mm2   src prev qword
 199         C mm6   shift
 200         C mm7   64-shift
 201
 202         psrlq   %mm6, %mm2
 203         addl    $4, %eax
 204
 205         movq    %mm0, %mm1
 206         psllq   %mm7, %mm0
 207
 208         por     %mm0, %mm2
 209         movq    4(%ebx,%eax,4), %mm0
 210
 211         psrlq   %mm6, %mm1
 212         movq    %mm2, -12(%ecx,%eax,4)
 213
 214         movq    %mm0, %mm2
 215         psllq   %mm7, %mm0
 216
 217         por     %mm0, %mm1
 218         movq    12(%ebx,%eax,4), %mm0
 219
 220         movq    %mm1, -4(%ecx,%eax,4)
 221         ja      L(top)          C jump if no carry and not zero
 222
 223
 224
 225         C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
 226         C to 3 representing respectively 3 to 0 further limbs.
 227
 228         testl   $2, %eax        C testl to avoid bad cache line crossings
 229         jnz     L(finish_nottwo)
 230
 231         C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
 232         C becomes new mm2 and a new mm0 is loaded.
 233
 234         psrlq   %mm6, %mm2
 235         movq    %mm0, %mm1
 236
 237         psllq   %mm7, %mm0
 238         addl    $2, %eax
 239
 240         por     %mm0, %mm2
 241         movq    12(%ebx,%eax,4), %mm0
 242
 243         movq    %mm2, -4(%ecx,%eax,4)
 244         movq    %mm1, %mm2
 245 L(finish_nottwo):
 246
 247
 248         testb   $1, %al
 249         psrlq   %mm6, %mm2
 250
 251         movq    %mm0, %mm1
 252         psllq   %mm7, %mm0
 253
 254         por     %mm0, %mm2
 255         psrlq   %mm6, %mm1
 256
 257         movq    %mm2, 4(%ecx,%eax,4)
 258         jnz     L(finish_even)
 259
 260
 261         C one further extra limb to process
 262
 263         movd    32-4(%ebx), %mm0        C src[size-1], most significant limb
 264         popl    %ebx
 265
 266         movq    %mm0, %mm2
 267         psllq   %mm7, %mm0
 268
 269         por     %mm0, %mm1
 270         psrlq   %mm6, %mm2
 271
 272         movq    %mm1, 32-12(%ecx)       C dst[size-3,size-2]
 273         movd    %mm2, 32-4(%ecx)        C dst[size-1]
 274
 275         movl    %edx, %eax              C retval
 276
 277         femms
 278         ret
 279
 280
 281         nop     C avoid bad cache line crossing
 282 L(finish_even):
 283         C no further extra limbs
 284
 285         movq    %mm1, 32-8(%ecx)        C dst[size-2,size-1]
 286         movl    %edx, %eax              C retval
 287
 288         popl    %ebx
 289
 290         femms
 291         ret
 292
 293 EPILOGUE()