source/libs/gmp/gmp-src/mpn/x86/k6/mmx/lshift.asm

   1 dnl  AMD K6 mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K6: 3.0 cycles/limb
  35
  36
  37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  38 C                       unsigned shift);
  39 C
  40 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
  41 C instructions.  This is despite every second fetch being unaligned.
  42
  43
  44 defframe(PARAM_SHIFT,16)
  45 defframe(PARAM_SIZE, 12)
  46 defframe(PARAM_SRC,  8)
  47 defframe(PARAM_DST,  4)
  48
  49         TEXT
  50         ALIGN(32)
  51
  52 PROLOGUE(mpn_lshift)
  53 deflit(`FRAME',0)
  54
  55         C The 1 limb case can be done without the push %ebx, but it's then
  56         C still the same speed.  The push is left as a free helping hand for
  57         C the two_or_more code.
  58
  59         movl    PARAM_SIZE, %eax
  60         pushl   %ebx                    FRAME_pushl()
  61
  62         movl    PARAM_SRC, %ebx
  63         decl    %eax
  64
  65         movl    PARAM_SHIFT, %ecx
  66         jnz     L(two_or_more)
  67
  68         movl    (%ebx), %edx            C src limb
  69         movl    PARAM_DST, %ebx
  70
  71         shldl(  %cl, %edx, %eax)        C return value
  72
  73         shll    %cl, %edx
  74
  75         movl    %edx, (%ebx)            C dst limb
  76         popl    %ebx
  77
  78         ret
  79
  80
  81         ALIGN(16)       C avoid offset 0x1f
  82         nop             C avoid bad cache line crossing
  83 L(two_or_more):
  84         C eax   size-1
  85         C ebx   src
  86         C ecx   shift
  87         C edx
  88
  89         movl    (%ebx,%eax,4), %edx     C src high limb
  90         negl    %ecx
  91
  92         movd    PARAM_SHIFT, %mm6
  93         addl    $32, %ecx               C 32-shift
  94
  95         shrl    %cl, %edx
  96
  97         movd    %ecx, %mm7
  98         movl    PARAM_DST, %ecx
  99
 100 L(top):
 101         C eax   counter, size-1 to 1
 102         C ebx   src
 103         C ecx   dst
 104         C edx   retval
 105         C
 106         C mm0   scratch
 107         C mm6   shift
 108         C mm7   32-shift
 109
 110         movq    -4(%ebx,%eax,4), %mm0
 111         decl    %eax
 112
 113         psrlq   %mm7, %mm0
 114
 115         movd    %mm0, 4(%ecx,%eax,4)
 116         jnz     L(top)
 117
 118
 119         movd    (%ebx), %mm0
 120         popl    %ebx
 121
 122         psllq   %mm6, %mm0
 123         movl    %edx, %eax
 124
 125         movd    %mm0, (%ecx)
 126
 127         emms
 128         ret
 129
 130 EPILOGUE()