source/libs/gmp/gmp-src/mpn/x86/pentium/lshift.asm

   1 dnl  Intel Pentium mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C         cycles/limb
  35 C P5,P54:    6.0
  36 C P55:       5.375
  37
  38
  39 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                       unsigned shift);
  41 C
  42 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
  43 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
  44
  45 defframe(PARAM_SHIFT,16)
  46 defframe(PARAM_SIZE, 12)
  47 defframe(PARAM_SRC,  8)
  48 defframe(PARAM_DST,  4)
  49
  50         TEXT
  51         ALIGN(8)
  52 PROLOGUE(mpn_lshift)
  53
  54         pushl   %edi
  55         pushl   %esi
  56         pushl   %ebx
  57         pushl   %ebp
  58 deflit(`FRAME',16)
  59
  60         movl    PARAM_DST,%edi
  61         movl    PARAM_SRC,%esi
  62         movl    PARAM_SIZE,%ebp
  63         movl    PARAM_SHIFT,%ecx
  64
  65 C We can use faster code for shift-by-1 under certain conditions.
  66         cmp     $1,%ecx
  67         jne     L(normal)
  68         leal    4(%esi),%eax
  69         cmpl    %edi,%eax
  70         jnc     L(special)              C jump if s_ptr + 1 >= res_ptr
  71         leal    (%esi,%ebp,4),%eax
  72         cmpl    %eax,%edi
  73         jnc     L(special)              C jump if res_ptr >= s_ptr + size
  74
  75 L(normal):
  76         leal    -4(%edi,%ebp,4),%edi
  77         leal    -4(%esi,%ebp,4),%esi
  78
  79         movl    (%esi),%edx
  80         subl    $4,%esi
  81         xorl    %eax,%eax
  82         shldl(  %cl, %edx, %eax)        C compute carry limb
  83         pushl   %eax                    C push carry limb onto stack
  84
  85         decl    %ebp
  86         pushl   %ebp
  87         shrl    $3,%ebp
  88         jz      L(end)
  89
  90         movl    (%edi),%eax             C fetch destination cache line
  91
  92         ALIGN(4)
  93 L(oop): movl    -28(%edi),%eax          C fetch destination cache line
  94         movl    %edx,%ebx
  95
  96         movl    (%esi),%eax
  97         movl    -4(%esi),%edx
  98         shldl(  %cl, %eax, %ebx)
  99         shldl(  %cl, %edx, %eax)
 100         movl    %ebx,(%edi)
 101         movl    %eax,-4(%edi)
 102
 103         movl    -8(%esi),%ebx
 104         movl    -12(%esi),%eax
 105         shldl(  %cl, %ebx, %edx)
 106         shldl(  %cl, %eax, %ebx)
 107         movl    %edx,-8(%edi)
 108         movl    %ebx,-12(%edi)
 109
 110         movl    -16(%esi),%edx
 111         movl    -20(%esi),%ebx
 112         shldl(  %cl, %edx, %eax)
 113         shldl(  %cl, %ebx, %edx)
 114         movl    %eax,-16(%edi)
 115         movl    %edx,-20(%edi)
 116
 117         movl    -24(%esi),%eax
 118         movl    -28(%esi),%edx
 119         shldl(  %cl, %eax, %ebx)
 120         shldl(  %cl, %edx, %eax)
 121         movl    %ebx,-24(%edi)
 122         movl    %eax,-28(%edi)
 123
 124         subl    $32,%esi
 125         subl    $32,%edi
 126         decl    %ebp
 127         jnz     L(oop)
 128
 129 L(end): popl    %ebp
 130         andl    $7,%ebp
 131         jz      L(end2)
 132 L(oop2):
 133         movl    (%esi),%eax
 134         shldl(  %cl,%eax,%edx)
 135         movl    %edx,(%edi)
 136         movl    %eax,%edx
 137         subl    $4,%esi
 138         subl    $4,%edi
 139         decl    %ebp
 140         jnz     L(oop2)
 141
 142 L(end2):
 143         shll    %cl,%edx                C compute least significant limb
 144         movl    %edx,(%edi)             C store it
 145
 146         popl    %eax                    C pop carry limb
 147
 148         popl    %ebp
 149         popl    %ebx
 150         popl    %esi
 151         popl    %edi
 152         ret
 153
 154
 155 C We loop from least significant end of the arrays, which is only
 156 C permissable if the source and destination don't overlap, since the
 157 C function is documented to work for overlapping source and destination.
 158
 159 L(special):
 160         movl    (%esi),%edx
 161         addl    $4,%esi
 162
 163         decl    %ebp
 164         pushl   %ebp
 165         shrl    $3,%ebp
 166
 167         addl    %edx,%edx
 168         incl    %ebp
 169         decl    %ebp
 170         jz      L(Lend)
 171
 172         movl    (%edi),%eax             C fetch destination cache line
 173
 174         ALIGN(4)
 175 L(Loop):
 176         movl    28(%edi),%eax           C fetch destination cache line
 177         movl    %edx,%ebx
 178
 179         movl    (%esi),%eax
 180         movl    4(%esi),%edx
 181         adcl    %eax,%eax
 182         movl    %ebx,(%edi)
 183         adcl    %edx,%edx
 184         movl    %eax,4(%edi)
 185
 186         movl    8(%esi),%ebx
 187         movl    12(%esi),%eax
 188         adcl    %ebx,%ebx
 189         movl    %edx,8(%edi)
 190         adcl    %eax,%eax
 191         movl    %ebx,12(%edi)
 192
 193         movl    16(%esi),%edx
 194         movl    20(%esi),%ebx
 195         adcl    %edx,%edx
 196         movl    %eax,16(%edi)
 197         adcl    %ebx,%ebx
 198         movl    %edx,20(%edi)
 199
 200         movl    24(%esi),%eax
 201         movl    28(%esi),%edx
 202         adcl    %eax,%eax
 203         movl    %ebx,24(%edi)
 204         adcl    %edx,%edx
 205         movl    %eax,28(%edi)
 206
 207         leal    32(%esi),%esi           C use leal not to clobber carry
 208         leal    32(%edi),%edi
 209         decl    %ebp
 210         jnz     L(Loop)
 211
 212 L(Lend):
 213         popl    %ebp
 214         sbbl    %eax,%eax               C save carry in %eax
 215         andl    $7,%ebp
 216         jz      L(Lend2)
 217         addl    %eax,%eax               C restore carry from eax
 218 L(Loop2):
 219         movl    %edx,%ebx
 220         movl    (%esi),%edx
 221         adcl    %edx,%edx
 222         movl    %ebx,(%edi)
 223
 224         leal    4(%esi),%esi            C use leal not to clobber carry
 225         leal    4(%edi),%edi
 226         decl    %ebp
 227         jnz     L(Loop2)
 228
 229         jmp     L(L1)
 230 L(Lend2):
 231         addl    %eax,%eax               C restore carry from eax
 232 L(L1):  movl    %edx,(%edi)             C store last limb
 233
 234         sbbl    %eax,%eax
 235         negl    %eax
 236
 237         popl    %ebp
 238         popl    %ebx
 239         popl    %esi
 240         popl    %edi
 241         ret
 242
 243 EPILOGUE()