source/libs/gmp/gmp-src/mpn/x86/pentium/rshift.asm

   1 dnl  Intel Pentium mpn_rshift -- mpn right shift.
   2
   3 dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C         cycles/limb
  35 C P5,P54:    6.0
  36 C P55:       5.375
  37
  38
  39 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  40 C                       unsigned shift);
  41 C
  42 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
  43 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
  44
  45 defframe(PARAM_SHIFT,16)
  46 defframe(PARAM_SIZE, 12)
  47 defframe(PARAM_SRC,  8)
  48 defframe(PARAM_DST,  4)
  49
  50         TEXT
  51         ALIGN(8)
  52 PROLOGUE(mpn_rshift)
  53
  54         pushl   %edi
  55         pushl   %esi
  56         pushl   %ebx
  57         pushl   %ebp
  58 deflit(`FRAME',16)
  59
  60         movl    PARAM_DST,%edi
  61         movl    PARAM_SRC,%esi
  62         movl    PARAM_SIZE,%ebp
  63         movl    PARAM_SHIFT,%ecx
  64
  65 C We can use faster code for shift-by-1 under certain conditions.
  66         cmp     $1,%ecx
  67         jne     L(normal)
  68         leal    4(%edi),%eax
  69         cmpl    %esi,%eax
  70         jnc     L(special)              C jump if res_ptr + 1 >= s_ptr
  71         leal    (%edi,%ebp,4),%eax
  72         cmpl    %eax,%esi
  73         jnc     L(special)              C jump if s_ptr >= res_ptr + size
  74
  75 L(normal):
  76         movl    (%esi),%edx
  77         addl    $4,%esi
  78         xorl    %eax,%eax
  79         shrdl(  %cl, %edx, %eax)        C compute carry limb
  80         pushl   %eax                    C push carry limb onto stack
  81
  82         decl    %ebp
  83         pushl   %ebp
  84         shrl    $3,%ebp
  85         jz      L(end)
  86
  87         movl    (%edi),%eax             C fetch destination cache line
  88
  89         ALIGN(4)
  90 L(oop): movl    28(%edi),%eax           C fetch destination cache line
  91         movl    %edx,%ebx
  92
  93         movl    (%esi),%eax
  94         movl    4(%esi),%edx
  95         shrdl(  %cl, %eax, %ebx)
  96         shrdl(  %cl, %edx, %eax)
  97         movl    %ebx,(%edi)
  98         movl    %eax,4(%edi)
  99
 100         movl    8(%esi),%ebx
 101         movl    12(%esi),%eax
 102         shrdl(  %cl, %ebx, %edx)
 103         shrdl(  %cl, %eax, %ebx)
 104         movl    %edx,8(%edi)
 105         movl    %ebx,12(%edi)
 106
 107         movl    16(%esi),%edx
 108         movl    20(%esi),%ebx
 109         shrdl(  %cl, %edx, %eax)
 110         shrdl(  %cl, %ebx, %edx)
 111         movl    %eax,16(%edi)
 112         movl    %edx,20(%edi)
 113
 114         movl    24(%esi),%eax
 115         movl    28(%esi),%edx
 116         shrdl(  %cl, %eax, %ebx)
 117         shrdl(  %cl, %edx, %eax)
 118         movl    %ebx,24(%edi)
 119         movl    %eax,28(%edi)
 120
 121         addl    $32,%esi
 122         addl    $32,%edi
 123         decl    %ebp
 124         jnz     L(oop)
 125
 126 L(end): popl    %ebp
 127         andl    $7,%ebp
 128         jz      L(end2)
 129 L(oop2):
 130         movl    (%esi),%eax
 131         shrdl(  %cl,%eax,%edx)          C compute result limb
 132         movl    %edx,(%edi)
 133         movl    %eax,%edx
 134         addl    $4,%esi
 135         addl    $4,%edi
 136         decl    %ebp
 137         jnz     L(oop2)
 138
 139 L(end2):
 140         shrl    %cl,%edx                C compute most significant limb
 141         movl    %edx,(%edi)             C store it
 142
 143         popl    %eax                    C pop carry limb
 144
 145         popl    %ebp
 146         popl    %ebx
 147         popl    %esi
 148         popl    %edi
 149         ret
 150
 151
 152 C We loop from least significant end of the arrays, which is only
 153 C permissable if the source and destination don't overlap, since the
 154 C function is documented to work for overlapping source and destination.
 155
 156 L(special):
 157         leal    -4(%edi,%ebp,4),%edi
 158         leal    -4(%esi,%ebp,4),%esi
 159
 160         movl    (%esi),%edx
 161         subl    $4,%esi
 162
 163         decl    %ebp
 164         pushl   %ebp
 165         shrl    $3,%ebp
 166
 167         shrl    %edx
 168         incl    %ebp
 169         decl    %ebp
 170         jz      L(Lend)
 171
 172         movl    (%edi),%eax             C fetch destination cache line
 173
 174         ALIGN(4)
 175 L(Loop):
 176         movl    -28(%edi),%eax          C fetch destination cache line
 177         movl    %edx,%ebx
 178
 179         movl    (%esi),%eax
 180         movl    -4(%esi),%edx
 181         rcrl    %eax
 182         movl    %ebx,(%edi)
 183         rcrl    %edx
 184         movl    %eax,-4(%edi)
 185
 186         movl    -8(%esi),%ebx
 187         movl    -12(%esi),%eax
 188         rcrl    %ebx
 189         movl    %edx,-8(%edi)
 190         rcrl    %eax
 191         movl    %ebx,-12(%edi)
 192
 193         movl    -16(%esi),%edx
 194         movl    -20(%esi),%ebx
 195         rcrl    %edx
 196         movl    %eax,-16(%edi)
 197         rcrl    %ebx
 198         movl    %edx,-20(%edi)
 199
 200         movl    -24(%esi),%eax
 201         movl    -28(%esi),%edx
 202         rcrl    %eax
 203         movl    %ebx,-24(%edi)
 204         rcrl    %edx
 205         movl    %eax,-28(%edi)
 206
 207         leal    -32(%esi),%esi          C use leal not to clobber carry
 208         leal    -32(%edi),%edi
 209         decl    %ebp
 210         jnz     L(Loop)
 211
 212 L(Lend):
 213         popl    %ebp
 214         sbbl    %eax,%eax               C save carry in %eax
 215         andl    $7,%ebp
 216         jz      L(Lend2)
 217         addl    %eax,%eax               C restore carry from eax
 218 L(Loop2):
 219         movl    %edx,%ebx
 220         movl    (%esi),%edx
 221         rcrl    %edx
 222         movl    %ebx,(%edi)
 223
 224         leal    -4(%esi),%esi           C use leal not to clobber carry
 225         leal    -4(%edi),%edi
 226         decl    %ebp
 227         jnz     L(Loop2)
 228
 229         jmp     L(L1)
 230 L(Lend2):
 231         addl    %eax,%eax               C restore carry from eax
 232 L(L1):  movl    %edx,(%edi)             C store last limb
 233
 234         movl    $0,%eax
 235         rcrl    %eax
 236
 237         popl    %ebp
 238         popl    %ebx
 239         popl    %esi
 240         popl    %edi
 241         ret
 242
 243 EPILOGUE()