source/libs/gmp/gmp-src/mpn/x86/pentium/mmx/lshift.asm

   1 dnl  Intel P5 mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 2000-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P5: 1.75 cycles/limb.
  35
  36
  37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  38 C                       unsigned shift);
  39 C
  40 C Shift src,size left by shift many bits and store the result in dst,size.
  41 C Zeros are shifted in at the right.  Return the bits shifted out at the
  42 C left.
  43 C
  44 C The comments in mpn_rshift apply here too.
  45
  46 defframe(PARAM_SHIFT,16)
  47 defframe(PARAM_SIZE, 12)
  48 defframe(PARAM_SRC,  8)
  49 defframe(PARAM_DST,  4)
  50 deflit(`FRAME',0)
  51
  52 dnl  minimum 5, because the unrolled loop can't handle less
  53 deflit(UNROLL_THRESHOLD, 5)
  54
  55         TEXT
  56         ALIGN(8)
  57
  58 PROLOGUE(mpn_lshift)
  59
  60         pushl   %ebx
  61         pushl   %edi
  62 deflit(`FRAME',8)
  63
  64         movl    PARAM_SIZE, %eax
  65         movl    PARAM_DST, %edx
  66
  67         movl    PARAM_SRC, %ebx
  68         movl    PARAM_SHIFT, %ecx
  69
  70         cmp     $UNROLL_THRESHOLD, %eax
  71         jae     L(unroll)
  72
  73         movl    -4(%ebx,%eax,4), %edi   C src high limb
  74         decl    %eax
  75
  76         jnz     L(simple)
  77
  78         shldl(  %cl, %edi, %eax)        C eax was decremented to zero
  79
  80         shll    %cl, %edi
  81
  82         movl    %edi, (%edx)            C dst low limb
  83         popl    %edi                    C risk of data cache bank clash
  84
  85         popl    %ebx
  86
  87         ret
  88
  89
  90 C -----------------------------------------------------------------------------
  91 L(simple):
  92         C eax   size-1
  93         C ebx   src
  94         C ecx   shift
  95         C edx   dst
  96         C esi
  97         C edi
  98         C ebp
  99 deflit(`FRAME',8)
 100
 101         movd    (%ebx,%eax,4), %mm5     C src high limb
 102
 103         movd    %ecx, %mm6              C lshift
 104         negl    %ecx
 105
 106         psllq   %mm6, %mm5
 107         addl    $32, %ecx
 108
 109         movd    %ecx, %mm7
 110         psrlq   $32, %mm5               C retval
 111
 112
 113 L(simple_top):
 114         C eax   counter, limbs, negative
 115         C ebx   src
 116         C ecx
 117         C edx   dst
 118         C esi
 119         C edi
 120         C
 121         C mm0   scratch
 122         C mm5   return value
 123         C mm6   shift
 124         C mm7   32-shift
 125
 126         movq    -4(%ebx,%eax,4), %mm0
 127         decl    %eax
 128
 129         psrlq   %mm7, %mm0
 130
 131         C
 132
 133         movd    %mm0, 4(%edx,%eax,4)
 134         jnz     L(simple_top)
 135
 136
 137         movd    (%ebx), %mm0
 138
 139         movd    %mm5, %eax
 140         psllq   %mm6, %mm0
 141
 142         popl    %edi
 143         popl    %ebx
 144
 145         movd    %mm0, (%edx)
 146
 147         emms
 148
 149         ret
 150
 151
 152 C -----------------------------------------------------------------------------
 153         ALIGN(8)
 154 L(unroll):
 155         C eax   size
 156         C ebx   src
 157         C ecx   shift
 158         C edx   dst
 159         C esi
 160         C edi
 161         C ebp
 162 deflit(`FRAME',8)
 163
 164         movd    -4(%ebx,%eax,4), %mm5   C src high limb
 165         leal    (%ebx,%eax,4), %edi
 166
 167         movd    %ecx, %mm6              C lshift
 168         andl    $4, %edi
 169
 170         psllq   %mm6, %mm5
 171         jz      L(start_src_aligned)
 172
 173
 174         C src isn't aligned, process high limb separately (marked xxx) to
 175         C make it so.
 176         C
 177         C  source     -8(ebx,%eax,4)
 178         C                  |
 179         C  +-------+-------+-------+--
 180         C  |               |
 181         C  +-------+-------+-------+--
 182         C        0mod8   4mod8   0mod8
 183         C
 184         C  dest
 185         C     -4(edx,%eax,4)
 186         C          |
 187         C  +-------+-------+--
 188         C  |  xxx  |       |
 189         C  +-------+-------+--
 190
 191         movq    -8(%ebx,%eax,4), %mm0   C unaligned load
 192
 193         psllq   %mm6, %mm0
 194         decl    %eax
 195
 196         psrlq   $32, %mm0
 197
 198         C
 199
 200         movd    %mm0, (%edx,%eax,4)
 201 L(start_src_aligned):
 202
 203         movq    -8(%ebx,%eax,4), %mm1   C src high qword
 204         leal    (%edx,%eax,4), %edi
 205
 206         andl    $4, %edi
 207         psrlq   $32, %mm5               C return value
 208
 209         movq    -16(%ebx,%eax,4), %mm3  C src second highest qword
 210         jz      L(start_dst_aligned)
 211
 212         C dst isn't aligned, subtract 4 to make it so, and pretend the shift
 213         C is 32 bits extra.  High limb of dst (marked xxx) handled here
 214         C separately.
 215         C
 216         C  source     -8(ebx,%eax,4)
 217         C                  |
 218         C  +-------+-------+--
 219         C  |      mm1      |
 220         C  +-------+-------+--
 221         C                0mod8   4mod8
 222         C
 223         C  dest
 224         C     -4(edx,%eax,4)
 225         C          |
 226         C  +-------+-------+-------+--
 227         C  |  xxx  |               |
 228         C  +-------+-------+-------+--
 229         C        0mod8   4mod8   0mod8
 230
 231         movq    %mm1, %mm0
 232         addl    $32, %ecx               C new shift
 233
 234         psllq   %mm6, %mm0
 235
 236         movd    %ecx, %mm6
 237         psrlq   $32, %mm0
 238
 239         C wasted cycle here waiting for %mm0
 240
 241         movd    %mm0, -4(%edx,%eax,4)
 242         subl    $4, %edx
 243 L(start_dst_aligned):
 244
 245
 246         psllq   %mm6, %mm1
 247         negl    %ecx                    C -shift
 248
 249         addl    $64, %ecx               C 64-shift
 250         movq    %mm3, %mm2
 251
 252         movd    %ecx, %mm7
 253         subl    $8, %eax                C size-8
 254
 255         psrlq   %mm7, %mm3
 256
 257         por     %mm1, %mm3              C mm3 ready to store
 258         jc      L(finish)
 259
 260
 261         C The comments in mpn_rshift apply here too.
 262
 263         ALIGN(8)
 264 L(unroll_loop):
 265         C eax   counter, limbs
 266         C ebx   src
 267         C ecx
 268         C edx   dst
 269         C esi
 270         C edi
 271         C
 272         C mm0
 273         C mm1
 274         C mm2   src qword from 16(%ebx,%eax,4)
 275         C mm3   dst qword ready to store to 24(%edx,%eax,4)
 276         C
 277         C mm5   return value
 278         C mm6   lshift
 279         C mm7   rshift
 280
 281         movq    8(%ebx,%eax,4), %mm0
 282         psllq   %mm6, %mm2
 283
 284         movq    %mm0, %mm1
 285         psrlq   %mm7, %mm0
 286
 287         movq    %mm3, 24(%edx,%eax,4)   C prev
 288         por     %mm2, %mm0
 289
 290         movq    (%ebx,%eax,4), %mm3     C
 291         psllq   %mm6, %mm1              C
 292
 293         movq    %mm0, 16(%edx,%eax,4)
 294         movq    %mm3, %mm2              C
 295
 296         psrlq   %mm7, %mm3              C
 297         subl    $4, %eax
 298
 299         por     %mm1, %mm3              C
 300         jnc     L(unroll_loop)
 301
 302
 303
 304 L(finish):
 305         C eax   -4 to -1 representing respectively 0 to 3 limbs remaining
 306
 307         testb   $2, %al
 308
 309         jz      L(finish_no_two)
 310
 311         movq    8(%ebx,%eax,4), %mm0
 312         psllq   %mm6, %mm2
 313
 314         movq    %mm0, %mm1
 315         psrlq   %mm7, %mm0
 316
 317         movq    %mm3, 24(%edx,%eax,4)   C prev
 318         por     %mm2, %mm0
 319
 320         movq    %mm1, %mm2
 321         movq    %mm0, %mm3
 322
 323         subl    $2, %eax
 324 L(finish_no_two):
 325
 326
 327         C eax   -4 or -3 representing respectively 0 or 1 limbs remaining
 328         C
 329         C mm2   src prev qword, from 16(%ebx,%eax,4)
 330         C mm3   dst qword, for 24(%edx,%eax,4)
 331
 332         testb   $1, %al
 333         movd    %mm5, %eax      C retval
 334
 335         popl    %edi
 336         jz      L(finish_zero)
 337
 338
 339         C One extra src limb, destination was aligned.
 340         C
 341         C                 source                  ebx
 342         C                 --+---------------+-------+
 343         C                   |      mm2      |       |
 344         C                 --+---------------+-------+
 345         C
 346         C dest         edx+12           edx+4     edx
 347         C --+---------------+---------------+-------+
 348         C   |      mm3      |               |       |
 349         C --+---------------+---------------+-------+
 350         C
 351         C mm6 = shift
 352         C mm7 = ecx = 64-shift
 353
 354
 355         C One extra src limb, destination was unaligned.
 356         C
 357         C                 source                  ebx
 358         C                 --+---------------+-------+
 359         C                   |      mm2      |       |
 360         C                 --+---------------+-------+
 361         C
 362         C         dest         edx+12           edx+4
 363         C         --+---------------+---------------+
 364         C           |      mm3      |               |
 365         C         --+---------------+---------------+
 366         C
 367         C mm6 = shift+32
 368         C mm7 = ecx = 64-(shift+32)
 369
 370
 371         C In both cases there's one extra limb of src to fetch and combine
 372         C with mm2 to make a qword at 4(%edx), and in the aligned case
 373         C there's an extra limb of dst to be formed from that extra src limb
 374         C left shifted.
 375
 376
 377         movd    (%ebx), %mm0
 378         psllq   %mm6, %mm2
 379
 380         movq    %mm3, 12(%edx)
 381         psllq   $32, %mm0
 382
 383         movq    %mm0, %mm1
 384         psrlq   %mm7, %mm0
 385
 386         por     %mm2, %mm0
 387         psllq   %mm6, %mm1
 388
 389         movq    %mm0, 4(%edx)
 390         psrlq   $32, %mm1
 391
 392         andl    $32, %ecx
 393         popl    %ebx
 394
 395         jz      L(finish_one_unaligned)
 396
 397         movd    %mm1, (%edx)
 398 L(finish_one_unaligned):
 399
 400         emms
 401
 402         ret
 403
 404
 405 L(finish_zero):
 406
 407         C No extra src limbs, destination was aligned.
 408         C
 409         C                 source          ebx
 410         C                 --+---------------+
 411         C                   |      mm2      |
 412         C                 --+---------------+
 413         C
 414         C dest          edx+8             edx
 415         C --+---------------+---------------+
 416         C   |      mm3      |               |
 417         C --+---------------+---------------+
 418         C
 419         C mm6 = shift
 420         C mm7 = ecx = 64-shift
 421
 422
 423         C No extra src limbs, destination was unaligned.
 424         C
 425         C               source            ebx
 426         C                 --+---------------+
 427         C                   |      mm2      |
 428         C                 --+---------------+
 429         C
 430         C         dest          edx+8   edx+4
 431         C         --+---------------+-------+
 432         C           |      mm3      |       |
 433         C         --+---------------+-------+
 434         C
 435         C mm6 = shift+32
 436         C mm7 = ecx = 64-(shift+32)
 437
 438
 439         C The movd for the unaligned case writes the same data to 4(%edx)
 440         C that the movq does for the aligned case.
 441
 442
 443         movq    %mm3, 8(%edx)
 444         andl    $32, %ecx
 445
 446         psllq   %mm6, %mm2
 447         jz      L(finish_zero_unaligned)
 448
 449         movq    %mm2, (%edx)
 450 L(finish_zero_unaligned):
 451
 452         psrlq   $32, %mm2
 453         popl    %ebx
 454
 455         movd    %mm5, %eax      C retval
 456
 457         movd    %mm2, 4(%edx)
 458
 459         emms
 460
 461         ret
 462
 463 EPILOGUE()