source/libs/gmp/gmp-src/mpn/x86/k7/mmx/rshift.asm

   1 dnl  AMD K7 mpn_rshift -- mpn right shift.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K7: 1.21 cycles/limb (at 16 limbs/loop).
  35
  36
  37
  38 dnl  K7: UNROLL_COUNT cycles/limb
  39 dnl           4           1.51
  40 dnl           8           1.26
  41 dnl          16           1.21
  42 dnl          32           1.2
  43 dnl  Maximum possible with the current code is 64.
  44
  45 deflit(UNROLL_COUNT, 16)
  46
  47
  48 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  49 C                       unsigned shift);
  50 C
  51 C Shift src,size right by shift many bits and store the result in dst,size.
  52 C Zeros are shifted in at the left.  The bits shifted out at the right are
  53 C the return value.
  54 C
  55 C This code uses 64-bit MMX operations, which makes it possible to handle
  56 C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
  57 C code, on the other hand, suffers from shrd being a vector path decode and
  58 C running at 3 cycles back-to-back.
  59 C
  60 C Full speed depends on source and destination being aligned, and some hairy
  61 C setups and finish-ups are done to arrange this for the loop.
  62
  63 ifdef(`PIC',`
  64 deflit(UNROLL_THRESHOLD, 10)
  65 ',`
  66 deflit(UNROLL_THRESHOLD, 10)
  67 ')
  68
  69 defframe(PARAM_SHIFT,16)
  70 defframe(PARAM_SIZE, 12)
  71 defframe(PARAM_SRC,  8)
  72 defframe(PARAM_DST,  4)
  73
  74 defframe(SAVE_EDI, -4)
  75 defframe(SAVE_ESI, -8)
  76 defframe(SAVE_EBX, -12)
  77 deflit(SAVE_SIZE, 12)
  78
  79         TEXT
  80         ALIGN(32)
  81
  82 PROLOGUE(mpn_rshift)
  83 deflit(`FRAME',0)
  84
  85         movl    PARAM_SIZE, %eax
  86         movl    PARAM_SRC, %edx
  87         subl    $SAVE_SIZE, %esp
  88 deflit(`FRAME',SAVE_SIZE)
  89
  90         movl    PARAM_SHIFT, %ecx
  91         movl    %edi, SAVE_EDI
  92
  93         movl    PARAM_DST, %edi
  94         decl    %eax
  95         jnz     L(more_than_one_limb)
  96
  97         movl    (%edx), %edx            C src limb
  98
  99         shrdl(  %cl, %edx, %eax)        C eax was decremented to zero
 100
 101         shrl    %cl, %edx
 102
 103         movl    %edx, (%edi)            C dst limb
 104         movl    SAVE_EDI, %edi
 105         addl    $SAVE_SIZE, %esp
 106
 107         ret
 108
 109
 110 C -----------------------------------------------------------------------------
 111 L(more_than_one_limb):
 112         C eax   size-1
 113         C ebx
 114         C ecx   shift
 115         C edx   src
 116         C esi
 117         C edi   dst
 118         C ebp
 119
 120         movd    PARAM_SHIFT, %mm6       C rshift
 121         movd    (%edx), %mm5            C src low limb
 122         cmp     $UNROLL_THRESHOLD-1, %eax
 123
 124         jae     L(unroll)
 125         leal    (%edx,%eax,4), %edx     C &src[size-1]
 126         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
 127
 128         movd    (%edx), %mm4            C src high limb
 129         negl    %eax
 130
 131
 132 L(simple_top):
 133         C eax   loop counter, limbs, negative
 134         C ebx
 135         C ecx   shift
 136         C edx   carry
 137         C edx   &src[size-1]
 138         C edi   &dst[size-2]
 139         C ebp
 140         C
 141         C mm0   scratch
 142         C mm4   src high limb
 143         C mm5   src low limb
 144         C mm6   shift
 145
 146         movq    (%edx,%eax,4), %mm0
 147         incl    %eax
 148
 149         psrlq   %mm6, %mm0
 150
 151         movd    %mm0, (%edi,%eax,4)
 152         jnz     L(simple_top)
 153
 154
 155         psllq   $32, %mm5
 156         psrlq   %mm6, %mm4
 157
 158         psrlq   %mm6, %mm5
 159         movd    %mm4, 4(%edi)           C dst high limb
 160
 161         movd    %mm5, %eax              C return value
 162
 163         movl    SAVE_EDI, %edi
 164         addl    $SAVE_SIZE, %esp
 165         emms
 166
 167         ret
 168
 169
 170 C -----------------------------------------------------------------------------
 171         ALIGN(16)
 172 L(unroll):
 173         C eax   size-1
 174         C ebx
 175         C ecx   shift
 176         C edx   src
 177         C esi
 178         C edi   dst
 179         C ebp
 180         C
 181         C mm5   src low limb
 182         C mm6   rshift
 183
 184         testb   $4, %dl
 185         movl    %esi, SAVE_ESI
 186         movl    %ebx, SAVE_EBX
 187
 188         psllq   $32, %mm5
 189         jz      L(start_src_aligned)
 190
 191
 192         C src isn't aligned, process low limb separately (marked xxx) and
 193         C step src and dst by one limb, making src aligned.
 194         C
 195         C source                  edx
 196         C --+-------+-------+-------+
 197         C           |          xxx  |
 198         C --+-------+-------+-------+
 199         C         4mod8   0mod8   4mod8
 200         C
 201         C         dest            edi
 202         C         --+-------+-------+
 203         C           |       |  xxx  |
 204         C         --+-------+-------+
 205
 206         movq    (%edx), %mm0            C src low two limbs
 207         addl    $4, %edx
 208         movl    %eax, PARAM_SIZE        C size-1
 209
 210         addl    $4, %edi
 211         decl    %eax                    C size-2 is new size-1
 212
 213         psrlq   %mm6, %mm0
 214         movl    %edi, PARAM_DST         C new dst
 215
 216         movd    %mm0, -4(%edi)
 217 L(start_src_aligned):
 218
 219
 220         movq    (%edx), %mm1            C src low two limbs
 221         decl    %eax                    C size-2, two last limbs handled at end
 222         testl   $4, %edi
 223
 224         psrlq   %mm6, %mm5
 225         jz      L(start_dst_aligned)
 226
 227
 228         C dst isn't aligned, add 4 to make it so, and pretend the shift is
 229         C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
 230         C
 231         C          source          edx
 232         C          --+-------+-------+
 233         C            |      mm1      |
 234         C          --+-------+-------+
 235         C                  4mod8   0mod8
 236         C
 237         C  dest                    edi
 238         C  --+-------+-------+-------+
 239         C                    |  xxx  |
 240         C  --+-------+-------+-------+
 241         C          4mod8   0mod8   4mod8
 242
 243         movq    %mm1, %mm0
 244         psrlq   %mm6, %mm1
 245         addl    $32, %ecx               C shift+32
 246
 247         movd    %mm1, (%edi)
 248         movq    %mm0, %mm1
 249         addl    $4, %edi                C new dst
 250
 251         movd    %ecx, %mm6
 252 L(start_dst_aligned):
 253
 254
 255         movq    %mm1, %mm2              C copy of src low two limbs
 256         negl    %ecx
 257         andl    $-2, %eax               C round size down to even
 258
 259         movl    %eax, %ebx
 260         negl    %eax
 261         addl    $64, %ecx
 262
 263         andl    $UNROLL_MASK, %eax
 264         decl    %ebx
 265
 266         shll    %eax
 267
 268         movd    %ecx, %mm7              C lshift = 64-rshift
 269
 270 ifdef(`PIC',`
 271         call    L(pic_calc)
 272 L(here):
 273 ',`
 274         leal    L(entry) (%eax,%eax,4), %esi
 275         negl    %eax
 276 ')
 277         shrl    $UNROLL_LOG2, %ebx      C loop counter
 278
 279         leal    ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
 280         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
 281         movl    PARAM_SIZE, %eax        C for use at end
 282
 283         jmp     *%esi
 284
 285
 286 ifdef(`PIC',`
 287 L(pic_calc):
 288         C See mpn/x86/README about old gas bugs
 289         leal    (%eax,%eax,4), %esi
 290         addl    $L(entry)-L(here), %esi
 291         addl    (%esp), %esi
 292         negl    %eax
 293
 294         ret_internal
 295 ')
 296
 297
 298 C -----------------------------------------------------------------------------
 299         ALIGN(64)
 300 L(top):
 301         C eax   size, for use at end
 302         C ebx   loop counter
 303         C ecx   lshift
 304         C edx   src
 305         C esi   was computed jump
 306         C edi   dst
 307         C ebp
 308         C
 309         C mm0   scratch
 310         C mm1   \ carry (alternating)
 311         C mm2   /
 312         C mm6   rshift
 313         C mm7   lshift
 314         C
 315         C 10 code bytes/limb
 316         C
 317         C The two chunks differ in whether mm1 or mm2 hold the carry.
 318         C The computed jump puts the initial carry in both mm1 and mm2.
 319
 320 L(entry):
 321 deflit(CHUNK_COUNT, 4)
 322 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 323         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 324         deflit(`disp1', eval(disp0 + 8))
 325
 326 Zdisp(  movq,   disp0,(%edx), %mm0)
 327         psrlq   %mm6, %mm2
 328
 329         movq    %mm0, %mm1
 330         psllq   %mm7, %mm0
 331
 332         por     %mm2, %mm0
 333 Zdisp(  movq,   %mm0, disp0,(%edi))
 334
 335
 336 Zdisp(  movq,   disp1,(%edx), %mm0)
 337         psrlq   %mm6, %mm1
 338
 339         movq    %mm0, %mm2
 340         psllq   %mm7, %mm0
 341
 342         por     %mm1, %mm0
 343 Zdisp(  movq,   %mm0, disp1,(%edi))
 344 ')
 345
 346         addl    $UNROLL_BYTES, %edx
 347         addl    $UNROLL_BYTES, %edi
 348         decl    %ebx
 349
 350         jns     L(top)
 351
 352
 353 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 354 deflit(`disp1', eval(disp0-0 + 8))
 355
 356         testb   $1, %al
 357         psrlq   %mm6, %mm2      C wanted rshifted in all cases below
 358         movl    SAVE_ESI, %esi
 359
 360         movd    %mm5, %eax              C return value
 361
 362         movl    SAVE_EBX, %ebx
 363         jz      L(end_even)
 364
 365
 366         C Size odd, destination was aligned.
 367         C
 368         C source
 369         C       edx
 370         C +-------+---------------+--
 371         C |       |      mm2      |
 372         C +-------+---------------+--
 373         C
 374         C dest                  edi
 375         C +-------+---------------+---------------+--
 376         C |       |               |    written    |
 377         C +-------+---------------+---------------+--
 378         C
 379         C mm6 = shift
 380         C mm7 = ecx = 64-shift
 381
 382
 383         C Size odd, destination was unaligned.
 384         C
 385         C source
 386         C       edx
 387         C +-------+---------------+--
 388         C |       |      mm2      |
 389         C +-------+---------------+--
 390         C
 391         C dest          edi
 392         C +---------------+---------------+--
 393         C |               |    written    |
 394         C +---------------+---------------+--
 395         C
 396         C mm6 = shift+32
 397         C mm7 = ecx = 64-(shift+32)
 398
 399
 400         C In both cases there's one extra limb of src to fetch and combine
 401         C with mm2 to make a qword to store, and in the aligned case there's
 402         C a further extra limb of dst to be formed.
 403
 404
 405         movd    disp0(%edx), %mm0
 406         movq    %mm0, %mm1
 407
 408         psllq   %mm7, %mm0
 409         testb   $32, %cl
 410
 411         por     %mm2, %mm0
 412         psrlq   %mm6, %mm1
 413
 414         movq    %mm0, disp0(%edi)
 415         jz      L(finish_odd_unaligned)
 416
 417         movd    %mm1, disp1(%edi)
 418 L(finish_odd_unaligned):
 419
 420         movl    SAVE_EDI, %edi
 421         addl    $SAVE_SIZE, %esp
 422         emms
 423
 424         ret
 425
 426
 427 L(end_even):
 428
 429         C Size even, destination was aligned.
 430         C
 431         C source
 432         C +---------------+--
 433         C |      mm2      |
 434         C +---------------+--
 435         C
 436         C dest          edi
 437         C +---------------+---------------+--
 438         C |               |      mm3      |
 439         C +---------------+---------------+--
 440         C
 441         C mm6 = shift
 442         C mm7 = ecx = 64-shift
 443
 444
 445         C Size even, destination was unaligned.
 446         C
 447         C source
 448         C +---------------+--
 449         C |      mm2      |
 450         C +---------------+--
 451         C
 452         C dest  edi
 453         C +-------+---------------+--
 454         C |       |      mm3      |
 455         C +-------+---------------+--
 456         C
 457         C mm6 = shift+32
 458         C mm7 = 64-(shift+32)
 459
 460
 461         C The movd for the unaligned case is the same data as the movq for
 462         C the aligned case, it's just a choice between whether one or two
 463         C limbs should be written.
 464
 465
 466         testb   $32, %cl
 467         movd    %mm2, disp0(%edi)
 468
 469         jz      L(end_even_unaligned)
 470
 471         movq    %mm2, disp0(%edi)
 472 L(end_even_unaligned):
 473
 474         movl    SAVE_EDI, %edi
 475         addl    $SAVE_SIZE, %esp
 476         emms
 477
 478         ret
 479
 480 EPILOGUE()