source/libs/gmp/gmp-src/mpn/x86/k7/mmx/lshift.asm

   1 dnl  AMD K7 mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C K7: 1.21 cycles/limb (at 16 limbs/loop).
  35
  36
  37
  38 dnl  K7: UNROLL_COUNT cycles/limb
  39 dnl           4           1.51
  40 dnl           8           1.26
  41 dnl          16           1.21
  42 dnl          32           1.2
  43 dnl  Maximum possible with the current code is 64.
  44
  45 deflit(UNROLL_COUNT, 16)
  46
  47
  48 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  49 C                       unsigned shift);
  50 C
  51 C Shift src,size left by shift many bits and store the result in dst,size.
  52 C Zeros are shifted in at the right.  The bits shifted out at the left are
  53 C the return value.
  54 C
  55 C The comments in mpn_rshift apply here too.
  56
  57 ifdef(`PIC',`
  58 deflit(UNROLL_THRESHOLD, 10)
  59 ',`
  60 deflit(UNROLL_THRESHOLD, 10)
  61 ')
  62
  63 defframe(PARAM_SHIFT,16)
  64 defframe(PARAM_SIZE, 12)
  65 defframe(PARAM_SRC,  8)
  66 defframe(PARAM_DST,  4)
  67
  68 defframe(SAVE_EDI, -4)
  69 defframe(SAVE_ESI, -8)
  70 defframe(SAVE_EBX, -12)
  71 deflit(SAVE_SIZE, 12)
  72
  73         TEXT
  74         ALIGN(32)
  75
  76 PROLOGUE(mpn_lshift)
  77 deflit(`FRAME',0)
  78
  79         movl    PARAM_SIZE, %eax
  80         movl    PARAM_SRC, %edx
  81         subl    $SAVE_SIZE, %esp
  82 deflit(`FRAME',SAVE_SIZE)
  83
  84         movl    PARAM_SHIFT, %ecx
  85         movl    %edi, SAVE_EDI
  86
  87         movl    PARAM_DST, %edi
  88         decl    %eax
  89         jnz     L(more_than_one_limb)
  90
  91         movl    (%edx), %edx
  92
  93         shldl(  %cl, %edx, %eax)        C eax was decremented to zero
  94
  95         shll    %cl, %edx
  96
  97         movl    %edx, (%edi)
  98         movl    SAVE_EDI, %edi
  99         addl    $SAVE_SIZE, %esp
 100
 101         ret
 102
 103
 104 C -----------------------------------------------------------------------------
 105 L(more_than_one_limb):
 106         C eax   size-1
 107         C ebx
 108         C ecx   shift
 109         C edx   src
 110         C esi
 111         C edi   dst
 112         C ebp
 113
 114         movd    PARAM_SHIFT, %mm6
 115         movd    (%edx,%eax,4), %mm5     C src high limb
 116         cmp     $UNROLL_THRESHOLD-1, %eax
 117
 118         jae     L(unroll)
 119         negl    %ecx
 120         movd    (%edx), %mm4            C src low limb
 121
 122         addl    $32, %ecx
 123
 124         movd    %ecx, %mm7
 125
 126 L(simple_top):
 127         C eax   loop counter, limbs
 128         C ebx
 129         C ecx
 130         C edx   src
 131         C esi
 132         C edi   dst
 133         C ebp
 134         C
 135         C mm0   scratch
 136         C mm4   src low limb
 137         C mm5   src high limb
 138         C mm6   shift
 139         C mm7   32-shift
 140
 141         movq    -4(%edx,%eax,4), %mm0
 142         decl    %eax
 143
 144         psrlq   %mm7, %mm0
 145
 146         movd    %mm0, 4(%edi,%eax,4)
 147         jnz     L(simple_top)
 148
 149
 150         psllq   %mm6, %mm5
 151         psllq   %mm6, %mm4
 152
 153         psrlq   $32, %mm5
 154         movd    %mm4, (%edi)            C dst low limb
 155
 156         movd    %mm5, %eax              C return value
 157
 158         movl    SAVE_EDI, %edi
 159         addl    $SAVE_SIZE, %esp
 160         emms
 161
 162         ret
 163
 164
 165 C -----------------------------------------------------------------------------
 166         ALIGN(16)
 167 L(unroll):
 168         C eax   size-1
 169         C ebx   (saved)
 170         C ecx   shift
 171         C edx   src
 172         C esi
 173         C edi   dst
 174         C ebp
 175         C
 176         C mm5   src high limb, for return value
 177         C mm6   lshift
 178
 179         movl    %esi, SAVE_ESI
 180         movl    %ebx, SAVE_EBX
 181         leal    -4(%edx,%eax,4), %edx   C &src[size-2]
 182
 183         testb   $4, %dl
 184         movq    (%edx), %mm1            C src high qword
 185
 186         jz      L(start_src_aligned)
 187
 188
 189         C src isn't aligned, process high limb (marked xxx) separately to
 190         C make it so
 191         C
 192         C  source    -4(edx,%eax,4)
 193         C                  |
 194         C  +-------+-------+-------+--
 195         C  |  xxx          |
 196         C  +-------+-------+-------+--
 197         C        0mod8   4mod8   0mod8
 198         C
 199         C  dest      -4(edi,%eax,4)
 200         C                  |
 201         C  +-------+-------+--
 202         C  |  xxx  |       |
 203         C  +-------+-------+--
 204
 205         psllq   %mm6, %mm1
 206         subl    $4, %edx
 207         movl    %eax, PARAM_SIZE        C size-1
 208
 209         psrlq   $32, %mm1
 210         decl    %eax                    C size-2 is new size-1
 211
 212         movd    %mm1, 4(%edi,%eax,4)
 213         movq    (%edx), %mm1            C new src high qword
 214 L(start_src_aligned):
 215
 216
 217         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
 218         psllq   %mm6, %mm5
 219
 220         testl   $4, %edi
 221         psrlq   $32, %mm5               C return value
 222
 223         jz      L(start_dst_aligned)
 224
 225
 226         C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
 227         C shift is 32 bits extra.  High limb of dst (marked xxx) handled
 228         C here separately.
 229         C
 230         C  source       %edx
 231         C  +-------+-------+--
 232         C  |      mm1      |
 233         C  +-------+-------+--
 234         C                0mod8   4mod8
 235         C
 236         C  dest         %edi
 237         C  +-------+-------+-------+--
 238         C  |  xxx  |
 239         C  +-------+-------+-------+--
 240         C        0mod8   4mod8   0mod8
 241
 242         movq    %mm1, %mm0
 243         psllq   %mm6, %mm1
 244         addl    $32, %ecx               C shift+32
 245
 246         psrlq   $32, %mm1
 247
 248         movd    %mm1, 4(%edi)
 249         movq    %mm0, %mm1
 250         subl    $4, %edi
 251
 252         movd    %ecx, %mm6              C new lshift
 253 L(start_dst_aligned):
 254
 255         decl    %eax                    C size-2, two last limbs handled at end
 256         movq    %mm1, %mm2              C copy of src high qword
 257         negl    %ecx
 258
 259         andl    $-2, %eax               C round size down to even
 260         addl    $64, %ecx
 261
 262         movl    %eax, %ebx
 263         negl    %eax
 264
 265         andl    $UNROLL_MASK, %eax
 266         decl    %ebx
 267
 268         shll    %eax
 269
 270         movd    %ecx, %mm7              C rshift = 64-lshift
 271
 272 ifdef(`PIC',`
 273         call    L(pic_calc)
 274 L(here):
 275 ',`
 276         leal    L(entry) (%eax,%eax,4), %esi
 277 ')
 278         shrl    $UNROLL_LOG2, %ebx      C loop counter
 279
 280         leal    ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
 281         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
 282         movl    PARAM_SIZE, %eax        C for use at end
 283         jmp     *%esi
 284
 285
 286 ifdef(`PIC',`
 287 L(pic_calc):
 288         C See mpn/x86/README about old gas bugs
 289         leal    (%eax,%eax,4), %esi
 290         addl    $L(entry)-L(here), %esi
 291         addl    (%esp), %esi
 292
 293         ret_internal
 294 ')
 295
 296
 297 C -----------------------------------------------------------------------------
 298         ALIGN(32)
 299 L(top):
 300         C eax   size (for use at end)
 301         C ebx   loop counter
 302         C ecx   rshift
 303         C edx   src
 304         C esi   computed jump
 305         C edi   dst
 306         C ebp
 307         C
 308         C mm0   scratch
 309         C mm1   \ carry (alternating, mm2 first)
 310         C mm2   /
 311         C mm6   lshift
 312         C mm7   rshift
 313         C
 314         C 10 code bytes/limb
 315         C
 316         C The two chunks differ in whether mm1 or mm2 hold the carry.
 317         C The computed jump puts the initial carry in both mm1 and mm2.
 318
 319 L(entry):
 320 deflit(CHUNK_COUNT, 4)
 321 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 322         deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 323         deflit(`disp1', eval(disp0 - 8))
 324
 325 Zdisp(  movq,   disp0,(%edx), %mm0)
 326         psllq   %mm6, %mm2
 327
 328         movq    %mm0, %mm1
 329         psrlq   %mm7, %mm0
 330
 331         por     %mm2, %mm0
 332 Zdisp(  movq,   %mm0, disp0,(%edi))
 333
 334
 335 Zdisp(  movq,   disp1,(%edx), %mm0)
 336         psllq   %mm6, %mm1
 337
 338         movq    %mm0, %mm2
 339         psrlq   %mm7, %mm0
 340
 341         por     %mm1, %mm0
 342 Zdisp(  movq,   %mm0, disp1,(%edi))
 343 ')
 344
 345         subl    $UNROLL_BYTES, %edx
 346         subl    $UNROLL_BYTES, %edi
 347         decl    %ebx
 348
 349         jns     L(top)
 350
 351
 352
 353 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
 354
 355 L(end):
 356         testb   $1, %al
 357         movl    SAVE_EBX, %ebx
 358         psllq   %mm6, %mm2      C wanted left shifted in all cases below
 359
 360         movd    %mm5, %eax
 361
 362         movl    SAVE_ESI, %esi
 363         jz      L(end_even)
 364
 365
 366 L(end_odd):
 367
 368         C Size odd, destination was aligned.
 369         C
 370         C                 source        edx+8   edx+4
 371         C                 --+---------------+-------+
 372         C                   |      mm2      |       |
 373         C                 --+---------------+-------+
 374         C
 375         C dest                            edi
 376         C --+---------------+---------------+-------+
 377         C   |   written     |               |       |
 378         C --+---------------+---------------+-------+
 379         C
 380         C mm6 = shift
 381         C mm7 = ecx = 64-shift
 382
 383
 384         C Size odd, destination was unaligned.
 385         C
 386         C                 source        edx+8   edx+4
 387         C                 --+---------------+-------+
 388         C                   |      mm2      |       |
 389         C                 --+---------------+-------+
 390         C
 391         C         dest                            edi
 392         C         --+---------------+---------------+
 393         C           |   written     |               |
 394         C         --+---------------+---------------+
 395         C
 396         C mm6 = shift+32
 397         C mm7 = ecx = 64-(shift+32)
 398
 399
 400         C In both cases there's one extra limb of src to fetch and combine
 401         C with mm2 to make a qword at (%edi), and in the aligned case
 402         C there's an extra limb of dst to be formed from that extra src limb
 403         C left shifted.
 404
 405         movd    disp(4) (%edx), %mm0
 406         testb   $32, %cl
 407
 408         movq    %mm0, %mm1
 409         psllq   $32, %mm0
 410
 411         psrlq   %mm7, %mm0
 412         psllq   %mm6, %mm1
 413
 414         por     %mm2, %mm0
 415
 416         movq    %mm0, disp(0) (%edi)
 417         jz      L(end_odd_unaligned)
 418         movd    %mm1, disp(-4) (%edi)
 419 L(end_odd_unaligned):
 420
 421         movl    SAVE_EDI, %edi
 422         addl    $SAVE_SIZE, %esp
 423         emms
 424
 425         ret
 426
 427
 428 L(end_even):
 429
 430         C Size even, destination was aligned.
 431         C
 432         C                 source        edx+8
 433         C                 --+---------------+
 434         C                   |      mm2      |
 435         C                 --+---------------+
 436         C
 437         C dest                            edi
 438         C --+---------------+---------------+
 439         C   |   written     |               |
 440         C --+---------------+---------------+
 441         C
 442         C mm6 = shift
 443         C mm7 = ecx = 64-shift
 444
 445
 446         C Size even, destination was unaligned.
 447         C
 448         C               source          edx+8
 449         C                 --+---------------+
 450         C                   |      mm2      |
 451         C                 --+---------------+
 452         C
 453         C         dest                  edi+4
 454         C         --+---------------+-------+
 455         C           |    written    |       |
 456         C         --+---------------+-------+
 457         C
 458         C mm6 = shift+32
 459         C mm7 = ecx = 64-(shift+32)
 460
 461
 462         C The movq for the aligned case overwrites the movd for the
 463         C unaligned case.
 464
 465         movq    %mm2, %mm0
 466         psrlq   $32, %mm2
 467
 468         testb   $32, %cl
 469         movd    %mm2, disp(4) (%edi)
 470
 471         jz      L(end_even_unaligned)
 472         movq    %mm0, disp(0) (%edi)
 473 L(end_even_unaligned):
 474
 475         movl    SAVE_EDI, %edi
 476         addl    $SAVE_SIZE, %esp
 477         emms
 478
 479         ret
 480
 481 EPILOGUE()