source/libs/gmp/gmp-src/mpn/x86_64/fastsse/lshift.asm

   1 dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE.
   2
   3 dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
   4
   5 dnl  Copyright 2010-2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb             cycles/limb              good
  37 C          16-byte aligned         16-byte unaligned        for cpu?
  38 C AMD K8,K9      ?                       ?
  39 C AMD K10        1.68  (1.45)            1.75  (1.49)           Y
  40 C AMD bd1        1.82  (1.75)            1.82  (1.75)           Y
  41 C AMD bobcat     4                       4
  42 C Intel P4       3     (2.7)             3     (2.7)            Y
  43 C Intel core2    2.05  (1.67)            2.55  (1.75)
  44 C Intel NHM      2.05  (1.75)            2.09  (2)
  45 C Intel SBR      1.5   (1.3125)          1.5   (1.4375)         Y
  46 C Intel atom     ?                       ?
  47 C VIA nano       2.25  (2)               2.5   (2)              Y
  48
  49 C We try to do as many 16-byte operations as possible.  The top-most and
  50 C bottom-most writes might need 8-byte operations.
  51
  52 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
  53 C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
  54 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
  55
  56 C This is not yet great code:
  57 C   (1) The unaligned case makes many reads.
  58 C   (2) We should do some unrolling, at least 2-way.
  59 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
  60 C Nano.
  61
  62 C INPUT PARAMETERS
  63 define(`rp',  `%rdi')
  64 define(`ap',  `%rsi')
  65 define(`n',   `%rdx')
  66 define(`cnt', `%rcx')
  67
  68 ASM_START()
  69         TEXT
  70         ALIGN(64)
  71 PROLOGUE(mpn_lshift)
  72         movd    R32(%rcx), %xmm4
  73         mov     $64, R32(%rax)
  74         sub     R32(%rcx), R32(%rax)
  75         movd    R32(%rax), %xmm5
  76
  77         neg     R32(%rcx)
  78         mov     -8(ap,n,8), %rax
  79         shr     R8(%rcx), %rax
  80
  81         cmp     $2, n
  82         jle     L(le2)
  83
  84         lea     (rp,n,8), R32(%rcx)
  85         test    $8, R8(%rcx)
  86         je      L(rp_aligned)
  87
  88 C Do one initial limb in order to make rp aligned
  89         movq    -8(ap,n,8), %xmm0
  90         movq    -16(ap,n,8), %xmm1
  91         psllq   %xmm4, %xmm0
  92         psrlq   %xmm5, %xmm1
  93         por     %xmm1, %xmm0
  94         movq    %xmm0, -8(rp,n,8)
  95         dec     n
  96
  97 L(rp_aligned):
  98         lea     (ap,n,8), R32(%rcx)
  99         test    $8, R8(%rcx)
 100         je      L(aent)
 101         jmp     L(uent)
 102 C *****************************************************************************
 103
 104 C Handle the case when ap != rp (mod 16).
 105
 106         ALIGN(16)
 107 L(utop):movdqa  -8(ap,n,8), %xmm0
 108         movq    (ap,n,8), %xmm1
 109         punpcklqdq  8(ap,n,8), %xmm1
 110         psllq   %xmm4, %xmm1
 111         psrlq   %xmm5, %xmm0
 112         por     %xmm1, %xmm0
 113         movdqa  %xmm0, (rp,n,8)
 114 L(uent):sub     $2, n
 115         ja      L(utop)
 116
 117         jne     L(end8)
 118
 119         movq    (ap), %xmm1
 120         pxor    %xmm0, %xmm0
 121         punpcklqdq  %xmm1, %xmm0
 122         punpcklqdq  8(ap), %xmm1
 123         psllq   %xmm4, %xmm1
 124         psrlq   %xmm5, %xmm0
 125         por     %xmm1, %xmm0
 126         movdqa  %xmm0, (rp)
 127         ret
 128 C *****************************************************************************
 129
 130 C Handle the case when ap = rp (mod 16).
 131
 132         ALIGN(16)
 133 L(atop):movdqa  (ap,n,8), %xmm0         C xmm0 = B*ap[n-1] + ap[n-2]
 134         movq    -8(ap,n,8), %xmm1       C xmm1 = ap[n-3]
 135         punpcklqdq  %xmm0, %xmm1        C xmm1 = B*ap[n-2] + ap[n-3]
 136         psllq   %xmm4, %xmm0
 137         psrlq   %xmm5, %xmm1
 138         por     %xmm1, %xmm0
 139         movdqa  %xmm0, (rp,n,8)
 140 L(aent):
 141         sub     $2, n
 142         ja      L(atop)
 143         jne     L(end8)
 144
 145         movdqa  (ap), %xmm1
 146         pxor    %xmm0, %xmm0
 147         punpcklqdq  %xmm1, %xmm0
 148         psllq   %xmm4, %xmm1
 149         psrlq   %xmm5, %xmm0
 150         por     %xmm1, %xmm0
 151         movdqa  %xmm0, (rp)
 152         ret
 153 C *****************************************************************************
 154
 155         ALIGN(16)
 156 L(le2): jne     L(end8)
 157
 158         movq    8(ap), %xmm0
 159         movq    (ap), %xmm1
 160         psllq   %xmm4, %xmm0
 161         psrlq   %xmm5, %xmm1
 162         por     %xmm1, %xmm0
 163         movq    %xmm0, 8(rp)
 164
 165 L(end8):movq    (ap), %xmm0
 166         psllq   %xmm4, %xmm0
 167         movq    %xmm0, (rp)
 168         ret
 169 EPILOGUE()