source/libs/gmp/gmp-src/mpn/x86_64/fastsse/lshift-movdqu2.asm

   1 dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund.
   4
   5 dnl  Copyright 2010-2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb     cycles/limb     cycles/limb    good
  37 C              aligned        unaligned       best seen    for cpu?
  38 C AMD K8,K9      3               3               2.35     no, use shl/shr
  39 C AMD K10        1.5-1.8         1.5-1.8         1.33     yes
  40 C AMD bd1        1.7-1.9         1.7-1.9         1.33     yes
  41 C AMD bobcat     3.17            3.17                     yes, bad for n < 20
  42 C Intel P4       4.67            4.67            2.7      no, slow movdqu
  43 C Intel core2    2.15            2.15            1.25     no, use shld/shrd
  44 C Intel NHM      1.66            1.66            1.25     no, use shld/shrd
  45 C Intel SBR      1.3             1.3             1.25     yes, bad for n = 4-6
  46 C Intel atom    11.7            11.7             4.5      no
  47 C VIA nano       5.7             5.95            2.0      no, slow movdqu
  48
  49 C We try to do as many aligned 16-byte operations as possible.  The top-most
  50 C and bottom-most writes might need 8-byte operations.
  51 C
  52 C This variant rely on fast load movdqu, and uses it even for aligned operands,
  53 C in order to avoid the need for two separate loops.
  54 C
  55 C TODO
  56 C  * Could 2-limb wind-down code be simplified?
  57 C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
  58 C    for other affected CPUs.
  59
  60 C INPUT PARAMETERS
  61 define(`rp',  `%rdi')
  62 define(`ap',  `%rsi')
  63 define(`n',   `%rdx')
  64 define(`cnt', `%rcx')
  65
  66 ASM_START()
  67         TEXT
  68         ALIGN(64)
  69 PROLOGUE(mpn_lshift)
  70         FUNC_ENTRY(4)
  71         movd    R32(%rcx), %xmm4
  72         mov     $64, R32(%rax)
  73         sub     R32(%rcx), R32(%rax)
  74         movd    R32(%rax), %xmm5
  75
  76         neg     R32(%rcx)
  77         mov     -8(ap,n,8), %rax
  78         shr     R8(%rcx), %rax
  79
  80         cmp     $3, n
  81         jle     L(bc)
  82
  83         lea     (rp,n,8), R32(%rcx)
  84         test    $8, R8(%rcx)
  85         jz      L(rp_aligned)
  86
  87 C Do one initial limb in order to make rp aligned
  88         movq    -8(ap,n,8), %xmm0
  89         movq    -16(ap,n,8), %xmm1
  90         psllq   %xmm4, %xmm0
  91         psrlq   %xmm5, %xmm1
  92         por     %xmm1, %xmm0
  93         movq    %xmm0, -8(rp,n,8)
  94         dec     n
  95
  96 L(rp_aligned):
  97         lea     1(n), %r8d
  98
  99         and     $6, R32(%r8)
 100         jz      L(ba0)
 101         cmp     $4, R32(%r8)
 102         jz      L(ba4)
 103         jc      L(ba2)
 104 L(ba6): add     $-4, n
 105         jmp     L(i56)
 106 L(ba0): add     $-6, n
 107         jmp     L(i70)
 108 L(ba4): add     $-2, n
 109         jmp     L(i34)
 110 L(ba2): add     $-8, n
 111         jle     L(end)
 112
 113         ALIGN(16)
 114 L(top): movdqu  40(ap,n,8), %xmm1
 115         movdqu  48(ap,n,8), %xmm0
 116         psllq   %xmm4, %xmm0
 117         psrlq   %xmm5, %xmm1
 118         por     %xmm1, %xmm0
 119         movdqa  %xmm0, 48(rp,n,8)
 120 L(i70):
 121         movdqu  24(ap,n,8), %xmm1
 122         movdqu  32(ap,n,8), %xmm0
 123         psllq   %xmm4, %xmm0
 124         psrlq   %xmm5, %xmm1
 125         por     %xmm1, %xmm0
 126         movdqa  %xmm0, 32(rp,n,8)
 127 L(i56):
 128         movdqu  8(ap,n,8), %xmm1
 129         movdqu  16(ap,n,8), %xmm0
 130         psllq   %xmm4, %xmm0
 131         psrlq   %xmm5, %xmm1
 132         por     %xmm1, %xmm0
 133         movdqa  %xmm0, 16(rp,n,8)
 134 L(i34):
 135         movdqu  -8(ap,n,8), %xmm1
 136         movdqu  (ap,n,8), %xmm0
 137         psllq   %xmm4, %xmm0
 138         psrlq   %xmm5, %xmm1
 139         por     %xmm1, %xmm0
 140         movdqa  %xmm0, (rp,n,8)
 141         sub     $8, n
 142         jg      L(top)
 143
 144 L(end): test    $1, R8(n)
 145         jnz     L(end8)
 146
 147         movdqu  (ap), %xmm1
 148         pxor    %xmm0, %xmm0
 149         punpcklqdq  %xmm1, %xmm0
 150         psllq   %xmm4, %xmm1
 151         psrlq   %xmm5, %xmm0
 152         por     %xmm1, %xmm0
 153         movdqa  %xmm0, (rp)
 154         FUNC_EXIT()
 155         ret
 156
 157 C Basecase
 158         ALIGN(16)
 159 L(bc):  dec     R32(n)
 160         jz      L(end8)
 161
 162         movq    (ap,n,8), %xmm1
 163         movq    -8(ap,n,8), %xmm0
 164         psllq   %xmm4, %xmm1
 165         psrlq   %xmm5, %xmm0
 166         por     %xmm1, %xmm0
 167         movq    %xmm0, (rp,n,8)
 168         sub     $2, R32(n)
 169         jl      L(end8)
 170         movq    8(ap), %xmm1
 171         movq    (ap), %xmm0
 172         psllq   %xmm4, %xmm1
 173         psrlq   %xmm5, %xmm0
 174         por     %xmm1, %xmm0
 175         movq    %xmm0, 8(rp)
 176
 177 L(end8):movq    (ap), %xmm0
 178         psllq   %xmm4, %xmm0
 179         movq    %xmm0, (rp)
 180         FUNC_EXIT()
 181         ret
 182 EPILOGUE()