source/libs/gmp/gmp-src/mpn/x86_64/fastsse/lshiftc-movdqu2.asm

   1 dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund.
   4
   5 dnl  Copyright 2010-2012 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb     cycles/limb     cycles/limb    good
  37 C              aligned        unaligned       best seen    for cpu?
  38 C AMD K8,K9      3               3               ?        no, use shl/shr
  39 C AMD K10        1.8-2.0         1.8-2.0         ?        yes
  40 C AMD bd1        1.9             1.9             ?        yes
  41 C AMD bobcat     3.67            3.67                     yes, bad for n < 20
  42 C Intel P4       4.75            4.75            ?        no, slow movdqu
  43 C Intel core2    2.27            2.27            ?        no, use shld/shrd
  44 C Intel NHM      2.15            2.15            ?        no, use shld/shrd
  45 C Intel SBR      1.45            1.45            ?        yes, bad for n = 4-6
  46 C Intel atom    12.9            12.9             ?        no
  47 C VIA nano       6.18            6.44            ?        no, slow movdqu
  48
  49 C We try to do as many aligned 16-byte operations as possible.  The top-most
  50 C and bottom-most writes might need 8-byte operations.
  51 C
  52 C This variant rely on fast load movdqu, and uses it even for aligned operands,
  53 C in order to avoid the need for two separate loops.
  54 C
  55 C TODO
  56 C  * Could 2-limb wind-down code be simplified?
  57 C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
  58 C    for other affected CPUs.
  59
  60 C INPUT PARAMETERS
  61 define(`rp',  `%rdi')
  62 define(`ap',  `%rsi')
  63 define(`n',   `%rdx')
  64 define(`cnt', `%rcx')
  65
  66 ASM_START()
  67         TEXT
  68         ALIGN(64)
  69 PROLOGUE(mpn_lshiftc)
  70         FUNC_ENTRY(4)
  71         movd    R32(%rcx), %xmm4
  72         mov     $64, R32(%rax)
  73         sub     R32(%rcx), R32(%rax)
  74         movd    R32(%rax), %xmm5
  75
  76         neg     R32(%rcx)
  77         mov     -8(ap,n,8), %rax
  78         shr     R8(%rcx), %rax
  79
  80         pcmpeqb %xmm3, %xmm3            C set to 111...111
  81
  82         cmp     $3, n
  83         jle     L(bc)
  84
  85         lea     (rp,n,8), R32(%rcx)
  86         test    $8, R8(%rcx)
  87         jz      L(rp_aligned)
  88
  89 C Do one initial limb in order to make rp aligned
  90         movq    -8(ap,n,8), %xmm0
  91         movq    -16(ap,n,8), %xmm1
  92         psllq   %xmm4, %xmm0
  93         psrlq   %xmm5, %xmm1
  94         por     %xmm1, %xmm0
  95         pxor    %xmm3, %xmm0
  96         movq    %xmm0, -8(rp,n,8)
  97         dec     n
  98
  99 L(rp_aligned):
 100         lea     1(n), %r8d
 101
 102         and     $6, R32(%r8)
 103         jz      L(ba0)
 104         cmp     $4, R32(%r8)
 105         jz      L(ba4)
 106         jc      L(ba2)
 107 L(ba6): add     $-4, n
 108         jmp     L(i56)
 109 L(ba0): add     $-6, n
 110         jmp     L(i70)
 111 L(ba4): add     $-2, n
 112         jmp     L(i34)
 113 L(ba2): add     $-8, n
 114         jle     L(end)
 115
 116         ALIGN(16)
 117 L(top): movdqu  40(ap,n,8), %xmm1
 118         movdqu  48(ap,n,8), %xmm0
 119         psllq   %xmm4, %xmm0
 120         psrlq   %xmm5, %xmm1
 121         por     %xmm1, %xmm0
 122         pxor    %xmm3, %xmm0
 123         movdqa  %xmm0, 48(rp,n,8)
 124 L(i70):
 125         movdqu  24(ap,n,8), %xmm1
 126         movdqu  32(ap,n,8), %xmm0
 127         psllq   %xmm4, %xmm0
 128         psrlq   %xmm5, %xmm1
 129         por     %xmm1, %xmm0
 130         pxor    %xmm3, %xmm0
 131         movdqa  %xmm0, 32(rp,n,8)
 132 L(i56):
 133         movdqu  8(ap,n,8), %xmm1
 134         movdqu  16(ap,n,8), %xmm0
 135         psllq   %xmm4, %xmm0
 136         psrlq   %xmm5, %xmm1
 137         por     %xmm1, %xmm0
 138         pxor    %xmm3, %xmm0
 139         movdqa  %xmm0, 16(rp,n,8)
 140 L(i34):
 141         movdqu  -8(ap,n,8), %xmm1
 142         movdqu  (ap,n,8), %xmm0
 143         psllq   %xmm4, %xmm0
 144         psrlq   %xmm5, %xmm1
 145         por     %xmm1, %xmm0
 146         pxor    %xmm3, %xmm0
 147         movdqa  %xmm0, (rp,n,8)
 148         sub     $8, n
 149         jg      L(top)
 150
 151 L(end): test    $1, R8(n)
 152         jnz     L(end8)
 153
 154         movdqu  (ap), %xmm1
 155         pxor    %xmm0, %xmm0
 156         punpcklqdq  %xmm1, %xmm0
 157         psllq   %xmm4, %xmm1
 158         psrlq   %xmm5, %xmm0
 159         por     %xmm1, %xmm0
 160         pxor    %xmm3, %xmm0
 161         movdqa  %xmm0, (rp)
 162         FUNC_EXIT()
 163         ret
 164
 165 C Basecase
 166         ALIGN(16)
 167 L(bc):  dec     R32(n)
 168         jz      L(end8)
 169
 170         movq    (ap,n,8), %xmm1
 171         movq    -8(ap,n,8), %xmm0
 172         psllq   %xmm4, %xmm1
 173         psrlq   %xmm5, %xmm0
 174         por     %xmm1, %xmm0
 175         pxor    %xmm3, %xmm0
 176         movq    %xmm0, (rp,n,8)
 177         sub     $2, R32(n)
 178         jl      L(end8)
 179         movq    8(ap), %xmm1
 180         movq    (ap), %xmm0
 181         psllq   %xmm4, %xmm1
 182         psrlq   %xmm5, %xmm0
 183         por     %xmm1, %xmm0
 184         pxor    %xmm3, %xmm0
 185         movq    %xmm0, 8(rp)
 186
 187 L(end8):movq    (ap), %xmm0
 188         psllq   %xmm4, %xmm0
 189         pxor    %xmm3, %xmm0
 190         movq    %xmm0, (rp)
 191         FUNC_EXIT()
 192         ret
 193 EPILOGUE()