source/libs/gmp/gmp-src/mpn/x86_64/fastsse/sec_tabselect.asm

   1 dnl  AMD64 SSE mpn_sec_tabselect.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2011-2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C            cycles/limb     cycles/limb     cycles/limb
  37 C             ali,evn n      unal,evn n       other cases
  38 C AMD K8,K9      1.65           1.65             1.8
  39 C AMD K10        0.78           0.78             0.85
  40 C AMD bd1        0.80           0.91             1.25
  41 C AMD bobcat     2.15           2.15             2.37
  42 C Intel P4       2.5            2.5              2.95
  43 C Intel core2    1.17           1.25             1.25
  44 C Intel NHM      0.87           0.90             0.90
  45 C Intel SBR      0.63           0.79             0.77
  46 C Intel atom     4.3             4.3             4.3    slower than plain code
  47 C VIA nano       1.4             5.1             3.14   too alignment dependent
  48
  49 C NOTES
  50 C  * We only honour the least significant 32 bits of the `which' and `nents'
  51 C    arguments to allow efficient code using just SSE2.  We would need to
  52 C    either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
  53 C  * We use movd for copying between xmm and plain registers, since old gas
  54 C    rejects movq.  But gas assembles movd as movq when given a 64-bit greg.
  55
  56 define(`rp',     `%rdi')
  57 define(`tp',     `%rsi')
  58 define(`n',      `%rdx')
  59 define(`nents',  `%rcx')
  60 define(`which',  `%r8')
  61
  62 define(`i',      `%r10')
  63 define(`j',      `%r9')
  64
  65 C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
  66 C          nents  n   rp   tab       which j    i   temp  *    *    *    *
  67
  68 ABI_SUPPORT(DOS64)
  69 ABI_SUPPORT(STD64)
  70
  71 ASM_START()
  72         TEXT
  73         ALIGN(16)
  74 PROLOGUE(mpn_sec_tabselect)
  75         FUNC_ENTRY(4)
  76 IFDOS(` mov     56(%rsp), %r8d  ')
  77
  78         movd    which, %xmm8
  79         pshufd  $0, %xmm8, %xmm8        C 4 `which' copies
  80         mov     $1, R32(%rax)
  81         movd    %rax, %xmm9
  82         pshufd  $0, %xmm9, %xmm9        C 4 copies of 1
  83
  84         mov     n, j
  85         add     $-8, j
  86         js      L(outer_end)
  87
  88 L(outer_top):
  89         mov     nents, i
  90         mov     tp, %r11
  91         pxor    %xmm13, %xmm13
  92         pxor    %xmm4, %xmm4
  93         pxor    %xmm5, %xmm5
  94         pxor    %xmm6, %xmm6
  95         pxor    %xmm7, %xmm7
  96         ALIGN(16)
  97 L(top): movdqa  %xmm8, %xmm0
  98         pcmpeqd %xmm13, %xmm0
  99         paddd   %xmm9, %xmm13
 100         movdqu  0(tp), %xmm2
 101         movdqu  16(tp), %xmm3
 102         pand    %xmm0, %xmm2
 103         pand    %xmm0, %xmm3
 104         por     %xmm2, %xmm4
 105         por     %xmm3, %xmm5
 106         movdqu  32(tp), %xmm2
 107         movdqu  48(tp), %xmm3
 108         pand    %xmm0, %xmm2
 109         pand    %xmm0, %xmm3
 110         por     %xmm2, %xmm6
 111         por     %xmm3, %xmm7
 112         lea     (tp,n,8), tp
 113         add     $-1, i
 114         jne     L(top)
 115
 116         movdqu  %xmm4, 0(rp)
 117         movdqu  %xmm5, 16(rp)
 118         movdqu  %xmm6, 32(rp)
 119         movdqu  %xmm7, 48(rp)
 120
 121         lea     64(%r11), tp
 122         lea     64(rp), rp
 123         add     $-8, j
 124         jns     L(outer_top)
 125 L(outer_end):
 126
 127         test    $4, R8(n)
 128         je      L(b0xx)
 129 L(b1xx):mov     nents, i
 130         mov     tp, %r11
 131         pxor    %xmm13, %xmm13
 132         pxor    %xmm4, %xmm4
 133         pxor    %xmm5, %xmm5
 134         ALIGN(16)
 135 L(tp4): movdqa  %xmm8, %xmm0
 136         pcmpeqd %xmm13, %xmm0
 137         paddd   %xmm9, %xmm13
 138         movdqu  0(tp), %xmm2
 139         movdqu  16(tp), %xmm3
 140         pand    %xmm0, %xmm2
 141         pand    %xmm0, %xmm3
 142         por     %xmm2, %xmm4
 143         por     %xmm3, %xmm5
 144         lea     (tp,n,8), tp
 145         add     $-1, i
 146         jne     L(tp4)
 147         movdqu  %xmm4, 0(rp)
 148         movdqu  %xmm5, 16(rp)
 149         lea     32(%r11), tp
 150         lea     32(rp), rp
 151
 152 L(b0xx):test    $2, R8(n)
 153         je      L(b00x)
 154 L(b01x):mov     nents, i
 155         mov     tp, %r11
 156         pxor    %xmm13, %xmm13
 157         pxor    %xmm4, %xmm4
 158         ALIGN(16)
 159 L(tp2): movdqa  %xmm8, %xmm0
 160         pcmpeqd %xmm13, %xmm0
 161         paddd   %xmm9, %xmm13
 162         movdqu  0(tp), %xmm2
 163         pand    %xmm0, %xmm2
 164         por     %xmm2, %xmm4
 165         lea     (tp,n,8), tp
 166         add     $-1, i
 167         jne     L(tp2)
 168         movdqu  %xmm4, 0(rp)
 169         lea     16(%r11), tp
 170         lea     16(rp), rp
 171
 172 L(b00x):test    $1, R8(n)
 173         je      L(b000)
 174 L(b001):mov     nents, i
 175         mov     tp, %r11
 176         pxor    %xmm13, %xmm13
 177         pxor    %xmm4, %xmm4
 178         ALIGN(16)
 179 L(tp1): movdqa  %xmm8, %xmm0
 180         pcmpeqd %xmm13, %xmm0
 181         paddd   %xmm9, %xmm13
 182         movq    0(tp), %xmm2
 183         pand    %xmm0, %xmm2
 184         por     %xmm2, %xmm4
 185         lea     (tp,n,8), tp
 186         add     $-1, i
 187         jne     L(tp1)
 188         movq    %xmm4, 0(rp)
 189
 190 L(b000):FUNC_EXIT()
 191         ret
 192 EPILOGUE()