source/libs/gmp/gmp-src/mpn/x86_64/fastsse/com.asm

   1 dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
   2
   3 dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
   4 dnl  Inc.
   5
   6 dnl  Contributed to the GNU project by Torbjorn Granlund.
   7
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  11 dnl  it under the terms of either:
  12 dnl
  13 dnl    * the GNU Lesser General Public License as published by the Free
  14 dnl      Software Foundation; either version 3 of the License, or (at your
  15 dnl      option) any later version.
  16 dnl
  17 dnl  or
  18 dnl
  19 dnl    * the GNU General Public License as published by the Free Software
  20 dnl      Foundation; either version 2 of the License, or (at your option) any
  21 dnl      later version.
  22 dnl
  23 dnl  or both in parallel, as here.
  24 dnl
  25 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  26 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  28 dnl  for more details.
  29 dnl
  30 dnl  You should have received copies of the GNU General Public License and the
  31 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  32 dnl  see https://www.gnu.org/licenses/.
  33
  34 include(`../config.m4')
  35
  36 C            cycles/limb     cycles/limb     cycles/limb      good
  37 C              aligned        unaligned       best seen      for cpu?
  38 C AMD K8,K9      2.0             2.0                            N
  39 C AMD K10        0.85            1.3                            Y/N
  40 C AMD bull       1.40            1.40                           Y
  41 C AMD pile     0.9-1.4         0.9-1.4                          Y
  42 C AMD steam
  43 C AMD excavator
  44 C AMD bobcat     3.1             3.1                            N
  45 C AMD jaguar     0.91            0.91           opt/opt         Y
  46 C Intel P4       2.28            illop                          Y
  47 C Intel core2    1.02            1.02                           N
  48 C Intel NHM      0.53            0.68                           Y
  49 C Intel SBR      0.51            0.75           opt/0.65        Y/N
  50 C Intel IBR      0.50            0.57           opt/opt         Y
  51 C Intel HWL      0.51            0.64           opt/0.58        Y
  52 C Intel BWL      0.61            0.65           0.57/opt        Y
  53 C Intel atom     3.68            3.68                           N
  54 C Intel SLM      1.09            1.35                           N
  55 C VIA nano       1.17            5.09                           Y/N
  56
  57 C We try to do as many 16-byte operations as possible.  The top-most and
  58 C bottom-most writes might need 8-byte operations.  We can always write using
  59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
  60 C operations.
  61
  62 C Instead of having separate loops for reading aligned and unaligned, we read
  63 C using MOVDQU.  This seems to work great except for core2; there performance
  64 C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
  65 C best handle the unaligned case there.
  66
  67 C INPUT PARAMETERS
  68 define(`rp', `%rdi')
  69 define(`up', `%rsi')
  70 define(`n',  `%rdx')
  71
  72 ABI_SUPPORT(DOS64)
  73 ABI_SUPPORT(STD64)
  74
  75 ASM_START()
  76         TEXT
  77         ALIGN(16)
  78 PROLOGUE(mpn_com)
  79         FUNC_ENTRY(3)
  80
  81         pcmpeqb %xmm7, %xmm7            C set to 111...111
  82
  83         test    $8, R8(rp)              C is rp 16-byte aligned?
  84         jz      L(ali)                  C jump if rp aligned
  85         mov     (up), %rax
  86         lea     8(up), up
  87         not     %rax
  88         mov     %rax, (rp)
  89         lea     8(rp), rp
  90         dec     n
  91
  92         sub     $14, n
  93         jc      L(sma)
  94
  95         ALIGN(16)
  96 L(top): movdqu  (up), %xmm0
  97         movdqu  16(up), %xmm1
  98         movdqu  32(up), %xmm2
  99         movdqu  48(up), %xmm3
 100         movdqu  64(up), %xmm4
 101         movdqu  80(up), %xmm5
 102         movdqu  96(up), %xmm6
 103         lea     112(up), up
 104         pxor    %xmm7, %xmm0
 105         pxor    %xmm7, %xmm1
 106         pxor    %xmm7, %xmm2
 107         pxor    %xmm7, %xmm3
 108         pxor    %xmm7, %xmm4
 109         pxor    %xmm7, %xmm5
 110         pxor    %xmm7, %xmm6
 111         movdqa  %xmm0, (rp)
 112         movdqa  %xmm1, 16(rp)
 113         movdqa  %xmm2, 32(rp)
 114         movdqa  %xmm3, 48(rp)
 115         movdqa  %xmm4, 64(rp)
 116         movdqa  %xmm5, 80(rp)
 117         movdqa  %xmm6, 96(rp)
 118         lea     112(rp), rp
 119 L(ali): sub     $14, n
 120         jnc     L(top)
 121
 122 L(sma): add     $14, n
 123         test    $8, R8(n)
 124         jz      1f
 125         movdqu  (up), %xmm0
 126         movdqu  16(up), %xmm1
 127         movdqu  32(up), %xmm2
 128         movdqu  48(up), %xmm3
 129         lea     64(up), up
 130         pxor    %xmm7, %xmm0
 131         pxor    %xmm7, %xmm1
 132         pxor    %xmm7, %xmm2
 133         pxor    %xmm7, %xmm3
 134         movdqa  %xmm0, (rp)
 135         movdqa  %xmm1, 16(rp)
 136         movdqa  %xmm2, 32(rp)
 137         movdqa  %xmm3, 48(rp)
 138         lea     64(rp), rp
 139 1:
 140         test    $4, R8(n)
 141         jz      1f
 142         movdqu  (up), %xmm0
 143         movdqu  16(up), %xmm1
 144         lea     32(up), up
 145         pxor    %xmm7, %xmm0
 146         pxor    %xmm7, %xmm1
 147         movdqa  %xmm0, (rp)
 148         movdqa  %xmm1, 16(rp)
 149         lea     32(rp), rp
 150 1:
 151         test    $2, R8(n)
 152         jz      1f
 153         movdqu  (up), %xmm0
 154         lea     16(up), up
 155         pxor    %xmm7, %xmm0
 156         movdqa  %xmm0, (rp)
 157         lea     16(rp), rp
 158 1:
 159         test    $1, R8(n)
 160         jz      1f
 161         mov     (up), %rax
 162         not     %rax
 163         mov     %rax, (rp)
 164 1:
 165 L(don): FUNC_EXIT()
 166         ret
 167 EPILOGUE()