source/libs/gmp/gmp-src/mpn/x86_64/fastsse/copyi.asm

   1 dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE.
   2
   3 dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
   4 dnl  Inc.
   5
   6 dnl  Contributed to the GNU project by Torbjörn Granlund.
   7
   8 dnl  This file is part of the GNU MP Library.
   9 dnl
  10 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  11 dnl  it under the terms of either:
  12 dnl
  13 dnl    * the GNU Lesser General Public License as published by the Free
  14 dnl      Software Foundation; either version 3 of the License, or (at your
  15 dnl      option) any later version.
  16 dnl
  17 dnl  or
  18 dnl
  19 dnl    * the GNU General Public License as published by the Free Software
  20 dnl      Foundation; either version 2 of the License, or (at your option) any
  21 dnl      later version.
  22 dnl
  23 dnl  or both in parallel, as here.
  24 dnl
  25 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  26 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  28 dnl  for more details.
  29 dnl
  30 dnl  You should have received copies of the GNU General Public License and the
  31 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  32 dnl  see https://www.gnu.org/licenses/.
  33
  34 include(`../config.m4')
  35
  36 C            cycles/limb     cycles/limb     cycles/limb      good
  37 C              aligned        unaligned       best seen      for cpu?
  38 C AMD K8,K9
  39 C AMD K10        0.85            1.64                           Y/N
  40 C AMD bull       1.4             1.4                            N
  41 C AMD pile       0.77            0.93                           N
  42 C AMD steam      ?               ?
  43 C AMD excavator  ?               ?
  44 C AMD bobcat
  45 C AMD jaguar     0.65            1.02           opt/0.93        Y/N
  46 C Intel P4       2.3             2.3                            Y
  47 C Intel core     1.0             1.0            0.52/0.64       N
  48 C Intel NHM      0.5             0.67                           Y
  49 C Intel SBR      0.51            0.75           opt/0.54        Y/N
  50 C Intel IBR      0.50            0.57           opt/0.54        Y
  51 C Intel HWL      0.50            0.57           opt/0.51        Y
  52 C Intel BWL      0.55            0.62           opt/0.55        Y
  53 C Intel atom
  54 C Intel SLM      1.02            1.27           opt/1.07        Y/N
  55 C VIA nano       1.16            5.16                           Y/N
  56
  57 C We try to do as many 16-byte operations as possible.  The top-most and
  58 C bottom-most writes might need 8-byte operations.  We can always write using
  59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
  60 C operations.
  61
  62 C Instead of having separate loops for reading aligned and unaligned, we read
  63 C using MOVDQU.  This seems to work great except for core2; there performance
  64 C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
  65 C best handle the unaligned case there.
  66
  67 C INPUT PARAMETERS
  68 define(`rp', `%rdi')
  69 define(`up', `%rsi')
  70 define(`n',  `%rdx')
  71
  72 ABI_SUPPORT(DOS64)
  73 ABI_SUPPORT(STD64)
  74
  75 dnl define(`movdqu', lddqu)
  76
  77 ASM_START()
  78         TEXT
  79         ALIGN(64)
  80 PROLOGUE(mpn_copyi)
  81         FUNC_ENTRY(3)
  82
  83         cmp     $3, n                   C NB: bc code below assumes this limit
  84         jc      L(bc)
  85
  86         test    $8, R8(rp)              C is rp 16-byte aligned?
  87         jz      L(ali)                  C jump if rp aligned
  88         movsq                           C copy single limb
  89         dec     n
  90
  91         sub     $16, n
  92         jc      L(sma)
  93
  94         ALIGN(16)
  95 L(top): movdqu  (up), %xmm0
  96         movdqu  16(up), %xmm1
  97         movdqu  32(up), %xmm2
  98         movdqu  48(up), %xmm3
  99         movdqu  64(up), %xmm4
 100         movdqu  80(up), %xmm5
 101         movdqu  96(up), %xmm6
 102         movdqu  112(up), %xmm7
 103         lea     128(up), up
 104         movdqa  %xmm0, (rp)
 105         movdqa  %xmm1, 16(rp)
 106         movdqa  %xmm2, 32(rp)
 107         movdqa  %xmm3, 48(rp)
 108         movdqa  %xmm4, 64(rp)
 109         movdqa  %xmm5, 80(rp)
 110         movdqa  %xmm6, 96(rp)
 111         movdqa  %xmm7, 112(rp)
 112         lea     128(rp), rp
 113 L(ali): sub     $16, n
 114         jnc     L(top)
 115
 116 L(sma): test    $8, R8(n)
 117         jz      1f
 118         movdqu  (up), %xmm0
 119         movdqu  16(up), %xmm1
 120         movdqu  32(up), %xmm2
 121         movdqu  48(up), %xmm3
 122         lea     64(up), up
 123         movdqa  %xmm0, (rp)
 124         movdqa  %xmm1, 16(rp)
 125         movdqa  %xmm2, 32(rp)
 126         movdqa  %xmm3, 48(rp)
 127         lea     64(rp), rp
 128 1:
 129         test    $4, R8(n)
 130         jz      1f
 131         movdqu  (up), %xmm0
 132         movdqu  16(up), %xmm1
 133         lea     32(up), up
 134         movdqa  %xmm0, (rp)
 135         movdqa  %xmm1, 16(rp)
 136         lea     32(rp), rp
 137 1:
 138         test    $2, R8(n)
 139         jz      1f
 140         movdqu  (up), %xmm0
 141         lea     16(up), up
 142         movdqa  %xmm0, (rp)
 143         lea     16(rp), rp
 144         ALIGN(16)
 145 1:
 146 L(end): test    $1, R8(n)
 147         jz      1f
 148         mov     (up), %r8
 149         mov     %r8, (rp)
 150 1:
 151         FUNC_EXIT()
 152         ret
 153
 154 C Basecase code.  Needed for good small operands speed, not for correctness as
 155 C the above code is currently written.  The commented-out lines need to be
 156 C reinstated if this code is to be used for n > 3, and then the post loop
 157 C offsets need fixing.
 158
 159 L(bc):  sub     $2, n
 160         jc      L(end)
 161         ALIGN(16)
 162 1:      mov     (up), %rax
 163         mov     8(up), %rcx
 164 dnl     lea     16(up), up
 165         mov     %rax, (rp)
 166         mov     %rcx, 8(rp)
 167 dnl     lea     16(rp), rp
 168 dnl     sub     $2, n
 169 dnl     jnc     1b
 170
 171         test    $1, R8(n)
 172         jz      L(ret)
 173         mov     16(up), %rax
 174         mov     %rax, 16(rp)
 175 L(ret): FUNC_EXIT()
 176         ret
 177 EPILOGUE()