source/libs/gmp/gmp-src/mpn/x86_64/fastsse/copyi-palignr.asm

   1 dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
   2
   3 dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
   4
   5 dnl  Contributed to the GNU project by Torbjörn Granlund.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb     cycles/limb     cycles/limb      good
  36 C              aligned        unaligned       best seen      for cpu?
  37 C AMD K8,K9      2.0             illop          1.0/1.0         N
  38 C AMD K10        0.85            illop                          Y/N
  39 C AMD bull       0.70            0.66                           Y
  40 C AMD pile       0.68            0.66                           Y
  41 C AMD steam      ?               ?
  42 C AMD excavator  ?               ?
  43 C AMD bobcat     1.97            8.16           1.5/1.5         N
  44 C AMD jaguar     0.77            0.93           0.65/opt        N/Y
  45 C Intel P4       2.26            illop                          Y/N
  46 C Intel core     0.52            0.64           opt/opt         Y
  47 C Intel NHM      0.52            0.71           opt/opt         Y
  48 C Intel SBR      0.51            0.54           opt/0.51        Y
  49 C Intel IBR      0.50            0.54           opt/opt         Y
  50 C Intel HWL      0.50            0.51           opt/opt         Y
  51 C Intel BWL      0.55            0.55           opt/opt         Y
  52 C Intel atom     1.16            1.61           opt/opt         Y
  53 C Intel SLM      1.02            1.07           opt/opt         Y
  54 C VIA nano       1.09            1.08           opt/opt         Y
  55
  56 C We use only 16-byte operations, except for unaligned top-most and bottom-most
  57 C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
  58 C instruction is better adapted to mpn_copyd's needs, we need to contort the
  59 C code to use it here.
  60 C
  61 C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
  62 C taken from the x86_64 default code.
  63
  64 C INPUT PARAMETERS
  65 define(`rp', `%rdi')
  66 define(`up', `%rsi')
  67 define(`n',  `%rdx')
  68
  69 C There are three instructions for loading an aligned 128-bit quantity.  We use
  70 C movaps, since it has the shortest coding.
  71 dnl define(`movdqa', ``movaps'')
  72
  73 ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
  74
  75 ASM_START()
  76         TEXT
  77         ALIGN(64)
  78 PROLOGUE(mpn_copyi)
  79         FUNC_ENTRY(3)
  80
  81         cmp     $COPYI_SSE_THRESHOLD, n
  82         jbe     L(bc)
  83
  84         test    $8, R8(rp)              C is rp 16-byte aligned?
  85         jz      L(rp_aligned)           C jump if rp aligned
  86
  87         movsq                           C copy one limb
  88         dec     n
  89
  90 L(rp_aligned):
  91         test    $8, R8(up)
  92         jnz     L(uent)
  93
  94 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
  95 `       sub     $8, n',
  96 `       jmp     L(am)')
  97
  98         ALIGN(16)
  99 L(atop):movdqa  0(up), %xmm0
 100         movdqa  16(up), %xmm1
 101         movdqa  32(up), %xmm2
 102         movdqa  48(up), %xmm3
 103         lea     64(up), up
 104         movdqa  %xmm0, (rp)
 105         movdqa  %xmm1, 16(rp)
 106         movdqa  %xmm2, 32(rp)
 107         movdqa  %xmm3, 48(rp)
 108         lea     64(rp), rp
 109 L(am):  sub     $8, n
 110         jnc     L(atop)
 111
 112         test    $4, R8(n)
 113         jz      1f
 114         movdqa  (up), %xmm0
 115         movdqa  16(up), %xmm1
 116         lea     32(up), up
 117         movdqa  %xmm0, (rp)
 118         movdqa  %xmm1, 16(rp)
 119         lea     32(rp), rp
 120
 121 1:      test    $2, R8(n)
 122         jz      1f
 123         movdqa  (up), %xmm0
 124         lea     16(up), up
 125         movdqa  %xmm0, (rp)
 126         lea     16(rp), rp
 127
 128 1:      test    $1, R8(n)
 129         jz      1f
 130         mov     (up), %r8
 131         mov     %r8, (rp)
 132
 133 1:      FUNC_EXIT()
 134         ret
 135
 136 L(uent):
 137 C Code handling up - rp = 8 (mod 16)
 138
 139         cmp     $16, n
 140         jc      L(ued0)
 141
 142 IFDOS(` add     $-56, %rsp      ')
 143 IFDOS(` movdqa  %xmm6, (%rsp)   ')
 144 IFDOS(` movdqa  %xmm7, 16(%rsp) ')
 145 IFDOS(` movdqa  %xmm8, 32(%rsp) ')
 146
 147         movaps  120(up), %xmm7
 148         movaps  104(up), %xmm6
 149         movaps  88(up), %xmm5
 150         movaps  72(up), %xmm4
 151         movaps  56(up), %xmm3
 152         movaps  40(up), %xmm2
 153         lea     128(up), up
 154         sub     $32, n
 155         jc      L(ued1)
 156
 157         ALIGN(16)
 158 L(utop):movaps  -104(up), %xmm1
 159         sub     $16, n
 160         movaps  -120(up), %xmm0
 161         palignr($8, %xmm6, %xmm7)
 162         movaps  -136(up), %xmm8
 163         movdqa  %xmm7, 112(rp)
 164         palignr($8, %xmm5, %xmm6)
 165         movaps  120(up), %xmm7
 166         movdqa  %xmm6, 96(rp)
 167         palignr($8, %xmm4, %xmm5)
 168         movaps  104(up), %xmm6
 169         movdqa  %xmm5, 80(rp)
 170         palignr($8, %xmm3, %xmm4)
 171         movaps  88(up), %xmm5
 172         movdqa  %xmm4, 64(rp)
 173         palignr($8, %xmm2, %xmm3)
 174         movaps  72(up), %xmm4
 175         movdqa  %xmm3, 48(rp)
 176         palignr($8, %xmm1, %xmm2)
 177         movaps  56(up), %xmm3
 178         movdqa  %xmm2, 32(rp)
 179         palignr($8, %xmm0, %xmm1)
 180         movaps  40(up), %xmm2
 181         movdqa  %xmm1, 16(rp)
 182         palignr($8, %xmm8, %xmm0)
 183         lea     128(up), up
 184         movdqa  %xmm0, (rp)
 185         lea     128(rp), rp
 186         jnc     L(utop)
 187
 188 L(ued1):movaps  -104(up), %xmm1
 189         movaps  -120(up), %xmm0
 190         movaps  -136(up), %xmm8
 191         palignr($8, %xmm6, %xmm7)
 192         movdqa  %xmm7, 112(rp)
 193         palignr($8, %xmm5, %xmm6)
 194         movdqa  %xmm6, 96(rp)
 195         palignr($8, %xmm4, %xmm5)
 196         movdqa  %xmm5, 80(rp)
 197         palignr($8, %xmm3, %xmm4)
 198         movdqa  %xmm4, 64(rp)
 199         palignr($8, %xmm2, %xmm3)
 200         movdqa  %xmm3, 48(rp)
 201         palignr($8, %xmm1, %xmm2)
 202         movdqa  %xmm2, 32(rp)
 203         palignr($8, %xmm0, %xmm1)
 204         movdqa  %xmm1, 16(rp)
 205         palignr($8, %xmm8, %xmm0)
 206         movdqa  %xmm0, (rp)
 207         lea     128(rp), rp
 208
 209 IFDOS(` movdqa  (%rsp), %xmm6   ')
 210 IFDOS(` movdqa  16(%rsp), %xmm7 ')
 211 IFDOS(` movdqa  32(%rsp), %xmm8 ')
 212 IFDOS(` add     $56, %rsp       ')
 213
 214 L(ued0):test    $8, R8(n)
 215         jz      1f
 216         movaps  56(up), %xmm3
 217         movaps  40(up), %xmm2
 218         movaps  24(up), %xmm1
 219         movaps  8(up), %xmm0
 220         movaps  -8(up), %xmm4
 221         palignr($8, %xmm2, %xmm3)
 222         movdqa  %xmm3, 48(rp)
 223         palignr($8, %xmm1, %xmm2)
 224         movdqa  %xmm2, 32(rp)
 225         palignr($8, %xmm0, %xmm1)
 226         movdqa  %xmm1, 16(rp)
 227         palignr($8, %xmm4, %xmm0)
 228         lea     64(up), up
 229         movdqa  %xmm0, (rp)
 230         lea     64(rp), rp
 231
 232 1:      test    $4, R8(n)
 233         jz      1f
 234         movaps  24(up), %xmm1
 235         movaps  8(up), %xmm0
 236         palignr($8, %xmm0, %xmm1)
 237         movaps  -8(up), %xmm3
 238         movdqa  %xmm1, 16(rp)
 239         palignr($8, %xmm3, %xmm0)
 240         lea     32(up), up
 241         movdqa  %xmm0, (rp)
 242         lea     32(rp), rp
 243
 244 1:      test    $2, R8(n)
 245         jz      1f
 246         movdqa  8(up), %xmm0
 247         movdqa  -8(up), %xmm3
 248         palignr($8, %xmm3, %xmm0)
 249         lea     16(up), up
 250         movdqa  %xmm0, (rp)
 251         lea     16(rp), rp
 252
 253 1:      test    $1, R8(n)
 254         jz      1f
 255         mov     (up), %r8
 256         mov     %r8, (rp)
 257
 258 1:      FUNC_EXIT()
 259         ret
 260
 261 C Basecase code.  Needed for good small operands speed, not for
 262 C correctness as the above code is currently written.
 263
 264 L(bc):  lea     -8(rp), rp
 265         sub     $4, R32(n)
 266         jc      L(end)
 267
 268         ALIGN(16)
 269 L(top): mov     (up), %r8
 270         mov     8(up), %r9
 271         lea     32(rp), rp
 272         mov     16(up), %r10
 273         mov     24(up), %r11
 274         lea     32(up), up
 275         mov     %r8, -24(rp)
 276         mov     %r9, -16(rp)
 277 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
 278 `       sub     $4, R32(n)')
 279         mov     %r10, -8(rp)
 280         mov     %r11, (rp)
 281 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
 282 `       jnc     L(top)')
 283
 284 L(end): test    $1, R8(n)
 285         jz      1f
 286         mov     (up), %r8
 287         mov     %r8, 8(rp)
 288         lea     8(rp), rp
 289         lea     8(up), up
 290 1:      test    $2, R8(n)
 291         jz      1f
 292         mov     (up), %r8
 293         mov     8(up), %r9
 294         mov     %r8, 8(rp)
 295         mov     %r9, 16(rp)
 296 1:      FUNC_EXIT()
 297         ret
 298 EPILOGUE()