source/libs/gmp/gmp-src/mpn/x86_64/fastsse/com-palignr.asm

   1 dnl  AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
   2
   3 dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
   4
   5 dnl  Contributed to the GNU project by Torbjorn Granlund.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb     cycles/limb     cycles/limb      good
  36 C              aligned        unaligned       best seen      for cpu?
  37 C AMD K8,K9      2.0             illop          1.0/1.0         N
  38 C AMD K10        0.85            illop                          Y/N
  39 C AMD bull       1.39            ? 1.45                         Y/N
  40 C AMD pile     0.8-1.4         0.7-1.4                          Y
  41 C AMD steam
  42 C AMD excavator
  43 C AMD bobcat     1.97            ? 8.17         1.5/1.5         N
  44 C AMD jaguar     1.02            1.02           0.91/0.91       N
  45 C Intel P4       2.26            illop                          Y/N
  46 C Intel core     0.52            0.95           opt/0.74        Y
  47 C Intel NHM      0.52            0.65           opt/opt         Y
  48 C Intel SBR      0.51            0.65           opt/opt         Y
  49 C Intel IBR      0.50            0.64           opt/0.57        Y
  50 C Intel HWL      0.51            0.58           opt/opt         Y
  51 C Intel BWL      0.57            0.69           opt/0.65        Y
  52 C Intel atom     1.16            1.70           opt/opt         Y
  53 C Intel SLM      1.02            1.52                           N
  54 C VIA nano       1.09            1.10           opt/opt         Y
  55
  56 C We use only 16-byte operations, except for unaligned top-most and bottom-most
  57 C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
  58 C instruction is better adapted to mpn_copyd's needs, we need to contort the
  59 C code to use it here.
  60 C
  61 C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
  62 C from the x86_64 default code.
  63
  64 C INPUT PARAMETERS
  65 define(`rp', `%rdi')
  66 define(`up', `%rsi')
  67 define(`n',  `%rdx')
  68
  69 C There are three instructions for loading an aligned 128-bit quantity.  We use
  70 C movaps, since it has the shortest coding.
  71 define(`movdqa', ``movaps'')
  72
  73 ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
  74
  75 ASM_START()
  76         TEXT
  77         ALIGN(64)
  78 PROLOGUE(mpn_com)
  79         FUNC_ENTRY(3)
  80
  81         cmp     $COM_SSE_THRESHOLD, n
  82         jbe     L(bc)
  83
  84         pcmpeqb %xmm7, %xmm7            C set to 111...111
  85
  86         test    $8, R8(rp)              C is rp 16-byte aligned?
  87         jz      L(rp_aligned)           C jump if rp aligned
  88
  89         mov     (up), %r8
  90         lea     8(up), up
  91         not     %r8
  92         mov     %r8, (rp)
  93         lea     8(rp), rp
  94         dec     n
  95
  96 L(rp_aligned):
  97         test    $8, R8(up)
  98         jnz     L(uent)
  99
 100 ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
 101 `       sub     $8, n',
 102 `       jmp     L(am)')
 103
 104         ALIGN(16)
 105 L(atop):movdqa  0(up), %xmm0
 106         movdqa  16(up), %xmm1
 107         movdqa  32(up), %xmm2
 108         movdqa  48(up), %xmm3
 109         lea     64(up), up
 110         pxor    %xmm7, %xmm0
 111         pxor    %xmm7, %xmm1
 112         pxor    %xmm7, %xmm2
 113         pxor    %xmm7, %xmm3
 114         movdqa  %xmm0, (rp)
 115         movdqa  %xmm1, 16(rp)
 116         movdqa  %xmm2, 32(rp)
 117         movdqa  %xmm3, 48(rp)
 118         lea     64(rp), rp
 119 L(am):  sub     $8, n
 120         jnc     L(atop)
 121
 122         test    $4, R8(n)
 123         jz      1f
 124         movdqa  (up), %xmm0
 125         movdqa  16(up), %xmm1
 126         lea     32(up), up
 127         pxor    %xmm7, %xmm0
 128         pxor    %xmm7, %xmm1
 129         movdqa  %xmm0, (rp)
 130         movdqa  %xmm1, 16(rp)
 131         lea     32(rp), rp
 132
 133 1:      test    $2, R8(n)
 134         jz      1f
 135         movdqa  (up), %xmm0
 136         lea     16(up), up
 137         pxor    %xmm7, %xmm0
 138         movdqa  %xmm0, (rp)
 139         lea     16(rp), rp
 140
 141 1:      test    $1, R8(n)
 142         jz      1f
 143         mov     (up), %r8
 144         not     %r8
 145         mov     %r8, (rp)
 146
 147 1:      FUNC_EXIT()
 148         ret
 149
 150 L(uent):
 151 C Code handling up - rp = 8 (mod 16)
 152
 153 C FIXME: The code below only handles overlap if it is close to complete, or
 154 C quite separate: up-rp < 5 or up-up > 15 limbs
 155         lea     -40(up), %rax           C 40 = 5 * GMP_LIMB_BYTES
 156         sub     rp, %rax
 157         cmp     $80, %rax               C 80 = (15-5) * GMP_LIMB_BYTES
 158         jbe     L(bc)                   C deflect to plain loop
 159
 160         sub     $16, n
 161         jc      L(uend)
 162
 163         movdqa  120(up), %xmm3
 164
 165         sub     $16, n
 166         jmp     L(um)
 167
 168         ALIGN(16)
 169 L(utop):movdqa  120(up), %xmm3
 170         pxor    %xmm7, %xmm0
 171         movdqa  %xmm0, -128(rp)
 172         sub     $16, n
 173 L(um):  movdqa  104(up), %xmm2
 174         palignr($8, %xmm2, %xmm3)
 175         movdqa  88(up), %xmm1
 176         pxor    %xmm7, %xmm3
 177         movdqa  %xmm3, 112(rp)
 178         palignr($8, %xmm1, %xmm2)
 179         movdqa  72(up), %xmm0
 180         pxor    %xmm7, %xmm2
 181         movdqa  %xmm2, 96(rp)
 182         palignr($8, %xmm0, %xmm1)
 183         movdqa  56(up), %xmm3
 184         pxor    %xmm7, %xmm1
 185         movdqa  %xmm1, 80(rp)
 186         palignr($8, %xmm3, %xmm0)
 187         movdqa  40(up), %xmm2
 188         pxor    %xmm7, %xmm0
 189         movdqa  %xmm0, 64(rp)
 190         palignr($8, %xmm2, %xmm3)
 191         movdqa  24(up), %xmm1
 192         pxor    %xmm7, %xmm3
 193         movdqa  %xmm3, 48(rp)
 194         palignr($8, %xmm1, %xmm2)
 195         movdqa  8(up), %xmm0
 196         pxor    %xmm7, %xmm2
 197         movdqa  %xmm2, 32(rp)
 198         palignr($8, %xmm0, %xmm1)
 199         movdqa  -8(up), %xmm3
 200         pxor    %xmm7, %xmm1
 201         movdqa  %xmm1, 16(rp)
 202         palignr($8, %xmm3, %xmm0)
 203         lea     128(up), up
 204         lea     128(rp), rp
 205         jnc     L(utop)
 206
 207         pxor    %xmm7, %xmm0
 208         movdqa  %xmm0, -128(rp)
 209
 210 L(uend):test    $8, R8(n)
 211         jz      1f
 212         movdqa  56(up), %xmm3
 213         movdqa  40(up), %xmm2
 214         palignr($8, %xmm2, %xmm3)
 215         movdqa  24(up), %xmm1
 216         pxor    %xmm7, %xmm3
 217         movdqa  %xmm3, 48(rp)
 218         palignr($8, %xmm1, %xmm2)
 219         movdqa  8(up), %xmm0
 220         pxor    %xmm7, %xmm2
 221         movdqa  %xmm2, 32(rp)
 222         palignr($8, %xmm0, %xmm1)
 223         movdqa  -8(up), %xmm3
 224         pxor    %xmm7, %xmm1
 225         movdqa  %xmm1, 16(rp)
 226         palignr($8, %xmm3, %xmm0)
 227         lea     64(up), up
 228         pxor    %xmm7, %xmm0
 229         movdqa  %xmm0, (rp)
 230         lea     64(rp), rp
 231
 232 1:      test    $4, R8(n)
 233         jz      1f
 234         movdqa  24(up), %xmm1
 235         movdqa  8(up), %xmm0
 236         palignr($8, %xmm0, %xmm1)
 237         movdqa  -8(up), %xmm3
 238         pxor    %xmm7, %xmm1
 239         movdqa  %xmm1, 16(rp)
 240         palignr($8, %xmm3, %xmm0)
 241         lea     32(up), up
 242         pxor    %xmm7, %xmm0
 243         movdqa  %xmm0, (rp)
 244         lea     32(rp), rp
 245
 246 1:      test    $2, R8(n)
 247         jz      1f
 248         movdqa  8(up), %xmm0
 249         movdqa  -8(up), %xmm3
 250         palignr($8, %xmm3, %xmm0)
 251         lea     16(up), up
 252         pxor    %xmm7, %xmm0
 253         movdqa  %xmm0, (rp)
 254         lea     16(rp), rp
 255
 256 1:      test    $1, R8(n)
 257         jz      1f
 258         mov     (up), %r8
 259         not     %r8
 260         mov     %r8, (rp)
 261
 262 1:      FUNC_EXIT()
 263         ret
 264
 265 C Basecase code.  Needed for good small operands speed, not for
 266 C correctness as the above code is currently written.
 267
 268 L(bc):  lea     -8(rp), rp
 269         sub     $4, R32(n)
 270         jc      L(end)
 271
 272 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
 273 `       ALIGN(16)')
 274 L(top): mov     (up), %r8
 275         mov     8(up), %r9
 276         lea     32(rp), rp
 277         mov     16(up), %r10
 278         mov     24(up), %r11
 279         lea     32(up), up
 280         not     %r8
 281         not     %r9
 282         not     %r10
 283         not     %r11
 284         mov     %r8, -24(rp)
 285         mov     %r9, -16(rp)
 286 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
 287 `       sub     $4, R32(n)')
 288         mov     %r10, -8(rp)
 289         mov     %r11, (rp)
 290 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
 291 `       jnc     L(top)')
 292
 293 L(end): test    $1, R8(n)
 294         jz      1f
 295         mov     (up), %r8
 296         not     %r8
 297         mov     %r8, 8(rp)
 298         lea     8(rp), rp
 299         lea     8(up), up
 300 1:      test    $2, R8(n)
 301         jz      1f
 302         mov     (up), %r8
 303         mov     8(up), %r9
 304         not     %r8
 305         not     %r9
 306         mov     %r8, 8(rp)
 307         mov     %r9, 16(rp)
 308 1:      FUNC_EXIT()
 309         ret
 310 EPILOGUE()