sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S

   1 /* memcpy with AVX
   2    Copyright (C) 2014-2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 #if IS_IN (libc) \
  22     && (defined SHARED \
  23         || defined USE_AS_MEMMOVE \
  24         || !defined USE_MULTIARCH)
  25
  26 #include "asm-syntax.h"
  27 #ifndef MEMCPY
  28 # define MEMCPY         __memcpy_avx_unaligned
  29 # define MEMCPY_CHK     __memcpy_chk_avx_unaligned
  30 # define MEMPCPY        __mempcpy_avx_unaligned
  31 # define MEMPCPY_CHK    __mempcpy_chk_avx_unaligned
  32 #endif
  33
  34         .section .text.avx,"ax",@progbits
  35 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
  36 ENTRY (MEMPCPY_CHK)
  37         cmpq    %rdx, %rcx
  38         jb      HIDDEN_JUMPTARGET (__chk_fail)
  39 END (MEMPCPY_CHK)
  40
  41 ENTRY (MEMPCPY)
  42         movq    %rdi, %rax
  43         addq    %rdx, %rax
  44         jmp     L(start)
  45 END (MEMPCPY)
  46 #endif
  47
  48 #if !defined USE_AS_BCOPY
  49 ENTRY (MEMCPY_CHK)
  50         cmpq    %rdx, %rcx
  51         jb      HIDDEN_JUMPTARGET (__chk_fail)
  52 END (MEMCPY_CHK)
  53 #endif
  54
  55 ENTRY (MEMCPY)
  56         mov     %rdi, %rax
  57 #ifdef USE_AS_MEMPCPY
  58         add     %rdx, %rax
  59 #endif
  60 L(start):
  61         cmp     $256, %rdx
  62         jae     L(256bytesormore)
  63         cmp     $16, %dl
  64         jb      L(less_16bytes)
  65         cmp     $128, %dl
  66         jb      L(less_128bytes)
  67         vmovdqu (%rsi), %xmm0
  68         lea     (%rsi, %rdx), %rcx
  69         vmovdqu 0x10(%rsi), %xmm1
  70         vmovdqu 0x20(%rsi), %xmm2
  71         vmovdqu 0x30(%rsi), %xmm3
  72         vmovdqu 0x40(%rsi), %xmm4
  73         vmovdqu 0x50(%rsi), %xmm5
  74         vmovdqu 0x60(%rsi), %xmm6
  75         vmovdqu 0x70(%rsi), %xmm7
  76         vmovdqu -0x80(%rcx), %xmm8
  77         vmovdqu -0x70(%rcx), %xmm9
  78         vmovdqu -0x60(%rcx), %xmm10
  79         vmovdqu -0x50(%rcx), %xmm11
  80         vmovdqu -0x40(%rcx), %xmm12
  81         vmovdqu -0x30(%rcx), %xmm13
  82         vmovdqu -0x20(%rcx), %xmm14
  83         vmovdqu -0x10(%rcx), %xmm15
  84         lea     (%rdi, %rdx), %rdx
  85         vmovdqu %xmm0, (%rdi)
  86         vmovdqu %xmm1, 0x10(%rdi)
  87         vmovdqu %xmm2, 0x20(%rdi)
  88         vmovdqu %xmm3, 0x30(%rdi)
  89         vmovdqu %xmm4, 0x40(%rdi)
  90         vmovdqu %xmm5, 0x50(%rdi)
  91         vmovdqu %xmm6, 0x60(%rdi)
  92         vmovdqu %xmm7, 0x70(%rdi)
  93         vmovdqu %xmm8, -0x80(%rdx)
  94         vmovdqu %xmm9, -0x70(%rdx)
  95         vmovdqu %xmm10, -0x60(%rdx)
  96         vmovdqu %xmm11, -0x50(%rdx)
  97         vmovdqu %xmm12, -0x40(%rdx)
  98         vmovdqu %xmm13, -0x30(%rdx)
  99         vmovdqu %xmm14, -0x20(%rdx)
 100         vmovdqu %xmm15, -0x10(%rdx)
 101         ret
 102         .p2align 4
 103 L(less_128bytes):
 104         cmp     $64, %dl
 105         jb      L(less_64bytes)
 106         vmovdqu (%rsi), %xmm0
 107         lea     (%rsi, %rdx), %rcx
 108         vmovdqu 0x10(%rsi), %xmm1
 109         vmovdqu 0x20(%rsi), %xmm2
 110         lea     (%rdi, %rdx), %rdx
 111         vmovdqu 0x30(%rsi), %xmm3
 112         vmovdqu -0x40(%rcx), %xmm4
 113         vmovdqu -0x30(%rcx), %xmm5
 114         vmovdqu -0x20(%rcx), %xmm6
 115         vmovdqu -0x10(%rcx), %xmm7
 116         vmovdqu %xmm0, (%rdi)
 117         vmovdqu %xmm1, 0x10(%rdi)
 118         vmovdqu %xmm2, 0x20(%rdi)
 119         vmovdqu %xmm3, 0x30(%rdi)
 120         vmovdqu %xmm4, -0x40(%rdx)
 121         vmovdqu %xmm5, -0x30(%rdx)
 122         vmovdqu %xmm6, -0x20(%rdx)
 123         vmovdqu %xmm7, -0x10(%rdx)
 124         ret
 125
 126         .p2align 4
 127 L(less_64bytes):
 128         cmp     $32, %dl
 129         jb      L(less_32bytes)
 130         vmovdqu (%rsi), %xmm0
 131         vmovdqu 0x10(%rsi), %xmm1
 132         vmovdqu -0x20(%rsi, %rdx), %xmm6
 133         vmovdqu -0x10(%rsi, %rdx), %xmm7
 134         vmovdqu %xmm0, (%rdi)
 135         vmovdqu %xmm1, 0x10(%rdi)
 136         vmovdqu %xmm6, -0x20(%rdi, %rdx)
 137         vmovdqu %xmm7, -0x10(%rdi, %rdx)
 138         ret
 139
 140         .p2align 4
 141 L(less_32bytes):
 142         vmovdqu (%rsi), %xmm0
 143         vmovdqu -0x10(%rsi, %rdx), %xmm7
 144         vmovdqu %xmm0, (%rdi)
 145         vmovdqu %xmm7, -0x10(%rdi, %rdx)
 146         ret
 147
 148         .p2align 4
 149 L(less_16bytes):
 150         cmp     $8, %dl
 151         jb      L(less_8bytes)
 152         movq -0x08(%rsi, %rdx), %rcx
 153         movq (%rsi),    %rsi
 154         movq %rsi, (%rdi)
 155         movq %rcx, -0x08(%rdi, %rdx)
 156         ret
 157
 158         .p2align 4
 159 L(less_8bytes):
 160         cmp     $4, %dl
 161         jb      L(less_4bytes)
 162         mov -0x04(%rsi, %rdx), %ecx
 163         mov (%rsi),     %esi
 164         mov %esi, (%rdi)
 165         mov %ecx, -0x04(%rdi, %rdx)
 166         ret
 167
 168 L(less_4bytes):
 169         cmp     $1, %dl
 170         jbe     L(less_2bytes)
 171         mov -0x02(%rsi, %rdx),  %cx
 172         mov (%rsi),     %si
 173         mov %si, (%rdi)
 174         mov %cx, -0x02(%rdi, %rdx)
 175         ret
 176
 177 L(less_2bytes):
 178         jb      L(less_0bytes)
 179         mov     (%rsi), %cl
 180         mov     %cl,    (%rdi)
 181 L(less_0bytes):
 182         ret
 183
 184         .p2align 4
 185 L(256bytesormore):
 186 #ifdef USE_AS_MEMMOVE
 187         mov     %rdi, %rcx
 188         sub     %rsi, %rcx
 189         cmp     %rdx, %rcx
 190         jc      L(copy_backward)
 191 #endif
 192         cmp     $2048, %rdx
 193         jae     L(gobble_data_movsb)
 194         mov     %rax, %r8
 195         lea     (%rsi, %rdx), %rcx
 196         mov     %rdi, %r10
 197         vmovdqu -0x80(%rcx), %xmm5
 198         vmovdqu -0x70(%rcx), %xmm6
 199         mov     $0x80, %rax
 200         and     $-32, %rdi
 201         add     $32, %rdi
 202         vmovdqu -0x60(%rcx), %xmm7
 203         vmovdqu -0x50(%rcx), %xmm8
 204         mov     %rdi, %r11
 205         sub     %r10, %r11
 206         vmovdqu -0x40(%rcx), %xmm9
 207         vmovdqu -0x30(%rcx), %xmm10
 208         sub     %r11, %rdx
 209         vmovdqu -0x20(%rcx), %xmm11
 210         vmovdqu -0x10(%rcx), %xmm12
 211         vmovdqu (%rsi), %ymm4
 212         add     %r11, %rsi
 213         sub     %eax, %edx
 214 L(goble_128_loop):
 215         vmovdqu (%rsi), %ymm0
 216         vmovdqu 0x20(%rsi), %ymm1
 217         vmovdqu 0x40(%rsi), %ymm2
 218         vmovdqu 0x60(%rsi), %ymm3
 219         add     %rax, %rsi
 220         vmovdqa %ymm0, (%rdi)
 221         vmovdqa %ymm1, 0x20(%rdi)
 222         vmovdqa %ymm2, 0x40(%rdi)
 223         vmovdqa %ymm3, 0x60(%rdi)
 224         add     %rax, %rdi
 225         sub     %eax, %edx
 226         jae     L(goble_128_loop)
 227         add     %eax, %edx
 228         add     %rdi, %rdx
 229         vmovdqu %ymm4, (%r10)
 230         vzeroupper
 231         vmovdqu %xmm5, -0x80(%rdx)
 232         vmovdqu %xmm6, -0x70(%rdx)
 233         vmovdqu %xmm7, -0x60(%rdx)
 234         vmovdqu %xmm8, -0x50(%rdx)
 235         vmovdqu %xmm9, -0x40(%rdx)
 236         vmovdqu %xmm10, -0x30(%rdx)
 237         vmovdqu %xmm11, -0x20(%rdx)
 238         vmovdqu %xmm12, -0x10(%rdx)
 239         mov     %r8, %rax
 240         ret
 241
 242         .p2align 4
 243 L(gobble_data_movsb):
 244 #ifdef SHARED_CACHE_SIZE_HALF
 245         mov     $SHARED_CACHE_SIZE_HALF, %rcx
 246 #else
 247         mov     __x86_shared_cache_size_half(%rip), %rcx
 248 #endif
 249         shl     $3, %rcx
 250         cmp     %rcx, %rdx
 251         jae     L(gobble_big_data_fwd)
 252         mov     %rdx, %rcx
 253         rep     movsb
 254         ret
 255
 256         .p2align 4
 257 L(gobble_big_data_fwd):
 258         lea     (%rsi, %rdx), %rcx
 259         vmovdqu (%rsi), %ymm4
 260         vmovdqu -0x80(%rsi,%rdx), %xmm5
 261         vmovdqu -0x70(%rcx), %xmm6
 262         vmovdqu -0x60(%rcx), %xmm7
 263         vmovdqu -0x50(%rcx), %xmm8
 264         vmovdqu -0x40(%rcx), %xmm9
 265         vmovdqu -0x30(%rcx), %xmm10
 266         vmovdqu -0x20(%rcx), %xmm11
 267         vmovdqu -0x10(%rcx), %xmm12
 268         mov     %rdi, %r8
 269         and     $-32, %rdi
 270         add     $32, %rdi
 271         mov     %rdi, %r10
 272         sub     %r8, %r10
 273         sub     %r10, %rdx
 274         add     %r10, %rsi
 275         lea     (%rdi, %rdx), %rcx
 276         add     $-0x80, %rdx
 277 L(gobble_mem_fwd_loop):
 278         prefetchnta 0x1c0(%rsi)
 279         prefetchnta 0x280(%rsi)
 280         vmovdqu (%rsi), %ymm0
 281         vmovdqu 0x20(%rsi), %ymm1
 282         vmovdqu 0x40(%rsi), %ymm2
 283         vmovdqu 0x60(%rsi), %ymm3
 284         sub     $-0x80, %rsi
 285         vmovntdq        %ymm0, (%rdi)
 286         vmovntdq        %ymm1, 0x20(%rdi)
 287         vmovntdq        %ymm2, 0x40(%rdi)
 288         vmovntdq        %ymm3, 0x60(%rdi)
 289         sub     $-0x80, %rdi
 290         add     $-0x80, %rdx
 291         jb      L(gobble_mem_fwd_loop)
 292         sfence
 293         vmovdqu %ymm4, (%r8)
 294         vzeroupper
 295         vmovdqu %xmm5, -0x80(%rcx)
 296         vmovdqu %xmm6, -0x70(%rcx)
 297         vmovdqu %xmm7, -0x60(%rcx)
 298         vmovdqu %xmm8, -0x50(%rcx)
 299         vmovdqu %xmm9, -0x40(%rcx)
 300         vmovdqu %xmm10, -0x30(%rcx)
 301         vmovdqu %xmm11, -0x20(%rcx)
 302         vmovdqu %xmm12, -0x10(%rcx)
 303         ret
 304
 305 #ifdef USE_AS_MEMMOVE
 306         .p2align 4
 307 L(copy_backward):
 308 #ifdef SHARED_CACHE_SIZE_HALF
 309         mov     $SHARED_CACHE_SIZE_HALF, %rcx
 310 #else
 311         mov     __x86_shared_cache_size_half(%rip), %rcx
 312 #endif
 313         shl     $3, %rcx
 314         vmovdqu (%rsi), %xmm5
 315         vmovdqu 0x10(%rsi), %xmm6
 316         add     %rdx, %rdi
 317         vmovdqu 0x20(%rsi), %xmm7
 318         vmovdqu 0x30(%rsi), %xmm8
 319         lea     -0x20(%rdi), %r10
 320         mov %rdi, %r11
 321         vmovdqu 0x40(%rsi), %xmm9
 322         vmovdqu 0x50(%rsi), %xmm10
 323         and     $0x1f, %r11
 324         vmovdqu 0x60(%rsi), %xmm11
 325         vmovdqu 0x70(%rsi), %xmm12
 326         xor     %r11, %rdi
 327         add     %rdx, %rsi
 328         vmovdqu -0x20(%rsi), %ymm4
 329         sub     %r11, %rsi
 330         sub     %r11, %rdx
 331         cmp     %rcx, %rdx
 332         ja      L(gobble_big_data_bwd)
 333         add     $-0x80, %rdx
 334 L(gobble_mem_bwd_llc):
 335         vmovdqu -0x20(%rsi), %ymm0
 336         vmovdqu -0x40(%rsi), %ymm1
 337         vmovdqu -0x60(%rsi), %ymm2
 338         vmovdqu -0x80(%rsi), %ymm3
 339         lea     -0x80(%rsi), %rsi
 340         vmovdqa %ymm0, -0x20(%rdi)
 341         vmovdqa %ymm1, -0x40(%rdi)
 342         vmovdqa %ymm2, -0x60(%rdi)
 343         vmovdqa %ymm3, -0x80(%rdi)
 344         lea     -0x80(%rdi), %rdi
 345         add     $-0x80, %rdx
 346         jb      L(gobble_mem_bwd_llc)
 347         vmovdqu %ymm4, (%r10)
 348         vzeroupper
 349         vmovdqu %xmm5, (%rax)
 350         vmovdqu %xmm6, 0x10(%rax)
 351         vmovdqu %xmm7, 0x20(%rax)
 352         vmovdqu %xmm8, 0x30(%rax)
 353         vmovdqu %xmm9, 0x40(%rax)
 354         vmovdqu %xmm10, 0x50(%rax)
 355         vmovdqu %xmm11, 0x60(%rax)
 356         vmovdqu %xmm12, 0x70(%rax)
 357         ret
 358
 359         .p2align 4
 360 L(gobble_big_data_bwd):
 361         add     $-0x80, %rdx
 362 L(gobble_mem_bwd_loop):
 363         prefetchnta -0x1c0(%rsi)
 364         prefetchnta -0x280(%rsi)
 365         vmovdqu -0x20(%rsi), %ymm0
 366         vmovdqu -0x40(%rsi), %ymm1
 367         vmovdqu -0x60(%rsi), %ymm2
 368         vmovdqu -0x80(%rsi), %ymm3
 369         lea     -0x80(%rsi), %rsi
 370         vmovntdq        %ymm0, -0x20(%rdi)
 371         vmovntdq        %ymm1, -0x40(%rdi)
 372         vmovntdq        %ymm2, -0x60(%rdi)
 373         vmovntdq        %ymm3, -0x80(%rdi)
 374         lea     -0x80(%rdi), %rdi
 375         add     $-0x80, %rdx
 376         jb      L(gobble_mem_bwd_loop)
 377         sfence
 378         vmovdqu %ymm4, (%r10)
 379         vzeroupper
 380         vmovdqu %xmm5, (%rax)
 381         vmovdqu %xmm6, 0x10(%rax)
 382         vmovdqu %xmm7, 0x20(%rax)
 383         vmovdqu %xmm8, 0x30(%rax)
 384         vmovdqu %xmm9, 0x40(%rax)
 385         vmovdqu %xmm10, 0x50(%rax)
 386         vmovdqu %xmm11, 0x60(%rax)
 387         vmovdqu %xmm12, 0x70(%rax)
 388         ret
 389 #endif
 390 END (MEMCPY)
 391 #endif