sysdeps/aarch64/memcpy.S

   1 /* Copyright (C) 2012-2017 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses.
  24  *
  25  */
  26
  27 #define dstin   x0
  28 #define src     x1
  29 #define count   x2
  30 #define dst     x3
  31 #define srcend  x4
  32 #define dstend  x5
  33 #define A_l     x6
  34 #define A_lw    w6
  35 #define A_h     x7
  36 #define A_hw    w7
  37 #define B_l     x8
  38 #define B_lw    w8
  39 #define B_h     x9
  40 #define C_l     x10
  41 #define C_h     x11
  42 #define D_l     x12
  43 #define D_h     x13
  44 #define E_l     src
  45 #define E_h     count
  46 #define F_l     srcend
  47 #define F_h     dst
  48 #define G_l     count
  49 #define G_h     dst
  50 #define tmp1    x14
  51
  52 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
  53    medium copies of 17..96 bytes which are fully unrolled. Large copies
  54    of more than 96 bytes align the destination and use an unrolled loop
  55    processing 64 bytes per iteration.
  56    In order to share code with memmove, small and medium copies read all
  57    data before writing, allowing any kind of overlap. So small, medium
  58    and large backwards memmoves are handled by falling through into memcpy.
  59    Overlapping large forward memmoves use a loop that copies backwards.
  60 */
  61
  62 #ifndef MEMMOVE
  63 # define MEMMOVE memmove
  64 #endif
  65 #ifndef MEMCPY
  66 # define MEMCPY memcpy
  67 #endif
  68
  69 ENTRY_ALIGN (MEMMOVE, 6)
  70
  71         DELOUSE (0)
  72         DELOUSE (1)
  73         DELOUSE (2)
  74
  75         sub     tmp1, dstin, src
  76         cmp     count, 96
  77         ccmp    tmp1, count, 2, hi
  78         b.lo    L(move_long)
  79
  80         /* Common case falls through into memcpy.  */
  81 END (MEMMOVE)
  82 libc_hidden_builtin_def (MEMMOVE)
  83 ENTRY (MEMCPY)
  84
  85         DELOUSE (0)
  86         DELOUSE (1)
  87         DELOUSE (2)
  88
  89         prfm    PLDL1KEEP, [src]
  90         add     srcend, src, count
  91         add     dstend, dstin, count
  92         cmp     count, 16
  93         b.ls    L(copy16)
  94         cmp     count, 96
  95         b.hi    L(copy_long)
  96
  97         /* Medium copies: 17..96 bytes.  */
  98         sub     tmp1, count, 1
  99         ldp     A_l, A_h, [src]
 100         tbnz    tmp1, 6, L(copy96)
 101         ldp     D_l, D_h, [srcend, -16]
 102         tbz     tmp1, 5, 1f
 103         ldp     B_l, B_h, [src, 16]
 104         ldp     C_l, C_h, [srcend, -32]
 105         stp     B_l, B_h, [dstin, 16]
 106         stp     C_l, C_h, [dstend, -32]
 107 1:
 108         stp     A_l, A_h, [dstin]
 109         stp     D_l, D_h, [dstend, -16]
 110         ret
 111
 112         .p2align 4
 113         /* Small copies: 0..16 bytes.  */
 114 L(copy16):
 115         cmp     count, 8
 116         b.lo    1f
 117         ldr     A_l, [src]
 118         ldr     A_h, [srcend, -8]
 119         str     A_l, [dstin]
 120         str     A_h, [dstend, -8]
 121         ret
 122         .p2align 4
 123 1:
 124         tbz     count, 2, 1f
 125         ldr     A_lw, [src]
 126         ldr     A_hw, [srcend, -4]
 127         str     A_lw, [dstin]
 128         str     A_hw, [dstend, -4]
 129         ret
 130
 131         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 132            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 133 1:
 134         cbz     count, 2f
 135         lsr     tmp1, count, 1
 136         ldrb    A_lw, [src]
 137         ldrb    A_hw, [srcend, -1]
 138         ldrb    B_lw, [src, tmp1]
 139         strb    A_lw, [dstin]
 140         strb    B_lw, [dstin, tmp1]
 141         strb    A_hw, [dstend, -1]
 142 2:      ret
 143
 144         .p2align 4
 145         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 146            32 bytes from the end.  */
 147 L(copy96):
 148         ldp     B_l, B_h, [src, 16]
 149         ldp     C_l, C_h, [src, 32]
 150         ldp     D_l, D_h, [src, 48]
 151         ldp     E_l, E_h, [srcend, -32]
 152         ldp     F_l, F_h, [srcend, -16]
 153         stp     A_l, A_h, [dstin]
 154         stp     B_l, B_h, [dstin, 16]
 155         stp     C_l, C_h, [dstin, 32]
 156         stp     D_l, D_h, [dstin, 48]
 157         stp     E_l, E_h, [dstend, -32]
 158         stp     F_l, F_h, [dstend, -16]
 159         ret
 160
 161         /* Align DST to 16 byte alignment so that we don't cross cache line
 162            boundaries on both loads and stores.  There are at least 96 bytes
 163            to copy, so copy 16 bytes unaligned and then align.  The loop
 164            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 165
 166         .p2align 4
 167 L(copy_long):
 168         and     tmp1, dstin, 15
 169         bic     dst, dstin, 15
 170         ldp     D_l, D_h, [src]
 171         sub     src, src, tmp1
 172         add     count, count, tmp1      /* Count is now 16 too large.  */
 173         ldp     A_l, A_h, [src, 16]
 174         stp     D_l, D_h, [dstin]
 175         ldp     B_l, B_h, [src, 32]
 176         ldp     C_l, C_h, [src, 48]
 177         ldp     D_l, D_h, [src, 64]!
 178         subs    count, count, 128 + 16  /* Test and readjust count.  */
 179         b.ls    L(last64)
 180 L(loop64):
 181         stp     A_l, A_h, [dst, 16]
 182         ldp     A_l, A_h, [src, 16]
 183         stp     B_l, B_h, [dst, 32]
 184         ldp     B_l, B_h, [src, 32]
 185         stp     C_l, C_h, [dst, 48]
 186         ldp     C_l, C_h, [src, 48]
 187         stp     D_l, D_h, [dst, 64]!
 188         ldp     D_l, D_h, [src, 64]!
 189         subs    count, count, 64
 190         b.hi    L(loop64)
 191
 192         /* Write the last full set of 64 bytes.  The remainder is at most 64
 193            bytes, so it is safe to always copy 64 bytes from the end even if
 194            there is just 1 byte left.  */
 195 L(last64):
 196         ldp     E_l, E_h, [srcend, -64]
 197         stp     A_l, A_h, [dst, 16]
 198         ldp     A_l, A_h, [srcend, -48]
 199         stp     B_l, B_h, [dst, 32]
 200         ldp     B_l, B_h, [srcend, -32]
 201         stp     C_l, C_h, [dst, 48]
 202         ldp     C_l, C_h, [srcend, -16]
 203         stp     D_l, D_h, [dst, 64]
 204         stp     E_l, E_h, [dstend, -64]
 205         stp     A_l, A_h, [dstend, -48]
 206         stp     B_l, B_h, [dstend, -32]
 207         stp     C_l, C_h, [dstend, -16]
 208         ret
 209
 210         .p2align 4
 211 L(move_long):
 212         cbz     tmp1, 3f
 213
 214         add     srcend, src, count
 215         add     dstend, dstin, count
 216
 217         /* Align dstend to 16 byte alignment so that we don't cross cache line
 218            boundaries on both loads and stores.  There are at least 96 bytes
 219            to copy, so copy 16 bytes unaligned and then align.  The loop
 220            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 221
 222         and     tmp1, dstend, 15
 223         ldp     D_l, D_h, [srcend, -16]
 224         sub     srcend, srcend, tmp1
 225         sub     count, count, tmp1
 226         ldp     A_l, A_h, [srcend, -16]
 227         stp     D_l, D_h, [dstend, -16]
 228         ldp     B_l, B_h, [srcend, -32]
 229         ldp     C_l, C_h, [srcend, -48]
 230         ldp     D_l, D_h, [srcend, -64]!
 231         sub     dstend, dstend, tmp1
 232         subs    count, count, 128
 233         b.ls    2f
 234
 235         nop
 236 1:
 237         stp     A_l, A_h, [dstend, -16]
 238         ldp     A_l, A_h, [srcend, -16]
 239         stp     B_l, B_h, [dstend, -32]
 240         ldp     B_l, B_h, [srcend, -32]
 241         stp     C_l, C_h, [dstend, -48]
 242         ldp     C_l, C_h, [srcend, -48]
 243         stp     D_l, D_h, [dstend, -64]!
 244         ldp     D_l, D_h, [srcend, -64]!
 245         subs    count, count, 64
 246         b.hi    1b
 247
 248         /* Write the last full set of 64 bytes.  The remainder is at most 64
 249            bytes, so it is safe to always copy 64 bytes from the start even if
 250            there is just 1 byte left.  */
 251 2:
 252         ldp     G_l, G_h, [src, 48]
 253         stp     A_l, A_h, [dstend, -16]
 254         ldp     A_l, A_h, [src, 32]
 255         stp     B_l, B_h, [dstend, -32]
 256         ldp     B_l, B_h, [src, 16]
 257         stp     C_l, C_h, [dstend, -48]
 258         ldp     C_l, C_h, [src]
 259         stp     D_l, D_h, [dstend, -64]
 260         stp     G_l, G_h, [dstin, 48]
 261         stp     A_l, A_h, [dstin, 32]
 262         stp     B_l, B_h, [dstin, 16]
 263         stp     C_l, C_h, [dstin]
 264 3:      ret
 265
 266 END (MEMCPY)
 267 libc_hidden_builtin_def (MEMCPY)