sysdeps/aarch64/memcpy.S

   1 /* Copyright (C) 2012-2020 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses.
  24  *
  25  */
  26
  27 #define dstin   x0
  28 #define src     x1
  29 #define count   x2
  30 #define dst     x3
  31 #define srcend  x4
  32 #define dstend  x5
  33 #define A_l     x6
  34 #define A_lw    w6
  35 #define A_h     x7
  36 #define A_hw    w7
  37 #define B_l     x8
  38 #define B_lw    w8
  39 #define B_h     x9
  40 #define C_l     x10
  41 #define C_h     x11
  42 #define D_l     x12
  43 #define D_h     x13
  44 #define E_l     x14
  45 #define E_h     x15
  46 #define F_l     x16
  47 #define F_h     x17
  48 #define G_l     count
  49 #define G_h     dst
  50 #define H_l     src
  51 #define H_h     srcend
  52 #define tmp1    x14
  53
  54 /* Copies are split into 3 main cases: small copies of up to 32 bytes,
  55    medium copies of 33..128 bytes which are fully unrolled. Large copies
  56    of more than 128 bytes align the destination and use an unrolled loop
  57    processing 64 bytes per iteration.
  58    In order to share code with memmove, small and medium copies read all
  59    data before writing, allowing any kind of overlap. So small, medium
  60    and large backwards memmoves are handled by falling through into memcpy.
  61    Overlapping large forward memmoves use a loop that copies backwards.
  62 */
  63
  64 #ifndef MEMMOVE
  65 # define MEMMOVE memmove
  66 #endif
  67 #ifndef MEMCPY
  68 # define MEMCPY memcpy
  69 #endif
  70
  71 ENTRY_ALIGN (MEMMOVE, 6)
  72
  73         DELOUSE (0)
  74         DELOUSE (1)
  75         DELOUSE (2)
  76
  77         sub     tmp1, dstin, src
  78         cmp     count, 128
  79         ccmp    tmp1, count, 2, hi
  80         b.lo    L(move_long)
  81
  82         /* Common case falls through into memcpy.  */
  83 END (MEMMOVE)
  84 libc_hidden_builtin_def (MEMMOVE)
  85 ENTRY (MEMCPY)
  86
  87         DELOUSE (0)
  88         DELOUSE (1)
  89         DELOUSE (2)
  90
  91         prfm    PLDL1KEEP, [src]
  92         add     srcend, src, count
  93         add     dstend, dstin, count
  94         cmp     count, 32
  95         b.ls    L(copy32)
  96         cmp     count, 128
  97         b.hi    L(copy_long)
  98
  99         /* Medium copies: 33..128 bytes.  */
 100         ldp     A_l, A_h, [src]
 101         ldp     B_l, B_h, [src, 16]
 102         ldp     C_l, C_h, [srcend, -32]
 103         ldp     D_l, D_h, [srcend, -16]
 104         cmp     count, 64
 105         b.hi    L(copy128)
 106         stp     A_l, A_h, [dstin]
 107         stp     B_l, B_h, [dstin, 16]
 108         stp     C_l, C_h, [dstend, -32]
 109         stp     D_l, D_h, [dstend, -16]
 110         ret
 111
 112         .p2align 4
 113         /* Small copies: 0..32 bytes.  */
 114 L(copy32):
 115         /* 16-32 bytes.  */
 116         cmp     count, 16
 117         b.lo    1f
 118         ldp     A_l, A_h, [src]
 119         ldp     B_l, B_h, [srcend, -16]
 120         stp     A_l, A_h, [dstin]
 121         stp     B_l, B_h, [dstend, -16]
 122         ret
 123         .p2align 4
 124 1:
 125         /* 8-15 bytes.  */
 126         tbz     count, 3, 1f
 127         ldr     A_l, [src]
 128         ldr     A_h, [srcend, -8]
 129         str     A_l, [dstin]
 130         str     A_h, [dstend, -8]
 131         ret
 132         .p2align 4
 133 1:
 134         /* 4-7 bytes.  */
 135         tbz     count, 2, 1f
 136         ldr     A_lw, [src]
 137         ldr     A_hw, [srcend, -4]
 138         str     A_lw, [dstin]
 139         str     A_hw, [dstend, -4]
 140         ret
 141
 142         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 143            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 144 1:
 145         cbz     count, 2f
 146         lsr     tmp1, count, 1
 147         ldrb    A_lw, [src]
 148         ldrb    A_hw, [srcend, -1]
 149         ldrb    B_lw, [src, tmp1]
 150         strb    A_lw, [dstin]
 151         strb    B_lw, [dstin, tmp1]
 152         strb    A_hw, [dstend, -1]
 153 2:      ret
 154
 155         .p2align 4
 156         /* Copy 65..128 bytes.  Copy 64 bytes from the start and
 157            64 bytes from the end.  */
 158 L(copy128):
 159         ldp     E_l, E_h, [src, 32]
 160         ldp     F_l, F_h, [src, 48]
 161         ldp     G_l, G_h, [srcend, -64]
 162         ldp     H_l, H_h, [srcend, -48]
 163         stp     A_l, A_h, [dstin]
 164         stp     B_l, B_h, [dstin, 16]
 165         stp     E_l, E_h, [dstin, 32]
 166         stp     F_l, F_h, [dstin, 48]
 167         stp     G_l, G_h, [dstend, -64]
 168         stp     H_l, H_h, [dstend, -48]
 169         stp     C_l, C_h, [dstend, -32]
 170         stp     D_l, D_h, [dstend, -16]
 171         ret
 172
 173         /* Align DST to 16 byte alignment so that we don't cross cache line
 174            boundaries on both loads and stores.  There are at least 128 bytes
 175            to copy, so copy 16 bytes unaligned and then align.  The loop
 176            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 177
 178         .p2align 4
 179 L(copy_long):
 180         and     tmp1, dstin, 15
 181         bic     dst, dstin, 15
 182         ldp     D_l, D_h, [src]
 183         sub     src, src, tmp1
 184         add     count, count, tmp1      /* Count is now 16 too large.  */
 185         ldp     A_l, A_h, [src, 16]
 186         stp     D_l, D_h, [dstin]
 187         ldp     B_l, B_h, [src, 32]
 188         ldp     C_l, C_h, [src, 48]
 189         ldp     D_l, D_h, [src, 64]!
 190         subs    count, count, 128 + 16  /* Test and readjust count.  */
 191         b.ls    L(last64)
 192 L(loop64):
 193         stp     A_l, A_h, [dst, 16]
 194         ldp     A_l, A_h, [src, 16]
 195         stp     B_l, B_h, [dst, 32]
 196         ldp     B_l, B_h, [src, 32]
 197         stp     C_l, C_h, [dst, 48]
 198         ldp     C_l, C_h, [src, 48]
 199         stp     D_l, D_h, [dst, 64]!
 200         ldp     D_l, D_h, [src, 64]!
 201         subs    count, count, 64
 202         b.hi    L(loop64)
 203
 204         /* Write the last full set of 64 bytes.  The remainder is at most 64
 205            bytes, so it is safe to always copy 64 bytes from the end even if
 206            there is just 1 byte left.  */
 207 L(last64):
 208         ldp     E_l, E_h, [srcend, -64]
 209         stp     A_l, A_h, [dst, 16]
 210         ldp     A_l, A_h, [srcend, -48]
 211         stp     B_l, B_h, [dst, 32]
 212         ldp     B_l, B_h, [srcend, -32]
 213         stp     C_l, C_h, [dst, 48]
 214         ldp     C_l, C_h, [srcend, -16]
 215         stp     D_l, D_h, [dst, 64]
 216         stp     E_l, E_h, [dstend, -64]
 217         stp     A_l, A_h, [dstend, -48]
 218         stp     B_l, B_h, [dstend, -32]
 219         stp     C_l, C_h, [dstend, -16]
 220         ret
 221
 222         .p2align 4
 223 L(move_long):
 224         cbz     tmp1, 3f
 225
 226         add     srcend, src, count
 227         add     dstend, dstin, count
 228
 229         /* Align dstend to 16 byte alignment so that we don't cross cache line
 230            boundaries on both loads and stores.  There are at least 128 bytes
 231            to copy, so copy 16 bytes unaligned and then align.  The loop
 232            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 233
 234         and     tmp1, dstend, 15
 235         ldp     D_l, D_h, [srcend, -16]
 236         sub     srcend, srcend, tmp1
 237         sub     count, count, tmp1
 238         ldp     A_l, A_h, [srcend, -16]
 239         stp     D_l, D_h, [dstend, -16]
 240         ldp     B_l, B_h, [srcend, -32]
 241         ldp     C_l, C_h, [srcend, -48]
 242         ldp     D_l, D_h, [srcend, -64]!
 243         sub     dstend, dstend, tmp1
 244         subs    count, count, 128
 245         b.ls    2f
 246
 247         nop
 248 1:
 249         stp     A_l, A_h, [dstend, -16]
 250         ldp     A_l, A_h, [srcend, -16]
 251         stp     B_l, B_h, [dstend, -32]
 252         ldp     B_l, B_h, [srcend, -32]
 253         stp     C_l, C_h, [dstend, -48]
 254         ldp     C_l, C_h, [srcend, -48]
 255         stp     D_l, D_h, [dstend, -64]!
 256         ldp     D_l, D_h, [srcend, -64]!
 257         subs    count, count, 64
 258         b.hi    1b
 259
 260         /* Write the last full set of 64 bytes.  The remainder is at most 64
 261            bytes, so it is safe to always copy 64 bytes from the start even if
 262            there is just 1 byte left.  */
 263 2:
 264         ldp     G_l, G_h, [src, 48]
 265         stp     A_l, A_h, [dstend, -16]
 266         ldp     A_l, A_h, [src, 32]
 267         stp     B_l, B_h, [dstend, -32]
 268         ldp     B_l, B_h, [src, 16]
 269         stp     C_l, C_h, [dstend, -48]
 270         ldp     C_l, C_h, [src]
 271         stp     D_l, D_h, [dstend, -64]
 272         stp     G_l, G_h, [dstin, 48]
 273         stp     A_l, A_h, [dstin, 32]
 274         stp     B_l, B_h, [dstin, 16]
 275         stp     C_l, C_h, [dstin]
 276 3:      ret
 277
 278 END (MEMCPY)
 279 libc_hidden_builtin_def (MEMCPY)