sysdeps/aarch64/memcpy.S

   1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses.
  24  *
  25  */
  26
  27 #define dstin   x0
  28 #define src     x1
  29 #define count   x2
  30 #define dst     x3
  31 #define srcend  x4
  32 #define dstend  x5
  33 #define A_l     x6
  34 #define A_lw    w6
  35 #define A_h     x7
  36 #define A_hw    w7
  37 #define B_l     x8
  38 #define B_lw    w8
  39 #define B_h     x9
  40 #define C_l     x10
  41 #define C_h     x11
  42 #define D_l     x12
  43 #define D_h     x13
  44 #define E_l     src
  45 #define E_h     count
  46 #define F_l     srcend
  47 #define F_h     dst
  48 #define G_l     count
  49 #define G_h     dst
  50 #define tmp1    x14
  51
  52 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
  53    medium copies of 17..96 bytes which are fully unrolled. Large copies
  54    of more than 96 bytes align the destination and use an unrolled loop
  55    processing 64 bytes per iteration.
  56    In order to share code with memmove, small and medium copies read all
  57    data before writing, allowing any kind of overlap. So small, medium
  58    and large backwards memmoves are handled by falling through into memcpy.
  59    Overlapping large forward memmoves use a loop that copies backwards.
  60 */
  61
  62 ENTRY_ALIGN (memmove, 6)
  63
  64         sub     tmp1, dstin, src
  65         cmp     count, 96
  66         ccmp    tmp1, count, 2, hi
  67         b.lo    L(move_long)
  68
  69         /* Common case falls through into memcpy.  */
  70 END (memmove)
  71 libc_hidden_builtin_def (memmove)
  72 ENTRY (memcpy)
  73
  74         prfm    PLDL1KEEP, [src]
  75         add     srcend, src, count
  76         add     dstend, dstin, count
  77         cmp     count, 16
  78         b.ls    L(copy16)
  79         cmp     count, 96
  80         b.hi    L(copy_long)
  81
  82         /* Medium copies: 17..96 bytes.  */
  83         sub     tmp1, count, 1
  84         ldp     A_l, A_h, [src]
  85         tbnz    tmp1, 6, L(copy96)
  86         ldp     D_l, D_h, [srcend, -16]
  87         tbz     tmp1, 5, 1f
  88         ldp     B_l, B_h, [src, 16]
  89         ldp     C_l, C_h, [srcend, -32]
  90         stp     B_l, B_h, [dstin, 16]
  91         stp     C_l, C_h, [dstend, -32]
  92 1:
  93         stp     A_l, A_h, [dstin]
  94         stp     D_l, D_h, [dstend, -16]
  95         ret
  96
  97         .p2align 4
  98         /* Small copies: 0..16 bytes.  */
  99 L(copy16):
 100         cmp     count, 8
 101         b.lo    1f
 102         ldr     A_l, [src]
 103         ldr     A_h, [srcend, -8]
 104         str     A_l, [dstin]
 105         str     A_h, [dstend, -8]
 106         ret
 107         .p2align 4
 108 1:
 109         tbz     count, 2, 1f
 110         ldr     A_lw, [src]
 111         ldr     A_hw, [srcend, -4]
 112         str     A_lw, [dstin]
 113         str     A_hw, [dstend, -4]
 114         ret
 115
 116         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 117            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 118 1:
 119         cbz     count, 2f
 120         lsr     tmp1, count, 1
 121         ldrb    A_lw, [src]
 122         ldrb    A_hw, [srcend, -1]
 123         ldrb    B_lw, [src, tmp1]
 124         strb    A_lw, [dstin]
 125         strb    B_lw, [dstin, tmp1]
 126         strb    A_hw, [dstend, -1]
 127 2:      ret
 128
 129         .p2align 4
 130         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 131            32 bytes from the end.  */
 132 L(copy96):
 133         ldp     B_l, B_h, [src, 16]
 134         ldp     C_l, C_h, [src, 32]
 135         ldp     D_l, D_h, [src, 48]
 136         ldp     E_l, E_h, [srcend, -32]
 137         ldp     F_l, F_h, [srcend, -16]
 138         stp     A_l, A_h, [dstin]
 139         stp     B_l, B_h, [dstin, 16]
 140         stp     C_l, C_h, [dstin, 32]
 141         stp     D_l, D_h, [dstin, 48]
 142         stp     E_l, E_h, [dstend, -32]
 143         stp     F_l, F_h, [dstend, -16]
 144         ret
 145
 146         /* Align DST to 16 byte alignment so that we don't cross cache line
 147            boundaries on both loads and stores.  There are at least 96 bytes
 148            to copy, so copy 16 bytes unaligned and then align.  The loop
 149            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 150
 151         .p2align 4
 152 L(copy_long):
 153         and     tmp1, dstin, 15
 154         bic     dst, dstin, 15
 155         ldp     D_l, D_h, [src]
 156         sub     src, src, tmp1
 157         add     count, count, tmp1      /* Count is now 16 too large.  */
 158         ldp     A_l, A_h, [src, 16]
 159         stp     D_l, D_h, [dstin]
 160         ldp     B_l, B_h, [src, 32]
 161         ldp     C_l, C_h, [src, 48]
 162         ldp     D_l, D_h, [src, 64]!
 163         subs    count, count, 128 + 16  /* Test and readjust count.  */
 164         b.ls    2f
 165 1:
 166         stp     A_l, A_h, [dst, 16]
 167         ldp     A_l, A_h, [src, 16]
 168         stp     B_l, B_h, [dst, 32]
 169         ldp     B_l, B_h, [src, 32]
 170         stp     C_l, C_h, [dst, 48]
 171         ldp     C_l, C_h, [src, 48]
 172         stp     D_l, D_h, [dst, 64]!
 173         ldp     D_l, D_h, [src, 64]!
 174         subs    count, count, 64
 175         b.hi    1b
 176
 177         /* Write the last full set of 64 bytes.  The remainder is at most 64
 178            bytes, so it is safe to always copy 64 bytes from the end even if
 179            there is just 1 byte left.  */
 180 2:
 181         ldp     E_l, E_h, [srcend, -64]
 182         stp     A_l, A_h, [dst, 16]
 183         ldp     A_l, A_h, [srcend, -48]
 184         stp     B_l, B_h, [dst, 32]
 185         ldp     B_l, B_h, [srcend, -32]
 186         stp     C_l, C_h, [dst, 48]
 187         ldp     C_l, C_h, [srcend, -16]
 188         stp     D_l, D_h, [dst, 64]
 189         stp     E_l, E_h, [dstend, -64]
 190         stp     A_l, A_h, [dstend, -48]
 191         stp     B_l, B_h, [dstend, -32]
 192         stp     C_l, C_h, [dstend, -16]
 193         ret
 194
 195         .p2align 4
 196 L(move_long):
 197         cbz     tmp1, 3f
 198
 199         add     srcend, src, count
 200         add     dstend, dstin, count
 201
 202         /* Align dstend to 16 byte alignment so that we don't cross cache line
 203            boundaries on both loads and stores.  There are at least 96 bytes
 204            to copy, so copy 16 bytes unaligned and then align.  The loop
 205            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 206
 207         and     tmp1, dstend, 15
 208         ldp     D_l, D_h, [srcend, -16]
 209         sub     srcend, srcend, tmp1
 210         sub     count, count, tmp1
 211         ldp     A_l, A_h, [srcend, -16]
 212         stp     D_l, D_h, [dstend, -16]
 213         ldp     B_l, B_h, [srcend, -32]
 214         ldp     C_l, C_h, [srcend, -48]
 215         ldp     D_l, D_h, [srcend, -64]!
 216         sub     dstend, dstend, tmp1
 217         subs    count, count, 128
 218         b.ls    2f
 219
 220         nop
 221 1:
 222         stp     A_l, A_h, [dstend, -16]
 223         ldp     A_l, A_h, [srcend, -16]
 224         stp     B_l, B_h, [dstend, -32]
 225         ldp     B_l, B_h, [srcend, -32]
 226         stp     C_l, C_h, [dstend, -48]
 227         ldp     C_l, C_h, [srcend, -48]
 228         stp     D_l, D_h, [dstend, -64]!
 229         ldp     D_l, D_h, [srcend, -64]!
 230         subs    count, count, 64
 231         b.hi    1b
 232
 233         /* Write the last full set of 64 bytes.  The remainder is at most 64
 234            bytes, so it is safe to always copy 64 bytes from the start even if
 235            there is just 1 byte left.  */
 236 2:
 237         ldp     G_l, G_h, [src, 48]
 238         stp     A_l, A_h, [dstend, -16]
 239         ldp     A_l, A_h, [src, 32]
 240         stp     B_l, B_h, [dstend, -32]
 241         ldp     B_l, B_h, [src, 16]
 242         stp     C_l, C_h, [dstend, -48]
 243         ldp     C_l, C_h, [src]
 244         stp     D_l, D_h, [dstend, -64]
 245         stp     G_l, G_h, [dstin, 48]
 246         stp     A_l, A_h, [dstin, 32]
 247         stp     B_l, B_h, [dstin, 16]
 248         stp     C_l, C_h, [dstin]
 249 3:      ret
 250
 251 END (memcpy)
 252 libc_hidden_builtin_def (memcpy)