sysdeps/aarch64/memcpy.S

   1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses.
  24  *
  25  */
  26
  27 #define dstin   x0
  28 #define src     x1
  29 #define count   x2
  30 #define dst     x3
  31 #define srcend  x4
  32 #define dstend  x5
  33 #define A_l     x6
  34 #define A_lw    w6
  35 #define A_h     x7
  36 #define A_hw    w7
  37 #define B_l     x8
  38 #define B_lw    w8
  39 #define B_h     x9
  40 #define C_l     x10
  41 #define C_h     x11
  42 #define D_l     x12
  43 #define D_h     x13
  44 #define E_l     src
  45 #define E_h     count
  46 #define F_l     srcend
  47 #define F_h     dst
  48 #define G_l     count
  49 #define G_h     dst
  50 #define tmp1    x14
  51
  52 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
  53    medium copies of 17..96 bytes which are fully unrolled. Large copies
  54    of more than 96 bytes align the destination and use an unrolled loop
  55    processing 64 bytes per iteration.
  56    In order to share code with memmove, small and medium copies read all
  57    data before writing, allowing any kind of overlap. So small, medium
  58    and large backwards memmoves are handled by falling through into memcpy.
  59    Overlapping large forward memmoves use a loop that copies backwards.
  60 */
  61
  62 ENTRY_ALIGN (memmove, 6)
  63
  64         DELOUSE (0)
  65         DELOUSE (1)
  66         DELOUSE (2)
  67
  68         sub     tmp1, dstin, src
  69         cmp     count, 96
  70         ccmp    tmp1, count, 2, hi
  71         b.lo    L(move_long)
  72
  73         /* Common case falls through into memcpy.  */
  74 END (memmove)
  75 libc_hidden_builtin_def (memmove)
  76 ENTRY (memcpy)
  77
  78         DELOUSE (0)
  79         DELOUSE (1)
  80         DELOUSE (2)
  81
  82         prfm    PLDL1KEEP, [src]
  83         add     srcend, src, count
  84         add     dstend, dstin, count
  85         cmp     count, 16
  86         b.ls    L(copy16)
  87         cmp     count, 96
  88         b.hi    L(copy_long)
  89
  90         /* Medium copies: 17..96 bytes.  */
  91         sub     tmp1, count, 1
  92         ldp     A_l, A_h, [src]
  93         tbnz    tmp1, 6, L(copy96)
  94         ldp     D_l, D_h, [srcend, -16]
  95         tbz     tmp1, 5, 1f
  96         ldp     B_l, B_h, [src, 16]
  97         ldp     C_l, C_h, [srcend, -32]
  98         stp     B_l, B_h, [dstin, 16]
  99         stp     C_l, C_h, [dstend, -32]
 100 1:
 101         stp     A_l, A_h, [dstin]
 102         stp     D_l, D_h, [dstend, -16]
 103         ret
 104
 105         .p2align 4
 106         /* Small copies: 0..16 bytes.  */
 107 L(copy16):
 108         cmp     count, 8
 109         b.lo    1f
 110         ldr     A_l, [src]
 111         ldr     A_h, [srcend, -8]
 112         str     A_l, [dstin]
 113         str     A_h, [dstend, -8]
 114         ret
 115         .p2align 4
 116 1:
 117         tbz     count, 2, 1f
 118         ldr     A_lw, [src]
 119         ldr     A_hw, [srcend, -4]
 120         str     A_lw, [dstin]
 121         str     A_hw, [dstend, -4]
 122         ret
 123
 124         /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 125            byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 126 1:
 127         cbz     count, 2f
 128         lsr     tmp1, count, 1
 129         ldrb    A_lw, [src]
 130         ldrb    A_hw, [srcend, -1]
 131         ldrb    B_lw, [src, tmp1]
 132         strb    A_lw, [dstin]
 133         strb    B_lw, [dstin, tmp1]
 134         strb    A_hw, [dstend, -1]
 135 2:      ret
 136
 137         .p2align 4
 138         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 139            32 bytes from the end.  */
 140 L(copy96):
 141         ldp     B_l, B_h, [src, 16]
 142         ldp     C_l, C_h, [src, 32]
 143         ldp     D_l, D_h, [src, 48]
 144         ldp     E_l, E_h, [srcend, -32]
 145         ldp     F_l, F_h, [srcend, -16]
 146         stp     A_l, A_h, [dstin]
 147         stp     B_l, B_h, [dstin, 16]
 148         stp     C_l, C_h, [dstin, 32]
 149         stp     D_l, D_h, [dstin, 48]
 150         stp     E_l, E_h, [dstend, -32]
 151         stp     F_l, F_h, [dstend, -16]
 152         ret
 153
 154         /* Align DST to 16 byte alignment so that we don't cross cache line
 155            boundaries on both loads and stores.  There are at least 96 bytes
 156            to copy, so copy 16 bytes unaligned and then align.  The loop
 157            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 158
 159         .p2align 4
 160 L(copy_long):
 161         and     tmp1, dstin, 15
 162         bic     dst, dstin, 15
 163         ldp     D_l, D_h, [src]
 164         sub     src, src, tmp1
 165         add     count, count, tmp1      /* Count is now 16 too large.  */
 166         ldp     A_l, A_h, [src, 16]
 167         stp     D_l, D_h, [dstin]
 168         ldp     B_l, B_h, [src, 32]
 169         ldp     C_l, C_h, [src, 48]
 170         ldp     D_l, D_h, [src, 64]!
 171         subs    count, count, 128 + 16  /* Test and readjust count.  */
 172         b.ls    2f
 173 1:
 174         stp     A_l, A_h, [dst, 16]
 175         ldp     A_l, A_h, [src, 16]
 176         stp     B_l, B_h, [dst, 32]
 177         ldp     B_l, B_h, [src, 32]
 178         stp     C_l, C_h, [dst, 48]
 179         ldp     C_l, C_h, [src, 48]
 180         stp     D_l, D_h, [dst, 64]!
 181         ldp     D_l, D_h, [src, 64]!
 182         subs    count, count, 64
 183         b.hi    1b
 184
 185         /* Write the last full set of 64 bytes.  The remainder is at most 64
 186            bytes, so it is safe to always copy 64 bytes from the end even if
 187            there is just 1 byte left.  */
 188 2:
 189         ldp     E_l, E_h, [srcend, -64]
 190         stp     A_l, A_h, [dst, 16]
 191         ldp     A_l, A_h, [srcend, -48]
 192         stp     B_l, B_h, [dst, 32]
 193         ldp     B_l, B_h, [srcend, -32]
 194         stp     C_l, C_h, [dst, 48]
 195         ldp     C_l, C_h, [srcend, -16]
 196         stp     D_l, D_h, [dst, 64]
 197         stp     E_l, E_h, [dstend, -64]
 198         stp     A_l, A_h, [dstend, -48]
 199         stp     B_l, B_h, [dstend, -32]
 200         stp     C_l, C_h, [dstend, -16]
 201         ret
 202
 203         .p2align 4
 204 L(move_long):
 205         cbz     tmp1, 3f
 206
 207         add     srcend, src, count
 208         add     dstend, dstin, count
 209
 210         /* Align dstend to 16 byte alignment so that we don't cross cache line
 211            boundaries on both loads and stores.  There are at least 96 bytes
 212            to copy, so copy 16 bytes unaligned and then align.  The loop
 213            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 214
 215         and     tmp1, dstend, 15
 216         ldp     D_l, D_h, [srcend, -16]
 217         sub     srcend, srcend, tmp1
 218         sub     count, count, tmp1
 219         ldp     A_l, A_h, [srcend, -16]
 220         stp     D_l, D_h, [dstend, -16]
 221         ldp     B_l, B_h, [srcend, -32]
 222         ldp     C_l, C_h, [srcend, -48]
 223         ldp     D_l, D_h, [srcend, -64]!
 224         sub     dstend, dstend, tmp1
 225         subs    count, count, 128
 226         b.ls    2f
 227
 228         nop
 229 1:
 230         stp     A_l, A_h, [dstend, -16]
 231         ldp     A_l, A_h, [srcend, -16]
 232         stp     B_l, B_h, [dstend, -32]
 233         ldp     B_l, B_h, [srcend, -32]
 234         stp     C_l, C_h, [dstend, -48]
 235         ldp     C_l, C_h, [srcend, -48]
 236         stp     D_l, D_h, [dstend, -64]!
 237         ldp     D_l, D_h, [srcend, -64]!
 238         subs    count, count, 64
 239         b.hi    1b
 240
 241         /* Write the last full set of 64 bytes.  The remainder is at most 64
 242            bytes, so it is safe to always copy 64 bytes from the start even if
 243            there is just 1 byte left.  */
 244 2:
 245         ldp     G_l, G_h, [src, 48]
 246         stp     A_l, A_h, [dstend, -16]
 247         ldp     A_l, A_h, [src, 32]
 248         stp     B_l, B_h, [dstend, -32]
 249         ldp     B_l, B_h, [src, 16]
 250         stp     C_l, C_h, [dstend, -48]
 251         ldp     C_l, C_h, [src]
 252         stp     D_l, D_h, [dstend, -64]
 253         stp     G_l, G_h, [dstin, 48]
 254         stp     A_l, A_h, [dstin, 32]
 255         stp     B_l, B_h, [dstin, 16]
 256         stp     C_l, C_h, [dstin]
 257 3:      ret
 258
 259 END (memcpy)
 260 libc_hidden_builtin_def (memcpy)