sysdeps/aarch64/memcpy.S

   1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses.
  24  *
  25  */
  26
  27 #define dstin   x0
  28 #define src     x1
  29 #define count   x2
  30 #define dst     x3
  31 #define srcend  x4
  32 #define dstend  x5
  33 #define A_l     x6
  34 #define A_lw    w6
  35 #define A_h     x7
  36 #define A_hw    w7
  37 #define B_l     x8
  38 #define B_h     x9
  39 #define C_l     x10
  40 #define C_h     x11
  41 #define D_l     x12
  42 #define D_h     x13
  43 #define E_l     src
  44 #define E_h     count
  45 #define F_l     srcend
  46 #define F_h     dst
  47 #define G_l     count
  48 #define G_h     dst
  49 #define tmp1    x14
  50
  51 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
  52    medium copies of 17..96 bytes which are fully unrolled. Large copies
  53    of more than 96 bytes align the destination and use an unrolled loop
  54    processing 64 bytes per iteration.
  55    In order to share code with memmove, small and medium copies read all
  56    data before writing, allowing any kind of overlap. So small, medium
  57    and large backwards memmoves are handled by falling through into memcpy.
  58    Overlapping large forward memmoves use a loop that copies backwards.
  59 */
  60
  61 ENTRY_ALIGN (memmove, 6)
  62
  63         sub     tmp1, dstin, src
  64         cmp     count, 96
  65         ccmp    tmp1, count, 2, hi
  66         b.lo    L(move_long)
  67
  68         /* Common case falls through into memcpy.  */
  69 END (memmove)
  70 libc_hidden_builtin_def (memmove)
  71 ENTRY (memcpy)
  72
  73         add     srcend, src, count
  74         add     dstend, dstin, count
  75         cmp     count, 96
  76         b.hi    L(copy_long)
  77         cmp     count, 16
  78         b.hs    L(copy_medium)
  79
  80         /* Small copies: 0..16 bytes.  */
  81 L(copy16):
  82         tbz     count, 3, 1f
  83         ldr     A_l, [src]
  84         ldr     A_h, [srcend, -8]
  85         str     A_l, [dstin]
  86         str     A_h, [dstend, -8]
  87         ret
  88 1:
  89         tbz     count, 2, 1f
  90         ldr     A_lw, [src]
  91         ldr     A_hw, [srcend, -4]
  92         str     A_lw, [dstin]
  93         str     A_hw, [dstend, -4]
  94         ret
  95         .p2align 4
  96 1:
  97         cbz     count, 2f
  98         ldrb    A_lw, [src]
  99         tbz     count, 1, 1f
 100         ldrh    A_hw, [srcend, -2]
 101         strh    A_hw, [dstend, -2]
 102 1:      strb    A_lw, [dstin]
 103 2:      ret
 104
 105         .p2align 4
 106         /* Medium copies: 17..96 bytes.  */
 107 L(copy_medium):
 108         ldp     A_l, A_h, [src]
 109         tbnz    count, 6, L(copy96)
 110         ldp     D_l, D_h, [srcend, -16]
 111         tbz     count, 5, 1f
 112         ldp     B_l, B_h, [src, 16]
 113         ldp     C_l, C_h, [srcend, -32]
 114         stp     B_l, B_h, [dstin, 16]
 115         stp     C_l, C_h, [dstend, -32]
 116 1:
 117         stp     A_l, A_h, [dstin]
 118         stp     D_l, D_h, [dstend, -16]
 119         ret
 120
 121         .p2align 4
 122         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 123            32 bytes from the end.  */
 124 L(copy96):
 125         ldp     B_l, B_h, [src, 16]
 126         ldp     C_l, C_h, [src, 32]
 127         ldp     D_l, D_h, [src, 48]
 128         ldp     E_l, E_h, [srcend, -32]
 129         ldp     F_l, F_h, [srcend, -16]
 130         stp     A_l, A_h, [dstin]
 131         stp     B_l, B_h, [dstin, 16]
 132         stp     C_l, C_h, [dstin, 32]
 133         stp     D_l, D_h, [dstin, 48]
 134         stp     E_l, E_h, [dstend, -32]
 135         stp     F_l, F_h, [dstend, -16]
 136         ret
 137
 138         /* Align DST to 16 byte alignment so that we don't cross cache line
 139            boundaries on both loads and stores.  There are at least 96 bytes
 140            to copy, so copy 16 bytes unaligned and then align.  The loop
 141            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 142
 143         .p2align 4
 144 L(copy_long):
 145         and     tmp1, dstin, 15
 146         bic     dst, dstin, 15
 147         ldp     D_l, D_h, [src]
 148         sub     src, src, tmp1
 149         add     count, count, tmp1      /* Count is now 16 too large.  */
 150         ldp     A_l, A_h, [src, 16]
 151         stp     D_l, D_h, [dstin]
 152         ldp     B_l, B_h, [src, 32]
 153         ldp     C_l, C_h, [src, 48]
 154         ldp     D_l, D_h, [src, 64]!
 155         subs    count, count, 128 + 16  /* Test and readjust count.  */
 156         b.ls    2f
 157 1:
 158         stp     A_l, A_h, [dst, 16]
 159         ldp     A_l, A_h, [src, 16]
 160         stp     B_l, B_h, [dst, 32]
 161         ldp     B_l, B_h, [src, 32]
 162         stp     C_l, C_h, [dst, 48]
 163         ldp     C_l, C_h, [src, 48]
 164         stp     D_l, D_h, [dst, 64]!
 165         ldp     D_l, D_h, [src, 64]!
 166         subs    count, count, 64
 167         b.hi    1b
 168
 169         /* Write the last full set of 64 bytes.  The remainder is at most 64
 170            bytes, so it is safe to always copy 64 bytes from the end even if
 171            there is just 1 byte left.  */
 172 2:
 173         ldp     E_l, E_h, [srcend, -64]
 174         stp     A_l, A_h, [dst, 16]
 175         ldp     A_l, A_h, [srcend, -48]
 176         stp     B_l, B_h, [dst, 32]
 177         ldp     B_l, B_h, [srcend, -32]
 178         stp     C_l, C_h, [dst, 48]
 179         ldp     C_l, C_h, [srcend, -16]
 180         stp     D_l, D_h, [dst, 64]
 181         stp     E_l, E_h, [dstend, -64]
 182         stp     A_l, A_h, [dstend, -48]
 183         stp     B_l, B_h, [dstend, -32]
 184         stp     C_l, C_h, [dstend, -16]
 185         ret
 186
 187         .p2align 4
 188 L(move_long):
 189         cbz     tmp1, 3f
 190
 191         add     srcend, src, count
 192         add     dstend, dstin, count
 193
 194         /* Align dstend to 16 byte alignment so that we don't cross cache line
 195            boundaries on both loads and stores.  There are at least 96 bytes
 196            to copy, so copy 16 bytes unaligned and then align.  The loop
 197            copies 64 bytes per iteration and prefetches one iteration ahead.  */
 198
 199         and     tmp1, dstend, 15
 200         ldp     D_l, D_h, [srcend, -16]
 201         sub     srcend, srcend, tmp1
 202         sub     count, count, tmp1
 203         ldp     A_l, A_h, [srcend, -16]
 204         stp     D_l, D_h, [dstend, -16]
 205         ldp     B_l, B_h, [srcend, -32]
 206         ldp     C_l, C_h, [srcend, -48]
 207         ldp     D_l, D_h, [srcend, -64]!
 208         sub     dstend, dstend, tmp1
 209         subs    count, count, 128
 210         b.ls    2f
 211
 212         nop
 213 1:
 214         stp     A_l, A_h, [dstend, -16]
 215         ldp     A_l, A_h, [srcend, -16]
 216         stp     B_l, B_h, [dstend, -32]
 217         ldp     B_l, B_h, [srcend, -32]
 218         stp     C_l, C_h, [dstend, -48]
 219         ldp     C_l, C_h, [srcend, -48]
 220         stp     D_l, D_h, [dstend, -64]!
 221         ldp     D_l, D_h, [srcend, -64]!
 222         subs    count, count, 64
 223         b.hi    1b
 224
 225         /* Write the last full set of 64 bytes.  The remainder is at most 64
 226            bytes, so it is safe to always copy 64 bytes from the start even if
 227            there is just 1 byte left.  */
 228 2:
 229         ldp     G_l, G_h, [src, 48]
 230         stp     A_l, A_h, [dstend, -16]
 231         ldp     A_l, A_h, [src, 32]
 232         stp     B_l, B_h, [dstend, -32]
 233         ldp     B_l, B_h, [src, 16]
 234         stp     C_l, C_h, [dstend, -48]
 235         ldp     C_l, C_h, [src]
 236         stp     D_l, D_h, [dstend, -64]
 237         stp     G_l, G_h, [dstin, 48]
 238         stp     A_l, A_h, [dstin, 32]
 239         stp     B_l, B_h, [dstin, 16]
 240         stp     C_l, C_h, [dstin]
 241 3:      ret
 242
 243 END (memcpy)
 244 libc_hidden_builtin_def (memcpy)