sysdeps/aarch64/strcpy.S

   1 /* strcpy/stpcpy - copy a string returning pointer to start/end.
   2    Copyright (C) 2013-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
  20
  21    To test the page crossing code path more thoroughly, compile with
  22    -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
  23    the slower entry path.  This option is not intended for production use.  */
  24
  25 #include <sysdep.h>
  26
  27 /* Assumptions:
  28  *
  29  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  30  */
  31
  32 /* Arguments and results.  */
  33 #define dstin           x0
  34 #define srcin           x1
  35
  36 /* Locals and temporaries.  */
  37 #define src             x2
  38 #define dst             x3
  39 #define data1           x4
  40 #define data1w          w4
  41 #define data2           x5
  42 #define data2w          w5
  43 #define has_nul1        x6
  44 #define has_nul2        x7
  45 #define tmp1            x8
  46 #define tmp2            x9
  47 #define tmp3            x10
  48 #define tmp4            x11
  49 #define zeroones        x12
  50 #define data1a          x13
  51 #define data2a          x14
  52 #define pos             x15
  53 #define len             x16
  54 #define to_align        x17
  55
  56 #ifdef BUILD_STPCPY
  57 #define STRCPY __stpcpy
  58 #else
  59 #define STRCPY strcpy
  60 #endif
  61
  62         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  63            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  64            can be done in parallel across the entire word.  */
  65
  66 #define REP8_01 0x0101010101010101
  67 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  68 #define REP8_80 0x8080808080808080
  69
  70         /* AArch64 systems have a minimum page size of 4k.  We can do a quick
  71            page size check for crossing this boundary on entry and if we
  72            do not, then we can short-circuit much of the entry code.  We
  73            expect early page-crossing strings to be rare (probability of
  74            16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
  75            predictable, even with random strings.
  76
  77            We don't bother checking for larger page sizes, the cost of setting
  78            up the correct page size is just not worth the extra gain from
  79            a small reduction in the cases taking the slow path.  Note that
  80            we only care about whether the first fetch, which may be
  81            misaligned, crosses a page boundary - after that we move to aligned
  82            fetches for the remainder of the string.  */
  83
  84 #ifdef STRCPY_TEST_PAGE_CROSS
  85         /* Make everything that isn't Qword aligned look like a page cross.  */
  86 #define MIN_PAGE_P2 4
  87 #else
  88 #define MIN_PAGE_P2 12
  89 #endif
  90
  91 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
  92
  93 ENTRY_ALIGN (STRCPY, 6)
  94         DELOUSE (0)
  95         DELOUSE (1)
  96         /* For moderately short strings, the fastest way to do the copy is to
  97            calculate the length of the string in the same way as strlen, then
  98            essentially do a memcpy of the result.  This avoids the need for
  99            multiple byte copies and further means that by the time we
 100            reach the bulk copy loop we know we can always use DWord
 101            accesses.  We expect strcpy to rarely be called repeatedly
 102            with the same source string, so branch prediction is likely to
 103            always be difficult - we mitigate against this by preferring
 104            conditional select operations over branches whenever this is
 105            feasible.  */
 106         and     tmp2, srcin, #(MIN_PAGE_SIZE - 1)
 107         mov     zeroones, #REP8_01
 108         and     to_align, srcin, #15
 109         cmp     tmp2, #(MIN_PAGE_SIZE - 16)
 110         neg     tmp1, to_align
 111         /* The first fetch will straddle a (possible) page boundary iff
 112            srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
 113            aligned string will never fail the page align check, so will
 114            always take the fast path.  */
 115         b.gt    L(page_cross)
 116
 117 L(page_cross_ok):
 118         ldp     data1, data2, [srcin]
 119 #ifdef __AARCH64EB__
 120         /* Because we expect the end to be found within 16 characters
 121            (profiling shows this is the most common case), it's worth
 122            swapping the bytes now to save having to recalculate the
 123            termination syndrome later.  We preserve data1 and data2
 124            so that we can re-use the values later on.  */
 125         rev     tmp2, data1
 126         sub     tmp1, tmp2, zeroones
 127         orr     tmp2, tmp2, #REP8_7f
 128         bics    has_nul1, tmp1, tmp2
 129         b.ne    L(fp_le8)
 130         rev     tmp4, data2
 131         sub     tmp3, tmp4, zeroones
 132         orr     tmp4, tmp4, #REP8_7f
 133 #else
 134         sub     tmp1, data1, zeroones
 135         orr     tmp2, data1, #REP8_7f
 136         bics    has_nul1, tmp1, tmp2
 137         b.ne    L(fp_le8)
 138         sub     tmp3, data2, zeroones
 139         orr     tmp4, data2, #REP8_7f
 140 #endif
 141         bics    has_nul2, tmp3, tmp4
 142         b.eq    L(bulk_entry)
 143
 144         /* The string is short (<=16 bytes).  We don't know exactly how
 145            short though, yet.  Work out the exact length so that we can
 146            quickly select the optimal copy strategy.  */
 147 L(fp_gt8):
 148         rev     has_nul2, has_nul2
 149         clz     pos, has_nul2
 150         mov     tmp2, #56
 151         add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
 152         sub     pos, tmp2, pos
 153 #ifdef __AARCH64EB__
 154         lsr     data2, data2, pos
 155 #else
 156         lsl     data2, data2, pos
 157 #endif
 158         str     data2, [dst, #1]
 159         str     data1, [dstin]
 160 #ifdef BUILD_STPCPY
 161         add     dstin, dst, #8
 162 #endif
 163         ret
 164
 165 L(fp_le8):
 166         rev     has_nul1, has_nul1
 167         clz     pos, has_nul1
 168         add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
 169         subs    tmp2, pos, #24                  /* Pos in bits. */
 170         b.lt    L(fp_lt4)
 171 #ifdef __AARCH64EB__
 172         mov     tmp2, #56
 173         sub     pos, tmp2, pos
 174         lsr     data2, data1, pos
 175         lsr     data1, data1, #32
 176 #else
 177         lsr     data2, data1, tmp2
 178 #endif
 179         /* 4->7 bytes to copy.  */
 180         str     data2w, [dst, #-3]
 181         str     data1w, [dstin]
 182 #ifdef BUILD_STPCPY
 183         mov     dstin, dst
 184 #endif
 185         ret
 186 L(fp_lt4):
 187         cbz     pos, L(fp_lt2)
 188         /* 2->3 bytes to copy.  */
 189 #ifdef __AARCH64EB__
 190         lsr     data1, data1, #48
 191 #endif
 192         strh    data1w, [dstin]
 193         /* Fall-through, one byte (max) to go.  */
 194 L(fp_lt2):
 195         /* Null-terminated string.  Last character must be zero!  */
 196         strb    wzr, [dst]
 197 #ifdef BUILD_STPCPY
 198         mov     dstin, dst
 199 #endif
 200         ret
 201
 202         .p2align 6
 203         /* Aligning here ensures that the entry code and main loop all lies
 204            within one 64-byte cache line.  */
 205 L(bulk_entry):
 206         sub     to_align, to_align, #16
 207         stp     data1, data2, [dstin]
 208         sub     src, srcin, to_align
 209         sub     dst, dstin, to_align
 210         b       L(entry_no_page_cross)
 211
 212         /* The inner loop deals with two Dwords at a time.  This has a
 213            slightly higher start-up cost, but we should win quite quickly,
 214            especially on cores with a high number of issue slots per
 215            cycle, as we get much better parallelism out of the operations.  */
 216 L(main_loop):
 217         stp     data1, data2, [dst], #16
 218 L(entry_no_page_cross):
 219         ldp     data1, data2, [src], #16
 220         sub     tmp1, data1, zeroones
 221         orr     tmp2, data1, #REP8_7f
 222         sub     tmp3, data2, zeroones
 223         orr     tmp4, data2, #REP8_7f
 224         bic     has_nul1, tmp1, tmp2
 225         bics    has_nul2, tmp3, tmp4
 226         ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
 227         b.eq    L(main_loop)
 228
 229         /* Since we know we are copying at least 16 bytes, the fastest way
 230            to deal with the tail is to determine the location of the
 231            trailing NUL, then (re)copy the 16 bytes leading up to that.  */
 232         cmp     has_nul1, #0
 233 #ifdef __AARCH64EB__
 234         /* For big-endian, carry propagation (if the final byte in the
 235            string is 0x01) means we cannot use has_nul directly.  The
 236            easiest way to get the correct byte is to byte-swap the data
 237            and calculate the syndrome a second time.  */
 238         csel    data1, data1, data2, ne
 239         rev     data1, data1
 240         sub     tmp1, data1, zeroones
 241         orr     tmp2, data1, #REP8_7f
 242         bic     has_nul1, tmp1, tmp2
 243 #else
 244         csel    has_nul1, has_nul1, has_nul2, ne
 245 #endif
 246         rev     has_nul1, has_nul1
 247         clz     pos, has_nul1
 248         add     tmp1, pos, #72
 249         add     pos, pos, #8
 250         csel    pos, pos, tmp1, ne
 251         add     src, src, pos, lsr #3
 252         add     dst, dst, pos, lsr #3
 253         ldp     data1, data2, [src, #-32]
 254         stp     data1, data2, [dst, #-16]
 255 #ifdef BUILD_STPCPY
 256         sub     dstin, dst, #1
 257 #endif
 258         ret
 259
 260 L(page_cross):
 261         bic     src, srcin, #15
 262         /* Start by loading two words at [srcin & ~15], then forcing the
 263            bytes that precede srcin to 0xff.  This means they never look
 264            like termination bytes.  */
 265         ldp     data1, data2, [src]
 266         lsl     tmp1, tmp1, #3  /* Bytes beyond alignment -> bits.  */
 267         tst     to_align, #7
 268         csetm   tmp2, ne
 269 #ifdef __AARCH64EB__
 270         lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 271 #else
 272         lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 273 #endif
 274         orr     data1, data1, tmp2
 275         orr     data2a, data2, tmp2
 276         cmp     to_align, #8
 277         csinv   data1, data1, xzr, lt
 278         csel    data2, data2, data2a, lt
 279         sub     tmp1, data1, zeroones
 280         orr     tmp2, data1, #REP8_7f
 281         sub     tmp3, data2, zeroones
 282         orr     tmp4, data2, #REP8_7f
 283         bic     has_nul1, tmp1, tmp2
 284         bics    has_nul2, tmp3, tmp4
 285         ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
 286         b.eq    L(page_cross_ok)
 287         /* We now need to make data1 and data2 look like they've been
 288            loaded directly from srcin.  Do a rotate on the 128-bit value.  */
 289         lsl     tmp1, to_align, #3      /* Bytes->bits.  */
 290         neg     tmp2, to_align, lsl #3
 291 #ifdef __AARCH64EB__
 292         lsl     data1a, data1, tmp1
 293         lsr     tmp4, data2, tmp2
 294         lsl     data2, data2, tmp1
 295         orr     tmp4, tmp4, data1a
 296         cmp     to_align, #8
 297         csel    data1, tmp4, data2, lt
 298         rev     tmp2, data1
 299         rev     tmp4, data2
 300         sub     tmp1, tmp2, zeroones
 301         orr     tmp2, tmp2, #REP8_7f
 302         sub     tmp3, tmp4, zeroones
 303         orr     tmp4, tmp4, #REP8_7f
 304 #else
 305         lsr     data1a, data1, tmp1
 306         lsl     tmp4, data2, tmp2
 307         lsr     data2, data2, tmp1
 308         orr     tmp4, tmp4, data1a
 309         cmp     to_align, #8
 310         csel    data1, tmp4, data2, lt
 311         sub     tmp1, data1, zeroones
 312         orr     tmp2, data1, #REP8_7f
 313         sub     tmp3, data2, zeroones
 314         orr     tmp4, data2, #REP8_7f
 315 #endif
 316         bic     has_nul1, tmp1, tmp2
 317         cbnz    has_nul1, L(fp_le8)
 318         bic     has_nul2, tmp3, tmp4
 319         b       L(fp_gt8)
 320 END (STRCPY)
 321
 322 #ifdef BUILD_STPCPY
 323 weak_alias (__stpcpy, stpcpy)
 324 libc_hidden_def (__stpcpy)
 325 libc_hidden_builtin_def (stpcpy)
 326 #else
 327 libc_hidden_builtin_def (strcpy)
 328 #endif