sysdeps/aarch64/strncmp.S

   1 /* Copyright (C) 2013-2023 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64
  24  */
  25
  26 #define REP8_01 0x0101010101010101
  27 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  28
  29 /* Parameters and result.  */
  30 #define src1            x0
  31 #define src2            x1
  32 #define limit           x2
  33 #define result          x0
  34
  35 /* Internal variables.  */
  36 #define data1           x3
  37 #define data1w          w3
  38 #define data2           x4
  39 #define data2w          w4
  40 #define has_nul         x5
  41 #define diff            x6
  42 #define syndrome        x7
  43 #define tmp1            x8
  44 #define tmp2            x9
  45 #define tmp3            x10
  46 #define zeroones        x11
  47 #define pos             x12
  48 #define mask            x13
  49 #define endloop         x14
  50 #define count           mask
  51 #define offset          pos
  52 #define neg_offset      x15
  53
  54 /* Define endian dependent shift operations.
  55    On big-endian early bytes are at MSB and on little-endian LSB.
  56    LS_FW means shifting towards early bytes.
  57    LS_BK means shifting towards later bytes.
  58    */
  59 #ifdef __AARCH64EB__
  60 #define LS_FW lsl
  61 #define LS_BK lsr
  62 #else
  63 #define LS_FW lsr
  64 #define LS_BK lsl
  65 #endif
  66
  67         .text
  68         .p2align 6
  69         .rep 9
  70         nop     /* Pad so that the loop below fits a cache line.  */
  71         .endr
  72 ENTRY_ALIGN (strncmp, 0)
  73         cbz     limit, L(ret0)
  74         eor     tmp1, src1, src2
  75         mov     zeroones, #REP8_01
  76         tst     tmp1, #7
  77         and     count, src1, #7
  78         b.ne    L(misaligned8)
  79         cbnz    count, L(mutual_align)
  80
  81         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  82            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  83            can be done in parallel across the entire word.  */
  84         /* Start of performance-critical section  -- one 64B cache line.  */
  85 L(loop_aligned):
  86         ldr     data1, [src1], #8
  87         ldr     data2, [src2], #8
  88 L(start_realigned):
  89         subs    limit, limit, #8
  90         sub     tmp1, data1, zeroones
  91         orr     tmp2, data1, #REP8_7f
  92         eor     diff, data1, data2      /* Non-zero if differences found.  */
  93         csinv   endloop, diff, xzr, hi  /* Last Dword or differences.  */
  94         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  95         ccmp    endloop, #0, #0, eq
  96         b.eq    L(loop_aligned)
  97         /* End of performance-critical section  -- one 64B cache line.  */
  98
  99 L(full_check):
 100 #ifndef __AARCH64EB__
 101         orr     syndrome, diff, has_nul
 102         add     limit, limit, 8 /* Rewind limit to before last subs. */
 103 L(syndrome_check):
 104         /* Limit was reached. Check if the NUL byte or the difference
 105            is before the limit. */
 106         rev     syndrome, syndrome
 107         rev     data1, data1
 108         clz     pos, syndrome
 109         rev     data2, data2
 110         lsl     data1, data1, pos
 111         cmp     limit, pos, lsr #3
 112         lsl     data2, data2, pos
 113         /* But we need to zero-extend (char is unsigned) the value and then
 114            perform a signed 32-bit subtraction.  */
 115         lsr     data1, data1, #56
 116         sub     result, data1, data2, lsr #56
 117         csel result, result, xzr, hi
 118         ret
 119 #else
 120         /* Not reached the limit, must have found the end or a diff.  */
 121         tbz     limit, #63, L(not_limit)
 122         add     tmp1, limit, 8
 123         cbz     limit, L(not_limit)
 124
 125         lsl     limit, tmp1, #3 /* Bits -> bytes.  */
 126         mov     mask, #~0
 127         lsr     mask, mask, limit
 128         bic     data1, data1, mask
 129         bic     data2, data2, mask
 130
 131         /* Make sure that the NUL byte is marked in the syndrome.  */
 132         orr     has_nul, has_nul, mask
 133
 134 L(not_limit):
 135         /* For big-endian we cannot use the trick with the syndrome value
 136            as carry-propagation can corrupt the upper bits if the trailing
 137            bytes in the string contain 0x01.  */
 138         /* However, if there is no NUL byte in the dword, we can generate
 139            the result directly.  We can't just subtract the bytes as the
 140            MSB might be significant.  */
 141         cbnz    has_nul, 1f
 142         cmp     data1, data2
 143         cset    result, ne
 144         cneg    result, result, lo
 145         ret
 146 1:
 147         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 148         rev     tmp3, data1
 149         sub     tmp1, tmp3, zeroones
 150         orr     tmp2, tmp3, #REP8_7f
 151         bic     has_nul, tmp1, tmp2
 152         rev     has_nul, has_nul
 153         orr     syndrome, diff, has_nul
 154         clz     pos, syndrome
 155         /* The most-significant-non-zero bit of the syndrome marks either the
 156            first bit that is different, or the top bit of the first zero byte.
 157            Shifting left now will bring the critical information into the
 158            top bits.  */
 159 L(end_quick):
 160         lsl     data1, data1, pos
 161         lsl     data2, data2, pos
 162         /* But we need to zero-extend (char is unsigned) the value and then
 163            perform a signed 32-bit subtraction.  */
 164         lsr     data1, data1, #56
 165         sub     result, data1, data2, lsr #56
 166         ret
 167 #endif
 168
 169 L(mutual_align):
 170         /* Sources are mutually aligned, but are not currently at an
 171            alignment boundary.  Round down the addresses and then mask off
 172            the bytes that precede the start point.
 173            We also need to adjust the limit calculations, but without
 174            overflowing if the limit is near ULONG_MAX.  */
 175         bic     src1, src1, #7
 176         bic     src2, src2, #7
 177         ldr     data1, [src1], #8
 178         neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
 179         ldr     data2, [src2], #8
 180         mov     tmp2, #~0
 181         LS_FW   tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 182         /* Adjust the limit and ensure it doesn't overflow.  */
 183         adds    limit, limit, count
 184         csinv   limit, limit, xzr, lo
 185         orr     data1, data1, tmp2
 186         orr     data2, data2, tmp2
 187         b       L(start_realigned)
 188
 189         .p2align 6
 190         /* Don't bother with dwords for up to 16 bytes.  */
 191 L(misaligned8):
 192         cmp     limit, #16
 193         b.hs    L(try_misaligned_words)
 194
 195 L(byte_loop):
 196         /* Perhaps we can do better than this.  */
 197         ldrb    data1w, [src1], #1
 198         ldrb    data2w, [src2], #1
 199         subs    limit, limit, #1
 200         ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
 201         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 202         b.eq    L(byte_loop)
 203 L(done):
 204         sub     result, data1, data2
 205         ret
 206         /* Align the SRC1 to a dword by doing a bytewise compare and then do
 207            the dword loop.  */
 208 L(try_misaligned_words):
 209         cbz     count, L(src1_aligned)
 210
 211         neg     count, count
 212         and     count, count, #7
 213         sub     limit, limit, count
 214
 215 L(page_end_loop):
 216         ldrb    data1w, [src1], #1
 217         ldrb    data2w, [src2], #1
 218         cmp     data1w, #1
 219         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 220         b.ne    L(done)
 221         subs    count, count, #1
 222         b.hi    L(page_end_loop)
 223
 224         /* The following diagram explains the comparison of misaligned strings.
 225            The bytes are shown in natural order. For little-endian, it is
 226            reversed in the registers. The "x" bytes are before the string.
 227            The "|" separates data that is loaded at one time.
 228            src1     | a a a a a a a a | b b b c c c c c | . . .
 229            src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
 230            After shifting in each step, the data looks like this:
 231                         STEP_A              STEP_B              STEP_C
 232            data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
 233            data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 234            The bytes with "0" are eliminated from the syndrome via mask.
 235            Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
 236            time from SRC2. The comparison happens in 3 steps. After each step
 237            the loop can exit, or read from SRC1 or SRC2. */
 238 L(src1_aligned):
 239         /* Calculate offset from 8 byte alignment to string start in bits. No
 240            need to mask offset since shifts are ignoring upper bits. */
 241         lsl     offset, src2, #3
 242         bic     src2, src2, #0xf
 243         mov     mask, -1
 244         neg     neg_offset, offset
 245         ldr     data1, [src1], #8
 246         ldp     tmp1, tmp2, [src2], #16
 247         LS_BK   mask, mask, neg_offset
 248         and     neg_offset, neg_offset, #63     /* Need actual value for cmp later. */
 249         /* Skip the first compare if data in tmp1 is irrelevant. */
 250         tbnz    offset, 6, L(misaligned_mid_loop)
 251
 252 L(loop_misaligned):
 253         /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
 254         LS_FW   data2, tmp1, offset
 255         LS_BK   tmp1, tmp2, neg_offset
 256         subs    limit, limit, #8
 257         orr     data2, data2, tmp1      /* 8 bytes from SRC2 combined from two regs.*/
 258         sub     has_nul, data1, zeroones
 259         eor     diff, data1, data2      /* Non-zero if differences found.  */
 260         orr     tmp3, data1, #REP8_7f
 261         csinv   endloop, diff, xzr, hi  /* If limit, set to all ones. */
 262         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL byte found in SRC1. */
 263         orr     tmp3, endloop, has_nul
 264         cbnz    tmp3, L(full_check)
 265
 266         ldr     data1, [src1], #8
 267 L(misaligned_mid_loop):
 268         /* STEP_B: Compare first part of data1 to second part of tmp2. */
 269         LS_FW   data2, tmp2, offset
 270 #ifdef __AARCH64EB__
 271         /* For big-endian we do a byte reverse to avoid carry-propagation
 272         problem described above. This way we can reuse the has_nul in the
 273         next step and also use syndrome value trick at the end. */
 274         rev     tmp3, data1
 275         #define data1_fixed tmp3
 276 #else
 277         #define data1_fixed data1
 278 #endif
 279         sub     has_nul, data1_fixed, zeroones
 280         orr     tmp3, data1_fixed, #REP8_7f
 281         eor     diff, data2, data1      /* Non-zero if differences found.  */
 282         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL terminator.  */
 283 #ifdef __AARCH64EB__
 284         rev     has_nul, has_nul
 285 #endif
 286         cmp     limit, neg_offset, lsr #3
 287         orr     syndrome, diff, has_nul
 288         bic     syndrome, syndrome, mask        /* Ignore later bytes. */
 289         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 290         cbnz    tmp3, L(syndrome_check)
 291
 292         /* STEP_C: Compare second part of data1 to first part of tmp1. */
 293         ldp     tmp1, tmp2, [src2], #16
 294         cmp     limit, #8
 295         LS_BK   data2, tmp1, neg_offset
 296         eor     diff, data2, data1      /* Non-zero if differences found.  */
 297         orr     syndrome, diff, has_nul
 298         and     syndrome, syndrome, mask        /* Ignore earlier bytes. */
 299         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 300         cbnz    tmp3, L(syndrome_check)
 301
 302         ldr     data1, [src1], #8
 303         sub     limit, limit, #8
 304         b       L(loop_misaligned)
 305
 306 #ifdef  __AARCH64EB__
 307 L(syndrome_check):
 308         clz     pos, syndrome
 309         cmp     pos, limit, lsl #3
 310         b.lo    L(end_quick)
 311 #endif
 312
 313 L(ret0):
 314         mov     result, #0
 315         ret
 316
 317 END (strncmp)
 318 libc_hidden_builtin_def (strncmp)