1 /* Copyright (C) 2013-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <https://www.gnu.org/licenses/>. */
26 #define REP8_01 0x0101010101010101
27 #define REP8_7f 0x7f7f7f7f7f7f7f7f
29 /* Parameters and result. */
35 /* Internal variables. */
52 #define neg_offset x15
54 /* Define endian dependent shift operations.
55 On big-endian early bytes are at MSB and on little-endian LSB.
56 LS_FW means shifting towards early bytes.
57 LS_BK means shifting towards later bytes.
70 nop /* Pad so that the loop below fits a cache line. */
72 ENTRY_ALIGN (strncmp, 0)
75 mov zeroones, #REP8_01
79 cbnz count, L(mutual_align)
81 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
82 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
83 can be done in parallel across the entire word. */
84 /* Start of performance-critical section -- one 64B cache line. */
90 sub tmp1, data1, zeroones
91 orr tmp2, data1, #REP8_7f
92 eor diff, data1, data2 /* Non-zero if differences found. */
93 csinv endloop, diff, xzr, hi /* Last Dword or differences. */
94 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
95 ccmp endloop, #0, #0, eq
97 /* End of performance-critical section -- one 64B cache line. */
100 #ifndef __AARCH64EB__
101 orr syndrome, diff, has_nul
102 add limit, limit, 8 /* Rewind limit to before last subs. */
104 /* Limit was reached. Check if the NUL byte or the difference
105 is before the limit. */
106 rev syndrome, syndrome
110 lsl data1, data1, pos
111 cmp limit, pos, lsr #3
112 lsl data2, data2, pos
113 /* But we need to zero-extend (char is unsigned) the value and then
114 perform a signed 32-bit subtraction. */
115 lsr data1, data1, #56
116 sub result, data1, data2, lsr #56
117 csel result, result, xzr, hi
120 /* Not reached the limit, must have found the end or a diff. */
121 tbz limit, #63, L(not_limit)
123 cbz limit, L(not_limit)
125 lsl limit, tmp1, #3 /* Bits -> bytes. */
127 lsr mask, mask, limit
128 bic data1, data1, mask
129 bic data2, data2, mask
131 /* Make sure that the NUL byte is marked in the syndrome. */
132 orr has_nul, has_nul, mask
135 /* For big-endian we cannot use the trick with the syndrome value
136 as carry-propagation can corrupt the upper bits if the trailing
137 bytes in the string contain 0x01. */
138 /* However, if there is no NUL byte in the dword, we can generate
139 the result directly. We can't just subtract the bytes as the
140 MSB might be significant. */
144 cneg result, result, lo
147 /* Re-compute the NUL-byte detection, using a byte-reversed value. */
149 sub tmp1, tmp3, zeroones
150 orr tmp2, tmp3, #REP8_7f
151 bic has_nul, tmp1, tmp2
153 orr syndrome, diff, has_nul
155 /* The most-significant-non-zero bit of the syndrome marks either the
156 first bit that is different, or the top bit of the first zero byte.
157 Shifting left now will bring the critical information into the
160 lsl data1, data1, pos
161 lsl data2, data2, pos
162 /* But we need to zero-extend (char is unsigned) the value and then
163 perform a signed 32-bit subtraction. */
164 lsr data1, data1, #56
165 sub result, data1, data2, lsr #56
170 /* Sources are mutually aligned, but are not currently at an
171 alignment boundary. Round down the addresses and then mask off
172 the bytes that precede the start point.
173 We also need to adjust the limit calculations, but without
174 overflowing if the limit is near ULONG_MAX. */
177 ldr data1, [src1], #8
178 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
179 ldr data2, [src2], #8
181 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
182 /* Adjust the limit and ensure it doesn't overflow. */
183 adds limit, limit, count
184 csinv limit, limit, xzr, lo
185 orr data1, data1, tmp2
186 orr data2, data2, tmp2
190 /* Don't bother with dwords for up to 16 bytes. */
193 b.hs L(try_misaligned_words)
196 /* Perhaps we can do better than this. */
197 ldrb data1w, [src1], #1
198 ldrb data2w, [src2], #1
199 subs limit, limit, #1
200 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
201 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
204 sub result, data1, data2
206 /* Align the SRC1 to a dword by doing a bytewise compare and then do
208 L(try_misaligned_words):
209 cbz count, L(src1_aligned)
213 sub limit, limit, count
216 ldrb data1w, [src1], #1
217 ldrb data2w, [src2], #1
219 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
221 subs count, count, #1
222 b.hi L(page_end_loop)
224 /* The following diagram explains the comparison of misaligned strings.
225 The bytes are shown in natural order. For little-endian, it is
226 reversed in the registers. The "x" bytes are before the string.
227 The "|" separates data that is loaded at one time.
228 src1 | a a a a a a a a | b b b c c c c c | . . .
229 src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
230 After shifting in each step, the data looks like this:
232 data1 a a a a a a a a b b b c c c c c b b b c c c c c
233 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
234 The bytes with "0" are eliminated from the syndrome via mask.
235 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
236 time from SRC2. The comparison happens in 3 steps. After each step
237 the loop can exit, or read from SRC1 or SRC2. */
239 /* Calculate offset from 8 byte alignment to string start in bits. No
240 need to mask offset since shifts are ignoring upper bits. */
244 neg neg_offset, offset
245 ldr data1, [src1], #8
246 ldp tmp1, tmp2, [src2], #16
247 LS_BK mask, mask, neg_offset
248 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
249 /* Skip the first compare if data in tmp1 is irrelevant. */
250 tbnz offset, 6, L(misaligned_mid_loop)
253 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
254 LS_FW data2, tmp1, offset
255 LS_BK tmp1, tmp2, neg_offset
256 subs limit, limit, #8
257 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
258 sub has_nul, data1, zeroones
259 eor diff, data1, data2 /* Non-zero if differences found. */
260 orr tmp3, data1, #REP8_7f
261 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
262 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
263 orr tmp3, endloop, has_nul
264 cbnz tmp3, L(full_check)
266 ldr data1, [src1], #8
267 L(misaligned_mid_loop):
268 /* STEP_B: Compare first part of data1 to second part of tmp2. */
269 LS_FW data2, tmp2, offset
271 /* For big-endian we do a byte reverse to avoid carry-propagation
272 problem described above. This way we can reuse the has_nul in the
273 next step and also use syndrome value trick at the end. */
275 #define data1_fixed tmp3
277 #define data1_fixed data1
279 sub has_nul, data1_fixed, zeroones
280 orr tmp3, data1_fixed, #REP8_7f
281 eor diff, data2, data1 /* Non-zero if differences found. */
282 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
286 cmp limit, neg_offset, lsr #3
287 orr syndrome, diff, has_nul
288 bic syndrome, syndrome, mask /* Ignore later bytes. */
289 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
290 cbnz tmp3, L(syndrome_check)
292 /* STEP_C: Compare second part of data1 to first part of tmp1. */
293 ldp tmp1, tmp2, [src2], #16
295 LS_BK data2, tmp1, neg_offset
296 eor diff, data2, data1 /* Non-zero if differences found. */
297 orr syndrome, diff, has_nul
298 and syndrome, syndrome, mask /* Ignore earlier bytes. */
299 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
300 cbnz tmp3, L(syndrome_check)
302 ldr data1, [src1], #8
309 cmp pos, limit, lsl #3
318 libc_hidden_builtin_def (strncmp)