2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include <isa-level.h>
22 /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23 so we need this to build for ISA V2 builds. */
24 #if ISA_SHOULD_BUILD (2)
29 # define MEMCMP __memcmp_sse2
32 # ifdef USE_AS_WMEMCMP
33 # define PCMPEQ pcmpeqd
35 # define SIZE_OFFSET (0)
37 # define PCMPEQ pcmpeqb
41 # ifdef USE_AS_MEMCMPEQ
42 # define SIZE_OFFSET (0)
43 # define CHECK_CMP(x, y) subl x, y
46 # define SIZE_OFFSET (CHAR_PER_VEC * 2)
48 # define CHECK_CMP(x, y) cmpl x, y
52 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
55 # define MEMCMP memcmp
61 /* Clear the upper 32 bits. */
64 # ifdef USE_AS_WMEMCMP
65 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
66 in ecx for code size. This is preferable to using `incw` as
67 it avoids partial register stalls on older hardware (pre
71 cmpq $CHAR_PER_VEC, %rdx
74 # ifdef USE_AS_WMEMCMP
75 /* saves a byte of code keeping the fall through path n = [2, 4]
76 in the initial cache line. */
85 jnz L(ret_nonzero_vec_start_0)
87 movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
88 movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
92 jnz L(ret_nonzero_vec_end_0_adj)
100 # ifdef USE_AS_MEMCMPEQ
104 movl -4(%rsi, %rdx), %esi
105 subl -4(%rdi, %rdx), %esi
110 /* Combine comparisons for lo and hi 4-byte comparisons. */
111 movl -4(%rsi, %rdx), %ecx
112 movl -4(%rdi, %rdx), %eax
119 /* Only compute proper return if not-equal. */
128 # ifdef USE_AS_MEMCMPEQ
132 movq -8(%rsi, %rdx), %rcx
133 subq -8(%rdi, %rdx), %rcx
135 /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
142 /* Only compute proper return if not-equal. */
146 movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
147 movq -8(%rdi, %rdx, CHAR_SIZE), %rax
148 /* Only compute proper return if not-equal. */
158 /* Flag set by earlier comparison against 1. */
160 # ifdef USE_AS_WMEMCMP
166 leal -1(%rdx, %rdx), %eax
174 /* Fits in aligning bytes. */
179 # ifdef USE_AS_WMEMCMP
181 L(ret_nonzero_vec_start_0):
183 movl (%rdi, %rax), %ecx
185 cmpl (%rsi, %rax), %ecx
186 /* NB: no partial register stall here because xorl zero idiom
189 leal -1(%rdx, %rdx), %eax
193 # ifndef USE_AS_MEMCMPEQ
196 /* Need to bswap to get proper return without branch. */
207 # ifdef USE_AS_MEMCMPEQ
208 /* No reason to add to dependency chain on rdx. Saving a the
209 bytes here doesn't change number of fetch blocks. */
213 /* We need the code size to prevent taking an extra fetch block.
221 # ifdef USE_AS_MEMCMPEQ
224 movzbl -1(%rsi, %rdx), %esi
225 movzbl -1(%rdi, %rdx), %edi
232 /* Implicit right shift by one. We just need to displace the
237 /* Eat a partial register stall here. Saves code stopping
238 L(cmp_0_3) from bleeding into the next fetch block and saves
240 movb (%rsi, %rdx), %cl
241 movzbl (%rdi, %rdx), %edi
250 # ifndef USE_AS_WMEMCMP
251 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
252 in ecx for code size. This is preferable to using `incw` as
253 it avoids partial register stalls on older hardware (pre
262 jnz L(ret_nonzero_vec_start_0)
263 # if SIZE_OFFSET == 0
264 cmpq $(CHAR_PER_VEC * 2), %rdx
266 /* Offset rdx. Saves just enough code size to keep the
267 L(last_2x_vec) case and the non-zero return in a single
269 subq $(CHAR_PER_VEC * 2), %rdx
273 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
274 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
278 # ifndef USE_AS_MEMCMPEQ
279 /* Don't use `incw ax` as machines this code runs on are liable
280 to have partial register stall. */
281 jnz L(ret_nonzero_vec_end_0)
283 /* Various return targets for memcmpeq. Will always be hot in
284 Icache and get short encoding. */
285 L(ret_nonzero_vec_start_1):
286 L(ret_nonzero_vec_start_0):
287 L(ret_nonzero_vec_end_0):
291 # ifndef USE_AS_MEMCMPEQ
292 # ifdef USE_AS_WMEMCMP
294 L(ret_nonzero_vec_end_0_adj):
299 L(ret_nonzero_vec_end_0):
301 # ifdef USE_AS_WMEMCMP
302 leal (%rax, %rdx, CHAR_SIZE), %eax
303 movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
305 cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
306 /* NB: no partial register stall here because xorl zero idiom
309 leal -1(%rdx, %rdx), %eax
311 /* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
312 is negative value of the sum will be usable as a 64-bit offset
313 (negative 32-bit numbers zero-extend to a large and often
314 out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
315 an invariant when `memcmp` is used correctly, but if the input
316 strings `rsi`/`rdi` are concurrently modified as the function
317 runs (there is a Data-Race) it is possible for `rax` + `rdx` to
318 be negative. Given that there is virtually no extra to cost
319 using `addq` instead of `addl` we may as well protect the
322 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
323 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
327 # ifndef USE_AS_WMEMCMP
329 L(ret_nonzero_vec_start_0):
331 movzbl (%rsi, %rax), %ecx
332 movzbl (%rdi, %rax), %eax
341 movups (VEC_SIZE * 1)(%rsi), %xmm0
342 movups (VEC_SIZE * 1)(%rdi), %xmm1
346 jnz L(ret_nonzero_vec_start_1)
348 cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
351 cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
354 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
355 This can harm performance if non-zero return in [65, 80] or
356 [97, 112] but helps performance otherwise. Generally zero-
358 movups (VEC_SIZE * 2)(%rsi), %xmm0
359 movups (VEC_SIZE * 2)(%rdi), %xmm1
361 movups (VEC_SIZE * 3)(%rsi), %xmm2
362 movups (VEC_SIZE * 3)(%rdi), %xmm3
367 CHECK_CMP (%ecx, %eax)
368 jnz L(ret_nonzero_vec_start_2_3)
370 cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
373 movups (VEC_SIZE * 4)(%rsi), %xmm0
374 movups (VEC_SIZE * 4)(%rdi), %xmm1
376 movups (VEC_SIZE * 5)(%rsi), %xmm2
377 movups (VEC_SIZE * 5)(%rdi), %xmm3
382 CHECK_CMP (%ecx, %eax)
383 # ifdef USE_AS_MEMCMPEQ
387 jnz L(ret_nonzero_vec_start_4_5)
391 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
392 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
394 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
395 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
400 # ifdef USE_AS_MEMCMPEQ
401 /* Various return targets for memcmpeq. Will always be hot in
402 Icache and get short encoding. */
403 L(ret_nonzero_vec_start_2_3):
404 L(ret_nonzero_vec_start_4_5):
407 jnz L(ret_nonzero_vec_end_1)
411 L(ret_nonzero_vec_end_1):
413 /* High 16 bits of eax guaranteed to be all ones. Rotate them in
414 to we can do `or + not` with just `xor`. */
417 /* Partial register stall. */
420 # ifdef USE_AS_WMEMCMP
421 leal (%rax, %rdx, CHAR_SIZE), %eax
422 movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
424 cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
425 /* NB: no partial register stall here because xorl zero idiom
428 leal -1(%rdx, %rdx), %eax
431 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
432 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
438 L(ret_nonzero_vec_start_4_5):
441 leal 1(%rax, %rdx), %eax
443 # ifdef USE_AS_WMEMCMP
444 movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
446 cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
447 /* NB: no partial register stall here because xorl zero idiom
450 leal -1(%rdx, %rdx), %eax
452 movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
453 movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
459 L(ret_nonzero_vec_start_1):
461 # ifdef USE_AS_WMEMCMP
462 movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
464 cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
465 /* NB: no partial register stall here because xorl zero idiom
468 leal -1(%rdx, %rdx), %eax
470 movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
471 movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
480 leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
481 andq $(VEC_SIZE * -1), %rdi
485 movups (VEC_SIZE * 2)(%rsi), %xmm0
486 movups (VEC_SIZE * 3)(%rsi), %xmm1
488 PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
489 PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
491 movups (VEC_SIZE * 4)(%rsi), %xmm2
492 movups (VEC_SIZE * 5)(%rsi), %xmm3
494 PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
495 PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
503 jnz L(ret_nonzero_loop)
505 addq $(VEC_SIZE * 4), %rdi
506 addq $(VEC_SIZE * 4), %rsi
509 /* Get remaining length in edx. */
511 /* Restore offset so we can reuse L(last_2x_vec). */
512 addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
513 # ifdef USE_AS_WMEMCMP
516 cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
520 movups (VEC_SIZE * 2)(%rsi), %xmm0
521 movups (VEC_SIZE * 2)(%rdi), %xmm1
523 movups (VEC_SIZE * 3)(%rsi), %xmm2
524 movups (VEC_SIZE * 3)(%rdi), %xmm3
529 CHECK_CMP (%ecx, %eax)
531 # ifdef USE_AS_MEMCMPEQ
537 L(ret_nonzero_vec_start_2_3):
540 leal 1(%rax, %rdx), %eax
543 # ifdef USE_AS_WMEMCMP
544 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
546 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
547 /* NB: no partial register stall here because xorl zero idiom
550 leal -1(%rdx, %rdx), %eax
552 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
553 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
562 sall $(VEC_SIZE * 1), %edx
563 leal 1(%rcx, %rdx), %edx
565 /* High 16 bits of eax guaranteed to be all ones. Rotate them in
566 to we can do `or + not` with just `xor`. */
574 # ifdef USE_AS_WMEMCMP
575 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
577 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
578 /* NB: no partial register stall here because xorl zero idiom
581 leal -1(%rdx, %rdx), %eax
583 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
584 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax