1 /* memcmp/wmemcmp optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (3)
23 /* memcmp/wmemcmp is implemented as:
24 1. Use ymm vector compares when possible. The only case where
25 vector compares is not possible for when size < VEC_SIZE
26 and loading from either s1 or s2 would cause a page cross.
27 2. For size from 2 to 7 bytes on page cross, load as big endian
28 with movbe and bswap to avoid branches.
29 3. Use xmm vector compare when size >= 4 bytes for memcmp or
30 size >= 8 bytes for wmemcmp.
31 4. Optimistically compare up to first 4 * VEC_SIZE one at a
32 to check for early mismatches. Only do this if its guaranteed the
34 5. If size is 8 * VEC_SIZE or less, unroll the loop.
35 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
37 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
38 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
39 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
45 # define MEMCMP __memcmp_avx2_movbe
48 # ifdef USE_AS_WMEMCMP
50 # define VPCMPEQ vpcmpeqd
53 # define VPCMPEQ vpcmpeqb
57 # define VZEROUPPER vzeroupper
61 # define SECTION(p) p##.avx
65 # define PAGE_SIZE 4096
68 wmemcmp has to use SIGNED comparison for elements.
69 memcmp has to use UNSIGNED comparison for elements.
72 .section SECTION(.text),"ax",@progbits
74 # ifdef USE_AS_WMEMCMP
76 # elif defined __ILP32__
77 /* Clear the upper 32 bits. */
80 cmp $VEC_SIZE, %RDX_LP
83 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
85 VPCMPEQ (%rdi), %ymm1, %ymm1
87 /* NB: eax must be destination register if going to
88 L(return_vec_[0,2]). For L(return_vec_3 destination register
93 cmpq $(VEC_SIZE * 2), %rdx
96 /* Check second VEC no matter what. */
97 vmovdqu VEC_SIZE(%rsi), %ymm2
98 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
100 /* If all 4 VEC where equal eax will be all 1s so incl will
101 overflow and set zero flag. */
105 /* Less than 4 * VEC. */
106 cmpq $(VEC_SIZE * 4), %rdx
109 /* Check third and fourth VEC no matter what. */
110 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
111 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
112 vpmovmskb %ymm3, %eax
115 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
116 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
117 vpmovmskb %ymm4, %ecx
121 /* Go to 4x VEC loop. */
122 cmpq $(VEC_SIZE * 8), %rdx
125 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
128 /* Load first two VEC from s2 before adjusting addresses. */
129 vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
130 vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
131 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
132 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
134 /* Wait to load from s1 until addressed adjust due to
135 unlamination of microfusion with complex address mode. */
136 VPCMPEQ (%rdi), %ymm1, %ymm1
137 VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
139 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
140 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
141 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
142 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
144 /* Reduce VEC0 - VEC4. */
145 vpand %ymm1, %ymm2, %ymm5
146 vpand %ymm3, %ymm4, %ymm6
147 vpand %ymm5, %ymm6, %ymm7
148 vpmovmskb %ymm7, %ecx
150 jnz L(return_vec_0_1_2_3)
151 /* NB: eax must be zero to reach here. */
157 # ifdef USE_AS_WMEMCMP
158 movl (%rdi, %rax), %ecx
160 cmpl (%rsi, %rax), %ecx
161 /* NB: no partial register stall here because xorl zero idiom
164 leal -1(%rdx, %rdx), %eax
166 movzbl (%rsi, %rax), %ecx
167 movzbl (%rdi, %rax), %eax
170 L(return_vzeroupper):
171 ZERO_UPPER_VEC_REGISTERS_RETURN
176 # ifdef USE_AS_WMEMCMP
177 movl VEC_SIZE(%rdi, %rax), %ecx
179 cmpl VEC_SIZE(%rsi, %rax), %ecx
181 leal -1(%rdx, %rdx), %eax
183 movzbl VEC_SIZE(%rsi, %rax), %ecx
184 movzbl VEC_SIZE(%rdi, %rax), %eax
192 # ifdef USE_AS_WMEMCMP
193 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
195 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
197 leal -1(%rdx, %rdx), %eax
199 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
200 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
205 /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
207 L(8x_return_vec_0_1_2_3):
208 /* Returning from L(more_8x_vec) requires restoring rsi. */
210 L(return_vec_0_1_2_3):
211 vpmovmskb %ymm1, %eax
215 vpmovmskb %ymm2, %eax
219 vpmovmskb %ymm3, %eax
224 # ifdef USE_AS_WMEMCMP
225 movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
227 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
229 leal -1(%rdx, %rdx), %eax
231 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
232 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
239 /* Set end of s1 in rdx. */
240 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
241 /* rsi stores s2 - s1. This allows loop to only update one
244 /* Align s1 pointer. */
245 andq $-VEC_SIZE, %rdi
246 /* Adjust because first 4x vec where check already. */
247 subq $-(VEC_SIZE * 4), %rdi
250 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
252 vmovdqu (%rsi, %rdi), %ymm1
253 VPCMPEQ (%rdi), %ymm1, %ymm1
255 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
256 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
258 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
259 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
261 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
262 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
264 vpand %ymm1, %ymm2, %ymm5
265 vpand %ymm3, %ymm4, %ymm6
266 vpand %ymm5, %ymm6, %ymm7
267 vpmovmskb %ymm7, %ecx
269 jnz L(8x_return_vec_0_1_2_3)
270 subq $-(VEC_SIZE * 4), %rdi
271 /* Check if s1 pointer at end. */
276 /* rdi has 4 * VEC_SIZE - remaining length. */
277 cmpl $(VEC_SIZE * 3), %edi
278 jae L(8x_last_1x_vec)
279 /* Load regardless of branch. */
280 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
281 cmpl $(VEC_SIZE * 2), %edi
282 jae L(8x_last_2x_vec)
284 /* Check last 4 VEC. */
285 vmovdqu (%rsi, %rdx), %ymm1
286 VPCMPEQ (%rdx), %ymm1, %ymm1
288 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
289 VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
291 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
293 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
294 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
296 vpand %ymm1, %ymm2, %ymm5
297 vpand %ymm3, %ymm4, %ymm6
298 vpand %ymm5, %ymm6, %ymm7
299 vpmovmskb %ymm7, %ecx
300 /* Restore s1 pointer to rdi. */
303 jnz L(8x_return_vec_0_1_2_3)
304 /* NB: eax must be zero to reach here. */
307 /* Only entry is from L(more_8x_vec). */
310 /* Check second to last VEC. rdx store end pointer of s1 and
311 ymm3 has already been loaded with second to last VEC from s2.
313 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
314 vpmovmskb %ymm3, %eax
316 jnz L(8x_return_vec_2)
317 /* Check last VEC. */
320 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
321 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
322 vpmovmskb %ymm4, %eax
324 jnz L(8x_return_vec_3)
329 /* Check second to last VEC. */
330 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
331 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
332 vpmovmskb %ymm1, %eax
334 jnz L(return_vec_1_end)
335 /* Check last VEC. */
337 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
338 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
339 vpmovmskb %ymm1, %eax
341 jnz L(return_vec_0_end)
350 # ifdef USE_AS_WMEMCMP
351 movl (VEC_SIZE * 3)(%rax), %ecx
353 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
355 leal -1(%rdx, %rdx), %eax
357 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
358 movzbl (VEC_SIZE * 3)(%rax), %eax
367 # ifdef USE_AS_WMEMCMP
368 movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
370 cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
372 leal -1(%rdx, %rdx), %eax
374 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
375 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
384 # ifdef USE_AS_WMEMCMP
385 movl -VEC_SIZE(%rdi, %rax), %ecx
387 cmpl -VEC_SIZE(%rsi, %rax), %ecx
389 leal -1(%rdx, %rdx), %eax
391 movzbl -VEC_SIZE(%rsi, %rax), %ecx
392 movzbl -VEC_SIZE(%rdi, %rax), %eax
399 /* Check if one or less CHAR. This is necessary for size = 0 but
400 is also faster for size = CHAR_SIZE. */
401 cmpl $CHAR_SIZE, %edx
404 /* Check if loading one VEC from either s1 or s2 could cause a
405 page cross. This can have false positives but is by far the
409 andl $(PAGE_SIZE - 1), %eax
410 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
411 jg L(page_cross_less_vec)
413 /* No page cross possible. */
414 vmovdqu (%rsi), %ymm2
415 VPCMPEQ (%rdi), %ymm2, %ymm2
416 vpmovmskb %ymm2, %eax
418 /* Result will be zero if s1 and s2 match. Otherwise first set
419 bit will be first mismatch. */
420 bzhil %edx, %eax, %edx
426 L(page_cross_less_vec):
427 /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
431 # ifndef USE_AS_WMEMCMP
434 /* Fall through for [4, 7]. */
442 movbe -4(%rdi, %rdx), %edi
443 movbe -4(%rsi, %rdx), %esi
447 /* Fast path for return zero. */
449 /* No ymm register was touched. */
458 /* No ymm register was touched. */
465 /* No ymm register was touched. */
471 /* No ymm register was touched. */
480 movbe -8(%rdi, %rdx), %rax
481 movbe -8(%rsi, %rdx), %rcx
483 /* Fast path for return zero. */
485 /* No ymm register was touched. */
488 /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
491 VPCMPEQ %xmm1, %xmm2, %xmm2
492 vpmovmskb %xmm2, %eax
495 /* Use overlapping loads to avoid branches. */
496 leaq -8(%rdi, %rdx), %rdi
497 leaq -8(%rsi, %rdx), %rsi
500 VPCMPEQ %xmm1, %xmm2, %xmm2
501 vpmovmskb %xmm2, %eax
503 /* Fast path for return zero. */
505 /* No ymm register was touched. */
511 /* From 16 to 31 bytes. No branch when size == 16. */
512 vmovdqu (%rsi), %xmm2
513 VPCMPEQ (%rdi), %xmm2, %xmm2
514 vpmovmskb %xmm2, %eax
518 /* Use overlapping loads to avoid branches. */
520 vmovdqu -16(%rsi, %rdx), %xmm2
521 leaq -16(%rdi, %rdx), %rdi
522 leaq -16(%rsi, %rdx), %rsi
523 VPCMPEQ (%rdi), %xmm2, %xmm2
524 vpmovmskb %xmm2, %eax
526 /* Fast path for return zero. */
528 /* No ymm register was touched. */
531 # ifdef USE_AS_WMEMCMP
545 leal -1(%rdx, %rdx), %eax
546 /* No ymm register was touched. */
552 /* Load as big endian to avoid branches. */
559 movzbl -1(%rdi, %rdx), %edi
560 movzbl -1(%rsi, %rdx), %esi
563 /* Subtraction is okay because the upper bit is zero. */
565 /* No ymm register was touched. */