1 /* __memcmpeq optimized with EVEX.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
23 /* __memcmpeq is implemented as:
24 1. Use ymm vector compares when possible. The only case where
25 vector compares is not possible for when size < VEC_SIZE
26 and loading from either s1 or s2 would cause a page cross.
27 2. Use xmm vector compare when size >= 8 bytes.
28 3. Optimistically compare up to first 4 * VEC_SIZE one at a
29 to check for early mismatches. Only do this if its guaranteed the
31 4. If size is 8 * VEC_SIZE or less, unroll the loop.
32 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
34 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
35 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
36 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
41 # define MEMCMPEQ __memcmpeq_evex
45 # include "x86-evex256-vecs.h"
47 # include "reg-macros.h"
52 # define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
53 # define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
55 # define TO_32BIT_P1(reg) /* Do nothing. */
56 # define TO_32BIT_P2(reg) /* Do nothing. */
57 # define TO_32BIT(reg) /* Do nothing. */
59 # define VEC_CMP VPCMPEQ
63 # define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
64 # define TEST_ZERO(reg) neg %VGPR(reg)
67 /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
68 int. We have two methods for this. If the mask with branched
69 on, we use `neg` for the branch then `sbb` to get the 32-bit
70 return. If the mask was no branched on, we just use
72 # define TO_32BIT_P1(reg) TEST_ZERO(reg)
73 # define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
74 # define TO_32BIT(reg) popcntq %reg, %reg
76 # define VEC_CMP VPCMPNEQ
79 # error "Unsupported VEC_SIZE"
83 # define VMOVU_MASK vmovdqu8
84 # define VPCMPNEQ vpcmpneqb
85 # define VPCMPEQ vpcmpeqb
86 # define VPTEST vptestmb
88 # define PAGE_SIZE 4096
90 .section SECTION(.text), "ax", @progbits
91 ENTRY_P2ALIGN (MEMCMPEQ, 6)
93 /* Clear the upper 32 bits. */
96 cmp $VEC_SIZE, %RDX_LP
97 /* Fall through for [0, VEC_SIZE] as its the hottest. */
100 /* Create mask of bytes that are guaranteed to be valid because
101 of length (edx). Using masked movs allows us to skip checks
102 for page crosses/zero size. */
104 bzhi %VRDX, %VRAX, %VRAX
105 /* NB: A `jz` might be useful here. Page-faults that are
106 invalidated by predicate execution (the evex mask) can be
107 very slow. The expectation is this is not the norm so and
108 "most" code will not regularly call 'memcmp' with length = 0
109 and memory that is not wired up. */
112 /* Use masked loads as VEC_SIZE could page cross where length
114 VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
115 VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
122 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
123 VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
133 /* From VEC + 1 to 2 * VEC. */
134 VMOVU (%rsi), %VMM(1)
135 /* Use compare not equals to directly check for mismatch. */
136 VPCMPNEQ (%rdi), %VMM(1), %k1
141 cmpq $(VEC_SIZE * 2), %rdx
144 /* Check second VEC no matter what. */
145 VMOVU VEC_SIZE(%rsi), %VMM(2)
146 VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
151 /* Less than 4 * VEC. */
152 cmpq $(VEC_SIZE * 4), %rdx
155 /* Check third and fourth VEC no matter what. */
156 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
157 VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
162 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
163 VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
168 /* Go to 4x VEC loop. */
169 cmpq $(VEC_SIZE * 8), %rdx
172 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
175 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
176 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
179 /* Wait to load from s1 until addressed adjust due to
182 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
183 will have some 1s. */
184 vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
185 /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
186 oring with VEC(1). Result is stored in VEC(1). */
187 vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
189 cmpl $(VEC_SIZE * 6), %edx
190 jbe L(4x_last_2x_vec)
192 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
193 vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
194 /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
195 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
196 vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
198 /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
199 vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
201 /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
203 VPTEST %VMM(2), %VMM(2), %k1
211 /* Set end of s1 in rdx. */
212 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
213 /* rsi stores s2 - s1. This allows loop to only update one
216 /* Align s1 pointer. */
217 andq $-VEC_SIZE, %rdi
218 /* Adjust because first 4x vec where check already. */
219 subq $-(VEC_SIZE * 4), %rdi
223 VMOVU (%rsi, %rdi), %VMM(1)
224 vpxorq (%rdi), %VMM(1), %VMM(1)
226 VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
227 vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
229 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
230 vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
232 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
233 vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
235 vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
236 VPTEST %VMM(4), %VMM(4), %k1
240 subq $-(VEC_SIZE * 4), %rdi
246 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
247 vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
248 /* rdi has 4 * VEC_SIZE - remaining length. */
250 /* Load regardless of branch. */
251 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
252 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
253 oring with VEC(4). Result is stored in VEC(4). */
254 vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
256 /* Separate logic as we can only use testb for VEC_SIZE == 64.
262 cmpl $(VEC_SIZE * 2), %edi
263 jge L(8x_last_2x_vec)
266 VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
267 vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
269 VMOVU (%rsi, %rdx), %VMM(1)
270 vpxorq (%rdx), %VMM(1), %VMM(1)
272 vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
275 VPTEST %VMM(4), %VMM(4), %k1
284 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
285 vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
286 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
287 vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
288 VPTEST %VMM(2), %VMM(2), %k1
293 /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from