sysdeps/x86_64/multiarch/memcmpeq-evex.S

   1 /* __memcmpeq optimized with EVEX.
   2    Copyright (C) 2017-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (4)
  22
  23 /* __memcmpeq is implemented as:
  24    1. Use ymm vector compares when possible. The only case where
  25       vector compares is not possible for when size < VEC_SIZE
  26       and loading from either s1 or s2 would cause a page cross.
  27    2. Use xmm vector compare when size >= 8 bytes.
  28    3. Optimistically compare up to first 4 * VEC_SIZE one at a
  29       to check for early mismatches. Only do this if its guaranteed the
  30       work is not wasted.
  31    4. If size is 8 * VEC_SIZE or less, unroll the loop.
  32    5. Compare 4 * VEC_SIZE at a time with the aligned first memory
  33       area.
  34    6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
  35    7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
  36    8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
  37
  38 # include <sysdep.h>
  39
  40 # ifndef MEMCMPEQ
  41 #  define MEMCMPEQ      __memcmpeq_evex
  42 # endif
  43
  44 # ifndef VEC_SIZE
  45 #  include "x86-evex256-vecs.h"
  46 # endif
  47 # include "reg-macros.h"
  48
  49
  50 # if VEC_SIZE == 32
  51
  52 #  define TEST_ZERO_VCMP(reg)   inc %VGPR(reg)
  53 #  define TEST_ZERO(reg)        test %VGPR(reg), %VGPR(reg)
  54
  55 #  define TO_32BIT_P1(reg)      /* Do nothing. */
  56 #  define TO_32BIT_P2(reg)      /* Do nothing. */
  57 #  define TO_32BIT(reg) /* Do nothing. */
  58
  59 #  define VEC_CMP       VPCMPEQ
  60
  61 # elif VEC_SIZE == 64
  62
  63 #  define TEST_ZERO_VCMP(reg)   TEST_ZERO(reg)
  64 #  define TEST_ZERO(reg)        neg %VGPR(reg)
  65
  66
  67         /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
  68            int. We have two methods for this. If the mask with branched
  69            on, we use `neg` for the branch then `sbb` to get the 32-bit
  70            return. If the mask was no branched on, we just use
  71            `popcntq`.  */
  72 #  define TO_32BIT_P1(reg)      TEST_ZERO(reg)
  73 #  define TO_32BIT_P2(reg)      sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
  74 #  define TO_32BIT(reg) popcntq %reg, %reg
  75
  76 #  define VEC_CMP       VPCMPNEQ
  77
  78 # else
  79 #  error "Unsupported VEC_SIZE"
  80 # endif
  81
  82
  83 # define VMOVU_MASK     vmovdqu8
  84 # define VPCMPNEQ       vpcmpneqb
  85 # define VPCMPEQ        vpcmpeqb
  86 # define VPTEST vptestmb
  87
  88 # define PAGE_SIZE      4096
  89
  90         .section SECTION(.text), "ax", @progbits
  91 ENTRY_P2ALIGN (MEMCMPEQ, 6)
  92 # ifdef __ILP32__
  93         /* Clear the upper 32 bits.  */
  94         movl    %edx, %edx
  95 # endif
  96         cmp     $VEC_SIZE, %RDX_LP
  97         /* Fall through for [0, VEC_SIZE] as its the hottest.  */
  98         ja      L(more_1x_vec)
  99
 100         /* Create mask of bytes that are guaranteed to be valid because
 101            of length (edx). Using masked movs allows us to skip checks
 102            for page crosses/zero size.  */
 103         mov     $-1, %VRAX
 104         bzhi    %VRDX, %VRAX, %VRAX
 105         /* NB: A `jz` might be useful here. Page-faults that are
 106            invalidated by predicate execution (the evex mask) can be
 107            very slow.  The expectation is this is not the norm so and
 108            "most" code will not regularly call 'memcmp' with length = 0
 109            and memory that is not wired up.  */
 110         KMOV    %VRAX, %k2
 111
 112         /* Use masked loads as VEC_SIZE could page cross where length
 113            (edx) would not.  */
 114         VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
 115         VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
 116         KMOV    %k1, %VRAX
 117         TO_32BIT (VRAX)
 118         ret
 119
 120         .p2align 4,, 3
 121 L(last_1x_vec):
 122         VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
 123         VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
 124         KMOV    %k1, %VRAX
 125         TO_32BIT_P1 (rax)
 126 L(return_neq0):
 127         TO_32BIT_P2 (rax)
 128         ret
 129
 130
 131         .p2align 4,, 12
 132 L(more_1x_vec):
 133         /* From VEC + 1 to 2 * VEC.  */
 134         VMOVU   (%rsi), %VMM(1)
 135         /* Use compare not equals to directly check for mismatch.  */
 136         VPCMPNEQ (%rdi), %VMM(1), %k1
 137         KMOV    %k1, %VRAX
 138         TEST_ZERO (rax)
 139         jnz     L(return_neq0)
 140
 141         cmpq    $(VEC_SIZE * 2), %rdx
 142         jbe     L(last_1x_vec)
 143
 144         /* Check second VEC no matter what.  */
 145         VMOVU   VEC_SIZE(%rsi), %VMM(2)
 146         VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
 147         KMOV    %k1, %VRAX
 148         TEST_ZERO (rax)
 149         jnz     L(return_neq0)
 150
 151         /* Less than 4 * VEC.  */
 152         cmpq    $(VEC_SIZE * 4), %rdx
 153         jbe     L(last_2x_vec)
 154
 155         /* Check third and fourth VEC no matter what.  */
 156         VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(3)
 157         VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
 158         KMOV    %k1, %VRAX
 159         TEST_ZERO_VCMP (rax)
 160         jnz     L(return_neq0)
 161
 162         VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(4)
 163         VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
 164         KMOV    %k1, %VRAX
 165         TEST_ZERO_VCMP (rax)
 166         jnz     L(return_neq0)
 167
 168         /* Go to 4x VEC loop.  */
 169         cmpq    $(VEC_SIZE * 8), %rdx
 170         ja      L(more_8x_vec)
 171
 172         /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 173            branches.  */
 174
 175         VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
 176         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
 177         addq    %rdx, %rdi
 178
 179         /* Wait to load from s1 until addressed adjust due to
 180            unlamination.  */
 181
 182         /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 183            will have some 1s.  */
 184         vpxorq  -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
 185         /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
 186            oring with VEC(1). Result is stored in VEC(1).  */
 187         vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
 188
 189         cmpl    $(VEC_SIZE * 6), %edx
 190         jbe     L(4x_last_2x_vec)
 191
 192         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
 193         vpxorq  -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
 194         /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3).  */
 195         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
 196         vpxorq  -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
 197
 198         /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2).  */
 199         vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
 200
 201         /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match.  */
 202 L(4x_last_2x_vec):
 203         VPTEST  %VMM(2), %VMM(2), %k1
 204         KMOV    %k1, %VRAX
 205         TO_32BIT (VRAX)
 206         ret
 207
 208
 209         .p2align 4,, 10
 210 L(more_8x_vec):
 211         /* Set end of s1 in rdx.  */
 212         leaq    -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
 213         /* rsi stores s2 - s1. This allows loop to only update one
 214            pointer.  */
 215         subq    %rdi, %rsi
 216         /* Align s1 pointer.  */
 217         andq    $-VEC_SIZE, %rdi
 218         /* Adjust because first 4x vec where check already.  */
 219         subq    $-(VEC_SIZE * 4), %rdi
 220         .p2align 5,, 12
 221         .p2align 4,, 8
 222 L(loop_4x_vec):
 223         VMOVU   (%rsi, %rdi), %VMM(1)
 224         vpxorq  (%rdi), %VMM(1), %VMM(1)
 225
 226         VMOVU   VEC_SIZE(%rsi, %rdi), %VMM(2)
 227         vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
 228
 229         VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
 230         vpxorq  (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
 231
 232         VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
 233         vpxorq  (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
 234
 235         vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
 236         VPTEST  %VMM(4), %VMM(4), %k1
 237         KMOV    %k1, %VRAX
 238         TEST_ZERO (rax)
 239         jnz     L(return_neq2)
 240         subq    $-(VEC_SIZE * 4), %rdi
 241         cmpq    %rdx, %rdi
 242         jb      L(loop_4x_vec)
 243
 244         subq    %rdx, %rdi
 245
 246         VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
 247         vpxorq  (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
 248         /* rdi has 4 * VEC_SIZE - remaining length.  */
 249
 250         /* Load regardless of branch.  */
 251         VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
 252         /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
 253            oring with VEC(4). Result is stored in VEC(4).  */
 254         vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
 255
 256         /* Separate logic as we can only use testb for VEC_SIZE == 64.
 257          */
 258 # if VEC_SIZE == 64
 259         testb   %dil, %dil
 260         js      L(8x_last_2x_vec)
 261 # else
 262         cmpl    $(VEC_SIZE * 2), %edi
 263         jge     L(8x_last_2x_vec)
 264 # endif
 265
 266         VMOVU   VEC_SIZE(%rsi, %rdx), %VMM(2)
 267         vpxorq  VEC_SIZE(%rdx), %VMM(2), %VMM(2)
 268
 269         VMOVU   (%rsi, %rdx), %VMM(1)
 270         vpxorq  (%rdx), %VMM(1), %VMM(1)
 271
 272         vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
 273 L(8x_last_1x_vec):
 274 L(8x_last_2x_vec):
 275         VPTEST  %VMM(4), %VMM(4), %k1
 276         KMOV    %k1, %VRAX
 277         TO_32BIT_P1 (rax)
 278 L(return_neq2):
 279         TO_32BIT_P2 (rax)
 280         ret
 281
 282         .p2align 4,, 4
 283 L(last_2x_vec):
 284         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
 285         vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
 286         VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
 287         vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
 288         VPTEST  %VMM(2), %VMM(2), %k1
 289         KMOV    %k1, %VRAX
 290         TO_32BIT (VRAX)
 291         ret
 292
 293         /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
 294            next cache line.  */
 295 END (MEMCMPEQ)
 296 #endif