sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S

   1 /* memcmp/wmemcmp optimized with AVX2.
   2    Copyright (C) 2017-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <isa-level.h>
  20
  21 #if ISA_SHOULD_BUILD (3)
  22
  23 /* memcmp/wmemcmp is implemented as:
  24    1. Use ymm vector compares when possible. The only case where
  25       vector compares is not possible for when size < VEC_SIZE
  26       and loading from either s1 or s2 would cause a page cross.
  27    2. For size from 2 to 7 bytes on page cross, load as big endian
  28       with movbe and bswap to avoid branches.
  29    3. Use xmm vector compare when size >= 4 bytes for memcmp or
  30       size >= 8 bytes for wmemcmp.
  31    4. Optimistically compare up to first 4 * VEC_SIZE one at a
  32       to check for early mismatches. Only do this if its guaranteed the
  33       work is not wasted.
  34    5. If size is 8 * VEC_SIZE or less, unroll the loop.
  35    6. Compare 4 * VEC_SIZE at a time with the aligned first memory
  36       area.
  37    7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
  38    8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
  39    9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
  40
  41
  42 # include <sysdep.h>
  43
  44 # ifndef MEMCMP
  45 #  define MEMCMP        __memcmp_avx2_movbe
  46 # endif
  47
  48 # ifdef USE_AS_WMEMCMP
  49 #  define CHAR_SIZE     4
  50 #  define VPCMPEQ       vpcmpeqd
  51 # else
  52 #  define CHAR_SIZE     1
  53 #  define VPCMPEQ       vpcmpeqb
  54 # endif
  55
  56 # ifndef VZEROUPPER
  57 #  define VZEROUPPER    vzeroupper
  58 # endif
  59
  60 # ifndef SECTION
  61 #  define SECTION(p)    p##.avx
  62 # endif
  63
  64 # define VEC_SIZE 32
  65 # define PAGE_SIZE      4096
  66
  67 /* Warning!
  68            wmemcmp has to use SIGNED comparison for elements.
  69            memcmp has to use UNSIGNED comparison for elements.
  70 */
  71
  72         .section SECTION(.text),"ax",@progbits
  73 ENTRY (MEMCMP)
  74 # ifdef USE_AS_WMEMCMP
  75         shl     $2, %RDX_LP
  76 # elif defined __ILP32__
  77         /* Clear the upper 32 bits.  */
  78         movl    %edx, %edx
  79 # endif
  80         cmp     $VEC_SIZE, %RDX_LP
  81         jb      L(less_vec)
  82
  83         /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
  84         vmovdqu (%rsi), %ymm1
  85         VPCMPEQ (%rdi), %ymm1, %ymm1
  86         vpmovmskb %ymm1, %eax
  87         /* NB: eax must be destination register if going to
  88            L(return_vec_[0,2]). For L(return_vec_3 destination register
  89            must be ecx.  */
  90         incl    %eax
  91         jnz     L(return_vec_0)
  92
  93         cmpq    $(VEC_SIZE * 2), %rdx
  94         jbe     L(last_1x_vec)
  95
  96         /* Check second VEC no matter what.  */
  97         vmovdqu VEC_SIZE(%rsi), %ymm2
  98         VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
  99         vpmovmskb %ymm2, %eax
 100         /* If all 4 VEC where equal eax will be all 1s so incl will
 101            overflow and set zero flag.  */
 102         incl    %eax
 103         jnz     L(return_vec_1)
 104
 105         /* Less than 4 * VEC.  */
 106         cmpq    $(VEC_SIZE * 4), %rdx
 107         jbe     L(last_2x_vec)
 108
 109         /* Check third and fourth VEC no matter what.  */
 110         vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
 111         VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 112         vpmovmskb %ymm3, %eax
 113         incl    %eax
 114         jnz     L(return_vec_2)
 115         vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
 116         VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 117         vpmovmskb %ymm4, %ecx
 118         incl    %ecx
 119         jnz     L(return_vec_3)
 120
 121         /* Go to 4x VEC loop.  */
 122         cmpq    $(VEC_SIZE * 8), %rdx
 123         ja      L(more_8x_vec)
 124
 125         /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 126            branches.  */
 127
 128         /* Load first two VEC from s2 before adjusting addresses.  */
 129         vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
 130         vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
 131         leaq    -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 132         leaq    -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 133
 134         /* Wait to load from s1 until addressed adjust due to
 135            unlamination of microfusion with complex address mode.  */
 136         VPCMPEQ (%rdi), %ymm1, %ymm1
 137         VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
 138
 139         vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
 140         VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 141         vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
 142         VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 143
 144         /* Reduce VEC0 - VEC4.  */
 145         vpand   %ymm1, %ymm2, %ymm5
 146         vpand   %ymm3, %ymm4, %ymm6
 147         vpand   %ymm5, %ymm6, %ymm7
 148         vpmovmskb %ymm7, %ecx
 149         incl    %ecx
 150         jnz     L(return_vec_0_1_2_3)
 151         /* NB: eax must be zero to reach here.  */
 152         VZEROUPPER_RETURN
 153
 154         .p2align 4
 155 L(return_vec_0):
 156         tzcntl  %eax, %eax
 157 # ifdef USE_AS_WMEMCMP
 158         movl    (%rdi, %rax), %ecx
 159         xorl    %edx, %edx
 160         cmpl    (%rsi, %rax), %ecx
 161         /* NB: no partial register stall here because xorl zero idiom
 162            above.  */
 163         setg    %dl
 164         leal    -1(%rdx, %rdx), %eax
 165 # else
 166         movzbl  (%rsi, %rax), %ecx
 167         movzbl  (%rdi, %rax), %eax
 168         subl    %ecx, %eax
 169 # endif
 170 L(return_vzeroupper):
 171         ZERO_UPPER_VEC_REGISTERS_RETURN
 172
 173         .p2align 4
 174 L(return_vec_1):
 175         tzcntl  %eax, %eax
 176 # ifdef USE_AS_WMEMCMP
 177         movl    VEC_SIZE(%rdi, %rax), %ecx
 178         xorl    %edx, %edx
 179         cmpl    VEC_SIZE(%rsi, %rax), %ecx
 180         setg    %dl
 181         leal    -1(%rdx, %rdx), %eax
 182 # else
 183         movzbl  VEC_SIZE(%rsi, %rax), %ecx
 184         movzbl  VEC_SIZE(%rdi, %rax), %eax
 185         subl    %ecx, %eax
 186 # endif
 187         VZEROUPPER_RETURN
 188
 189         .p2align 4
 190 L(return_vec_2):
 191         tzcntl  %eax, %eax
 192 # ifdef USE_AS_WMEMCMP
 193         movl    (VEC_SIZE * 2)(%rdi, %rax), %ecx
 194         xorl    %edx, %edx
 195         cmpl    (VEC_SIZE * 2)(%rsi, %rax), %ecx
 196         setg    %dl
 197         leal    -1(%rdx, %rdx), %eax
 198 # else
 199         movzbl  (VEC_SIZE * 2)(%rsi, %rax), %ecx
 200         movzbl  (VEC_SIZE * 2)(%rdi, %rax), %eax
 201         subl    %ecx, %eax
 202 # endif
 203         VZEROUPPER_RETURN
 204
 205         /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
 206         .p2align 5
 207 L(8x_return_vec_0_1_2_3):
 208         /* Returning from L(more_8x_vec) requires restoring rsi.  */
 209         addq    %rdi, %rsi
 210 L(return_vec_0_1_2_3):
 211         vpmovmskb %ymm1, %eax
 212         incl    %eax
 213         jnz     L(return_vec_0)
 214
 215         vpmovmskb %ymm2, %eax
 216         incl    %eax
 217         jnz     L(return_vec_1)
 218
 219         vpmovmskb %ymm3, %eax
 220         incl    %eax
 221         jnz     L(return_vec_2)
 222 L(return_vec_3):
 223         tzcntl  %ecx, %ecx
 224 # ifdef USE_AS_WMEMCMP
 225         movl    (VEC_SIZE * 3)(%rdi, %rcx), %eax
 226         xorl    %edx, %edx
 227         cmpl    (VEC_SIZE * 3)(%rsi, %rcx), %eax
 228         setg    %dl
 229         leal    -1(%rdx, %rdx), %eax
 230 # else
 231         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
 232         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
 233         subl    %ecx, %eax
 234 # endif
 235         VZEROUPPER_RETURN
 236
 237         .p2align 4
 238 L(more_8x_vec):
 239         /* Set end of s1 in rdx.  */
 240         leaq    -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
 241         /* rsi stores s2 - s1. This allows loop to only update one
 242            pointer.  */
 243         subq    %rdi, %rsi
 244         /* Align s1 pointer.  */
 245         andq    $-VEC_SIZE, %rdi
 246         /* Adjust because first 4x vec where check already.  */
 247         subq    $-(VEC_SIZE * 4), %rdi
 248         .p2align 4
 249 L(loop_4x_vec):
 250         /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
 251          */
 252         vmovdqu (%rsi, %rdi), %ymm1
 253         VPCMPEQ (%rdi), %ymm1, %ymm1
 254
 255         vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
 256         VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 257
 258         vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
 259         VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 260
 261         vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
 262         VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 263
 264         vpand   %ymm1, %ymm2, %ymm5
 265         vpand   %ymm3, %ymm4, %ymm6
 266         vpand   %ymm5, %ymm6, %ymm7
 267         vpmovmskb %ymm7, %ecx
 268         incl    %ecx
 269         jnz     L(8x_return_vec_0_1_2_3)
 270         subq    $-(VEC_SIZE * 4), %rdi
 271         /* Check if s1 pointer at end.  */
 272         cmpq    %rdx, %rdi
 273         jb      L(loop_4x_vec)
 274
 275         subq    %rdx, %rdi
 276         /* rdi has 4 * VEC_SIZE - remaining length.  */
 277         cmpl    $(VEC_SIZE * 3), %edi
 278         jae     L(8x_last_1x_vec)
 279         /* Load regardless of branch.  */
 280         vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
 281         cmpl    $(VEC_SIZE * 2), %edi
 282         jae     L(8x_last_2x_vec)
 283
 284         /* Check last 4 VEC.  */
 285         vmovdqu (%rsi, %rdx), %ymm1
 286         VPCMPEQ (%rdx), %ymm1, %ymm1
 287
 288         vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
 289         VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
 290
 291         VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
 292
 293         vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
 294         VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
 295
 296         vpand   %ymm1, %ymm2, %ymm5
 297         vpand   %ymm3, %ymm4, %ymm6
 298         vpand   %ymm5, %ymm6, %ymm7
 299         vpmovmskb %ymm7, %ecx
 300         /* Restore s1 pointer to rdi.  */
 301         movq    %rdx, %rdi
 302         incl    %ecx
 303         jnz     L(8x_return_vec_0_1_2_3)
 304         /* NB: eax must be zero to reach here.  */
 305         VZEROUPPER_RETURN
 306
 307         /* Only entry is from L(more_8x_vec).  */
 308         .p2align 4
 309 L(8x_last_2x_vec):
 310         /* Check second to last VEC. rdx store end pointer of s1 and
 311            ymm3 has already been loaded with second to last VEC from s2.
 312          */
 313         VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
 314         vpmovmskb %ymm3, %eax
 315         incl    %eax
 316         jnz     L(8x_return_vec_2)
 317         /* Check last VEC.  */
 318         .p2align 4
 319 L(8x_last_1x_vec):
 320         vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
 321         VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
 322         vpmovmskb %ymm4, %eax
 323         incl    %eax
 324         jnz     L(8x_return_vec_3)
 325         VZEROUPPER_RETURN
 326
 327         .p2align 4
 328 L(last_2x_vec):
 329         /* Check second to last VEC.  */
 330         vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
 331         VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
 332         vpmovmskb %ymm1, %eax
 333         incl    %eax
 334         jnz     L(return_vec_1_end)
 335         /* Check last VEC.  */
 336 L(last_1x_vec):
 337         vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
 338         VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
 339         vpmovmskb %ymm1, %eax
 340         incl    %eax
 341         jnz     L(return_vec_0_end)
 342         VZEROUPPER_RETURN
 343
 344         .p2align 4
 345 L(8x_return_vec_2):
 346         subq    $VEC_SIZE, %rdx
 347 L(8x_return_vec_3):
 348         tzcntl  %eax, %eax
 349         addq    %rdx, %rax
 350 # ifdef USE_AS_WMEMCMP
 351         movl    (VEC_SIZE * 3)(%rax), %ecx
 352         xorl    %edx, %edx
 353         cmpl    (VEC_SIZE * 3)(%rsi, %rax), %ecx
 354         setg    %dl
 355         leal    -1(%rdx, %rdx), %eax
 356 # else
 357         movzbl  (VEC_SIZE * 3)(%rsi, %rax), %ecx
 358         movzbl  (VEC_SIZE * 3)(%rax), %eax
 359         subl    %ecx, %eax
 360 # endif
 361         VZEROUPPER_RETURN
 362
 363         .p2align 4
 364 L(return_vec_1_end):
 365         tzcntl  %eax, %eax
 366         addl    %edx, %eax
 367 # ifdef USE_AS_WMEMCMP
 368         movl    -(VEC_SIZE * 2)(%rdi, %rax), %ecx
 369         xorl    %edx, %edx
 370         cmpl    -(VEC_SIZE * 2)(%rsi, %rax), %ecx
 371         setg    %dl
 372         leal    -1(%rdx, %rdx), %eax
 373 # else
 374         movzbl  -(VEC_SIZE * 2)(%rsi, %rax), %ecx
 375         movzbl  -(VEC_SIZE * 2)(%rdi, %rax), %eax
 376         subl    %ecx, %eax
 377 # endif
 378         VZEROUPPER_RETURN
 379
 380         .p2align 4
 381 L(return_vec_0_end):
 382         tzcntl  %eax, %eax
 383         addl    %edx, %eax
 384 # ifdef USE_AS_WMEMCMP
 385         movl    -VEC_SIZE(%rdi, %rax), %ecx
 386         xorl    %edx, %edx
 387         cmpl    -VEC_SIZE(%rsi, %rax), %ecx
 388         setg    %dl
 389         leal    -1(%rdx, %rdx), %eax
 390 # else
 391         movzbl  -VEC_SIZE(%rsi, %rax), %ecx
 392         movzbl  -VEC_SIZE(%rdi, %rax), %eax
 393         subl    %ecx, %eax
 394 # endif
 395         VZEROUPPER_RETURN
 396
 397         .p2align 4
 398 L(less_vec):
 399         /* Check if one or less CHAR. This is necessary for size = 0 but
 400            is also faster for size = CHAR_SIZE.  */
 401         cmpl    $CHAR_SIZE, %edx
 402         jbe     L(one_or_less)
 403
 404         /* Check if loading one VEC from either s1 or s2 could cause a
 405            page cross. This can have false positives but is by far the
 406            fastest method.  */
 407         movl    %edi, %eax
 408         orl     %esi, %eax
 409         andl    $(PAGE_SIZE - 1), %eax
 410         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
 411         jg      L(page_cross_less_vec)
 412
 413         /* No page cross possible.  */
 414         vmovdqu (%rsi), %ymm2
 415         VPCMPEQ (%rdi), %ymm2, %ymm2
 416         vpmovmskb %ymm2, %eax
 417         incl    %eax
 418         /* Result will be zero if s1 and s2 match. Otherwise first set
 419            bit will be first mismatch.  */
 420         bzhil   %edx, %eax, %edx
 421         jnz     L(return_vec_0)
 422         xorl    %eax, %eax
 423         VZEROUPPER_RETURN
 424
 425         .p2align 4
 426 L(page_cross_less_vec):
 427         /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 428            bytes.  */
 429         cmpl    $16, %edx
 430         jae     L(between_16_31)
 431 # ifndef USE_AS_WMEMCMP
 432         cmpl    $8, %edx
 433         jae     L(between_8_15)
 434         /* Fall through for [4, 7].  */
 435         cmpl    $4, %edx
 436         jb      L(between_2_3)
 437
 438         movbe   (%rdi), %eax
 439         movbe   (%rsi), %ecx
 440         shlq    $32, %rax
 441         shlq    $32, %rcx
 442         movbe   -4(%rdi, %rdx), %edi
 443         movbe   -4(%rsi, %rdx), %esi
 444         orq     %rdi, %rax
 445         orq     %rsi, %rcx
 446         subq    %rcx, %rax
 447         /* Fast path for return zero.  */
 448         jnz     L(ret_nonzero)
 449         /* No ymm register was touched.  */
 450         ret
 451
 452         .p2align 4
 453 L(one_or_less):
 454         jb      L(zero)
 455         movzbl  (%rsi), %ecx
 456         movzbl  (%rdi), %eax
 457         subl    %ecx, %eax
 458         /* No ymm register was touched.  */
 459         ret
 460
 461         .p2align 4,, 5
 462 L(ret_nonzero):
 463         sbbl    %eax, %eax
 464         orl     $1, %eax
 465         /* No ymm register was touched.  */
 466         ret
 467
 468         .p2align 4,, 2
 469 L(zero):
 470         xorl    %eax, %eax
 471         /* No ymm register was touched.  */
 472         ret
 473
 474         .p2align 4
 475 L(between_8_15):
 476         movbe   (%rdi), %rax
 477         movbe   (%rsi), %rcx
 478         subq    %rcx, %rax
 479         jnz     L(ret_nonzero)
 480         movbe   -8(%rdi, %rdx), %rax
 481         movbe   -8(%rsi, %rdx), %rcx
 482         subq    %rcx, %rax
 483         /* Fast path for return zero.  */
 484         jnz     L(ret_nonzero)
 485         /* No ymm register was touched.  */
 486         ret
 487 # else
 488         /* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 489         vmovq   (%rdi), %xmm1
 490         vmovq   (%rsi), %xmm2
 491         VPCMPEQ %xmm1, %xmm2, %xmm2
 492         vpmovmskb %xmm2, %eax
 493         subl    $0xffff, %eax
 494         jnz     L(return_vec_0)
 495         /* Use overlapping loads to avoid branches.  */
 496         leaq    -8(%rdi, %rdx), %rdi
 497         leaq    -8(%rsi, %rdx), %rsi
 498         vmovq   (%rdi), %xmm1
 499         vmovq   (%rsi), %xmm2
 500         VPCMPEQ %xmm1, %xmm2, %xmm2
 501         vpmovmskb %xmm2, %eax
 502         subl    $0xffff, %eax
 503         /* Fast path for return zero.  */
 504         jnz     L(return_vec_0)
 505         /* No ymm register was touched.  */
 506         ret
 507 # endif
 508
 509         .p2align 4,, 10
 510 L(between_16_31):
 511         /* From 16 to 31 bytes.  No branch when size == 16.  */
 512         vmovdqu (%rsi), %xmm2
 513         VPCMPEQ (%rdi), %xmm2, %xmm2
 514         vpmovmskb %xmm2, %eax
 515         subl    $0xffff, %eax
 516         jnz     L(return_vec_0)
 517
 518         /* Use overlapping loads to avoid branches.  */
 519
 520         vmovdqu -16(%rsi, %rdx), %xmm2
 521         leaq    -16(%rdi, %rdx), %rdi
 522         leaq    -16(%rsi, %rdx), %rsi
 523         VPCMPEQ (%rdi), %xmm2, %xmm2
 524         vpmovmskb %xmm2, %eax
 525         subl    $0xffff, %eax
 526         /* Fast path for return zero.  */
 527         jnz     L(return_vec_0)
 528         /* No ymm register was touched.  */
 529         ret
 530
 531 # ifdef USE_AS_WMEMCMP
 532         .p2align 4,, 2
 533 L(zero):
 534         xorl    %eax, %eax
 535         ret
 536
 537         .p2align 4
 538 L(one_or_less):
 539         jb      L(zero)
 540         movl    (%rdi), %ecx
 541         xorl    %edx, %edx
 542         cmpl    (%rsi), %ecx
 543         je      L(zero)
 544         setg    %dl
 545         leal    -1(%rdx, %rdx), %eax
 546         /* No ymm register was touched.  */
 547         ret
 548 # else
 549
 550         .p2align 4
 551 L(between_2_3):
 552         /* Load as big endian to avoid branches.  */
 553         movzwl  (%rdi), %eax
 554         movzwl  (%rsi), %ecx
 555         bswap   %eax
 556         bswap   %ecx
 557         shrl    %eax
 558         shrl    %ecx
 559         movzbl  -1(%rdi, %rdx), %edi
 560         movzbl  -1(%rsi, %rdx), %esi
 561         orl     %edi, %eax
 562         orl     %esi, %ecx
 563         /* Subtraction is okay because the upper bit is zero.  */
 564         subl    %ecx, %eax
 565         /* No ymm register was touched.  */
 566         ret
 567 # endif
 568
 569 END (MEMCMP)
 570 #endif