sysdeps/x86_64/multiarch/memcmp-sse2.S

   1 /* memcmp with SSE2.
   2    Copyright (C) 2017-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19
  20 #include <isa-level.h>
  21
  22 /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
  23    so we need this to build for ISA V2 builds. */
  24 #if ISA_SHOULD_BUILD (2)
  25
  26 #include <sysdep.h>
  27
  28 # ifndef MEMCMP
  29 #  define MEMCMP __memcmp_sse2
  30 # endif
  31
  32 # ifdef USE_AS_WMEMCMP
  33 #  define PCMPEQ        pcmpeqd
  34 #  define CHAR_SIZE     4
  35 #  define SIZE_OFFSET   (0)
  36 # else
  37 #  define PCMPEQ        pcmpeqb
  38 #  define CHAR_SIZE     1
  39 # endif
  40
  41 # ifdef USE_AS_MEMCMPEQ
  42 #  define SIZE_OFFSET   (0)
  43 #  define CHECK_CMP(x, y)       subl x, y
  44 # else
  45 #  ifndef SIZE_OFFSET
  46 #   define SIZE_OFFSET  (CHAR_PER_VEC * 2)
  47 #  endif
  48 #  define CHECK_CMP(x, y)       cmpl x, y
  49 # endif
  50
  51 # define VEC_SIZE       16
  52 # define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
  53
  54 # ifndef MEMCMP
  55 #  define MEMCMP        memcmp
  56 # endif
  57
  58         .text
  59 ENTRY(MEMCMP)
  60 #  ifdef __ILP32__
  61         /* Clear the upper 32 bits.  */
  62         movl    %edx, %edx
  63 #  endif
  64 # ifdef USE_AS_WMEMCMP
  65         /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
  66            in ecx for code size. This is preferable to using `incw` as
  67            it avoids partial register stalls on older hardware (pre
  68            SnB).  */
  69         movl    $0xffff, %ecx
  70 # endif
  71         cmpq    $CHAR_PER_VEC, %rdx
  72         ja      L(more_1x_vec)
  73
  74 # ifdef USE_AS_WMEMCMP
  75         /* saves a byte of code keeping the fall through path n = [2, 4]
  76            in the initial cache line.  */
  77         decl    %edx
  78         jle     L(cmp_0_1)
  79
  80         movq    (%rsi), %xmm0
  81         movq    (%rdi), %xmm1
  82         PCMPEQ  %xmm0, %xmm1
  83         pmovmskb %xmm1, %eax
  84         subl    %ecx, %eax
  85         jnz     L(ret_nonzero_vec_start_0)
  86
  87         movq    -4(%rsi, %rdx, CHAR_SIZE), %xmm0
  88         movq    -4(%rdi, %rdx, CHAR_SIZE), %xmm1
  89         PCMPEQ  %xmm0, %xmm1
  90         pmovmskb %xmm1, %eax
  91         subl    %ecx, %eax
  92         jnz     L(ret_nonzero_vec_end_0_adj)
  93 # else
  94         cmpl    $8, %edx
  95         ja      L(cmp_9_16)
  96
  97         cmpl    $4, %edx
  98         jb      L(cmp_0_3)
  99
 100 #  ifdef USE_AS_MEMCMPEQ
 101         movl    (%rsi), %eax
 102         subl    (%rdi), %eax
 103
 104         movl    -4(%rsi, %rdx), %esi
 105         subl    -4(%rdi, %rdx), %esi
 106
 107         orl     %esi, %eax
 108         ret
 109 #  else
 110         /* Combine comparisons for lo and hi 4-byte comparisons.  */
 111         movl    -4(%rsi, %rdx), %ecx
 112         movl    -4(%rdi, %rdx), %eax
 113         shlq    $32, %rcx
 114         shlq    $32, %rax
 115         movl    (%rsi), %esi
 116         movl    (%rdi), %edi
 117         orq     %rsi, %rcx
 118         orq     %rdi, %rax
 119         /* Only compute proper return if not-equal.  */
 120         cmpq    %rcx, %rax
 121         jnz     L(ret_nonzero)
 122         xorl    %eax, %eax
 123         ret
 124 #  endif
 125
 126         .p2align 4,, 10
 127 L(cmp_9_16):
 128 #  ifdef USE_AS_MEMCMPEQ
 129         movq    (%rsi), %rax
 130         subq    (%rdi), %rax
 131
 132         movq    -8(%rsi, %rdx), %rcx
 133         subq    -8(%rdi, %rdx), %rcx
 134         orq     %rcx, %rax
 135         /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
 136            return long).  */
 137         setnz   %cl
 138         movzbl  %cl, %eax
 139 #  else
 140         movq    (%rsi), %rcx
 141         movq    (%rdi), %rax
 142         /* Only compute proper return if not-equal.  */
 143         cmpq    %rcx, %rax
 144         jnz     L(ret_nonzero)
 145
 146         movq    -8(%rsi, %rdx, CHAR_SIZE), %rcx
 147         movq    -8(%rdi, %rdx, CHAR_SIZE), %rax
 148         /* Only compute proper return if not-equal.  */
 149         cmpq    %rcx, %rax
 150         jnz     L(ret_nonzero)
 151         xorl    %eax, %eax
 152 #  endif
 153 # endif
 154         ret
 155
 156         .p2align 4,, 8
 157 L(cmp_0_1):
 158         /* Flag set by earlier comparison against 1.  */
 159         jne     L(cmp_0_0)
 160 # ifdef USE_AS_WMEMCMP
 161         movl    (%rdi), %ecx
 162         xorl    %edx, %edx
 163         cmpl    (%rsi), %ecx
 164         je      L(cmp_0_0)
 165         setg    %dl
 166         leal    -1(%rdx, %rdx), %eax
 167 # else
 168         movzbl  (%rdi), %eax
 169         movzbl  (%rsi), %ecx
 170         subl    %ecx, %eax
 171 # endif
 172         ret
 173
 174         /* Fits in aligning bytes.  */
 175 L(cmp_0_0):
 176         xorl    %eax, %eax
 177         ret
 178
 179 # ifdef USE_AS_WMEMCMP
 180         .p2align 4
 181 L(ret_nonzero_vec_start_0):
 182         bsfl    %eax, %eax
 183         movl    (%rdi, %rax), %ecx
 184         xorl    %edx, %edx
 185         cmpl    (%rsi, %rax), %ecx
 186         /* NB: no partial register stall here because xorl zero idiom
 187            above.  */
 188         setg    %dl
 189         leal    -1(%rdx, %rdx), %eax
 190         ret
 191 # else
 192
 193 #  ifndef USE_AS_MEMCMPEQ
 194         .p2align 4,, 14
 195 L(ret_nonzero):
 196         /* Need to bswap to get proper return without branch.  */
 197         bswapq  %rcx
 198         bswapq  %rax
 199         subq    %rcx, %rax
 200         sbbl    %eax, %eax
 201         orl     $1, %eax
 202         ret
 203 #  endif
 204
 205         .p2align 4
 206 L(cmp_0_3):
 207 #  ifdef USE_AS_MEMCMPEQ
 208         /* No reason to add to dependency chain on rdx. Saving a the
 209            bytes here doesn't change number of fetch blocks.  */
 210         cmpl    $1, %edx
 211         jbe     L(cmp_0_1)
 212 #  else
 213         /* We need the code size to prevent taking an extra fetch block.
 214          */
 215         decl    %edx
 216         jle     L(cmp_0_1)
 217 #  endif
 218         movzwl  (%rsi), %ecx
 219         movzwl  (%rdi), %eax
 220
 221 #  ifdef USE_AS_MEMCMPEQ
 222         subl    %ecx, %eax
 223
 224         movzbl  -1(%rsi, %rdx), %esi
 225         movzbl  -1(%rdi, %rdx), %edi
 226         subl    %edi, %esi
 227         orl     %esi, %eax
 228 #  else
 229         bswapl  %ecx
 230         bswapl  %eax
 231
 232         /* Implicit right shift by one. We just need to displace the
 233            sign bits.  */
 234         shrl    %ecx
 235         shrl    %eax
 236
 237         /* Eat a partial register stall here. Saves code stopping
 238            L(cmp_0_3) from bleeding into the next fetch block and saves
 239            an ALU.  */
 240         movb    (%rsi, %rdx), %cl
 241         movzbl  (%rdi, %rdx), %edi
 242         orl     %edi, %eax
 243         subl    %ecx, %eax
 244 #  endif
 245         ret
 246 # endif
 247
 248         .p2align 5
 249 L(more_1x_vec):
 250 # ifndef USE_AS_WMEMCMP
 251         /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
 252            in ecx for code size. This is preferable to using `incw` as
 253            it avoids partial register stalls on older hardware (pre
 254            SnB).  */
 255         movl    $0xffff, %ecx
 256 # endif
 257         movups  (%rsi), %xmm0
 258         movups  (%rdi), %xmm1
 259         PCMPEQ  %xmm0, %xmm1
 260         pmovmskb %xmm1, %eax
 261         subl    %ecx, %eax
 262         jnz     L(ret_nonzero_vec_start_0)
 263 # if SIZE_OFFSET == 0
 264         cmpq    $(CHAR_PER_VEC * 2), %rdx
 265 # else
 266         /* Offset rdx. Saves just enough code size to keep the
 267            L(last_2x_vec) case and the non-zero return in a single
 268            cache line.  */
 269         subq    $(CHAR_PER_VEC * 2), %rdx
 270 # endif
 271         ja      L(more_2x_vec)
 272
 273         movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
 274         movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
 275         PCMPEQ  %xmm0, %xmm1
 276         pmovmskb %xmm1, %eax
 277         subl    %ecx, %eax
 278 # ifndef USE_AS_MEMCMPEQ
 279         /* Don't use `incw ax` as machines this code runs on are liable
 280            to have partial register stall.  */
 281         jnz     L(ret_nonzero_vec_end_0)
 282 # else
 283         /* Various return targets for memcmpeq. Will always be hot in
 284            Icache and get short encoding.  */
 285 L(ret_nonzero_vec_start_1):
 286 L(ret_nonzero_vec_start_0):
 287 L(ret_nonzero_vec_end_0):
 288 # endif
 289         ret
 290
 291 # ifndef USE_AS_MEMCMPEQ
 292 #  ifdef USE_AS_WMEMCMP
 293         .p2align 4
 294 L(ret_nonzero_vec_end_0_adj):
 295         addl    $3, %edx
 296 #  else
 297         .p2align 4,, 8
 298 #  endif
 299 L(ret_nonzero_vec_end_0):
 300         bsfl    %eax, %eax
 301 #  ifdef USE_AS_WMEMCMP
 302         leal    (%rax, %rdx, CHAR_SIZE), %eax
 303         movl    (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
 304         xorl    %edx, %edx
 305         cmpl    (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
 306         /* NB: no partial register stall here because xorl zero idiom
 307            above.  */
 308         setg    %dl
 309         leal    -1(%rdx, %rdx), %eax
 310 #  else
 311         /* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
 312        is negative value of the sum will be usable as a 64-bit offset
 313        (negative 32-bit numbers zero-extend to a large and often
 314        out-of-bounds 64-bit offsets).  Note that `rax` + `rdx` >= 0 is
 315        an invariant when `memcmp` is used correctly, but if the input
 316        strings `rsi`/`rdi` are concurrently modified as the function
 317        runs (there is a Data-Race) it is possible for `rax` + `rdx` to
 318        be negative.  Given that there is virtually no extra to cost
 319        using `addq` instead of `addl` we may as well protect the
 320        data-race case.  */
 321         addq    %rdx, %rax
 322         movzbl  (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
 323         movzbl  (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
 324         subl    %ecx, %eax
 325 #  endif
 326         ret
 327 #  ifndef USE_AS_WMEMCMP
 328         .p2align 4,, 10
 329 L(ret_nonzero_vec_start_0):
 330         bsfl    %eax, %eax
 331         movzbl  (%rsi, %rax), %ecx
 332         movzbl  (%rdi, %rax), %eax
 333         subl    %ecx, %eax
 334         ret
 335 #  endif
 336 # else
 337 # endif
 338
 339         .p2align 5
 340 L(more_2x_vec):
 341         movups  (VEC_SIZE * 1)(%rsi), %xmm0
 342         movups  (VEC_SIZE * 1)(%rdi), %xmm1
 343         PCMPEQ  %xmm0, %xmm1
 344         pmovmskb %xmm1, %eax
 345         subl    %ecx, %eax
 346         jnz     L(ret_nonzero_vec_start_1)
 347
 348         cmpq    $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
 349         jbe     L(last_2x_vec)
 350
 351         cmpq    $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
 352         ja      L(more_8x_vec)
 353
 354         /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
 355            This can harm performance if non-zero return in [65, 80] or
 356            [97, 112] but helps performance otherwise. Generally zero-
 357            return is hotter.  */
 358         movups  (VEC_SIZE * 2)(%rsi), %xmm0
 359         movups  (VEC_SIZE * 2)(%rdi), %xmm1
 360         PCMPEQ  %xmm0, %xmm1
 361         movups  (VEC_SIZE * 3)(%rsi), %xmm2
 362         movups  (VEC_SIZE * 3)(%rdi), %xmm3
 363         PCMPEQ  %xmm2, %xmm3
 364         pand    %xmm1, %xmm3
 365
 366         pmovmskb %xmm3, %eax
 367         CHECK_CMP (%ecx, %eax)
 368         jnz     L(ret_nonzero_vec_start_2_3)
 369
 370         cmpl    $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
 371         jbe     L(last_2x_vec)
 372
 373         movups  (VEC_SIZE * 4)(%rsi), %xmm0
 374         movups  (VEC_SIZE * 4)(%rdi), %xmm1
 375         PCMPEQ  %xmm0, %xmm1
 376         movups  (VEC_SIZE * 5)(%rsi), %xmm2
 377         movups  (VEC_SIZE * 5)(%rdi), %xmm3
 378         PCMPEQ  %xmm2, %xmm3
 379         pand    %xmm1, %xmm3
 380
 381         pmovmskb %xmm3, %eax
 382         CHECK_CMP (%ecx, %eax)
 383 # ifdef USE_AS_MEMCMPEQ
 384         jz      L(last_2x_vec)
 385         ret
 386 # else
 387         jnz     L(ret_nonzero_vec_start_4_5)
 388 # endif
 389         .p2align 4
 390 L(last_2x_vec):
 391         movups  (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
 392         movups  (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
 393         PCMPEQ  %xmm0, %xmm1
 394         movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
 395         movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
 396         PCMPEQ  %xmm2, %xmm3
 397         pand    %xmm1, %xmm3
 398         pmovmskb %xmm3, %eax
 399         subl    %ecx, %eax
 400 # ifdef USE_AS_MEMCMPEQ
 401         /* Various return targets for memcmpeq. Will always be hot in
 402            Icache and get short encoding.  */
 403 L(ret_nonzero_vec_start_2_3):
 404 L(ret_nonzero_vec_start_4_5):
 405         ret
 406 # else
 407         jnz     L(ret_nonzero_vec_end_1)
 408         ret
 409
 410         .p2align 4,, 8
 411 L(ret_nonzero_vec_end_1):
 412         pmovmskb %xmm1, %ecx
 413         /* High 16 bits of eax guaranteed to be all ones. Rotate them in
 414            to we can do `or + not` with just `xor`.  */
 415         rorl    $16, %eax
 416         xorl    %ecx, %eax
 417         /* Partial register stall.  */
 418
 419         bsfl    %eax, %eax
 420 #  ifdef USE_AS_WMEMCMP
 421         leal    (%rax, %rdx, CHAR_SIZE), %eax
 422         movl    (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
 423         xorl    %edx, %edx
 424         cmpl    (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
 425         /* NB: no partial register stall here because xorl zero idiom
 426            above.  */
 427         setg    %dl
 428         leal    -1(%rdx, %rdx), %eax
 429 #  else
 430         addl    %edx, %eax
 431         movzbl  (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
 432         movzbl  (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
 433         subl    %ecx, %eax
 434 #  endif
 435         ret
 436
 437         .p2align 4
 438 L(ret_nonzero_vec_start_4_5):
 439         pmovmskb %xmm1, %edx
 440         sall    $16, %eax
 441         leal    1(%rax, %rdx), %eax
 442         bsfl    %eax, %eax
 443 #  ifdef USE_AS_WMEMCMP
 444         movl    (VEC_SIZE * 4)(%rdi, %rax), %ecx
 445         xorl    %edx, %edx
 446         cmpl    (VEC_SIZE * 4)(%rsi, %rax), %ecx
 447         /* NB: no partial register stall here because xorl zero idiom
 448            above.  */
 449         setg    %dl
 450         leal    -1(%rdx, %rdx), %eax
 451 #  else
 452         movzbl  (VEC_SIZE * 4)(%rsi, %rax), %ecx
 453         movzbl  (VEC_SIZE * 4)(%rdi, %rax), %eax
 454         subl    %ecx, %eax
 455 #  endif
 456         ret
 457
 458         .p2align 4,, 8
 459 L(ret_nonzero_vec_start_1):
 460         bsfl    %eax, %eax
 461 #  ifdef USE_AS_WMEMCMP
 462         movl    (VEC_SIZE * 1)(%rdi, %rax), %ecx
 463         xorl    %edx, %edx
 464         cmpl    (VEC_SIZE * 1)(%rsi, %rax), %ecx
 465         /* NB: no partial register stall here because xorl zero idiom
 466            above.  */
 467         setg    %dl
 468         leal    -1(%rdx, %rdx), %eax
 469 #  else
 470         movzbl  (VEC_SIZE * 1)(%rsi, %rax), %ecx
 471         movzbl  (VEC_SIZE * 1)(%rdi, %rax), %eax
 472         subl    %ecx, %eax
 473 #  endif
 474         ret
 475 # endif
 476
 477         .p2align 4
 478 L(more_8x_vec):
 479         subq    %rdi, %rsi
 480         leaq    (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
 481         andq    $(VEC_SIZE * -1), %rdi
 482         addq    %rdi, %rsi
 483         .p2align 4
 484 L(loop_4x):
 485         movups  (VEC_SIZE * 2)(%rsi), %xmm0
 486         movups  (VEC_SIZE * 3)(%rsi), %xmm1
 487
 488         PCMPEQ  (VEC_SIZE * 2)(%rdi), %xmm0
 489         PCMPEQ  (VEC_SIZE * 3)(%rdi), %xmm1
 490
 491         movups  (VEC_SIZE * 4)(%rsi), %xmm2
 492         movups  (VEC_SIZE * 5)(%rsi), %xmm3
 493
 494         PCMPEQ  (VEC_SIZE * 4)(%rdi), %xmm2
 495         PCMPEQ  (VEC_SIZE * 5)(%rdi), %xmm3
 496
 497         pand    %xmm0, %xmm1
 498         pand    %xmm2, %xmm3
 499         pand    %xmm1, %xmm3
 500
 501         pmovmskb %xmm3, %eax
 502         subl    %ecx, %eax
 503         jnz     L(ret_nonzero_loop)
 504
 505         addq    $(VEC_SIZE * 4), %rdi
 506         addq    $(VEC_SIZE * 4), %rsi
 507         cmpq    %rdi, %rdx
 508         ja      L(loop_4x)
 509         /* Get remaining length in edx.  */
 510         subl    %edi, %edx
 511         /* Restore offset so we can reuse L(last_2x_vec).  */
 512         addl    $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
 513 # ifdef USE_AS_WMEMCMP
 514         shrl    $2, %edx
 515 # endif
 516         cmpl    $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
 517         jbe     L(last_2x_vec)
 518
 519
 520         movups  (VEC_SIZE * 2)(%rsi), %xmm0
 521         movups  (VEC_SIZE * 2)(%rdi), %xmm1
 522         PCMPEQ  %xmm0, %xmm1
 523         movups  (VEC_SIZE * 3)(%rsi), %xmm2
 524         movups  (VEC_SIZE * 3)(%rdi), %xmm3
 525         PCMPEQ  %xmm2, %xmm3
 526         pand    %xmm1, %xmm3
 527
 528         pmovmskb %xmm3, %eax
 529         CHECK_CMP (%ecx, %eax)
 530         jz      L(last_2x_vec)
 531 # ifdef USE_AS_MEMCMPEQ
 532 L(ret_nonzero_loop):
 533         ret
 534 # else
 535
 536         .p2align 4
 537 L(ret_nonzero_vec_start_2_3):
 538         pmovmskb %xmm1, %edx
 539         sall    $16, %eax
 540         leal    1(%rax, %rdx), %eax
 541
 542         bsfl    %eax, %eax
 543 #  ifdef USE_AS_WMEMCMP
 544         movl    (VEC_SIZE * 2)(%rdi, %rax), %ecx
 545         xorl    %edx, %edx
 546         cmpl    (VEC_SIZE * 2)(%rsi, %rax), %ecx
 547         /* NB: no partial register stall here because xorl zero idiom
 548            above.  */
 549         setg    %dl
 550         leal    -1(%rdx, %rdx), %eax
 551 #  else
 552         movzbl  (VEC_SIZE * 2)(%rsi, %rax), %ecx
 553         movzbl  (VEC_SIZE * 2)(%rdi, %rax), %eax
 554         subl    %ecx, %eax
 555 #  endif
 556         ret
 557
 558         .p2align 4
 559 L(ret_nonzero_loop):
 560         pmovmskb %xmm0, %ecx
 561         pmovmskb %xmm1, %edx
 562         sall    $(VEC_SIZE * 1), %edx
 563         leal    1(%rcx, %rdx), %edx
 564         pmovmskb %xmm2, %ecx
 565         /* High 16 bits of eax guaranteed to be all ones. Rotate them in
 566            to we can do `or + not` with just `xor`.  */
 567         rorl    $16, %eax
 568         xorl    %ecx, %eax
 569
 570         salq    $32, %rax
 571         orq     %rdx, %rax
 572
 573         bsfq    %rax, %rax
 574 #  ifdef USE_AS_WMEMCMP
 575         movl    (VEC_SIZE * 2)(%rdi, %rax), %ecx
 576         xorl    %edx, %edx
 577         cmpl    (VEC_SIZE * 2)(%rsi, %rax), %ecx
 578         /* NB: no partial register stall here because xorl zero idiom
 579            above.  */
 580         setg    %dl
 581         leal    -1(%rdx, %rdx), %eax
 582 #  else
 583         movzbl  (VEC_SIZE * 2)(%rsi, %rax), %ecx
 584         movzbl  (VEC_SIZE * 2)(%rdi, %rax), %eax
 585         subl    %ecx, %eax
 586 #  endif
 587         ret
 588 # endif
 589 END(MEMCMP)
 590 #endif