sysdeps/aarch64/memcmp.S

   1 /* memcmp - compare memory
   2
   3    Copyright (C) 2013-2017 Free Software Foundation, Inc.
   4
   5    This file is part of the GNU C Library.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library.  If not, see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include <sysdep.h>
  22
  23 /* Assumptions:
  24  *
  25  * ARMv8-a, AArch64
  26  */
  27
  28 /* Parameters and result.  */
  29 #define src1            x0
  30 #define src2            x1
  31 #define limit           x2
  32 #define result          x0
  33
  34 /* Internal variables.  */
  35 #define data1           x3
  36 #define data1w          w3
  37 #define data2           x4
  38 #define data2w          w4
  39 #define has_nul         x5
  40 #define diff            x6
  41 #define endloop         x7
  42 #define tmp1            x8
  43 #define tmp2            x9
  44 #define tmp3            x10
  45 #define pos             x11
  46 #define limit_wd        x12
  47 #define mask            x13
  48
  49 ENTRY_ALIGN (memcmp, 6)
  50         DELOUSE (0)
  51         DELOUSE (1)
  52         DELOUSE (2)
  53         cbz     limit, L(ret0)
  54         eor     tmp1, src1, src2
  55         tst     tmp1, #7
  56         b.ne    L(misaligned8)
  57         ands    tmp1, src1, #7
  58         b.ne    L(mutual_align)
  59         add     limit_wd, limit, #7
  60         lsr     limit_wd, limit_wd, #3
  61         /* Start of performance-critical section  -- one 64B cache line.  */
  62 L(loop_aligned):
  63         ldr     data1, [src1], #8
  64         ldr     data2, [src2], #8
  65 L(start_realigned):
  66         subs    limit_wd, limit_wd, #1
  67         eor     diff, data1, data2      /* Non-zero if differences found.  */
  68         csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
  69         cbz     endloop, L(loop_aligned)
  70         /* End of performance-critical section  -- one 64B cache line.  */
  71
  72         /* Not reached the limit, must have found a diff.  */
  73         cbnz    limit_wd, L(not_limit)
  74
  75         /* Limit % 8 == 0 => all bytes significant.  */
  76         ands    limit, limit, #7
  77         b.eq    L(not_limit)
  78
  79         lsl     limit, limit, #3        /* Bits -> bytes.  */
  80         mov     mask, #~0
  81 #ifdef __AARCH64EB__
  82         lsr     mask, mask, limit
  83 #else
  84         lsl     mask, mask, limit
  85 #endif
  86         bic     data1, data1, mask
  87         bic     data2, data2, mask
  88
  89         orr     diff, diff, mask
  90 L(not_limit):
  91
  92 #ifndef __AARCH64EB__
  93         rev     diff, diff
  94         rev     data1, data1
  95         rev     data2, data2
  96 #endif
  97         /* The MS-non-zero bit of DIFF marks either the first bit
  98            that is different, or the end of the significant data.
  99            Shifting left now will bring the critical information into the
 100            top bits.  */
 101         clz     pos, diff
 102         lsl     data1, data1, pos
 103         lsl     data2, data2, pos
 104         /* But we need to zero-extend (char is unsigned) the value and then
 105            perform a signed 32-bit subtraction.  */
 106         lsr     data1, data1, #56
 107         sub     result, data1, data2, lsr #56
 108         RET
 109
 110 L(mutual_align):
 111         /* Sources are mutually aligned, but are not currently at an
 112            alignment boundary.  Round down the addresses and then mask off
 113            the bytes that precede the start point.  */
 114         bic     src1, src1, #7
 115         bic     src2, src2, #7
 116         add     limit, limit, tmp1      /* Adjust the limit for the extra.  */
 117         lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
 118         ldr     data1, [src1], #8
 119         neg     tmp1, tmp1              /* Bits to alignment -64.  */
 120         ldr     data2, [src2], #8
 121         mov     tmp2, #~0
 122 #ifdef __AARCH64EB__
 123         /* Big-endian.  Early bytes are at MSB.  */
 124         lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 125 #else
 126         /* Little-endian.  Early bytes are at LSB.  */
 127         lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 128 #endif
 129         add     limit_wd, limit, #7
 130         orr     data1, data1, tmp2
 131         orr     data2, data2, tmp2
 132         lsr     limit_wd, limit_wd, #3
 133         b       L(start_realigned)
 134
 135 L(ret0):
 136         mov     result, #0
 137         RET
 138
 139         .p2align 6
 140 L(misaligned8):
 141         sub     limit, limit, #1
 142 1:
 143         /* Perhaps we can do better than this.  */
 144         ldrb    data1w, [src1], #1
 145         ldrb    data2w, [src2], #1
 146         subs    limit, limit, #1
 147         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 148         b.eq    1b
 149         sub     result, data1, data2
 150         RET
 151 END (memcmp)
 152 #undef bcmp
 153 weak_alias (memcmp, bcmp)
 154 libc_hidden_builtin_def (memcmp)