ports/sysdeps/aarch64/memcmp.S

   1 /* memcmp - compare memory
   2
   3    Copyright (C) 2013-2014 Free Software Foundation, Inc.
   4
   5    This file is part of the GNU C Library.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library.  If not, see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include <sysdep.h>
  22
  23 /* Assumptions:
  24  *
  25  * ARMv8-a, AArch64
  26  */
  27
  28 /* Parameters and result.  */
  29 #define src1            x0
  30 #define src2            x1
  31 #define limit           x2
  32 #define result          x0
  33
  34 /* Internal variables.  */
  35 #define data1           x3
  36 #define data1w          w3
  37 #define data2           x4
  38 #define data2w          w4
  39 #define has_nul         x5
  40 #define diff            x6
  41 #define endloop         x7
  42 #define tmp1            x8
  43 #define tmp2            x9
  44 #define tmp3            x10
  45 #define pos             x11
  46 #define limit_wd        x12
  47 #define mask            x13
  48
  49 ENTRY_ALIGN (memcmp, 6)
  50         cbz     limit, L(ret0)
  51         eor     tmp1, src1, src2
  52         tst     tmp1, #7
  53         b.ne    L(misaligned8)
  54         ands    tmp1, src1, #7
  55         b.ne    L(mutual_align)
  56         add     limit_wd, limit, #7
  57         lsr     limit_wd, limit_wd, #3
  58         /* Start of performance-critical section  -- one 64B cache line.  */
  59 L(loop_aligned):
  60         ldr     data1, [src1], #8
  61         ldr     data2, [src2], #8
  62 L(start_realigned):
  63         subs    limit_wd, limit_wd, #1
  64         eor     diff, data1, data2      /* Non-zero if differences found.  */
  65         csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
  66         cbz     endloop, L(loop_aligned)
  67         /* End of performance-critical section  -- one 64B cache line.  */
  68
  69         /* Not reached the limit, must have found a diff.  */
  70         cbnz    limit_wd, L(not_limit)
  71
  72         /* Limit % 8 == 0 => all bytes significant.  */
  73         ands    limit, limit, #7
  74         b.eq    L(not_limit)
  75
  76         lsl     limit, limit, #3        /* Bits -> bytes.  */
  77         mov     mask, #~0
  78 #ifdef __AARCH64EB__
  79         lsr     mask, mask, limit
  80 #else
  81         lsl     mask, mask, limit
  82 #endif
  83         bic     data1, data1, mask
  84         bic     data2, data2, mask
  85
  86         orr     diff, diff, mask
  87 L(not_limit):
  88
  89 #ifndef __AARCH64EB__
  90         rev     diff, diff
  91         rev     data1, data1
  92         rev     data2, data2
  93 #endif
  94         /* The MS-non-zero bit of DIFF marks either the first bit
  95            that is different, or the end of the significant data.
  96            Shifting left now will bring the critical information into the
  97            top bits.  */
  98         clz     pos, diff
  99         lsl     data1, data1, pos
 100         lsl     data2, data2, pos
 101         /* But we need to zero-extend (char is unsigned) the value and then
 102            perform a signed 32-bit subtraction.  */
 103         lsr     data1, data1, #56
 104         sub     result, data1, data2, lsr #56
 105         RET
 106
 107 L(mutual_align):
 108         /* Sources are mutually aligned, but are not currently at an
 109            alignment boundary.  Round down the addresses and then mask off
 110            the bytes that precede the start point.  */
 111         bic     src1, src1, #7
 112         bic     src2, src2, #7
 113         add     limit, limit, tmp1      /* Adjust the limit for the extra.  */
 114         lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
 115         ldr     data1, [src1], #8
 116         neg     tmp1, tmp1              /* Bits to alignment -64.  */
 117         ldr     data2, [src2], #8
 118         mov     tmp2, #~0
 119 #ifdef __AARCH64EB__
 120         /* Big-endian.  Early bytes are at MSB.  */
 121         lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 122 #else
 123         /* Little-endian.  Early bytes are at LSB.  */
 124         lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 125 #endif
 126         add     limit_wd, limit, #7
 127         orr     data1, data1, tmp2
 128         orr     data2, data2, tmp2
 129         lsr     limit_wd, limit_wd, #3
 130         b       L(start_realigned)
 131
 132 L(ret0):
 133         mov     result, #0
 134         RET
 135
 136         .p2align 6
 137 L(misaligned8):
 138         sub     limit, limit, #1
 139 1:
 140         /* Perhaps we can do better than this.  */
 141         ldrb    data1w, [src1], #1
 142         ldrb    data2w, [src2], #1
 143         subs    limit, limit, #1
 144         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 145         b.eq    1b
 146         sub     result, data1, data2
 147         RET
 148 END (memcmp)
 149 #undef bcmp
 150 weak_alias (memcmp, bcmp)
 151 libc_hidden_builtin_def (memcmp)