sysdeps/alpha/strncmp.S

   1 /* Copyright (C) 1996-2021 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson (rth@tamu.edu)
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* Bytewise compare two null-terminated strings of length no longer than N.  */
  20
  21 #include <sysdep.h>
  22
  23         .set noat
  24         .set noreorder
  25
  26 /* EV6 only predicts one branch per octaword.  We'll use these to push
  27    subsequent branches back to the next bundle.  This will generally add
  28    a fetch+decode cycle to older machines, so skip in that case.  */
  29 #ifdef __alpha_fix__
  30 # define ev6_unop       unop
  31 #else
  32 # define ev6_unop
  33 #endif
  34
  35         .text
  36
  37 ENTRY(strncmp)
  38 #ifdef PROF
  39         ldgp    gp, 0(pv)
  40         lda     AT, _mcount
  41         jsr     AT, (AT), _mcount
  42         .prologue 1
  43 #else
  44         .prologue 0
  45 #endif
  46
  47         xor     a0, a1, t2      # are s1 and s2 co-aligned?
  48         beq     a2, $zerolength
  49         ldq_u   t0, 0(a0)       # load asap to give cache time to catch up
  50         ldq_u   t1, 0(a1)
  51         lda     t3, -1
  52         and     t2, 7, t2
  53         srl     t3, 1, t6
  54         and     a0, 7, t4       # find s1 misalignment
  55         and     a1, 7, t5       # find s2 misalignment
  56         cmovlt  a2, t6, a2      # bound neg count to LONG_MAX
  57         addq    a1, a2, a3      # s2+count
  58         addq    a2, t4, a2      # bias count by s1 misalignment
  59         and     a2, 7, t10      # ofs of last byte in s1 last word
  60         srl     a2, 3, a2       # remaining full words in s1 count
  61         bne     t2, $unaligned
  62
  63         /* On entry to this basic block:
  64            t0 == the first word of s1.
  65            t1 == the first word of s2.
  66            t3 == -1.  */
  67 $aligned:
  68         mskqh   t3, a1, t8      # mask off leading garbage
  69         ornot   t1, t8, t1
  70         ornot   t0, t8, t0
  71         cmpbge  zero, t1, t7    # bits set iff null found
  72         beq     a2, $eoc        # check end of count
  73         bne     t7, $eos
  74         beq     t10, $ant_loop
  75
  76         /* Aligned compare main loop.
  77            On entry to this basic block:
  78            t0 == an s1 word.
  79            t1 == an s2 word not containing a null.  */
  80
  81         .align 4
  82 $a_loop:
  83         xor     t0, t1, t2      # e0    :
  84         bne     t2, $wordcmp    # .. e1 (zdb)
  85         ldq_u   t1, 8(a1)       # e0    :
  86         ldq_u   t0, 8(a0)       # .. e1 :
  87
  88         subq    a2, 1, a2       # e0    :
  89         addq    a1, 8, a1       # .. e1 :
  90         addq    a0, 8, a0       # e0    :
  91         beq     a2, $eoc        # .. e1 :
  92
  93         cmpbge  zero, t1, t7    # e0    :
  94         beq     t7, $a_loop     # .. e1 :
  95
  96         br      $eos
  97
  98         /* Alternate aligned compare loop, for when there's no trailing
  99            bytes on the count.  We have to avoid reading too much data.  */
 100         .align 4
 101 $ant_loop:
 102         xor     t0, t1, t2      # e0    :
 103         ev6_unop
 104         ev6_unop
 105         bne     t2, $wordcmp    # .. e1 (zdb)
 106
 107         subq    a2, 1, a2       # e0    :
 108         beq     a2, $zerolength # .. e1 :
 109         ldq_u   t1, 8(a1)       # e0    :
 110         ldq_u   t0, 8(a0)       # .. e1 :
 111
 112         addq    a1, 8, a1       # e0    :
 113         addq    a0, 8, a0       # .. e1 :
 114         cmpbge  zero, t1, t7    # e0    :
 115         beq     t7, $ant_loop   # .. e1 :
 116
 117         br      $eos
 118
 119         /* The two strings are not co-aligned.  Align s1 and cope.  */
 120         /* On entry to this basic block:
 121            t0 == the first word of s1.
 122            t1 == the first word of s2.
 123            t3 == -1.
 124            t4 == misalignment of s1.
 125            t5 == misalignment of s2.
 126           t10 == misalignment of s1 end.  */
 127         .align  4
 128 $unaligned:
 129         /* If s1 misalignment is larger than s2 misalignment, we need
 130            extra startup checks to avoid SEGV.  */
 131         subq    a1, t4, a1      # adjust s2 for s1 misalignment
 132         cmpult  t4, t5, t9
 133         subq    a3, 1, a3       # last byte of s2
 134         bic     a1, 7, t8
 135         mskqh   t3, t5, t7      # mask garbage in s2
 136         subq    a3, t8, a3
 137         ornot   t1, t7, t7
 138         srl     a3, 3, a3       # remaining full words in s2 count
 139         beq     t9, $u_head
 140
 141         /* Failing that, we need to look for both eos and eoc within the
 142            first word of s2.  If we find either, we can continue by
 143            pretending that the next word of s2 is all zeros.  */
 144         lda     t2, 0           # next = zero
 145         cmpeq   a3, 0, t8       # eoc in the first word of s2?
 146         cmpbge  zero, t7, t7    # eos in the first word of s2?
 147         or      t7, t8, t8
 148         bne     t8, $u_head_nl
 149
 150         /* We know just enough now to be able to assemble the first
 151            full word of s2.  We can still find a zero at the end of it.
 152
 153            On entry to this basic block:
 154            t0 == first word of s1
 155            t1 == first partial word of s2.
 156            t3 == -1.
 157            t10 == ofs of last byte in s1 last word.
 158            t11 == ofs of last byte in s2 last word.  */
 159 $u_head:
 160         ldq_u   t2, 8(a1)       # load second partial s2 word
 161         subq    a3, 1, a3
 162 $u_head_nl:
 163         extql   t1, a1, t1      # create first s2 word
 164         mskqh   t3, a0, t8
 165         extqh   t2, a1, t4
 166         ornot   t0, t8, t0      # kill s1 garbage
 167         or      t1, t4, t1      # s2 word now complete
 168         cmpbge  zero, t0, t7    # find eos in first s1 word
 169         ornot   t1, t8, t1      # kill s2 garbage
 170         beq     a2, $eoc
 171         subq    a2, 1, a2
 172         bne     t7, $eos
 173         mskql   t3, a1, t8      # mask out s2[1] bits we have seen
 174         xor     t0, t1, t4      # compare aligned words
 175         or      t2, t8, t8
 176         bne     t4, $wordcmp
 177         cmpbge  zero, t8, t7    # eos in high bits of s2[1]?
 178         cmpeq   a3, 0, t8       # eoc in s2[1]?
 179         or      t7, t8, t7
 180         bne     t7, $u_final
 181
 182         /* Unaligned copy main loop.  In order to avoid reading too much,
 183            the loop is structured to detect zeros in aligned words from s2.
 184            This has, unfortunately, effectively pulled half of a loop
 185            iteration out into the head and half into the tail, but it does
 186            prevent nastiness from accumulating in the very thing we want
 187            to run as fast as possible.
 188
 189            On entry to this basic block:
 190            t2 == the unshifted low-bits from the next s2 word.
 191            t10 == ofs of last byte in s1 last word.
 192            t11 == ofs of last byte in s2 last word.  */
 193         .align 4
 194 $u_loop:
 195         extql   t2, a1, t3      # e0    :
 196         ldq_u   t2, 16(a1)      # .. e1 : load next s2 high bits
 197         ldq_u   t0, 8(a0)       # e0    : load next s1 word
 198         addq    a1, 8, a1       # .. e1 :
 199
 200         addq    a0, 8, a0       # e0    :
 201         subq    a3, 1, a3       # .. e1 :
 202         extqh   t2, a1, t1      # e0    :
 203         cmpbge  zero, t0, t7    # .. e1 : eos in current s1 word
 204
 205         or      t1, t3, t1      # e0    :
 206         beq     a2, $eoc        # .. e1 : eoc in current s1 word
 207         subq    a2, 1, a2       # e0    :
 208         cmpbge  zero, t2, t4    # .. e1 : eos in s2[1]
 209
 210         xor     t0, t1, t3      # e0    : compare the words
 211         ev6_unop
 212         ev6_unop
 213         bne     t7, $eos        # .. e1 :
 214
 215         cmpeq   a3, 0, t5       # e0    : eoc in s2[1]
 216         ev6_unop
 217         ev6_unop
 218         bne     t3, $wordcmp    # .. e1 :
 219
 220         or      t4, t5, t4      # e0    : eos or eoc in s2[1].
 221         beq     t4, $u_loop     # .. e1 (zdb)
 222
 223         /* We've found a zero in the low bits of the last s2 word.  Get
 224            the next s1 word and align them.  */
 225         .align 3
 226 $u_final:
 227         ldq_u   t0, 8(a0)
 228         extql   t2, a1, t1
 229         cmpbge  zero, t1, t7
 230         bne     a2, $eos
 231
 232         /* We've hit end of count.  Zero everything after the count
 233            and compare whats left.  */
 234         .align 3
 235 $eoc:
 236         mskql   t0, t10, t0
 237         mskql   t1, t10, t1
 238         cmpbge  zero, t1, t7
 239
 240         /* We've found a zero somewhere in a word we just read.
 241            On entry to this basic block:
 242            t0 == s1 word
 243            t1 == s2 word
 244            t7 == cmpbge mask containing the zero.  */
 245         .align 3
 246 $eos:
 247         negq    t7, t6          # create bytemask of valid data
 248         and     t6, t7, t8
 249         subq    t8, 1, t6
 250         or      t6, t8, t7
 251         zapnot  t0, t7, t0      # kill the garbage
 252         zapnot  t1, t7, t1
 253         xor     t0, t1, v0      # ... and compare
 254         beq     v0, $done
 255
 256         /* Here we have two differing co-aligned words in t0 & t1.
 257            Bytewise compare them and return (t0 > t1 ? 1 : -1).  */
 258         .align 3
 259 $wordcmp:
 260         cmpbge  t0, t1, t2      # comparison yields bit mask of ge
 261         cmpbge  t1, t0, t3
 262         xor     t2, t3, t0      # bits set iff t0/t1 bytes differ
 263         negq    t0, t1          # clear all but least bit
 264         and     t0, t1, t0
 265         lda     v0, -1
 266         and     t0, t2, t1      # was bit set in t0 > t1?
 267         cmovne  t1, 1, v0
 268 $done:
 269         ret
 270
 271         .align 3
 272 $zerolength:
 273         clr     v0
 274         ret
 275
 276         END(strncmp)
 277 libc_hidden_builtin_def (strncmp)