sysdeps/alpha/stxncpy.S

   1 /* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson (rth@tamu.edu)
   3
   4 This file is part of the GNU C Library.
   5
   6 The GNU C Library is free software; you can redistribute it and/or
   7 modify it under the terms of the GNU Library General Public License as
   8 published by the Free Software Foundation; either version 2 of the
   9 License, or (at your option) any later version.
  10
  11 The GNU C Library is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 Library General Public License for more details.
  15
  16 You should have received a copy of the GNU Library General Public
  17 License along with the GNU C Library; see the file COPYING.LIB.  If
  18 not, write to the Free Software Foundation, Inc., 675 Mass Ave,
  19 Cambridge, MA 02139, USA.  */
  20
  21 /* Copy no more than COUNT bytes of the null-terminated string from
  22    SRC to DST.
  23
  24    This is an internal routine used by strncpy, stpncpy, and strncat.
  25    As such, it uses special linkage conventions to make implementation
  26    of these public functions more efficient.
  27
  28    On input:
  29         t9 = return address
  30         a0 = DST
  31         a1 = SRC
  32         a2 = COUNT
  33
  34    Furthermore, COUNT may not be zero.
  35
  36    On output:
  37         t0  = last word written
  38         t8  = bitmask (with one bit set) indicating the last byte written
  39         t10 = bitmask (with one bit set) indicating the byte position of
  40               the end of the range specified by COUNT
  41         a0  = unaligned address of the last *word* written
  42         a2  = the number of full words left in COUNT
  43
  44    Furthermore, v0, a3-a5, t11, and t12 are untouched.
  45 */
  46
  47
  48 /* This is generally scheduled for the EV5, but should still be pretty
  49    good for the EV4 too.  */
  50
  51 #include <sysdep.h>
  52
  53         .set noat
  54         .set noreorder
  55
  56         .text
  57
  58 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
  59    doesn't like putting the entry point for a procedure somewhere in the
  60    middle of the procedure descriptor.  Work around this by putting the
  61    aligned copy in its own procedure descriptor */
  62
  63         .ent stxncpy_aligned
  64         .align 3
  65 stxncpy_aligned:
  66         .frame sp, 0, t9, 0
  67         .prologue 0
  68
  69         /* On entry to this basic block:
  70            t0 == the first destination word for masking back in
  71            t1 == the first source word.  */
  72
  73         /* Create the 1st output word and detect 0's in the 1st input word.  */
  74         lda     t2, -1          # e1    : build a mask against false zero
  75         mskqh   t2, a1, t2      # e0    :   detection in the src word
  76         mskqh   t1, a1, t3      # e0    :
  77         ornot   t1, t2, t2      # .. e1 :
  78         mskql   t0, a1, t0      # e0    : assemble the first output word
  79         cmpbge  zero, t2, t7    # .. e1 : bits set iff null found
  80         or      t0, t3, t0      # e0    :
  81         beq     a2, $a_eoc      # .. e1 :
  82         bne     t7, $a_eos      # .. e1 :
  83
  84         /* On entry to this basic block:
  85            t0 == a source word not containing a null.  */
  86
  87 $a_loop:
  88         stq_u   t0, 0(a0)       # e0    :
  89         addq    a0, 8, a0       # .. e1 :
  90         ldq_u   t0, 0(a1)       # e0    :
  91         addq    a1, 8, a1       # .. e1 :
  92         subq    a2, 1, a2       # e0    :
  93         cmpbge  zero, t0, t7    # .. e1 (stall)
  94         beq     a2, $a_eoc      # e1    :
  95         beq     t7, $a_loop     # e1    :
  96
  97         /* Take care of the final (partial) word store.  At this point
  98            the end-of-count bit is set in t7 iff it applies.
  99
 100            On entry to this basic block we have:
 101            t0 == the source word containing the null
 102            t7 == the cmpbge mask that found it.  */
 103
 104 $a_eos:
 105         negq    t7, t8          # e0    : find low bit set
 106         and     t7, t8, t8      # e1 (stall)
 107
 108         /* For the sake of the cache, don't read a destination word
 109            if we're not going to need it.  */
 110         and     t8, 0x80, t6    # e0    :
 111         bne     t6, 1f          # .. e1 (zdb)
 112
 113         /* We're doing a partial word store and so need to combine
 114            our source and original destination words.  */
 115         ldq_u   t1, 0(a0)       # e0    :
 116         subq    t8, 1, t6       # .. e1 :
 117         or      t8, t6, t7      # e0    :
 118         unop                    #
 119         zapnot  t0, t7, t0      # e0    : clear src bytes > null
 120         zap     t1, t7, t1      # .. e1 : clear dst bytes <= null
 121         or      t0, t1, t0      # e1    :
 122
 123 1:      stq_u   t0, 0(a0)       # e0    :
 124         ret     (t9)            # e1    :
 125
 126         /* Add the end-of-count bit to the eos detection bitmask.  */
 127 $a_eoc:
 128         or      t10, t7, t7
 129         br      $a_eos
 130
 131         .end stxncpy_aligned
 132
 133         .align 3
 134         .ent __stxncpy
 135         .globl __stxncpy
 136 __stxncpy:
 137         .frame sp, 0, t9, 0
 138         .prologue 0
 139
 140         /* Are source and destination co-aligned?  */
 141         xor     a0, a1, t1      # e0    :
 142         and     a0, 7, t0       # .. e1 : find dest misalignment
 143         and     t1, 7, t1       # e0    :
 144         addq    a2, t0, a2      # .. e1 : bias count by dest misalignment
 145         subq    a2, 1, a2       # e0    :
 146         and     a2, 7, t2       # e1    :
 147         srl     a2, 3, a2       # e0    : a2 = loop counter = (count - 1)/8
 148         addq    zero, 1, t10    # .. e1 :
 149         sll     t10, t2, t10    # e0    : t10 = bitmask of last count byte
 150         bne     t1, $unaligned  # .. e1 :
 151
 152         /* We are co-aligned; take care of a partial first word.  */
 153
 154         ldq_u   t1, 0(a1)       # e0    : load first src word
 155         addq    a1, 8, a1       # .. e1 :
 156
 157         beq     t0, stxncpy_aligned     # avoid loading dest word if not needed
 158         ldq_u   t0, 0(a0)       # e0    :
 159         br      stxncpy_aligned # .. e1 :
 160
 161
 162 /* The source and destination are not co-aligned.  Align the destination
 163    and cope.  We have to be very careful about not reading too much and
 164    causing a SEGV.  */
 165
 166         .align 3
 167 $u_head:
 168         /* We know just enough now to be able to assemble the first
 169            full source word.  We can still find a zero at the end of it
 170            that prevents us from outputting the whole thing.
 171
 172            On entry to this basic block:
 173            t0 == the first dest word, unmasked
 174            t1 == the shifted low bits of the first source word
 175            t6 == bytemask that is -1 in dest word bytes */
 176
 177         ldq_u   t2, 8(a1)       # e0    : load second src word
 178         addq    a1, 8, a1       # .. e1 :
 179         mskql   t0, a0, t0      # e0    : mask trailing garbage in dst
 180         extqh   t2, a1, t4      # e0    :
 181         or      t1, t4, t1      # e1    : first aligned src word complete
 182         mskqh   t1, a0, t1      # e0    : mask leading garbage in src
 183         or      t0, t1, t0      # e0    : first output word complete
 184         or      t0, t6, t6      # e1    : mask original data for zero test
 185         cmpbge  zero, t6, t7    # e0    :
 186         beq     a2, $u_eocfin   # .. e1 :
 187         bne     t7, $u_final    # e1    :
 188
 189         lda     t6, -1                  # e1    : mask out the bits we have
 190         mskql   t6, a1, t6              # e0    :   already seen
 191         stq_u   t0, 0(a0)               # e0    : store first output word
 192         or      t6, t2, t2              # .. e1 :
 193         cmpbge  zero, t2, t7            # e0    : find nulls in second partial
 194         addq    a0, 8, a0               # .. e1 :
 195         subq    a2, 1, a2               # e0    :
 196         bne     t7, $u_late_head_exit   # .. e1 :
 197
 198         /* Finally, we've got all the stupid leading edge cases taken care
 199            of and we can set up to enter the main loop.  */
 200
 201         extql   t2, a1, t1      # e0    : position hi-bits of lo word
 202         ldq_u   t2, 8(a1)       # .. e1 : read next high-order source word
 203         addq    a1, 8, a1       # e0    :
 204         cmpbge  zero, t2, t7    # e1 (stall)
 205         beq     a2, $u_eoc      # e1    :
 206         bne     t7, $u_eos      # e1    :
 207
 208         /* Unaligned copy main loop.  In order to avoid reading too much,
 209            the loop is structured to detect zeros in aligned source words.
 210            This has, unfortunately, effectively pulled half of a loop
 211            iteration out into the head and half into the tail, but it does
 212            prevent nastiness from accumulating in the very thing we want
 213            to run as fast as possible.
 214
 215            On entry to this basic block:
 216            t1 == the shifted high-order bits from the previous source word
 217            t2 == the unshifted current source word
 218
 219            We further know that t2 does not contain a null terminator.  */
 220
 221         .align 3
 222 $u_loop:
 223         extqh   t2, a1, t0      # e0    : extract high bits for current word
 224         addq    a1, 8, a1       # .. e1 :
 225         extql   t2, a1, t3      # e0    : extract low bits for next time
 226         addq    a0, 8, a0       # .. e1 :
 227         or      t0, t1, t0      # e0    : current dst word now complete
 228         ldq_u   t2, 0(a1)       # .. e1 : load high word for next time
 229         stq_u   t0, -8(a0)      # e0    : save the current word
 230         mov     t3, t1          # .. e1 :
 231         subq    a2, 1, a2       # e0    :
 232         cmpbge  zero, t2, t7    # .. e1 : test new word for eos
 233         beq     a2, $u_eoc      # e1    :
 234         beq     t7, $u_loop     # e1    :
 235
 236         /* We've found a zero somewhere in the source word we just read.
 237            If it resides in the lower half, we have one (probably partial)
 238            word to write out, and if it resides in the upper half, we
 239            have one full and one partial word left to write out.
 240
 241            On entry to this basic block:
 242            t1 == the shifted high-order bits from the previous source word
 243            t2 == the unshifted current source word.  */
 244 $u_eos:
 245         extqh   t2, a1, t0      # e0    :
 246         or      t0, t1, t0      # e1    : first (partial) source word complete
 247
 248         cmpbge  zero, t0, t7    # e0    : is the null in this first bit?
 249         bne     t7, $u_final    # .. e1 (zdb)
 250
 251         stq_u   t0, 0(a0)       # e0    : the null was in the high-order bits
 252         addq    a0, 8, a0       # .. e1 :
 253         subq    a2, 1, a2       # e1    :
 254
 255 $u_late_head_exit:
 256         extql   t2, a1, t0      # .. e0 :
 257         cmpbge  zero, t0, t7    # e0    :
 258         or      t7, t10, t6     # e1    :
 259         cmoveq  a2, t6, t7      # e0    :
 260         nop                     # .. e1 :
 261
 262         /* Take care of a final (probably partial) result word.
 263            On entry to this basic block:
 264            t0 == assembled source word
 265            t7 == cmpbge mask that found the null.  */
 266 $u_final:
 267         negq    t7, t6          # e0    : isolate low bit set
 268         and     t6, t7, t8      # e1    :
 269
 270         and     t8, 0x80, t6    # e0    : avoid dest word load if we can
 271         bne     t6, 1f          # .. e1 (zdb)
 272
 273         ldq_u   t1, 0(a0)       # e0    :
 274         subq    t8, 1, t6       # .. e1 :
 275         or      t6, t8, t7      # e0    :
 276         zapnot  t0, t7, t0      # .. e1 : kill source bytes > null
 277         zap     t1, t7, t1      # e0    : kill dest bytes <= null
 278         or      t0, t1, t0      # e1    :
 279
 280 1:      stq_u   t0, 0(a0)       # e0    :
 281         ret     (t9)            # .. e1 :
 282
 283 $u_eoc:                         # end-of-count
 284         extqh   t2, a1, t0
 285         or      t0, t1, t0
 286         cmpbge  zero, t0, t7
 287
 288 $u_eocfin:                      # end-of-count, final word
 289         or      t10, t7, t7
 290         br      $u_final
 291
 292         /* Unaligned copy entry point.  */
 293         .align 3
 294 $unaligned:
 295
 296         ldq_u   t1, 0(a1)       # e0    : load first source word
 297
 298         and     a0, 7, t4       # .. e1 : find dest misalignment
 299         and     a1, 7, t5       # e0    : find src misalignment
 300
 301         /* Conditionally load the first destination word and a bytemask
 302            with 0xff indicating that the destination byte is sacrosanct.  */
 303
 304         mov     zero, t0        # .. e1 :
 305         mov     zero, t6        # e0    :
 306         beq     t4, 1f          # .. e1 :
 307         ldq_u   t0, 0(a0)       # e0    :
 308         lda     t6, -1          # .. e1 :
 309         mskql   t6, a0, t6      # e0    :
 310 1:
 311         subq    a1, t4, a1      # .. e1 : sub dest misalignment from src addr
 312
 313         /* If source misalignment is larger than dest misalignment, we need
 314            extra startup checks to avoid SEGV.  */
 315
 316         cmplt   t4, t5, t8      # e1    :
 317         extql   t1, a1, t1      # .. e0 : shift src into place
 318         lda     t2, -1          # e0    : for creating masks later
 319         beq     t8, $u_head     # e1    :
 320
 321         mskqh   t2, t5, t2      # e0    : begin src byte validity mask
 322         cmpbge  zero, t1, t7    # .. e1 : is there a zero?
 323         extql   t2, a1, t2      # e0    :
 324         or      t7, t10, t5     # .. e1 : test for end-of-count too
 325         cmpbge  zero, t2, t3    # e0    :
 326         cmoveq  a2, t5, t7      # .. e1 :
 327         andnot  t7, t3, t7      # e0    :
 328         beq     t7, $u_head     # .. e1 (zdb)
 329
 330         /* At this point we've found a zero in the first partial word of
 331            the source.  We need to isolate the valid source data and mask
 332            it into the original destination data.  (Incidentally, we know
 333            that we'll need at least one byte of that original dest word.) */
 334
 335         ldq_u   t0, 0(a0)       # e0    :
 336         negq    t7, t6          # .. e1 : build bitmask of bytes <= zero
 337         mskqh   t1, t4, t1      # e0    :
 338         and     t6, t7, t8      # .. e1 :
 339         subq    t8, 1, t6       # e0    :
 340         or      t6, t8, t7      # e1    :
 341
 342         zapnot  t2, t7, t2      # e0    : prepare source word; mirror changes
 343         zapnot  t1, t7, t1      # .. e1 : to source validity mask
 344
 345         andnot  t0, t2, t0      # e0    : zero place for source to reside
 346         or      t0, t1, t0      # e1    : and put it there
 347         stq_u   t0, 0(a0)       # e0    :
 348         ret     (t9)            # .. e1 :
 349
 350         .end __stxncpy